实训大作业7
datawhale-Task7
# 爬取丁香園的數據并存儲進數據庫 import requests from time import sleep from sqlalchemy.orm import sessionmaker from dxy.model import User, Subject, Comment, db_engine from sqlalchemy.exc import IntegrityErrorsession = sessionmaker(bind=db_engine) db = session()def save(data):if type(data) == list:for i in data:try:db.add(i)db.commit()except IntegrityError:db.flush()else:db.add(data)db.commit()def crawler(url):headers = {'User-Agent': ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ''(KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36'),}cookies = {}temp_cookies_string = ("DXY_USER_GROUP=49; __auc=f5db5ffc1693f4415a8b9b324af; _ga=GA1.2.406327704.1551544823; __""utmz=1.1551655682.5.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=""(not%20provided); __utmz=3004402.1551676197.1.1.utmcsr=(direct)|utmccn=(direct)|""utmcmd=(none); _gid=GA1.2.13027547.1551889001; JUTE_BBS_DATA=15fded56752024cd95f26a0e8df""09dabc0b65dec4e30437ed04c6b1520f500a3a8e02c6161f0a5bd48d0f3ef0959dd38b9276650ec31f4f0c1""59e419c1b97cd34c3a0891d95f2a3926ef6fb7c3b40b4a551ebbb281325a043e4082b5e123d2287015bdcf2e""4925add012fdb048e846953a845df43b4b505c; __utma=1.406327704.1551544823.1551889130.1551914""990.8; JSESSIONID=6D78A1886433974BF211D4EA7FFBDA91; __utma=3004402.406327704.1551544823.""1551914699.1551963394.6; __utmc=3004402; JUTE_SESSION_ID=f4417a7d-8d0b-417d-8f62-70b7018""78879; JUTE_TOKEN=b72f68e1-a1d1-45e8-8f7f-974abcad9dc9; JUTE_SESSION=04c9d3a941888796762""cc09e4d6b56b7a4047b1d26ec72a4bbb0433ff00144852b0e659c4741b4b7151463e8a91fdd12db83bc5ecde""5622b66b04b11d64be607c44fe976b21f8170")for i in temp_cookies_string.split(';'):key_value = i.strip().split('=')cookies[key_value[0]] = key_value[1]response = requests.get(url, headers=headers, cookies=cookies).json()max_page = response['pageBean']['total']subject = Subject(title=response['subject'], user_id=response['items'][0]['userId'])save(subject)# 獲取 comment 數據與 user 數據for i in range(1, max_page+1):response = requests.get(url.format(i), headers=headers, cookies=cookies).json()item_user = [item['user'] for item in response['items']]# 獲取評論的用戶數據user = [User(user_id=item['userId'], avatar=item['avatar'], nickname=item['nickname']) for item in item_user]# 獲取評論的詳情body = [Comment(user_id=item['userId'], content=item['body'], subject_id=subject.id) for item in response['items']]save(user)save(body)sleep(2)def main(url):crawler(url)if __name__ == '__main__':url = ('http://3g.dxy.cn/bbs/bbsapi/mobile?''s=view_topic&checkUserAction=1&with''Good=1&order=0&size=20&id=509959&page={}')main(url)轉載于:https://www.cnblogs.com/yuzw/p/10493277.html
總結
- 上一篇: 公积金每个月交多少
- 下一篇: [UE4]虚幻引擎的C++环境安装