python抓取头条文章
生活随笔
收集整理的這篇文章主要介紹了
python抓取头条文章
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
python抓取頭條美文并存儲到mongodb
# Author:song from multiprocessing import Pool from urllib.parse import urlencode import requests import json from requests import RequestException from bs4 import BeautifulSoup import re import pymongo client = pymongo.MongoClient('localhost',connect=False) db = client['toutiaowenzhang']def get_index(offset):data = {'offset': offset,'format': 'json','keyword': '美文','autoload': 'true','count': 20,'cur_tab': 1,'from':'search_tab'}url = 'https://www.toutiao.com/search_content/?'+urlencode(data)response = requests.get(url)try:if response.status_code == 200:return response.textelse:return Noneexcept RequestException:return Nonedef get_urls(html):data = json.loads(html)if data and 'data' in data.keys():for item in data.get('data'):yield item.get('article_url')def get_index_detail(url):response = requests.get(url)try:if response.status_code == 200:return response.textelse:return Noneexcept RequestException:return Nonedef parse_detail(html):try:soup = BeautifulSoup(html,'lxml')title = soup.select('title')[0].get_text()compile_allarticle= re.compile('content.*?<div>(.*?)</div>',re.S)allarticle = re.findall(compile_allarticle,html)# article =re.sub('(<.*?<span>)','',allarticle[0])#正則匹配上不需要的那部分article =re.sub('[a-zA-Z0-9/#;&\._]','',str(allarticle)).strip()#直接把字母數字全部替換data = {'title':title,'article':article}return dataexcept TypeError:#解決出現了404界面pass def save_to_mongodb(result):if db['toutiaowenzhang'].insert(result):print('successful')else:print('fail')def main(offset):html = get_index(offset)items = get_urls(html)for item in items:if item:ab = get_index_detail(item)result = parse_detail(ab)save_to_mongodb(result) if __name__=='__main__':groups = [x*20 for x in range(3)]pool = Pool()pool.map(main,groups)?
轉載于:https://www.cnblogs.com/master-song/p/8922850.html
總結
以上是生活随笔為你收集整理的python抓取头条文章的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: MySQL数据类型(最大值 和 最小值)
- 下一篇: Verilog 编写规范