aio 爬虫,去重,入库
生活随笔
收集整理的這篇文章主要介紹了
aio 爬虫,去重,入库
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
#aio 爬蟲,去重,入庫
import asyncio
import aiohttp
import aiomysql
import re
from pyquery import PyQuerystoping = False
start_url = 'http://www.jobbole.com/'waiting_urls = []
seen_urls = set()
# url去重 --布隆過濾器 bloom filter
sem = asyncio.Semaphore(3) #限制并發數量
async def fetch(url,session):async with sem:#await asyncio.sleep(0.5)try:async with session.get(url) as resp:print(resp.status)if resp.status in [200,201]:data = await resp.text()return dataexcept Exception as e :print(e)#因為不是耗費 io的 所以用普通函數
def extract_urls(html):urls = []pq = PyQuery(html)for link in pq.items('a'):url = link.attr('href')if url and url.startswith('http') and url not in seen_urls:urls.append(url)waiting_urls.append(url)return urlsasync def init_urls(url,session):html = await fetch(url,session)seen_urls.add(url)extract_urls(html)async def article_handeler(url,session,pool):#獲取文章詳情,并解析入庫html = await fetch(url,session)seen_urls.add(url)extract_urls(html)pq = PyQuery(html)title = pq('title').text()async with pool.acquire() as conn:async with conn.cursor() as cur:await cur.execute('SELECT 42;')insert_sql = 'insert into aiomysql_test(title) VALUES ("{}")'.format(title)await cur.execute(insert_sql)async def consumer(pool):async with aiohttp.ClientSession() as session:while not stoping:if len(waiting_urls) == 0:await asyncio.sleep(0.5)continueurl = waiting_urls.pop()print('start get url:{}'.format(url))if re.match('http://.*?jobbole.com/\d+/',url):if url not in seen_urls:asyncio.ensure_future(article_handeler(url,session,pool))await asyncio.sleep(0.5)else:if url not in seen_urls:asyncio.ensure_future(init_urls(url,session))async def main(loop):#等待mysql鏈接建立好pool = await aiomysql.create_pool(host='127.0.0.1',port = 3306,user = 'root',password='123456',db = 'aiomysql_test',loop=loop,charset = 'utf8',autocommit = True)async with aiohttp.ClientSession() as session:html = await fetch(start_url, session)seen_urls.add(start_url)extract_urls(html)asyncio.ensure_future(consumer(pool))if __name__ == "__main__":loop = asyncio.get_event_loop()asyncio.ensure_future(main(loop))loop.run_forever()
?
轉載于:https://www.cnblogs.com/Erick-L/p/8939607.html
總結
以上是生活随笔為你收集整理的aio 爬虫,去重,入库的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: rocketMq - commitLog
- 下一篇: 搭建Mysql-proxy实现主从同步读