python鏈家網二手房異步IO爬蟲,使用asyncio、aiohttp和aiomysql
很多小伙伴初學python時都會學習到爬蟲,剛入門時會使用requests、urllib這些同步的庫進行單線程爬蟲,速度是比較慢的,后學會用scrapy框架進行爬蟲,速度很快,原因是scrapy是基于twisted多線程異步IO框架。
本例使用的asyncio也是一個異步IO框架,在python3.5以后加入了協程的關鍵字async,能夠將協程和生成器區分開來,更加方便使用協程。
經過測試,平均1秒可以爬取30個詳情頁信息
可以使用asyncio.Semaphore來控制并發數,達到限速的效果
""":author: KK:url: http://github.com/PythonerKK:copyright: ? 2019 KK <705555262@qq.com.com>
"""
import asyncio
import re
import aiohttp
from pyquery
import PyQuery
import aiomysql
from lxml
import etreepool
= ''
stop
= False
headers
= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
MAX_PAGE
= 10
TABLE_NAME
= 'data'
city
= 'zh'
url
= 'https://{}.lianjia.com/ershoufang/pg{}/'
urls
= []
links_detail
= set()
crawled_links_detail
= set() async def fetch(url
, session
):'''aiohttp獲取網頁源碼'''try:async with session
.get
(url
, headers
=headers
, verify_ssl
=False) as resp
:if resp
.status
in [200, 201]:data
= await resp
.text
()return data
except Exception
as e
:print(e
)def extract_links(source
):'''提取出詳情頁的鏈接'''pq
= PyQuery
(source
)for link
in pq
.items
("a"):_url
= link
.attr
("href")if _url
and re
.match
('https://.*?/\d+.html', _url
) and _url
.find
('{}.lianjia.com'.format(city
)):links_detail
.add
(_url
)print(links_detail
)def extract_elements(source
):'''提取出詳情頁里面的詳情內容'''try:dom
= etree
.HTML
(source
)id = dom
.xpath
('//link[@rel="canonical"]/@href')[0]title
= dom
.xpath
('//title/text()')[0]price
= dom
.xpath
('//span[@class="unitPriceValue"]/text()')[0]information
= dict(re
.compile('<li><span class="label">(.*?)</span>(.*?)</li>').findall
(source
))information
.update
(title
=title
, price
=price
, url
=id)print(information
)asyncio
.ensure_future
(save_to_database
(information
, pool
=pool
))except Exception
as e
:print('解析詳情頁出錯!')passasync def save_to_database(information
, pool
):'''使用異步IO方式保存數據到mysql中注:如果不存在數據表,則創建對應的表'''COLstr
= '' ROWstr
= '' ColumnStyle
= ' VARCHAR(255)'for key
in information
.keys
():COLstr
= COLstr
+ ' ' + key
+ ColumnStyle
+ ','ROWstr
= (ROWstr
+ '"%s"' + ',') % (information
[key
])async with pool
.acquire
() as conn
:async with conn
.cursor
() as cur
:try:await cur
.execute
("SELECT * FROM %s" % (TABLE_NAME
))await cur
.execute
("INSERT INTO %s VALUES (%s)"%(TABLE_NAME
, ROWstr
[:-1]))print('插入數據成功')except aiomysql
.Error
as e
:await cur
.execute
("CREATE TABLE %s (%s)" % (TABLE_NAME
, COLstr
[:-1]))await cur
.execute
("INSERT INTO %s VALUES (%s)" % (TABLE_NAME
, ROWstr
[:-1]))except aiomysql
.Error
as e
:print('mysql error %d: %s' % (e
.args
[0], e
.args
[1]))async def handle_elements(link
, session
):'''獲取詳情頁的內容并解析'''print('開始獲取: {}'.format(link
))source
= await fetch
(link
, session
)crawled_links_detail
.add
(link
)extract_elements
(source
)async def consumer():'''消耗未爬取的鏈接'''async with aiohttp
.ClientSession
() as session
:while not stop
:if len(urls
) != 0:_url
= urls
.pop
()source
= await fetch
(_url
, session
)print(_url
)extract_links
(source
)if len(links_detail
) == 0:print('目前沒有待爬取的鏈接')await asyncio
.sleep
(2)continuelink
= links_detail
.pop
()if link
not in crawled_links_detail
:asyncio
.ensure_future
(handle_elements
(link
, session
))async def main(loop
):global poolpool
= await aiomysql
.create_pool
(host
='127.0.0.1', port
=3306,user
='root', password
='xxxxxx',db
='aiomysql_lianjia', loop
=loop
, charset
='utf8',autocommit
=True)for i
in range(1, MAX_PAGE
):urls
.append
(url
.format(city
, str(i
)))print('爬取總頁數:{} 任務開始...'.format(str(MAX_PAGE
)))asyncio
.ensure_future
(consumer
())if __name__
== '__main__':loop
= asyncio
.get_event_loop
()asyncio
.ensure_future
(main
(loop
))loop
.run_forever
()
總結
以上是生活随笔為你收集整理的python链家网高并发异步爬虫asyncio+aiohttp+aiomysql异步存入数据的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。