Python爬取房天下租房信息实战
生活随笔
收集整理的這篇文章主要介紹了
Python爬取房天下租房信息实战
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
目錄
一、單線程爬蟲
二、優化為多線程爬蟲
三、使用asyncio進一步優化
四、存入Mysql數據庫
(一)建表
(二)將數據存入數據庫中?
思路:先單線程爬蟲,測試可以成功爬取之后再優化為多線程,最后存入數據庫
以爬取鄭州市租房信息為例
注意:本實戰項目僅以學習為目的,為避免給網站造成太大壓力,請將代碼中的num修改成較小的數字,并將線程改小
一、單線程爬蟲
# 用session取代requests # 解析庫使用bs4 # 并發庫使用concurrent import requests # from lxml import etree # 使用xpath解析 from bs4 import BeautifulSoup from urllib import parse import re import timeheaders = {'referer': 'https://zz.zu.fang.com/','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36','cookie': 'global_cookie=ffzvt3kztwck05jm6twso2wjw18kl67hqft; city=zz; integratecover=1; __utma=147393320.427795962.1613371106.1613371106.1613371106.1; __utmc=147393320; __utmz=147393320.1613371106.1.1.utmcsr=zz.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; ASP.NET_SessionId=aamzdnhzct4i5mx3ak4cyoyp; Rent_StatLog=23d82b94-13d6-4601-9019-ce0225c092f6; Captcha=61584F355169576F3355317957376E4F6F7552365351342B7574693561766E63785A70522F56557370586E3376585853346651565256574F37694B7074576B2B34536C5747715856516A4D3D; g_sourcepage=zf_fy%5Elb_pc; unique_cookie=U_ffzvt3kztwck05jm6twso2wjw18kl67hqft*6; __utmb=147393320.12.10.1613371106' } data={'agentbid':'' }session = requests.session() session.headers = headers# 獲取頁面 def getHtml(url):try:re = session.get(url)re.encoding = re.apparent_encodingreturn re.textexcept:print(re.status_code)# 獲取頁面總數量 def getNum(text):soup = BeautifulSoup(text, 'lxml')txt = soup.select('.fanye .txt')[0].text# 取出“共**頁”中間的數字num = re.search(r'\d+', txt).group(0)return num# 獲取詳細鏈接 def getLink(tex):soup=BeautifulSoup(text,'lxml')links=soup.select('.title a')for link in links:href=parse.urljoin('https://zz.zu.fang.com/',link['href'])hrefs.append(href)# 解析頁面 def parsePage(url):res=session.get(url)if res.status_code==200:res.encoding=res.apparent_encodingsoup=BeautifulSoup(res.text,'lxml')try:title=soup.select('div .title')[0].text.strip().replace(' ','')price=soup.select('div .trl-item')[0].text.strip()block=soup.select('.rcont #agantzfxq_C02_08')[0].text.strip()building=soup.select('.rcont #agantzfxq_C02_07')[0].text.strip()try:address=soup.select('.trl-item2 .rcont')[2].text.strip()except:address=soup.select('.trl-item2 .rcont')[1].text.strip()detail1=soup.select('.clearfix')[4].text.strip().replace('\n\n\n',',').replace('\n','')detail2=soup.select('.clearfix')[5].text.strip().replace('\n\n\n',',').replace('\n','')detail=detail1+detail2name=soup.select('.zf_jjname')[0].text.strip()buserid=re.search('buserid: \'(\d+)\'',res.text).group(1)phone=getPhone(buserid)print(title,price,block,building,address,detail,name,phone)house = (title, price, block, building, address, detail, name, phone)info.append(house)except:passelse:print(re.status_code,re.text)# 獲取代理人號碼 def getPhone(buserid):url='https://zz.zu.fang.com/RentDetails/Ajax/GetAgentVirtualMobile.aspx'data['agentbid']=buseridres=session.post(url,data=data)if res.status_code==200:return res.textelse:print(res.status_code)returnif __name__ == '__main__':start_time=time.time()hrefs=[]info=[]init_url = 'https://zz.zu.fang.com/house/'num=getNum(getHtml(init_url))for i in range(0,num):url = f'https://zz.zu.fang.com/house/i3{i+1}/'text=getHtml(url)getLink(text)print(hrefs)for href in hrefs:parsePage(href)print("共獲取%d條數據"%len(info))print("共耗時{}".format(time.time()-start_time))session.close()二、優化為多線程爬蟲
# 用session取代requests # 解析庫使用bs4 # 并發庫使用concurrent import requests # from lxml import etree # 使用xpath解析 from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor from urllib import parse import re import timeheaders = {'referer': 'https://zz.zu.fang.com/','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36','cookie': 'global_cookie=ffzvt3kztwck05jm6twso2wjw18kl67hqft; integratecover=1; city=zz; keyWord_recenthousezz=%5b%7b%22name%22%3a%22%e6%96%b0%e5%af%86%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014868%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e4%ba%8c%e4%b8%83%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014864%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e9%83%91%e4%b8%9c%e6%96%b0%e5%8c%ba%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a0842%2f%22%2c%22sort%22%3a1%7d%5d; __utma=147393320.427795962.1613371106.1613558547.1613575774.5; __utmc=147393320; __utmz=147393320.1613575774.5.4.utmcsr=zz.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ASP.NET_SessionId=vhrhxr1tdatcc1xyoxwybuwv; g_sourcepage=zf_fy%5Elb_pc; Captcha=4937566532507336644D6557347143746B5A6A6B4A7A48445A422F2F6A51746C67516F31357446573052634562725162316152533247514250736F72775566574A2B33514357304B6976343D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; __utmb=147393320.9.10.1613575774; unique_cookie=U_0l0d1ilf1t0ci2rozai9qi24k1pkl9lcmrs*4' } data={'agentbid':'' }session = requests.session() session.headers = headers# 獲取頁面 def getHtml(url):res = session.get(url)if res.status_code==200:res.encoding = res.apparent_encodingreturn res.textelse:print(res.status_code)# 獲取頁面總數量 def getNum(text):soup = BeautifulSoup(text, 'lxml')txt = soup.select('.fanye .txt')[0].text# 取出“共**頁”中間的數字num = re.search(r'\d+', txt).group(0)return num# 獲取詳細鏈接 def getLink(url):text=getHtml(url)soup=BeautifulSoup(text,'lxml')links=soup.select('.title a')for link in links:href=parse.urljoin('https://zz.zu.fang.com/',link['href'])hrefs.append(href)# 解析頁面 def parsePage(url):res=session.get(url)if res.status_code==200:res.encoding=res.apparent_encodingsoup=BeautifulSoup(res.text,'lxml')try:title=soup.select('div .title')[0].text.strip().replace(' ','')price=soup.select('div .trl-item')[0].text.strip()block=soup.select('.rcont #agantzfxq_C02_08')[0].text.strip()building=soup.select('.rcont #agantzfxq_C02_07')[0].text.strip()try:address=soup.select('.trl-item2 .rcont')[2].text.strip()except:address=soup.select('.trl-item2 .rcont')[1].text.strip()detail1=soup.select('.clearfix')[4].text.strip().replace('\n\n\n',',').replace('\n','')detail2=soup.select('.clearfix')[5].text.strip().replace('\n\n\n',',').replace('\n','')detail=detail1+detail2name=soup.select('.zf_jjname')[0].text.strip()buserid=re.search('buserid: \'(\d+)\'',res.text).group(1)phone=getPhone(buserid)print(title,price,block,building,address,detail,name,phone)house = (title, price, block, building, address, detail, name, phone)info.append(house)except:passelse:print(re.status_code,re.text)# 獲取代理人號碼 def getPhone(buserid):url='https://zz.zu.fang.com/RentDetails/Ajax/GetAgentVirtualMobile.aspx'data['agentbid']=buseridres=session.post(url,data=data)if res.status_code==200:return res.textelse:print(res.status_code)returnif __name__ == '__main__':start_time=time.time()hrefs=[]info=[]init_url = 'https://zz.zu.fang.com/house/'num=getNum(getHtml(init_url))with ThreadPoolExecutor(max_workers=5) as t:for i in range(0,num):url = f'https://zz.zu.fang.com/house/i3{i+1}/'t.submit(getLink,url)print("共獲取%d個鏈接"%len(hrefs))print(hrefs)with ThreadPoolExecutor(max_workers=30) as t:for href in hrefs:t.submit(parsePage,href)print("共獲取%d條數據"%len(info))print("耗時{}".format(time.time()-start_time))session.close()三、使用asyncio進一步優化
# 用session取代requests # 解析庫使用bs4 # 并發庫使用concurrent import requests # from lxml import etree # 使用xpath解析 from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor from urllib import parse import re import time import asyncioheaders = {'referer': 'https://zz.zu.fang.com/','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36','cookie': 'global_cookie=ffzvt3kztwck05jm6twso2wjw18kl67hqft; integratecover=1; city=zz; keyWord_recenthousezz=%5b%7b%22name%22%3a%22%e6%96%b0%e5%af%86%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014868%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e4%ba%8c%e4%b8%83%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014864%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e9%83%91%e4%b8%9c%e6%96%b0%e5%8c%ba%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a0842%2f%22%2c%22sort%22%3a1%7d%5d; __utma=147393320.427795962.1613371106.1613558547.1613575774.5; __utmc=147393320; __utmz=147393320.1613575774.5.4.utmcsr=zz.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ASP.NET_SessionId=vhrhxr1tdatcc1xyoxwybuwv; g_sourcepage=zf_fy%5Elb_pc; Captcha=4937566532507336644D6557347143746B5A6A6B4A7A48445A422F2F6A51746C67516F31357446573052634562725162316152533247514250736F72775566574A2B33514357304B6976343D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; __utmb=147393320.9.10.1613575774; unique_cookie=U_0l0d1ilf1t0ci2rozai9qi24k1pkl9lcmrs*4' } data={'agentbid':'' }session = requests.session() session.headers = headers# 獲取頁面 def getHtml(url):res = session.get(url)if res.status_code==200:res.encoding = res.apparent_encodingreturn res.textelse:print(res.status_code)# 獲取頁面總數量 def getNum(text):soup = BeautifulSoup(text, 'lxml')txt = soup.select('.fanye .txt')[0].text# 取出“共**頁”中間的數字num = re.search(r'\d+', txt).group(0)return num# 獲取詳細鏈接 def getLink(url):text=getHtml(url)soup=BeautifulSoup(text,'lxml')links=soup.select('.title a')for link in links:href=parse.urljoin('https://zz.zu.fang.com/',link['href'])hrefs.append(href)# 解析頁面 def parsePage(url):res=session.get(url)if res.status_code==200:res.encoding=res.apparent_encodingsoup=BeautifulSoup(res.text,'lxml')try:title=soup.select('div .title')[0].text.strip().replace(' ','')price=soup.select('div .trl-item')[0].text.strip()block=soup.select('.rcont #agantzfxq_C02_08')[0].text.strip()building=soup.select('.rcont #agantzfxq_C02_07')[0].text.strip()try:address=soup.select('.trl-item2 .rcont')[2].text.strip()except:address=soup.select('.trl-item2 .rcont')[1].text.strip()detail1=soup.select('.clearfix')[4].text.strip().replace('\n\n\n',',').replace('\n','')detail2=soup.select('.clearfix')[5].text.strip().replace('\n\n\n',',').replace('\n','')detail=detail1+detail2name=soup.select('.zf_jjname')[0].text.strip()buserid=re.search('buserid: \'(\d+)\'',res.text).group(1)phone=getPhone(buserid)print(title,price,block,building,address,detail,name,phone)house = (title, price, block, building, address, detail, name, phone)info.append(house)except:passelse:print(re.status_code,re.text)# 獲取代理人號碼 def getPhone(buserid):url='https://zz.zu.fang.com/RentDetails/Ajax/GetAgentVirtualMobile.aspx'data['agentbid']=buseridres=session.post(url,data=data)if res.status_code==200:return res.textelse:print(res.status_code)return# 獲取詳細鏈接的線程池 async def Pool1(num):loop=asyncio.get_event_loop()task=[]with ThreadPoolExecutor(max_workers=5) as t:for i in range(0,num):url = f'https://zz.zu.fang.com/house/i3{i+1}/'task.append(loop.run_in_executor(t,getLink,url))# 解析頁面的線程池 async def Pool2(hrefs):loop=asyncio.get_event_loop()task=[]with ThreadPoolExecutor(max_workers=30) as t:for href in hrefs:task.append(loop.run_in_executor(t,parsePage,href))if __name__ == '__main__':start_time=time.time()hrefs=[]info=[]task=[]init_url = 'https://zz.zu.fang.com/house/'num=getNum(getHtml(init_url))loop = asyncio.get_event_loop()loop.run_until_complete(Pool1(num))print("共獲取%d個鏈接"%len(hrefs))print(hrefs)loop.run_until_complete(Pool2(hrefs))loop.close()print("共獲取%d條數據"%len(info))print("耗時{}".format(time.time()-start_time))session.close()四、存入Mysql數據庫
(一)建表
from sqlalchemy import create_engine from sqlalchemy import String, Integer, Column, Text from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import scoped_session # 多線程爬蟲時避免出現線程安全問題 from sqlalchemy.ext.declarative import declarative_baseBASE = declarative_base() # 實例化 engine = create_engine("mysql+pymysql://root:root@127.0.0.1:3306/pytest?charset=utf8",max_overflow=300, # 超出連接池大小最多可以創建的連接pool_size=100, # 連接池大小echo=False, # 不顯示調試信息 )class House(BASE):__tablename__ = 'house'id = Column(Integer, primary_key=True, autoincrement=True)title=Column(String(200))price=Column(String(200))block=Column(String(200))building=Column(String(200))address=Column(String(200))detail=Column(Text())name=Column(String(20))phone=Column(String(20))BASE.metadata.create_all(engine) Session = sessionmaker(engine) sess = scoped_session(Session)(二)將數據存入數據庫中?
# 用session取代requests # 解析庫使用bs4 # 并發庫使用concurrent import requests from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor from urllib import parse from mysqldb import sess, House import re import time import asyncioheaders = {'referer': 'https://zz.zu.fang.com/','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36','cookie': 'global_cookie=ffzvt3kztwck05jm6twso2wjw18kl67hqft; integratecover=1; city=zz; __utmc=147393320; ASP.NET_SessionId=vhrhxr1tdatcc1xyoxwybuwv; __utma=147393320.427795962.1613371106.1613575774.1613580597.6; __utmz=147393320.1613580597.6.5.utmcsr=zz.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; Rent_StatLog=c158b2a7-4622-45a9-9e69-dcf6f42cf577; keyWord_recenthousezz=%5b%7b%22name%22%3a%22%e4%ba%8c%e4%b8%83%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014864%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e9%83%91%e4%b8%9c%e6%96%b0%e5%8c%ba%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a0842%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e7%bb%8f%e5%bc%80%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014871%2f%22%2c%22sort%22%3a1%7d%5d; g_sourcepage=zf_fy%5Elb_pc; Captcha=6B65716A41454739794D666864397178613772676C75447A4E746C657144775A347A6D42554F446532357649643062344F6976756E563450554E59594B7833712B413579506C4B684958343D; unique_cookie=U_0l0d1ilf1t0ci2rozai9qi24k1pkl9lcmrs*14; __utmb=147393320.21.10.1613580597' } data={'agentbid':'' }session = requests.session() session.headers = headers# 獲取頁面 def getHtml(url):res = session.get(url)if res.status_code==200:res.encoding = res.apparent_encodingreturn res.textelse:print(res.status_code)# 獲取頁面總數量 def getNum(text):soup = BeautifulSoup(text, 'lxml')txt = soup.select('.fanye .txt')[0].text# 取出“共**頁”中間的數字num = re.search(r'\d+', txt).group(0)return num# 獲取詳細鏈接 def getLink(url):text=getHtml(url)soup=BeautifulSoup(text,'lxml')links=soup.select('.title a')for link in links:href=parse.urljoin('https://zz.zu.fang.com/',link['href'])hrefs.append(href)# 解析頁面 def parsePage(url):res=session.get(url)if res.status_code==200:res.encoding=res.apparent_encodingsoup=BeautifulSoup(res.text,'lxml')try:title=soup.select('div .title')[0].text.strip().replace(' ','')price=soup.select('div .trl-item')[0].text.strip()block=soup.select('.rcont #agantzfxq_C02_08')[0].text.strip()building=soup.select('.rcont #agantzfxq_C02_07')[0].text.strip()try:address=soup.select('.trl-item2 .rcont')[2].text.strip()except:address=soup.select('.trl-item2 .rcont')[1].text.strip()detail1=soup.select('.clearfix')[4].text.strip().replace('\n\n\n',',').replace('\n','')detail2=soup.select('.clearfix')[5].text.strip().replace('\n\n\n',',').replace('\n','')detail=detail1+detail2name=soup.select('.zf_jjname')[0].text.strip()buserid=re.search('buserid: \'(\d+)\'',res.text).group(1)phone=getPhone(buserid)print(title,price,block,building,address,detail,name,phone)house = (title, price, block, building, address, detail, name, phone)info.append(house)try:house_data=House(title=title,price=price,block=block,building=building,address=address,detail=detail,name=name,phone=phone)sess.add(house_data)sess.commit()except Exception as e:print(e) # 打印錯誤信息sess.rollback() # 回滾except:passelse:print(re.status_code,re.text)# 獲取代理人號碼 def getPhone(buserid):url='https://zz.zu.fang.com/RentDetails/Ajax/GetAgentVirtualMobile.aspx'data['agentbid']=buseridres=session.post(url,data=data)if res.status_code==200:return res.textelse:print(res.status_code)return# 獲取詳細鏈接的線程池 async def Pool1(num):loop=asyncio.get_event_loop()task=[]with ThreadPoolExecutor(max_workers=5) as t:for i in range(0,num):url = f'https://zz.zu.fang.com/house/i3{i+1}/'task.append(loop.run_in_executor(t,getLink,url))# 解析頁面的線程池 async def Pool2(hrefs):loop=asyncio.get_event_loop()task=[]with ThreadPoolExecutor(max_workers=30) as t:for href in hrefs:task.append(loop.run_in_executor(t,parsePage,href))if __name__ == '__main__':start_time=time.time()hrefs=[]info=[]task=[]init_url = 'https://zz.zu.fang.com/house/'num=getNum(getHtml(init_url))loop = asyncio.get_event_loop()loop.run_until_complete(Pool1(num))print("共獲取%d個鏈接"%len(hrefs))print(hrefs)loop.run_until_complete(Pool2(hrefs))loop.close()print("共獲取%d條數據"%len(info))print("耗時{}".format(time.time()-start_time))session.close()五、最終效果圖 (已打碼)
?
總結
以上是生活随笔為你收集整理的Python爬取房天下租房信息实战的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Windows10熄屏自动断开WiFi连
- 下一篇: Python爬取京东商品信息以及评论存进