python 从excel中抓取数据_使用Python抓取美团数据存于Excel中
0.程序是針對美團中的美食部分數(shù)據(jù)按好評排序采集。
要抓取保存的數(shù)據(jù)為:
商家名類型 ?地理位置?評論人數(shù) ?均價 ?最低價格
1.首先編寫網(wǎng)頁數(shù)據(jù)采集函數(shù),使用request采集網(wǎng)頁源碼,具體實現(xiàn)如下
def getHtml(url):
headers = ('User-Agent',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')
opener = urllib.request.build_opener()
opener.addheaders = [headers]
htmldata = opener.open(url).read()
htmldata=htmldata.decode('utf-8')
return htmldata
2.根據(jù)網(wǎng)頁源碼解析獲取已上線城市的url
class GetCityUrl(HTMLParser):
part = ('gaevent','changecity/build')
urldic = {}
def handle_starttag(self, tag, attrs):
if tag=='a' and (self.part in attrs):
for att,value in attrs:
if att=='href':
self.urldic.__setitem__(value, value+'/category/meishi/all/rating')
def getUrl(self):
return self.urldic
3.獲取分頁url
class GetPages(HTMLParser):
pagelist = list()
temphref = str()
flg = 0
initurl = str()
def setInitUrl(self,url):
self.initurl = url
def handle_starttag(self, tag, attrs):
if tag=='a':
for attr,value in attrs:
if attr=='href' and ('page' in value):
self.temphref = self.initurl + value
if self.temphref not in self.pagelist:
self.pagelist.append(self.temphref)
def getList(self):
return self.pagelist
4.解析網(wǎng)頁源碼 獲取有效信息
class MyHTMLParser(HTMLParser):
tempstr = str()
divsum = int()
def handle_starttag(self, tag, attrs):
if tag=='div':
for attr,value in attrs:
if attr=='class' and value.find('poi-tile-nodeal')!=-1:
self.tempstr=''
self.divsum = 0
def handle_data(self, data):
if(data.isspace()==False):
data = data.replace('·', '·')
if data=='¥':
if '¥' not in self.tempstr:
self.tempstr+='無' +'\t'
self.tempstr+=data
elif data=='¥':
if '¥' not in self.tempstr:
self.tempstr+='無' +'\t'
self.tempstr+='¥'
elif data=='人評價':
self.tempstr=self.tempstr[0:-1]+data+'\t'
elif data=='人均 ':
self.tempstr+='人均'
elif data[0]=='起':
self.tempstr=self.tempstr[0:-1]+'起'
else:
self.tempstr+=data+'\t'
def handle_endtag(self, tag):
if tag=='div':
self.divsum+=1
if self.divsum==6:
if (self.tempstr.find('¥'))!=-1:
if (re.split(r'\t', self.tempstr).__len__())==5:
teststr = str()
flg = 0
for stmp in re.split(r'\t',self.tempstr):
if flg==2:
teststr+='無位置信息'+'\t'
teststr+=stmp+'\t'
flg+=1
self.tempstr=teststr
if (re.split(r'\t', self.tempstr).__len__())==6:
arraystr.append(self.tempstr)
self.divsum=0
self.tempstr=''
5.將信息存放于Excel中
def SaveExcel(listdata):
head=['商家名','類型','地理位置','評論人數(shù)','均價','最低價格']
wbk=xlwt.Workbook()
sheet1=wbk.add_sheet("sheet1")
ii=0
for testhand in head:
sheet1.write(0,ii,testhand)
ii+=1
i=1
j=0
for stt in listdata:
j=0
lis = re.split(r'\t',stt)
for ls in lis:
sheet1.write(i,j,ls)
j=j+1
i+=1
wbk.save('test.xls')
以下是Excel中的數(shù)據(jù):
附錄完整代碼:
#encoding:utf-8
'''
Created on 2016年7月22日
python version 3.5
@author: baalhuo
'''
from html.parser import HTMLParser
import re
import urllib.request
import xlwt
import time
#存放采集的商家信息
arraystr = list()
#解析網(wǎng)頁源碼 獲取有效信息
class MyHTMLParser(HTMLParser):
tempstr = str()
divsum = int()
def handle_starttag(self, tag, attrs):
if tag=='div':
for attr,value in attrs:
if attr=='class' and value.find('poi-tile-nodeal')!=-1:
self.tempstr=''
self.divsum = 0
def handle_data(self, data):
if(data.isspace()==False):
data = data.replace('·', '·')
if data=='¥':
if '¥' not in self.tempstr:
self.tempstr+='無' +'\t'
self.tempstr+=data
elif data=='¥':
if '¥' not in self.tempstr:
self.tempstr+='無' +'\t'
self.tempstr+='¥'
elif data=='人評價':
self.tempstr=self.tempstr[0:-1]+data+'\t'
elif data=='人均 ':
self.tempstr+='人均'
elif data[0]=='起':
self.tempstr=self.tempstr[0:-1]+'起'
else:
self.tempstr+=data+'\t'
def handle_endtag(self, tag):
if tag=='div':
self.divsum+=1
if self.divsum==6:
if (self.tempstr.find('¥'))!=-1:
if (re.split(r'\t', self.tempstr).__len__())==5:
teststr = str()
flg = 0
for stmp in re.split(r'\t',self.tempstr):
if flg==2:
teststr+='無位置信息'+'\t'
teststr+=stmp+'\t'
flg+=1
self.tempstr=teststr
if (re.split(r'\t', self.tempstr).__len__())==6:
arraystr.append(self.tempstr)
self.divsum=0
self.tempstr=''
#獲取美團已上線城市的url 目前為844個城市地區(qū)
class GetCityUrl(HTMLParser):
part = ('gaevent','changecity/build')
urldic = {}
def handle_starttag(self, tag, attrs):
if tag=='a' and (self.part in attrs):
for att,value in attrs:
if att=='href':
self.urldic.__setitem__(value, value+'/category/meishi/all/rating')
def getUrl(self):
return self.urldic
#獲取分頁URL
class GetPages(HTMLParser):
pagelist = list()
temphref = str()
flg = 0
initurl = str()
def setInitUrl(self,url):
self.initurl = url
def handle_starttag(self, tag, attrs):
if tag=='a':
for attr,value in attrs:
if attr=='href' and ('page' in value):
self.temphref = self.initurl + value
if self.temphref not in self.pagelist:
self.pagelist.append(self.temphref)
def getList(self):
return self.pagelist
#采集網(wǎng)頁源碼信息
def getHtml(url):
headers = ('User-Agent',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')
opener = urllib.request.build_opener()
opener.addheaders = [headers]
htmldata = opener.open(url).read()
htmldata=htmldata.decode('utf-8')
return htmldata
#將信息保存到Excel中
def SaveExcel(listdata):
head=['商家名','類型','地理位置','評論人數(shù)','均價','最低價格']
wbk=xlwt.Workbook()
sheet1=wbk.add_sheet("sheet1")
ii=0
for testhand in head:
sheet1.write(0,ii,testhand)
ii+=1
i=1
j=0
for stt in listdata:
j=0
lis = re.split(r'\t',stt)
for ls in lis:
sheet1.write(i,j,ls)
j=j+1
i+=1
wbk.save('e:/test3.xls')
par = GetCityUrl()
par.feed(getHtml('http://www.meituan.com/index/changecity/initiative'))
urldic = par.getUrl()
par = MyHTMLParser()
print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())))
ffwait=1
for url in urldic:
data = getHtml(urldic.get(url))
getpage = GetPages()
getpage.setInitUrl(url)
getpage.feed(data)
pageurllist = getpage.getList()
par.feed(data)
for urltemp in pageurllist:
par.feed(getHtml(urltemp))
arraystr.append('切換地區(qū) ')
if ffwait ==4:#此處只抓取了4個城市數(shù)據(jù)
break;
ffwait+=1
SaveExcel(arraystr)
print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())))
print('Done')
學之,以記之。
總結
以上是生活随笔為你收集整理的python 从excel中抓取数据_使用Python抓取美团数据存于Excel中的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 异步接口同步返回_同步|异步
- 下一篇: 在python中requests模块怎么