正则中国邮政
#-*-coding:utf-8-*-
import re,json,requests
url="http://www.ip138.com/post/"
headers = { # 偽裝成瀏覽器,防止反爬,通用'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5558.400 QQBrowser/10.1.1695.400'}
# response=requests.get(url=url,headers=headers).content.decode("gbk")
# with open("中國郵編首頁.html",'w',encoding='utf-8')as fq:
# fq.write(response)
with open("中國郵編首頁.html",'r',encoding='utf-8')as fq:response=fq.read()
p=re.compile(r'<a href="/(.*?)/" target="_blank">(.*?)</a></td>')#列表套元組
province=p.findall(response)
print(province)
print(len(province))
dictall={}
m=0
for i in province:print(i)dictcity = {}# url = f"http://www.ip138.com/{i[0]}"# response=requests.get(url=url,headers=headers).content.decode("gbk")# with open(f"各省郵編/{i[1]}.html",'w',encoding='utf-8')as fq:# fq.write(response)with open(f"各省郵編/{i[1]}.html",'r',encoding='utf-8')as fq:response=fq.read()bigcities=[""]# p=re.compile(r'<tr bgcolor="#ffffff"><td>(.*?)</td><td><a href="/.*?/">(.*?)</a></td>')# xian=p.findall(response)# print(len(xian),xian)p2=re.compile(r'<tr bgcolor="#ffffff">(.*?)<tr><td colspan="6"></td></tr>',re.S)#把省的地區分出來cities=p2.findall(response)if cities==[]:#直轄市和港澳#p4 = re.compile(r'<tr bgcolor="#ffffff"><td>(.*?)</td><td><a href="/.*?/">(.*?)</a>', re.S)p4 = re.compile(r'<td>(.*?)</td><td><a href="/.*?/">(.*?)</a></td><td><a href="/.*?/">.*?</a></td>', re.S)bigcitylist= p4.findall(response)for bigcity in bigcitylist:dictcity[bigcity[0]]=bigcity[1]m+=1dictall[i[1]] = dictcitycontinueprint('cities',cities)for city in cities:print(city)p3=re.compile(r'<td><a href=".*?/"><b>(.*?)</a></b></td><td><a href="/.*?/">(.*?)</a></td>',re.S)area=p3.findall(city)[0]#p4=re.compile(r'<tr bgcolor="#ffffff"><td>(.*?)</td><td><a href="/.*?/">(.*?)</a></td>.*?<td>(.*?)</td><td><a href="/.*?/">(.*?)</a></td></tr><tr><td colspan="6"></td></tr>',re.S)p4 = re.compile(r'<td>(.*?)</td><td><a href="/.*?/">(.*?)</a></td><td><a href="/.*?/">.*?</a></td>', re.S)xians=p4.findall(city)[1:]if xians==[]:dictcity[area[0]]=area[1]m+=1else:dictxian={}dictxian[area[0]] = area[1]m+=1for xian in xians:dictxian[xian[0]]=xian[1]m+=1#dictxian[xian[2]] = xian[3]dictcity[area[0]]=dictxianprint(area,xians)dictall[i[1]]=dictcity
strall= json.dump(dictall, open(f'中國郵編正則7.json', 'w', encoding="utf-8"), ensure_ascii=False)
print(m)
創作挑戰賽新人創作獎勵來咯,堅持創作打卡瓜分現金大獎
總結
- 上一篇: mongoDB如何将数据导成csv文件?
- 下一篇: pandas常见错误类型TypeErro