import requests
import re
import xlsxwriter
import time
time_start=time.time()
agent={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
choose_ls=[depth*2if depth<=3else3*(depth-1)for depth inrange(1,6)]#根據(jù)深度大小取12位代碼前**位
match_level=['provincetr','citytr','countytr','towntr','villagetr']
initurl='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html'
total_dict={}
depth=0
each_root={initurl:('','')}
max_depth=5#可選,1-5分別表示省級(jí)、地級(jí)、縣級(jí)、鄉(xiāng)級(jí)、村級(jí),進(jìn)而爬取固定深度范圍內(nèi)所有的葉節(jié)點(diǎn)以及該深度下的根節(jié)點(diǎn)while depth<max_depth:total_count=0next_root={}for url in each_root:code_join=each_root[url][0]+'-'if depth!=0else each_root[url][0]zone_join=each_root[url][1]+'-'if depth!=0else each_root[url][1]change_root='/'.join(url.split('/')[:-1])+'/'whileTrue:try:req=requests.get(url,headers=agent)req.encoding='GBK'#中文解碼,不要用req.encoding=req.apparent_encoding,這樣識(shí)別出來(lái)的req.encoding='gb2312',有好多復(fù)雜漢字解不出碼text=req.texttext=text.replace('\n','\\n')#正則表達(dá)式會(huì)跳過(guò)換行符(無(wú)法識(shí)別下一行),因此將換行符替換special_sigh=Falseif match_level[depth]in text:match_text=re.findall(r"class='%s'>(.*?)</table"%match_level[depth],text)[0]breakelse:search=Falsefor level inrange(depth,5):#東莞、中山、儋州缺縣級(jí)單位,因此需要進(jìn)行識(shí)別并放入下一節(jié)點(diǎn)存儲(chǔ)if match_level[level]in text:match_text=re.findall(r"class='%s'>(.*?)</table"%match_level[level],text)[0]search=Truespecial_sigh=Trueprint('特殊區(qū)劃:%s'%each_root[url][1])breakif search:breakelse:print('服務(wù)器繁忙')time.sleep(2)except:print('服務(wù)器繁忙')time.sleep(2)if special_sigh:next_root[url]=(code_join,zone_join)else:if depth!=0:has_tree=re.findall(r"href='(.*?)'>(\d+?)<.*?html'>(.*?)</a></td></tr>",match_text)else:base_tree=re.findall(r"href='(.*?)'>(.*?)<br/",match_text)has_tree=[(each[0],each[0].split('.html')[0],each[1])for each in base_tree]base_no=re.findall(r"td>(\d+?)</td><td>(.*?)</td></tr>",match_text)no_tree=[(each[0],re.findall(r'<td>(.+)',each[1])[0]if'td'in each[1]else each[1])for each in base_no]for each in has_tree:each_dir=change_root+each[0]next_root[each_dir]=(code_join+each[1][:choose_ls[depth]],zone_join+each[2])if depth==3:if(total_count+1)%100==0:print('已爬取%d個(gè),在路徑%s處'%(total_count+1,zone_join+each[2]))else:print('在路徑%s處'%(zone_join+each[2]))if no_tree:for each in no_tree:total_dict[code_join+each[0][:choose_ls[depth]]]=zone_join+each[1]if depth==4:if(total_count+1)%800==0:print('已爬取%d個(gè),在路徑%s處'%(total_count+1,zone_join+each[1]))else:print('已獲取路徑%s'%(zone_join+each[1]))total_count+=1depth+=1each_root=next_root
defdecompose(each):iftype(total_dict[each])==tuple:codelist=total_dict[each][0].split('-')namelist=total_dict[each][1].split('-')else:codelist=each.split('-')namelist=total_dict[each].split('-')iflen(codelist)<depth:for i inrange(len(codelist),depth):codelist.append('')namelist.append('')ziplist=list(zip(codelist,namelist))return[i for j in ziplist for i in j]
sort_name=['省級(jí)','地級(jí)','縣級(jí)','鄉(xiāng)級(jí)','村級(jí)']
real_column=[(sort_name[each]+'代碼',sort_name[each]+'名稱(chēng)')for each inrange(depth)]
flat_col=[i for each in real_column for i in each]
total_dict.update(each_root)if depth<=3:#縣級(jí)及以上數(shù)據(jù)量不大(約三千行),可以用excel存儲(chǔ)wk=xlsxwriter.Workbook('五級(jí)聯(lián)動(dòng).xlsx')sh=wk.add_worksheet('sheet1')for each inrange(2*depth):sh.write(0,each,flat_col[each])totalrow=1for each in total_dict:flatlist=decompose(each)for i inrange(2*depth):sh.write(totalrow,i,flatlist[i])totalrow+=1wk.close()else:#縣級(jí)往下數(shù)據(jù)較多,excel沒(méi)有優(yōu)勢(shì),因此寫(xiě)入csv存儲(chǔ)book=open('五級(jí)聯(lián)動(dòng).csv','w',encoding='utf-8')book.write(','.join(flat_col)+'\n')for each in total_dict:flatten=decompose(each)book.write(','.join(flatten)+'\n')book.close()
time_end=time.time()
rest_second=time_end-time_start
print('用時(shí)%d分%d秒'%divmod(rest_second,60))