全國行政區劃信息查詢平臺地址:http://xzqh.mca.gov.cn/map
檢查網頁源碼:
檢查網頁源碼可以發現: 所有省級信息全部在javaScript下的json中,會在頁面加載時加載json數據,填充到頁面的option中。
1.第一步:使用正則表達式抓取json數據并解析,租成一個province集合:
def get_province(self
):pattern
= re
.compile(r
"var json =(.*?);", re
.MULTILINE
| re
.DOTALL
)script
= self
.soup
.find
("script", text
=pattern
)lists
= str(pattern
.search
(script
.text
).group
(1))json_list
= json
.loads
(lists
)province_dict
= dict()for json_data
in json_list
:province
= json_data
['shengji']quhua_code
= json_data
['quHuaDaiMa']province_dict
.update
({quhua_code
: province
})return province_dict
2.第二步:檢查該網站實現級聯查詢的方式,找出查詢市區的方式
根據這段源碼可看出,在選擇 省級的后,網頁會調用selectJson接口進行一個post請求,上圖可以看到請求的body和header等信息。
于是事情就變得簡單起來:代碼可以這樣寫(如下)
def get_city(self
, shengji
):body
= ("shengji=" + shengji
).encode
('UTF-8')headers
= {'Content-Type': "application/x-www-form-urlencoded; charset=utf-8",'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, ""like Gecko) Chrome/77.0.3865.120 Safari/537.36"}response
= requests
.post
('http://xzqh.mca.gov.cn/selectJson', data
=body
, headers
=headers
)content
= response
.contentjson_list
= json
.loads
(content
)city_dict
= dict()for json_data
in json_list
:citys
= json_data
['diji']quhua_code
= json_data
['quHuaDaiMa']city_dict
.update
({quhua_code
: citys
})return city_dict
def get_area(self
, shengji
, diji
):body
= ("shengji=" + shengji
+ "&diji=" + diji
).encode
('UTF-8')headers
= {'Content-Type': "application/x-www-form-urlencoded; charset=utf-8",'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, ""like Gecko) Chrome/77.0.3865.120 Safari/537.36"}response
= requests
.post
('http://xzqh.mca.gov.cn/selectJson', data
=body
, headers
=headers
)content
= response
.contentjson_list
= json
.loads
(content
)area_dict
= dict()for json_data
in json_list
:area
= json_data
['xianji']area_code
= json_data
['quHuaDaiMa']area_dict
.update
({area_code
: area
})return area_dict
2.第三步:main函數(遍歷所有省市區+數據入庫)
數據庫表結構如下:
【全部代碼如下】:
import requests
from bs4
import BeautifulSoup
import pymysql
import re
import json
class allAreaDataNew(object):base_url
= 'http://xzqh.mca.gov.cn/map'headers
= {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}wb_data
= requests
.get
(base_url
, headers
=headers
)wb_data
.encoding
= 'GBK'soup
= BeautifulSoup
(wb_data
.text
, 'lxml')def __init__(self
):self
.db
= pymysql
.connect
("***", "***", "***", "***", charset
="utf8mb4") self
.main
()self
.db
.close
()def main(self
):sql_list
= set()province_dict
= self
.get_province
()for province_code
in province_dict
:province
= province_dict
[province_code
]city_dict
= self
.get_city
(province
)sql_province
= "insert into area_config values (null,'" + province
+ "','PROVINCE'," + province_code
+ ",0)"sql_list
.add
(sql_province
)print(province_code
+ "----------------------------------省------------------------------------------" + province
+ "\n")for city_code
in city_dict
:city
= city_dict
[city_code
]area_dict
= self
.get_area
(province
, city
)print(city_code
+ "*******************市****************" + city
+ "\n")if city
== '省直轄縣級行政單位' or city
== '自治區直轄縣級行政單位':sql_city
= "insert into area_config values (null,'" + city
+ "','CITY'," + province_code
+ "," + province_code
+ ")"sql_list
.add
(sql_city
)for area_code
in area_dict
:area
= area_dict
[area_code
]print(area_code
+ "-區-" + area
+ "\n")sql_area
= "insert into area_config values (null,'" + area
+ "','DISTRICT'," + area_code
+ "," + province_code
+ ")"sql_list
.add
(sql_area
)else:sql_city
= "insert into area_config values (null,'" + city
+ "','CITY'," + city_code
+ "," + province_code
+ ")"sql_list
.add
(sql_city
)for area_code
in area_dict
:area
= area_dict
[area_code
]print(area_code
+ "-區-" + area
+ "\n")sql_area
= "insert into area_config values (null,'" + area
+ "','DISTRICT'," + area_code
+ "," + city_code
+ ")"sql_list
.add
(sql_area
)print(str(sql_list
))empty_sql
= "delete from area_config"self
.connect_mysql
(empty_sql
, sql_list
)def get_province(self
):pattern
= re
.compile(r
"var json =(.*?);", re
.MULTILINE
| re
.DOTALL
)script
= self
.soup
.find
("script", text
=pattern
)lists
= str(pattern
.search
(script
.text
).group
(1))json_list
= json
.loads
(lists
)province_dict
= dict()for json_data
in json_list
:province
= json_data
['shengji']quhua_code
= json_data
['quHuaDaiMa']province_dict
.update
({quhua_code
: province
})return province_dict
def get_city(self
, shengji
):body
= ("shengji=" + shengji
).encode
('UTF-8')headers
= {'Content-Type': "application/x-www-form-urlencoded; charset=utf-8",'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, ""like Gecko) Chrome/77.0.3865.120 Safari/537.36"}response
= requests
.post
('http://xzqh.mca.gov.cn/selectJson', data
=body
, headers
=headers
)content
= response
.contentjson_list
= json
.loads
(content
)city_dict
= dict()for json_data
in json_list
:citys
= json_data
['diji']quhua_code
= json_data
['quHuaDaiMa']city_dict
.update
({quhua_code
: citys
})return city_dict
def get_area(self
, shengji
, diji
):body
= ("shengji=" + shengji
+ "&diji=" + diji
).encode
('UTF-8')headers
= {'Content-Type': "application/x-www-form-urlencoded; charset=utf-8",'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, ""like Gecko) Chrome/77.0.3865.120 Safari/537.36"}response
= requests
.post
('http://xzqh.mca.gov.cn/selectJson', data
=body
, headers
=headers
)content
= response
.contentjson_list
= json
.loads
(content
)area_dict
= dict()for json_data
in json_list
:area
= json_data
['xianji']area_code
= json_data
['quHuaDaiMa']area_dict
.update
({area_code
: area
})return area_dict
def connect_mysql(self
, empty_sql
, sql_list
):cursor
= self
.db
.cursor
()try:cursor
.execute
(empty_sql
)for sql
in sql_list
:cursor
.execute
(sql
)print('=================================更新所有數據完成!=================================')except Exception
as e
:print('=================================更新失敗!=================================')print(e
)self
.db
.rollback
()finally:cursor
.close
()self
.db
.commit
()if __name__
== '__main__':allAreaDataNew
()
代碼執行成功后就可以查到中國所有省市區啦!:
特殊情況:“省直轄縣級行政單位”和“自治區直轄縣級行政單位”
部分省有特殊的“直轄縣級行政單位”或“自治區直轄縣級行政單位”
參考:https://baike.baidu.com/item/省直轄縣級行政單位/6903180?fr=aladdin
遇到這種情況有點懵逼,因為沒有 區號代碼 所以無法關聯父子關系。
但是無妨,數據庫設計有type字段~直接將省級區號代碼作為唯一值給到市,作為區號代碼,再將這個值賦值給區,作為區的父級區號代碼,這樣后臺用type+區號代碼判斷關聯關系。
PS:后續
1.由于本身是java項目要用到中國的省市區 2.并且中國的省市區的變化很頻繁(市級區級的變化時有發生),因此后期可以用jython將其用到java項目,并且可以寫一個job定時任務,定時更新數據庫表。
參考文檔
https://jingyan.baidu.com/article/d169e1867bd27f436611d829.html
總結
以上是生活随笔為你收集整理的python+ BeautifulSoup抓取“全国行政区划信息查询平台”的省市区信息的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。