正则爬虫案例
#coding:utf-8import requests
import re
import jsonurl='https://movie.douban.com/top250?start=0&filter=' #豆瓣網(wǎng)def get_page(url):
#獲取網(wǎng)頁(yè)上的數(shù)據(jù)response_html=requests.get(url)#response_html.encoding = response_html.apparent_encodingreturn response_html.text def run(url):response=get_page(url)
#編譯匹配規(guī)則,找出用的數(shù)據(jù)obj=re.compile('<div class="item">.*?<em.*?>(?P<id>\d+)</em>.*?<span class="title">(?P<title>.*?)</span>.*?<p .*?>(?P<info>.*?)</p>.*?<span class="rating_num" .*?>(?P<rating>.*?)</span>.*?<span>(?P<appraise>\w+)</span>',re.S)res=obj.finditer(response)file={}for i in res:file[i.group('id')]=(i.group('title'),i.group('rating'),i.group('appraise'))# print(file)
#將有用的信息轉(zhuǎn)成json格式,以字典的格式儲(chǔ)存到文件中
content=json.dumps(file,ensure_ascii=False)f = open('doubian.txt', 'a')f.seek(0,2)f.write(content+'\n')file={}i=0 while i < 251:
#循環(huán)取出所有網(wǎng)頁(yè)里的內(nèi)容(根據(jù)網(wǎng)頁(yè)不同進(jìn)行更改)a=irun(url)i+=25url=re.sub('start=\d+','start='+str(i),url)print(url)
#獲取網(wǎng)頁(yè)上的數(shù)據(jù)response_html=requests.get(url)#response_html.encoding = response_html.apparent_encodingreturn response_html.text def run(url):response=get_page(url)
#編譯匹配規(guī)則,找出用的數(shù)據(jù)obj=re.compile('<div class="item">.*?<em.*?>(?P<id>\d+)</em>.*?<span class="title">(?P<title>.*?)</span>.*?<p .*?>(?P<info>.*?)</p>.*?<span class="rating_num" .*?>(?P<rating>.*?)</span>.*?<span>(?P<appraise>\w+)</span>',re.S)res=obj.finditer(response)file={}for i in res:file[i.group('id')]=(i.group('title'),i.group('rating'),i.group('appraise'))# print(file)
#將有用的信息轉(zhuǎn)成json格式,以字典的格式儲(chǔ)存到文件中
content=json.dumps(file,ensure_ascii=False)f = open('doubian.txt', 'a')f.seek(0,2)f.write(content+'\n')file={}i=0 while i < 251:
#循環(huán)取出所有網(wǎng)頁(yè)里的內(nèi)容(根據(jù)網(wǎng)頁(yè)不同進(jìn)行更改)a=irun(url)i+=25url=re.sub('start=\d+','start='+str(i),url)print(url)
?
轉(zhuǎn)載于:https://www.cnblogs.com/mona524/p/7096190.html
總結(jié)
- 上一篇: Java基础总结之数组
- 下一篇: 修改CentOS 7.2系统的主机名