网页爬虫python代码_Python 爬虫web网页版程序代码
一:網頁結構分析
二:代碼實戰#! /usr/bin/env python2
# encoding=utf-8
#BeautifulSoup需要安裝 MySQLdb
import sys,os,re,hashlib
import urllib
import httplib2
from lxml import etree
import MySQLdb
from BeautifulSoup import BeautifulSoup
import urllib2
import re
import time
reload(sys)
from datetime import datetime as dt,timedelta
import re
h=httplib2.Http(timeout=10)
#設置請求http頭 模擬偽裝 瀏覽器
headers={
'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'
}
#正則匹配a標簽
pattern = '(.*?)'
#日志記錄
log_path='./sporttery'
log_file='%s.log' % dt.now().strftime('%Y-%m-%d')
if not os.path.exists(log_path):
os.makedirs(log_path)
log=open('%s/%s' % (log_path,log_file),'w+')
#python操作mysql數據庫
conn= MySQLdb.connect(
host='localhost',
port = 3306,
user='root',
passwd='root',
db ='test',
)
conn.set_character_set('utf8')
cur = conn.cursor()
cur.execute('SET NAMES utf8;')
cur.execute('SET CHARACTER SET utf8;')
cur.execute('SET character_set_connection=utf8;')
cur.close()
#獲取請求鏈接內容 失敗再次執行
def download(url):
fails = 0
while True:
if fails>5:return None
try:
res,content = h.request(url,'GET',headers=headers)
return content.decode('utf-8','ignore')
except:
print(u'打開鏈接失敗'+url)
fails +=1
#字符串截取方法
def GetMiddleStr(content,startStr,endStr):
startIndex = content.index(startStr)
if startIndex>=0:
startIndex += len(startStr)
endIndex = content.index(endStr)
return content[startIndex:endIndex]
def get_ul(data):
mystring=GetMiddleStr(data,'','')
return mystring
def test_sporttery(i):
url='http://www.xxx.com/video/video_%E8%B6%B3%E7%90%83%E9%AD%94%E6%96%B9_'+str(i)+'.html'
print url
#http://www.xxx.com/video/video_%E8%B6%B3%E7%90%83%E9%AD%94%E6%96%B9_2.html
source=download(url)
data=get_ul(source)
datas=data.split('
')
for each in datas:
ret=re.findall(r"(?<=href=\\").+?(?=\\")|(?<=href=\\').+?(?=\\')" ,each)
for urls in ret:
detial=download(urls)
if detial:
detial_content=GetMiddleStr(detial,'createFlashVideo','m3u8').replace(' ', '')
if detial_content:
end_url_rex=GetMiddleStr(detial_content+".m3u8",'http://','.m3u8')+"m3u8"
#最終的url
#title
sstree = etree.HTML(detial)
ssnodes = sstree.xpath('//*[@id="playVideo"]/div[1]/h2')
for ssn in ssnodes:
name= ssn.text.strip().replace('/h2>', '')
#title=GetMiddleStr(detial,'
').replace(' ', '')
#簡介
introduction=GetMiddleStr(detial,'video-info">','').replace(' ', '')
dr = re.compile(r']+>',re.S)
introductions = dr.sub('',introduction)
end_content=introductions.strip().replace('/span>', '')
end_time= time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()+8*60*60))
#end_times=dt.now().strftime('%Y-%m-%d %H:%i:%S')
saveDB(urls,end_url_rex,name,end_content,str(i),end_time)
def saveDB(current_url,end_url_rex,names,end_content,page,create_time):
#添加select update
sql = 'INSERT INTO test.mytables(current_url,end_url_rex,`names`,end_content,page,create_time)\\
VALUES (%s,%s,%s,%s,%s,%s)'
print sql
cur = conn.cursor()
cur.execute(sql,(current_url,end_url_rex,names,end_content,page,create_time))
cur.close()
conn.commit()
if __name__ == '__main__':
first="http://www.xxx.com/video/video_%E8%B6%B3%E7%90%83%E9%AD%94%E6%96%B9_1.html"
url = urllib2.urlopen(first)
content = url.read()
soup = BeautifulSoup(content)
strs=soup.findAll(attrs={"class":"pagination"})
lists=str(strs[0])
listss=re.findall(r'\\d+',lists)
count=len(listss)
list_string = list(set(listss))
str_num= list_string[-1]
i = 1
while i <= int(str_num):
test_sporttery(i)
i += 1
總結
以上是生活随笔為你收集整理的网页爬虫python代码_Python 爬虫web网页版程序代码的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: java gc日志乱码_6000+字,3
- 下一篇: win7连接网络身份验证怎么办 Win7