笔趣阁 单篇小说采集
生活随笔
收集整理的這篇文章主要介紹了
笔趣阁 单篇小说采集
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
1、代理文檔格式:(代理采集地址 http://www.xicidaili.com)
2、免費代理穩定性不可靠,采用裝飾器重連同時切換代理
# coding: utf-8 # 筆趣閣 單篇小說采集 http://www.biquge.com.tw # 替換第一章地址,總章節數。 # ip.txt 為代理池。 import urllib2 from bs4 import BeautifulSoup import sys import traceback import randomreload(sys) sys.setdefaultencoding('utf-8')f = open("out.txt", "a+") headers = {"Host": "www.biquge.com.tw","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","X-Requested-With": "XMLHttpRequest","User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36","Content-Type": "application/x-www-form-urlencoded; charset=UTF-8","Connection": "keep-alive","Upgrade-Insecure-Requests": "1" }url = "http://www.biquge.com.tw/17_17281/7647045.html" # 第一章網址 page = 1301 # 章節數 nextHref = urlipPool = []def IPpool():reader = open('ip.txt')line = reader.readline()while line:if line.strip() != '':ipPool.append(line.split())line = reader.readline()reader.close()RETRIES = 0 # 重試的次數 count = {"num": RETRIES}def conn_try_again(function):def wrapped(*args, **kwargs):try:return function(*args, **kwargs)except Exception, err:print("--重試訪問,當前次數 %s ,(總次數5)--" % (count['num'] + 1))if count['num'] < 5:count['num'] += 1return wrapped(*args, **kwargs)else:raise Exception(err)return wrappedbsObj = None@conn_try_again def getContent(url):global nextHref, page, bsObj# 定義一個代理開關proxySwitch = Truetry:poolLen = len(ipPool)if (poolLen > 0):i = random.randint(0, poolLen - 1)print(ipPool[i])proxy_host = ipPool[i][2] + "://" + ipPool[i][0] + ":" + ipPool[i][1]proxy_temp = {ipPool[i][2]: proxy_host}proxy_support = urllib2.ProxyHandler(proxy_temp)else:print('--代理池當前無可用代理,使用本機地址訪問--')proxy_support = urllib2.ProxyHandler({})nullproxy_handler = urllib2.ProxyHandler({})if proxySwitch:opener = urllib2.build_opener(proxy_support)else:opener = urllib2.build_opener(nullproxy_handler)urllib2.install_opener(opener)req = urllib2.Request(url, headers=headers)response = urllib2.urlopen(req, timeout=3)# print(response.read())bsObj = BeautifulSoup(response, 'lxml')except Exception, err:raise Exception(err)contentDiv = bsObj.find('div', id='content')content = bsObj.find('div', id='content').get_text()preAndNextBar = bsObj.find('div', attrs={'class': 'bottem2'})title = bsObj.find('div', attrs={'class': 'bookname'}).h1.get_text()if ("下一章" in preAndNextBar.get_text()):next = NoneaList = preAndNextBar.findAll('a')for i in aList:if ("下一章" in i.get_text()):next = iif (next == None):print("下一章為空")return TruenextHref = "http://www.biquge.com.tw" + next.get('href')print(title)# print(content)print(nextHref)f.write("#####" + '\n')f.write(title + '\n')f.write(content + '\n')count['num'] = 0else:return Truedef main():IPpool()global pagetry:for num in range(1, page):if (getContent(nextHref)):breakprint("--- end ---")except Exception, e:print(traceback.print_exc())finally:f.close()main()附:代理采集 https://blog.csdn.net/u012795120/article/details/80857990
下載地址:https://download.csdn.net/download/u012795120/10508330
總結
以上是生活随笔為你收集整理的笔趣阁 单篇小说采集的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: dw网页制作入学教程_简单的手机网页制作
- 下一篇: [C++][IO]读写二进制文件