當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

书包网小说多线程爬虫

發布時間：2023/12/20 编程问答 32 豆豆

生活随笔收集整理的這篇文章主要介紹了书包网小说多线程爬虫小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

書包網是個很好的小說網站，提供了小說txt下載，并且網站后端高并發，不用擔心隨便抓一下把網站抓崩了

既然如此，何不拿來練手爬蟲項目呢。

直接上代碼把，此多線程爬蟲支持爬取各種這樣類似的網站，關鍵需要網站支持高并發，否則分分鐘崩了。

畢竟5分鐘一本18mb的小說，屬于超級快的那種了

from lxml import etree import requests from threading import Thread,enumerate import os from time import sleep,timeheaders={ # ':authority':'www.bookbao8.com', # ':method': 'GET', # ':path': '/book/201506/04/id_XNDMyMjA1.html', # ':scheme': 'https', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', 'cookie': 'Hm_lvt_79d6c18dfed73a9524dc37b056df45ec=1577182135; Hm_lpvt_79d6c18dfed73a9524dc37b056df45ec=1577182135; Hm_lvt_9e424f40a62d01a6b9036c7d25ce9a05=1577182142; trustedsite_visit=1; bk_ad=2; __cm_warden_uid=840a745a752905060cd14982b4bbc922coo; __cm_warden_upi=MTE5LjQuMjI4LjE1Nw%3D%3D; Hm_lpvt_9e424f40a62d01a6b9036c7d25ce9a05=1577185720', 'referer': 'https://www.bookbao8.com/book/201506/04/id_XNDMyMjA1.html', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' }def thread_it(func,*args):t = Thread(target=func,args=args)t.setDaemon(True)t.start()def getAll(url = "https://www.bookbao8.com/book/201506/04/id_XNDMyMjA1.html"):r = requests.get(url,headers=headers)print(r.text)if r.status_code == 200:r.encoding = r.apparent_encodingret = r.textpage_source = etree.HTML(ret)name = page_source.xpath('//*[@id="info"]/h1/text()')author = page_source.xpath('//*[@id="info"]/p[1]/a/text()')novel_type = page_source.xpath('//*[@id="info"]/p[2]/a/text()')title = page_source.xpath('/html/body/div[7]/ul/li/a/text()')link = page_source.xpath('/html/body/div[7]/ul/li/a/@href')link = map(lambda x: 'https://www.bookbao8.com'+x, link) #向列表中每個元素都加入前綴novel_list = list(zip(title,link)) #將兩個列表用zip打包成新的zip對象并轉為列表對象if len(novel_list) > 0:return name[0], author[0], novel_type[0], novel_listelse:return None,None,None,Nonedef getOne(link=('第0001章絕地中走出的少年', 'https://www.bookbao8.com/views/201506/04/id_XNDMyMjA1_1.html')):r = requests.get(link[1], headers=headers)if r.status_code == 200:r.encoding = r.apparent_encodingret = r.textpage_source = etree.HTML(ret)node_title = link[0]node_content = page_source.xpath('//*[@id="contents"]/text()')node_content = "".join(node_content).replace("\n \xa0 \xa0","")if len(node_title) > 0:return node_title, node_contentelse:return None, None def writeOne(title,content):txt = "\t\t"+title+"\n"+content+"\n\n"return txt def runApp(novel_list,name,t1,cwd=''):article_num = len(novel_list)xc_num = article_num//20+1print(f"待開啟線程數量為{xc_num}")def inter(link,f,i):try:title, content = getOne(link)txt = writeOne(title, content)f.write(txt)print(f"\r線程{i}正在寫入 {title}", end="")except Exception as e:print("\n爬得太快被拒絕連接，等1s遞歸繼續")sleep(1)inter(link,f,i)def inner(name,i,begin,end,cwd):f = open(f"{cwd}downloads/{name}/{i}.txt", mode='w+', encoding='utf-8')for link in novel_list[begin:end]:inter(link, f,i)if link == novel_list[end - 1]:print(f"\n線程{i}執行完畢")print(f"\n剩余線程數量{len(enumerate())}")base_xc = 2 if not cwd else 4if len(enumerate()) <= base_xc:print(enumerate())print("\n全本下載完畢")t2 = time()print(f"\n本次下載小說總共耗時{round(t2 - t1)}s")hebing(f"{cwd}downloads/{name}")f.close()for i in range(1,xc_num+1):begin = 20*(i-1)end = 20*i if i != xc_num else article_numif i == xc_num:print(f"\n全部線程開啟完畢")thread_it(inner,name,i,begin,end,cwd)sleep(0.5)def paixuRule(elem):return int(elem.split(".")[0])def hebing(path):dirs = os.listdir(path)dirs.sort(key=paixuRule, reverse=False)f = open(path+".txt",mode='w+',encoding='utf-8')for file in dirs:with open(path+"/"+file,mode="r",encoding="utf-8") as f1:f.write(f1.read())f.close()print("小說合并完成")if __name__ == '__main__':t1 = time()name, _, _, novel_list = getAll(url="https://www.bookbao8.com/book/201506/04/id_XNDMyMjA1.html")print(name)if not os.path.exists("downloads/" + name):os.mkdir("downloads/" + name)runApp(novel_list, name, t1)while True:pass

總結

以上是生活随笔為你收集整理的书包网小说多线程爬虫的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：软件工程师薪水_13个薪水最高的技术工作
下一篇：编译原理之算符优先分析语法程序

3atv精品不卡视频,97人人超碰国产精品最新,中文字幕av一区二区三区人妻少妇,久久久精品波多野结衣,日韩一区二区三区精品

编程问答

书包网小说多线程爬虫

總結