當前位置：首頁 > 编程语言 > python >内容正文

python

python爬虫笔记（八）实例3：用Python批量爬取全站小说【以书趣阁为例】

發布時間：2024/3/26 python 27 豆豆

生活随笔收集整理的這篇文章主要介紹了 python爬虫笔记（八）实例3：用Python批量爬取全站小说【以书趣阁为例】小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

1.?用Python批量爬取全站小說

爬取這個網站小說：http://www.shuquge.com/txt/89644/index.html

2. 爬取一本書

# -*- coding: utf-8 -*- """ Created on Sat Feb 8 20:31:43 2020@author: douzi """import requests from parsel import Selector import re import timedef main():index_url = 'http://www.shuquge.com/txt/89644/index.html' # 想要爬取的小說tpl = 'http://www.shuquge.com/txt/89644/' headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"}# 獲取小說目錄頁urllist = requests.get(index_url, headers=headers)index_sel = Selector(urllist.text)# <div class="listmain"> <dl><dt>《九星毒奶》最新章節</dt><dd><a href="29287710.html">1040 養龍皮？</a></dd>index = index_sel.css('.listmain a::attr(href)').getall()# 保存10章節for n in index:url = tpl + n# 第 n 章response = requests.get(url, headers=headers, timeout=30)response.encoding = response.apparent_encodingprint(response.request.url)# xpath css 選擇器提取網頁數據結構（html） # lxml pyquery parselsel = Selector(response.text)title = sel.css('h1::text').get()print(title)match = re.search(r'[0-9]*', title.split()[0])if match:with open("./jiuxin/" + match.group(0) + '.txt', 'w', encoding = 'utf-8') as f:f.writelines(title)# <div id="content" class="showtxt">for line in sel.css('#content::text').getall():f.writelines(line)time.sleep(0.5)if __name__ == '__main__':main()

3. 爬取一個分類

# -*- coding: utf-8 -*- """ Created on Sat Feb 8 20:31:43 2020@author: douzi """import requests from parsel import Selector import re import time import osheaders = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"}# 下載一章節 def download_one_chapter(url, book_name):# 第 n 章response = requests.get(url, headers=headers, timeout=30)response.encoding = response.apparent_encodingprint(response.request.url)# xpath css 選擇器提取網頁數據結構（html） # lxml pyquery parselsel = Selector(response.text)title = sel.css('h1::text').get()print(title)with open('./'+book_name+'/'+title+'.txt','a+', encoding = 'utf-8') as f:f.writelines(title)# <div id="content" class="showtxt">for line in sel.css('#content::text').getall():f.writelines(line)f.write('\n\0')time.sleep(0.5)# 下載一本書 def download_one_book(index_url, bname): # index_url = 'http://www.shuquge.com/txt/89644/index.html' # 想要爬取的小說:例，九星毒奶book_name = re.split('/', index_url)[-2] # 例: 89644tpl = 'http://www.shuquge.com/txt/' + book_name + '/'# 獲取小說目錄頁urllist = requests.get(index_url, headers=headers)urllist.encoding = urllist.apparent_encodingindex_sel = Selector(urllist.text)# <div class="listmain"> <dl><dt>《九星毒奶》最新章節</dt><dd><a href="29287710.html">1040 養龍皮？</a></dd>index = index_sel.css('.listmain a::attr(href)').getall()for n in index:url = tpl + ndownload_one_chapter(url, bname)# 下載一類別 def download_one_category():tpl = 'http://www.shuquge.com/category/7_{}.html' # 想要爬取的類別# 3頁for page in range(1, 4):category_url = tpl.format(page)print(category_url)# 獲取小說類別頁cate_list = requests.get(category_url, headers=headers)cate_list.encoding = cate_list.apparent_encodingindex_sel = Selector(cate_list.text)books_url = index_sel.css('span.s2 a::attr(href)').getall()books_name = index_sel.css('span.s2 a::text').getall()for book_url in books_url:# 如：變成隨身老奶奶 http://www.shuquge.com/txt/109203/index.htmlbook_name = books_name[books_url.index(book_url)]print(book_name, book_url)if os.path.isdir('./' + book_name):os.removedirs(book_name)else:os.mkdir('./' + book_name)# 下載一本書download_one_book(book_url, book_name)if __name__ == '__main__': # download_one_book('asd')download_one_category()

總結

以上是生活随笔為你收集整理的python爬虫笔记（八）实例3：用Python批量爬取全站小说【以书趣阁为例】的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：迅为国产RK3568开发板Android
下一篇： 04 高级控件总结