python 爬虫爬取小说信息
生活随笔
收集整理的這篇文章主要介紹了
python 爬虫爬取小说信息
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
1.進入小說主頁(以下示例是我在網上隨便找的一片小說),獲取該小說的名稱、作者以及相關描述信息
2.獲取該小說的所有章節列表信息(最重要的是每個章節的鏈接地址href)
3.根據每個章節的地址信息下載每個章節的內容并解析
4.將解析后的內容打印并寫入文件或者數據庫
?
示例代碼v1版如下(僅供學習交流):
# !/usr/bin/env python # -*-coding:utf-8-*- """ @Author? : xiaofeng @Time? ? : 2018/12/26 11:41 @Desc : Less interests,More interest. @Project : python_appliction @FileName: dianxs.py @Software: PyCharm @Blog? ? :https://blog.csdn.net/zwx19921215 """import requests import time from lxml import html import os""" 簡單爬取小說文章內容('殿行說小說網') """class Dianxs():# 構造函數初始化def __init__(self, host, url, headers, path):self.host = hostself.url = urlself.headers = headersself.path = path"""下載并解析小說主頁信息,獲取小說列表"""def download_page(self):response = requests.get(url=self.url, headers=self.headers)text = html.fromstring(response.text)novel = text.xpath('//div[@class="info"]/h1/text()')author = text.xpath('//div[@class="info"]/p[1]/text()')act = text.xpath('//div[@class="info"]/p[2]/text()')last_update = text.xpath('//div[@class="info"]/p[3]/text()')last_chapter_href = text.xpath('//div[@class="info"]/p[4]/a/@href')last_chapter_title = text.xpath('//div[@class="info"]/p[4]/a/text()')introduce = text.xpath('//div[@class="info"]/p[@class="introduce"]/text()')print('---------------------------description--------------------------------')print(novel)# author[0].replace('\xa0', '')print(author)print(act)print(last_update)print(last_chapter_title, ' , ', last_chapter_href)print('簡介:', introduce)print('-----------------------------------------------------------------------')print('\n')chapters = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/text()')hrefs = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/@href')print(chapters)print(hrefs)print('\n')for href in hrefs:time.sleep(1)address = self.host + hrefself.parse_html(address)"""解析文章內容@:param address 章節頁地址"""def parse_html(self, address):response = requests.get(url=address, headers=self.headers, timeout=10)if response.status_code != 200:self.parse_html(address)text = html.fromstring(response.text)title = text.xpath('//div[@class="read-title"]/h2/text()')content = text.xpath('//div[@class="read-content"]/p/text()')print('-------- ', title, '-----------')print(content)print('\n')# ''.join(content):list集合轉字符串;list = list(string):字符串轉list集合title_str = ''.join(title)content_str = ''.join(content)self.write_to_file(title_str, content_str)"""章節內容寫入文件"""def write_to_file(self, title, content):flag = os.path.exists(self.path)if not flag:# 'w' 表示寫模式,沒有文件則會創建一個f = open(self.path, 'w')f.close()# with 可以不用顯示調用close方法# 'a' 表示追加寫入with open(self.path, mode='a', encoding='utf-8') as file:file.write(title + '\n')file.writelines(content)file.write('\n\n')if __name__ == '__main__':host = 'https://www.dianxs.com'url = 'https://www.dianxs.com/book/64554/'headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}path = 'G:/test/novel.txt'app = Dianxs(host, url, headers, path)app.download_page()?
控制臺輸出:
?
文件寫入內容:
?
v2版:從網站主頁開始深度優先爬取所有分類欄目以及欄目下所有小說信息
改進v2版示例如下
# !/usr/bin/env python # -*-coding:utf-8-*- """ @Author? : xiaofeng @Time? ? : 2018/12/26 11:41 @Desc : Less interests,More interest. @Project : python_appliction @FileName: dianxs2.py @Software: PyCharm @Blog? ? :https://blog.csdn.net/zwx19921215 """import osimport requests from lxml import html""" 簡單爬取小說文章內容('殿行說小說網') """class Dianxs():# 構造函數初始化def __init__(self, host, url, headers, path):self.host = hostself.url = urlself.headers = headersself.path = pathself.novel_name = ''"""主站導航欄欄目列表爬取"""def nav_page(self):print('------------------殿行說----------------------------')response = requests.get(url=self.host, headers=self.headers)text = html.fromstring(response.text)nav_list = text.xpath('//ul[@class="nav"]/li/a/text()')nav_href_list = text.xpath('//ul[@class="nav"]/li/a/@href')nav_list.pop(0)nav_href_list.pop(0)print(nav_list)print(nav_href_list)i = 0for nav_item in nav_href_list:address = self.host + nav_itemnav_title = nav_list[i]self.nav_item(address, nav_title)i += 1"""小說欄目下所有章節爬取"""def nav_item(self, url, nav_title):response = requests.get(url=url, headers=self.headers)text = html.fromstring(response.text)novel_list = text.xpath('//div[@class="panel new-xs-list w300 w265 fr simple"]/ul/li//span[@class="xs-name"]/a/text()')novel_list_href = text.xpath('//div[@class="panel new-xs-list w300 w265 fr simple"]/ul/li//span[@class="xs-name"]/a/@href')print('--------------------', nav_title, '-----------------')print(novel_list)print(novel_list_href)print('\n')for nov_item in novel_list_href:self.url = self.host + nov_itemself.download_page()"""每個章節下載,并解析小說主頁信息,獲取小說列表"""def download_page(self):response = requests.get(url=self.url, headers=self.headers)text = html.fromstring(response.text)novel = text.xpath('//div[@class="info"]/h1/text()')author = text.xpath('//div[@class="info"]/p[1]/text()')act = text.xpath('//div[@class="info"]/p[2]/text()')last_update = text.xpath('//div[@class="info"]/p[3]/text()')last_chapter_href = text.xpath('//div[@class="info"]/p[4]/a/@href')last_chapter_title = text.xpath('//div[@class="info"]/p[4]/a/text()')introduce = text.xpath('//div[@class="info"]/p[@class="introduce"]/text()')print('---------------------------description--------------------------------')print(novel)# author[0].replace('\xa0', '')print(author)print(act)print(last_update)print(last_chapter_title, ' , ', last_chapter_href)print('簡介:', introduce)print('-----------------------------------------------------------------------')print('\n')chapters = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/text()')hrefs = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/@href')print(chapters)print(hrefs)print('\n')for href in hrefs:# time.sleep(1)address = self.host + hrefself.novel_name = novelself.parse_html(address)"""解析文章內容@:param address 章節頁地址"""def parse_html(self, address):response = requests.get(url=address, headers=self.headers, timeout=10)if response.status_code != 200:self.parse_html(address)text = html.fromstring(response.text)title = text.xpath('//div[@class="read-title"]/h2/text()')content = text.xpath('//div[@class="read-content"]/p/text()')print('-------- ', title, '-----------')print(content)print('\n')# ''.join(content):list集合轉字符串;list = list(string):字符串轉list集合title_str = ''.join(title)content_str = ''.join(content)self.write_to_file(title_str, content_str)"""章節內容寫入文件"""def write_to_file(self, title, content):file_path = self.path + ''.join(self.novel_name)if not os.path.exists(file_path):os.makedirs(file_path)file_name = file_path + '/' + title + '.txt'flag = os.path.exists(file_name)if not flag:# 'w' 表示寫模式,沒有文件則會創建一個try:f = open(file_name, 'w')f.close()except Exception as e:print(e)# 可將發生異常信息的章節收集,進行后續處理# todo# with 可以不用顯示調用close方法# 'a' 表示追加寫入with open(file_name, mode='a', encoding='utf-8') as file:file.write(title + '\n')file.writelines(content)file.write('\n\n')if __name__ == '__main__':host = 'https://www.dianxs.com'url = 'https://www.dianxs.com/book/64554/'headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}path = 'G:/殿興說/'app = Dianxs(host, url, headers, path)app.nav_page()# app.download_page()總結
以上是生活随笔為你收集整理的python 爬虫爬取小说信息的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 浙江台州最大汽车经销商老板跑路 旗下19
- 下一篇: 腾讯视频科幻剧集矩阵公布:《三体II:黑