當(dāng)前位置：首頁(yè) > 编程语言 > python >内容正文

python

Python的scrapy之爬取顶点小说网的所有小说

發(fā)布時(shí)間：2025/6/17 python 22 豆豆

生活随笔收集整理的這篇文章主要介紹了 Python的scrapy之爬取顶点小说网的所有小说小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

閑來無事用Python的scrapy框架練練手，爬取頂點(diǎn)小說網(wǎng)的所有小說的詳細(xì)信息。

看一下網(wǎng)頁(yè)的構(gòu)造：

tr標(biāo)簽里面的 td 使我們所要爬取的信息

下面是我們要爬取的二級(jí)頁(yè)面小說的簡(jiǎn)介信息：

下面上代碼：

mydingdian.py

import scrapy from scrapy.http import Request from ..items import DingdianItemclass MydingdianSpider(scrapy.Spider):name = 'mydingdian'allowed_domains = ['www.x23us.com/']start_url = ['https://www.x23us.com/class/']starturl=['.html']def start_requests(self):# for i in range(1,11):for i in range(5, 6):#print(i)url_con=str(i)+'_1'#print(url_con)url1 = self.start_url+list(url_con)+self.starturl#print(url1)url=''for j in url1:url+=j+''#print(url)yield Request(url, self.parse)def parse(self, response):baseurl=response.url #真正的url鏈接#print(baseurl)max_num = response.xpath('//*[@id="pagelink"]/a[14]/text()').extract_first() # 獲取當(dāng)前頁(yè)面的最大頁(yè)碼數(shù)#print(max_num) #頁(yè)碼數(shù)baseurl = baseurl[:-7]#print(baseurl)for num in range(1,int(max_num)+1):#for num in range(1, 5):#print(list("_" + str(num)))newurl1 = list(baseurl) + list("_" + str(num)) + self.starturl#print(newurl1)newurl=''for j in newurl1:newurl+=j+''print(newurl)# 此處使用dont_filter和不使用的效果不一樣，使用dont_filter就能夠抓取到第一個(gè)頁(yè)面的內(nèi)容，不用就抓不到# scrapy會(huì)對(duì)request的URL去重(RFPDupeFilter)，加上dont_filter則告訴它這個(gè)URL不參與去重。yield Request(newurl, dont_filter=True, callback=self.get_name) # 將新的頁(yè)面url的內(nèi)容傳遞給get_name函數(shù)去處理def get_name(self,response):item=DingdianItem()for nameinfo in response.xpath('//tr'):#print(nameinfo)novelurl = nameinfo.xpath('td[1]/a/@href').extract_first() # 小說地址#print(novelurl)name = nameinfo.xpath('td[1]/a[2]/text()').extract_first() # 小說名字#print(name)newchapter=nameinfo.xpath('td[2]/a/text()').extract_first() #最新章節(jié)#print(newchapter)date=nameinfo.xpath('td[5]/text()').extract_first() #更新日期#print(date)author = nameinfo.xpath('td[3]/text()').extract_first() # 小說作者#print(author)serialstatus = nameinfo.xpath('td[6]/text()').extract_first() # 小說狀態(tài)#print(serialstatus)serialsize = nameinfo.xpath('td[4]/text()').extract_first() # 小說大小#print(serialnumber)#print('--==--'*10)if novelurl:item['novel_name'] = name#print(item['novel_name'])item['author'] = authoritem['novelurl'] = novelurl#print(item['novelurl'])item['serialstatus'] = serialstatusitem['serialsize'] = serialsizeitem['date']=dateitem['newchapter']=newchapterprint('小說名字:', item['novel_name'])print('小說作者:', item['author'])print('小說地址:', item['novelurl'])print('小說狀態(tài):', item['serialstatus'])print('小說大小:', item['serialsize'])print('更新日期:', item['date'])print('最新章節(jié):', item['newchapter'])print('===='*5)#yield Request(novelurl,dont_filter=True,callback=self.get_novelcontent,meta={'item':item})yield item'''def get_novelcontent(self,response):#print(123124) #測(cè)試調(diào)用成功urlitem=response.meta['item']novelurl=response.url#print(novelurl)serialnumber = response.xpath('//tr[2]/td[2]/text()').extract_first() # 連載字?jǐn)?shù)#print(serialnumber)category = response.xpath('//tr[1]/td[1]/a/text()').extract_first() # 小說類別#print(category)collect_num_total = response.xpath('//tr[2]/td[1]/text()').extract_first() # 總收藏#print(collect_num_total)click_num_total = response.xpath('//tr[3]/td[1]/text()').extract_first() # 總點(diǎn)擊novel_breif = response.xpath('//dd[2]/p[2]').extract_first() #小說簡(jiǎn)介# item['serialnumber'] = serialnumber# item['category'] = category# item['collect_num_total']=collect_num_total# item['click_num_total']=click_num_total# item['novel_breif']=novel_breif## print('小說字?jǐn)?shù):', item['serialnumber'])# print('小說類別:', item['category'])# print('總收藏:', item['collect_num_total'])# print('總點(diǎn)擊:', item['click_num_total'])# print('小說簡(jiǎn)介:', item['novel_breif'])# print('===='*10)yield item

items.py

# -*- coding: utf-8 -*-# Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass DingdianItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()novel_name = scrapy.Field() # 小說名字author = scrapy.Field() # 作者novelurl = scrapy.Field() # 小說地址serialstatus = scrapy.Field() # 狀態(tài)serialsize = scrapy.Field() # 連載大小date=scrapy.Field() #小說日期newchapter=scrapy.Field() #最新章節(jié) serialnumber = scrapy.Field() # 連載字?jǐn)?shù)category = scrapy.Field() # 小說類別collect_num_total = scrapy.Field() # 總收藏click_num_total = scrapy.Field() # 總點(diǎn)擊novel_breif = scrapy.Field() # 小說簡(jiǎn)介

插入數(shù)據(jù)庫(kù)的管道? iopipelines.py

from 爬蟲大全.dingdian.dingdian import dbutil# 作業(yè)：自定義的管道，將完整的爬取數(shù)據(jù)，保存到MySql數(shù)據(jù)庫(kù)中 class DingdianPipeline(object):def process_item(self, item, spider):dbu = dbutil.MYSQLdbUtil()dbu.getConnection() # 開啟事物# 1.添加try:#sql = "insert into movies (電影排名,電影名稱,電影短評(píng),評(píng)價(jià)分?jǐn)?shù),評(píng)價(jià)人數(shù))values(%s,%s,%s,%s,%s)"sql = "insert into ebook (novel_name,author,novelurl,serialstatus,serialsize,ebookdate,newchapter)values(%s,%s,%s,%s,%s,%s,%s)"#date = [item['rank'],item['title'],item['quote'],item['star']]#dbu.execute(sql, date, True)dbu.execute(sql, (item['novel_name'],item['author'],item['novelurl'],item['serialstatus'],item['serialsize'],item['date'],item['newchapter']),True)#dbu.execute(sql,True) dbu.commit()print('插入數(shù)據(jù)庫(kù)成功！！')except:dbu.rollback()dbu.commit() # 回滾后要提交finally:dbu.close()return item

settings.py

# -*- coding: utf-8 -*-# Scrapy settings for dingdian project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'dingdian'SPIDER_MODULES = ['dingdian.spiders'] NEWSPIDER_MODULE = 'dingdian.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'dingdian (+http://www.yourdomain.com)'# Obey robots.txt rules ROBOTSTXT_OBEY = True# Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 2# Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default) #COOKIES_ENABLED = False# Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False# Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #}# Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html SPIDER_MIDDLEWARES = {'dingdian.middlewares.DingdianSpiderMiddleware': 543, }# Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = {'dingdian.middlewares.DingdianDownloaderMiddleware': 543,'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None,'dingdian.rotate_useragent.RotateUserAgentMiddleware' :400 }# Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #}# Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {#'dingdian.pipelines.DingdianPipeline': 300,#'dingdian.iopipelines.DingdianPipeline': 301, }# Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings HTTPCACHE_ENABLED = True HTTPCACHE_EXPIRATION_SECS = 0 HTTPCACHE_DIR = 'httpcache' HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'LOG_LEVEL='INFO' LOG_FILE='dingdian.log'

在往數(shù)據(jù)庫(kù)插入數(shù)據(jù)的時(shí)候，我遇到了

pymysql.err.InterfaceError: (0, '') 這種問題，百度了好久才解決。。。

那是因?yàn)閟crapy異步的存儲(chǔ)的原因,太快。

解決方法:只要放慢爬取速度就能解決,setting.py中設(shè)置 DOWNLOAD_DELAY = 2

詳細(xì)代碼附在Github上了?

tyutltf/dingdianbook: 爬取頂點(diǎn)小說網(wǎng)的所有小說信息? https://github.com/tyutltf/dingdianbook

轉(zhuǎn)載于:https://www.cnblogs.com/yuxuanlian/p/10000968.html

《新程序員》：云原生和全面數(shù)字化實(shí)踐50位技術(shù)專家共同創(chuàng)作，文字、視頻、音頻交互閱讀

總結(jié)

以上是生活随笔為你收集整理的Python的scrapy之爬取顶点小说网的所有小说的全部?jī)?nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò)，歡迎將生活随笔推薦給好友。

上一篇： Mac上在终端上解压与压缩
下一篇： Hadoop-2.8.5的HA集群搭建

3atv精品不卡视频,97人人超碰国产精品最新,中文字幕av一区二区三区人妻少妇,久久久精品波多野结衣,日韩一区二区三区精品

python

Python的scrapy之爬取顶点小说网的所有小说

總結(jié)