scrapy分布式爬虫爬取淘车网
生活随笔
收集整理的這篇文章主要介紹了
scrapy分布式爬虫爬取淘车网
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
一、master主機配置
1、開啟redis服務器
2、city.py#文件
3、redis_url文件
from taoche.taoche.spiders.city import CITY_CODE, CAR_CODE_LIST from redis import Redis class Redis_url():def __init__(self):#連接客戶端self.re=Redis("localhost",6379)def add(self,url):#將url,利用lpush方法,添加到"taoche:start_urls"中self.re.lpush("taoche:start_urls",url)def flushdb(self):pass rd=Redis_url()#實例化對象 #先將redis中的request全部清空 for city in CITY_CODE:for car_code in CAR_CODE_LIST:rd.add("https://{}.taoche.com/{}/".format(city,car_code))二、爬取數據的電腦配置
1、settings.py文件配置
添加如下代碼
完整settings.py代碼
# -*- coding: utf-8 -*-# Scrapy settings for taoche project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'taoche'SPIDER_MODULES = ['taoche.spiders'] NEWSPIDER_MODULE = 'taoche.spiders' #調度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" #去重 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" #redis服務器地址 REDIS_HOST = '10.10.21.13'#要連接的master主機地址 #redis端口號 REDIS_PORT = 6379 ##開啟隊列 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'taoche (+http://www.yourdomain.com)'#偽裝瀏覽器# Obey robots.txt rules ROBOTSTXT_OBEY = False#非測試模式# Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default) #COOKIES_ENABLED = False# Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False# Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #}# Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html SPIDER_MIDDLEWARES = {'taoche.middlewares.TaocheSpiderMiddleware': 543, } # 中間件 # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'taoche.middlewares.TaocheDownloaderMiddleware': 543, #}# Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #}# Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {'taoche.pipelines.TaochePipeline': 300, }# Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'2、spiders文件夾下的taochec.py文件
# -*- coding: utf-8 -*- import scrapy from .city import *#從同級目錄的city.py文件導入所有內容 from lxml import etree from ..items import TaocheItem from scrapy_redis.spiders import RedisSpider import reclass TaochecSpider(RedisSpider):#redis分布式爬蟲name = 'taochec'redis_key = "taoche:start_urls"# class TaochecSpider(scrapy.Spider): # name = 'taochec' # allowed_domains = ['taoche.com'] # start_urls = [] # for city in CITY_CODE[:3]: # for pinpai in CAR_CODE_LIST[:3]: # url=f'https://{city}.taoche.com/{pinpai}/' # start_urls.append(url) # print(url)def parse(self, response):tree = etree.HTML(response.body.decode('utf-8'))# 獲取li列表頁信息li_list = tree.xpath('//ul[@class="gongge_ul"]//li')print(len(li_list))if len(li_list) == 0:passelse:for li_data in li_list:item = TaocheItem()# 獲取標題title = li_data.xpath('./div[@class="gongge_main"]//span/text()')[0]reg_date = li_data.xpath('./div[@class="gongge_main"]/p/i[1]/text()')[0]mile = li_data.xpath('./div[@class="gongge_main"]/p/i[2]/text()')[0]city_name = tree.xpath('//div[@class="nav_statusMain"]//a[2]/text()')[0]price = li_data.xpath('.//div[@class="price"]//i[@class="Total brand_col"]/text()')[0]try:all_price = li_data.xpath('.//div[@class="price"]//i[@class="onepaynor"]/text()')[0]except:all_price = li_data.xpath('.//div[@class="price"]//i[@class="original"]/text()')[0]# 獲取詳情頁的urlbase_url = li_data.xpath('.//div[@class="item_img"]/a/@href')[0]# 拼接urldetail_url = 'https:' + base_urlitem['title'] = titleitem['reg_date'] = reg_dateitem['mile'] = mileitem['city_name'] = city_nameitem['price'] = priceitem['all_price'] = all_priceitem['detail_url'] = detail_urlyield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item}, dont_filter=True)page_next = tree.xpath('//a[@class="pages-next"]')if page_next:next_url = tree.xpath('//a[@class="pages-next"]/@href')[0]next_url = 'http:' + next_urlyield scrapy.Request(next_url, callback=self.parse, encoding='utf-8', dont_filter=True)def parse_detail(self,response):item = response.meta["item"]print(response.url)response = response.body.decode('utf-8')tree = etree.HTML(response)# 圖片pic =tree.xpath('//div[@class="taoche-details-xs-picbox"]//ul[@id="taoche-details-xs-pic"]//li[1]/img/@data-src')[0]# 排量displace = tree.xpath('//div[@class="summary-attrs"]//dl[3]/dd/text()')[0]# 車源號source_id = tree.xpath('//span[@class="car-number"]/text()')[0]source_id = source_id.split(':')[-1]item["pic"] = picitem["displace"] = displaceitem["source_id"] = source_iditem["name"] = '天主極樂大帝'yield item3、items.py文件
import scrapyclass TaocheItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()title = scrapy.Field() # 標題reg_date = scrapy.Field() # 上牌日期mile = scrapy.Field() # 公里數city_name = scrapy.Field() # 城市名稱price = scrapy.Field() # 優惠價格all_price = scrapy.Field() # 全款價格# 詳情頁detail_url =scrapy.Field() # 詳情urlpic = scrapy.Field() # 圖片displace = scrapy.Field() # 排量source_id = scrapy.Field() # 車源號name= scrapy.Field() # 車源號4、pipelines.py文件
import pymongo class TaochePipeline(object):def __init__(self):self.client=pymongo.MongoClient('10.10.21.13',port=27017)#連接mongo數據庫,建立客戶端對象self.db=self.client['taoche']#連接數據庫self.collection=self.db['taoche']def process_item(self,item,spider):self.collection.insert(dict(item))三、mongodb數據庫主機配置
1.修改mongo.config文件
在該文件中添加如下代碼
總結
以上是生活随笔為你收集整理的scrapy分布式爬虫爬取淘车网的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: python定时发送邮件
- 下一篇: numpy更改形状、类型