爬虫案列:京东商城长裤信息获取
生活随笔
收集整理的這篇文章主要介紹了
爬虫案列:京东商城长裤信息获取
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
1、創建Scrapy項目
使用全局命令startproject創建項目,創建新文件夾并且使用命令進入文件夾,創建一個名為jingdong的Scrapy項目。
[python]?view plain?copy
2.使用項目命令genspider創建Spider
[python]?view plain?copy
3、發送請求,接受響應,提取數據
# -*- coding: utf-8 -*- import scrapyfrom jingdong.items import JingdongItemclass JdSpider(scrapy.Spider):name = "jd"allowed_domains = ["www.jd.com"]start_urls = ['http://www.jd.com/']search_url1 = 'https://search.jd.com/Search?keyword={key}&enc=utf-8&page={page}'#search_url2='https://search.jd.com/s_new.php?keyword={key}&enc=utf-8&page={page}&scrolling=y&pos=30&show_items={goods_items}'search_url2= 'https://search.jd.com/s_new.php?keyword={key}&enc=utf-8&page={page}&s=26&scrolling=y&pos=30&tpl=3_L&show_items={goods_items}'shop_url ='http://mall.jd.com/index-{shop_id}.html'def start_requests(self):key = '長褲'for num in range(1,100 ):page1 = str(2*num-1)#構造頁數page2 = str(2*num)yield scrapy.Request(url=self.search_url1.format(key=key,page=page1),callback=self.parse,dont_filter = True)yield scrapy.Request(url=self.search_url1.format(key=key,page=page1),callback=self.get_next_half,meta={'page2':page2,'key':key},dont_filter = True)def get_next_half(self,response):try:items = response.xpath('//*[@id="J_goodsList"]/ul/li/@data-pid').extract()key = response.meta['key']page2 =response.meta['page2']goods_items=','.join(items)yield scrapy.Request(url=self.search_url2.format(key=key, page=page2, goods_items=goods_items),callback=self.next_parse,dont_filter=True)#這里不加這個的話scrapy會報錯dont_filter,官方是說跟allowed_domains沖突except Exception as e:print('沒有數據')def parse(self, response):all_goods = response.xpath('//div[@id="J_goodsList"]/ul/li')for one_good in all_goods:item = JingdongItem()try:data = one_good.xpath('div/div/a/em')item['title'] = data.xpath('string(.)').extract()[0]#提取出該標簽所有文字內容item['comment_count'] = one_good.xpath('div/div[@class="p-commit"]/strong/a/text()').extract()[0]#評論數item['goods_url'] = 'http:'+one_good.xpath('div/div[4]/a/@href').extract()[0]#商品鏈接item['shops_id']=one_good.xpath('div/div[@class="p-shop"]/@data-shopid').extract()[0]#店鋪IDitem['shop_url'] =self.shop_url.format(shop_id=item['shops_id'])goods_id=one_good.xpath('div/div[2]/div/ul/li[1]/a/img/@data-sku').extract()[0]if goods_id:item['goods_id'] =goods_idprice=one_good.xpath('div/div[3]/strong/i/text()').extract()#價格if price:#有寫商品評論數是0,價格也不再源代碼里面,應該是暫時上首頁的促銷商品,每頁有三四件,我們忽略掉item['price'] =price[0]#print(item)yield itemexcept Exception as e:passdef next_parse(self,response):all_goods=response.xpath('/html/body/li')for one_good in all_goods:item = JingdongItem()try:data = one_good.xpath('div/div/a/em')item['title'] = data.xpath('string(.)').extract()[0] # 提取出該標簽所有文字內容item['comment_count'] = one_good.xpath('div/div[@class="p-commit"]/strong/a/text()').extract()[0] # 評論數item['goods_url'] = 'http:' + one_good.xpath('div/div[4]/a/@href').extract()[0] # 商品鏈接item['shops_id'] = one_good.xpath('div/div[@class="p-shop"]/@data-shopid').extract()[0] # 店鋪IDitem['shop_url'] = self.shop_url.format(shop_id=item['shops_id'])goods_id = one_good.xpath('div/div[2]/div/ul/li[1]/a/img/@data-sku').extract()[0]if goods_id:item['goods_id'] = goods_idprice = one_good.xpath('div/div[3]/strong/i/text()').extract() # 價格if price: # 有寫商品評論數是0,價格也不再源代碼里面,應該是暫時上首頁的促銷商品,每頁有三四件,我們忽略掉item['price'] = price[0]yield item# print(item)except Exception as e:pass# print(e,'沒有數據')
4.pipelines設置保存文件,創建mysql數據庫,設置表格:
# -*- coding: utf-8 -*-# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql from pymongo import MongoClientclass JingdongPipeline(object):# def __init__(self):# self.client = MongoClient()# self.database = self.client['jingdong']# self.db = self.database['jingdong_infomation']## def process_item(self, item, spider):#這里以每個用戶url_token為ID,有則更新,沒有則插入# self.db.update({'goods_id':item['goods_id']},dict(item),True)# return item## def close_spider(self,spider):# self.client.close()def __init__(self):self.conn = pymysql.connect(host='127.0.0.1',port=3306,user ='root',passwd='mysql',db='jingdong',charset='utf8')self.cursor = self.conn.cursor()def process_item(self, item, spider):try:#有些標題會重復,所以添加異常title = item['title']comment_count = item['comment_count'] # 評論數shop_url = item['shop_url'] # 店鋪鏈接price = item['price']goods_url = item['goods_url']shops_id = item['shops_id']goods_id =int(item['goods_id'])#sql = 'insert into jingdong_goods(title,comment_count,shop_url,price,goods_url,shops_id) VALUES (%(title)s,%(comment_count)s,%(shop_url)s,%(price)s,%(goods_url)s,%(shops_id)s,)'try:self.cursor.execute("insert into jingdong_goods(title,comment_count,shop_url,price,goods_url,shops_id,goods_id)values(%s,%s,%s,%s,%s,%s,%s)", (title,comment_count,shop_url,price,goods_url,shops_id,goods_id))self.conn.commit()except Exception as e:passexcept Exception as e:pass# def close_spider(self):# self.conn.close()
5.配置settings設置
# -*- coding: utf-8 -*-# Scrapy settings for jingdong project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'jingdong'SPIDER_MODULES = ['jingdong.spiders'] NEWSPIDER_MODULE = 'jingdong.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent# Obey robots.txt rules ROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default) COOKIES_ENABLED = False# Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False# Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #}# Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'jingdong.middlewares.MyCustomSpiderMiddleware': 543, #}# Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'jingdong.middlewares.MyCustomDownloaderMiddleware': 543, #}# Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #}# Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {'jingdong.pipelines.JingdongPipeline': 300, }# Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
6.進行爬取:執行項目命令crawl,啟動Spider:
[python]?view plain?copy
?
總結
以上是生活随笔為你收集整理的爬虫案列:京东商城长裤信息获取的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: HTML试题及答案(总结)
- 下一篇: python3之批量修改文件名称