Scrapy 框架 中间件,信号,定制命令
中間件
下載器中間件
寫中間件
from scrapy.http import HtmlResponse from scrapy.http import Requestclass Md1(object):@classmethoddef from_crawler(cls, crawler):# 此方法用于拿到當前的爬蟲s = cls()return sdef process_request(self, request, spider): print('md1.process_request',request)return None # 返回如果是 空就會繼續往下執行下一個中間件的 process_request 方法,如果一旦有返回值就要考慮情況"""# 1. 返回 Response# 返回 Response 之后會往下執行 最后一個中間件的 process_response 方法 # import requests# result = requests.get(request.url)# return HtmlResponse(url=request.url, status=200, headers=None, body=result.content)# 2. 返回 Request# 返回 Request 之后 相當于無視了這次的請求 重新回到 調制器 那邊,相當于又產生了新的任務# return Request('https://dig.chouti.com/r/tec/hot/1')# 3. 拋出異常 # 拋出異常 必須要 有 process_exception 方法進行捕捉異常,不然會報錯# process_exception 方法在進行一系列的操作 在捕捉到異常的時候 # from scrapy.exceptions import IgnoreRequest# raise IgnoreRequest# 4. 對請求進行加工(*) # 通常我們都是用于對請求加工,然后再繼續下面操作不返回東西 # request.headers['user-agent'] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"# return None"""def process_response(self, request, response, spider):# Called with the response returned from the downloader.# Must either;# - return a Response object # 返回一個 Response 來代替當前的 Response# - return a Request object # 返回一個 Request 開啟新任務 # - or raise IgnoreRequest # 返回一個 IgnoreRequest 進行異常捕捉 print('m1.process_response',request,response)return responsedef process_exception(self, request, exception, spider):# Called when a download handler or a process_request()# (from other downloader middleware) raises an exception.# Must either:# - return None: continue processing this exception# 通常我們都是直接返回 None 就可以了# - return a Response object: stops process_exception() chain # 只要返回了 Response 當前的 process_exception 就不做操作了 # 返回 Response 表示交給下一個 中間件的 process_exception 繼續處理 # - return a Request object: stops process_exception() chain # 只要返回了 Request 當前的 process_exception 就不做操作了 # 返回 Request 放棄本次任務,新建任務 pass配置文件
DOWNLOADER_MIDDLEWARES = { #'xdb.middlewares.XdbDownloaderMiddleware': 543,# 'xdb.proxy.XdbProxyMiddleware':751,'xdb.md.Md1':666, # 依舊是 0-1000 越小越優先 'xdb.md.Md2':667, }執行順序梳理
調度器 給 下載器的時候先走 process_request(從第一個中間件往最后一個走) 然后如果根據返回情況進行判斷接下來的方向返回 None 繼續下一個中間件的 process_request返回 Response 進入 最后一個下載中間件的 process_response 流程返回 Request 返回 調度器開啟新任務 返回 異常 進入當前中間件的 process_exception 進行異常處理下載器 還給 爬蟲的時候要走 process_response(從最后一個中間件往第一個走)然后如果根據返回情況進行判斷接下來的方向返回 None 繼續上一個中間件的 process_response返回 Response 替換當前Response 進入上一個下載中間件的 process_response 流程返回 Request 返回 調度器開啟新任務 放棄當前的任務 返回 異常 進入當前中間件的 process_exception 進行異常處理
應用場景 - 隨機 User-Agent?
開源的組件? 導入
from fake_useragent import UserAgent配置文件中設置選擇方式
RANDOM_UA_TYPE = "random"根據配置文件中的選擇方式設置模式
class RandomUserAgentMiddlware(object):# 隨機更換user-agentdef __init__(self, crawler):super(RandomUserAgentMiddlware, self).__init__()self.ua = UserAgent()self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")@classmethoddef from_crawler(cls, crawler):return cls(crawler)def process_request(self, request, spider):def get_ua():return getattr(self.ua, self.ua_type)request.headers.setdefault('User-Agent', get_ua())應用場景 - IP代理
寫個腳本完成對 西刺代理IP的爬蟲
并存入數據庫
# -*- coding: utf-8 -*- import requests from scrapy.selector import Selector import pymysqlconn = pymysql.connect(host="127.0.0.1", user="root", passwd="root", db="article_spider", charset="utf8") cursor = conn.cursor()def crawl_ips():# 爬取西刺的免費ip代理headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"}for i in range(1568):re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers)selector = Selector(text=re.text)all_trs = selector.css("#ip_list tr")ip_list = []for tr in all_trs[1:]:speed_str = tr.css(".bar::attr(title)").extract()[0]if speed_str:speed = float(speed_str.split("秒")[0])all_texts = tr.css("td::text").extract()ip = all_texts[0]port = all_texts[1]proxy_type = all_texts[5]ip_list.append((ip, port, proxy_type, speed))for ip_info in ip_list:cursor.execute("insert proxy_ip(ip, port, speed, proxy_type) VALUES('{0}', '{1}', {2}, 'HTTP')".format(ip_info[0], ip_info[1], ip_info[3]))conn.commit()class GetIP(object):def delete_ip(self, ip):# 從數據庫中刪除無效的ipdelete_sql = """delete from proxy_ip where ip='{0}'""".format(ip)cursor.execute(delete_sql)conn.commit()return Truedef judge_ip(self, ip, port):# 判斷ip是否可用http_url = "http://www.baidu.com"proxy_url = "http://{0}:{1}".format(ip, port)try:proxy_dict = {"http": proxy_url,}response = requests.get(http_url, proxies=proxy_dict)except Exception as e:print("invalid ip and port")self.delete_ip(ip)return Falseelse:code = response.status_codeif code >= 200 and code < 300:print("effective ip")return Trueelse:print("invalid ip and port")self.delete_ip(ip)return Falsedef get_random_ip(self):# 從數據庫中隨機獲取一個可用的iprandom_sql = """SELECT ip, port FROM proxy_ipORDER BY RAND()LIMIT 1"""result = cursor.execute(random_sql)for ip_info in cursor.fetchall():ip = ip_info[0]port = ip_info[1]judge_re = self.judge_ip(ip, port)if judge_re:return "http://{0}:{1}".format(ip, port)else:return self.get_random_ip()# print (crawl_ips()) if __name__ == "__main__":get_ip = GetIP()get_ip.get_random_ip()設置中間件來調用腳本設置代理 IP
class RandomProxyMiddleware(object):# 動態設置ip代理def process_request(self, request, spider):get_ip = GetIP()request.meta["proxy"] = get_ip.get_random_ip()爬蟲中間件
寫中間件
class Sd1(object):# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the spider middleware does not modify the# passed objects. @classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()return sdef process_spider_input(self, response, spider):# Called for each response that goes through the spider# middleware and into the spider.# Should return None or raise an exception.return Nonedef process_spider_output(self, response, result, spider):# Called with the results returned from the Spider, after# it has processed the response.# Must return an iterable of Request, dict or Item objects.for i in result:yield idef process_spider_exception(self, response, exception, spider):# Called when a spider or process_spider_input() method# (from other spider middleware) raises an exception.# Should return either None or an iterable of Response, dict# or Item objects.pass# 只在爬蟲啟動時,執行一次。def process_start_requests(self, start_requests, spider):# Called with the start requests of the spider, and works# similarly to the process_spider_output() method, except# that it doesn’t have a response associated.# Must return only requests (not items).for r in start_requests:yield r配置文件
SPIDER_MIDDLEWARES = {# 'xdb.middlewares.XdbSpiderMiddleware': 543,'xdb.sd.Sd1': 666, # 同爬蟲中間件一樣的判斷機制 'xdb.sd.Sd2': 667, }執行流程
1. 第一次啟動爬蟲文件封裝好 request 之后 走 process_start_requests 上傳給引擎
2. 引擎將封裝好的 request 給調度器
3. 調度器 繼續執行 給下載器
4. 下載器 下載了內容之后給 引擎
5. 引擎再給 爬蟲文件的時候要走 process_spider_input
6. 爬蟲文件處理完之后如果有 yield 就要在走 process_spider_output 給引擎
應用
- 深度
- 優先級
信號
使用框架預留的位置,幫助你自定義一些功能
使用實例
from scrapy import signalsclass MyExtend(object):def __init__(self):pass@classmethoddef from_crawler(cls, crawler):self = cls()crawler.signals.connect(self.x1, signal=signals.spider_opened) # 綁定信號發生時允許的函數crawler.signals.connect(self.x2, signal=signals.spider_closed)return selfdef x1(self, spider):print('open')def x2(self, spider):print('close') # 信號可選類型 from scrapy import signals 中可以看到 engine_started = object() engine_stopped = object()spider_opened = object() spider_idle = object() spider_closed = object() spider_error = object()request_scheduled = object() request_dropped = object() response_received = object() response_downloaded = object()item_scraped = object() item_dropped = object()?
# settings.pyEXTENSIONS = {'xdb.ext.MyExtend':666,}
定制命令
單爬蟲運行
import sys from scrapy.cmdline import executeif __name__ == '__main__':execute(["scrapy","crawl","chouti","--nolog"])所有爬蟲
- 在spiders同級創建任意目錄,如:commands - 在其中創建 crawlall.py 文件 (此處文件名就是自定義的命令) - 在settings.py 中添加配置 COMMANDS_MODULE = '項目名稱.目錄名稱' - 在項目目錄執行命令:scrapy crawlall # crawlall.pyfrom scrapy.commands import ScrapyCommand from scrapy.utils.project import get_project_settingsclass Command(ScrapyCommand):requires_project = Truedef syntax(self):return '[options]'def short_desc(self):return 'Runs all of the spiders'def run(self, args, opts):spider_list = self.crawler_process.spiders.list()for name in spider_list:self.crawler_process.crawl(name, **opts.__dict__)self.crawler_process.start() # settings.py
COMMANDS_MODULE = "xdb.commands"
爬蟲暫停重啟
原理
爬蟲的暫停重啟是需要文件支持
在啟動的命令里選擇一個路徑
不同的爬蟲不能共用,
相同的爬蟲如果公用同一個就會給予這個文件的上一次狀態繼續爬取
該命令的中斷命令是基于 windows? Ctrl+c / 殺進程 或者 Linux 里面的? kill -f -9 main.py?
因此在 pycharm 中的中斷是做不到的, 只能在命令行中處理
scrapy crawl lagou -s JOBDIR=job_info/001配置文件方式
指定 文件路徑可以在 settings.py 中設置??
這樣就是全局設置了
JOBDIR="job_info/001"?
或者在單爬蟲類中設置
cutom_settings = {"JOBDIR": "job_info/001" }?
總結
但是還是和上面的說法一樣.....pycharm 里面沒辦法中斷, 因此還是沒有啥意義,?
還是只能使用命令行方式
?
轉載于:https://www.cnblogs.com/shijieli/p/10358611.html
總結
以上是生活随笔為你收集整理的Scrapy 框架 中间件,信号,定制命令的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 关于Two pointers的个人理解
- 下一篇: Vue开发异常: Error: cust