python queue查询空_【Python】多线程爬虫案例
爬取博客園文章列表
爬取博客園文章列表,假設頁面的URL是https://www.cnblogs.com/loaderman?
要求:使用requests獲取頁面信息,用XPath / re 做數據提取?獲取每個博客里的標題,描述,鏈接地址,日期等保存到 json 文件內
示例:
# -*- coding:utf-8 -*-import?urllib2
import?json
from?lxml import?etree
url = "https://www.cnblogs.com/loaderman/"
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
request = urllib2.Request(url, headers=headers)
html = urllib2.urlopen(request).read()
# 響應返回的是字符串,解析為HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有結點位置,contains()模糊查詢方法,第一個參數是要匹配的標簽,第二個參數是標簽名部分內容
node_list = text.xpath('//div[contains(@class, "post")]')
print?(node_list)
items = {}
for?each in?node_list:
????print?(each)
????title = each.xpath(".//h2/a[@class='postTitle2']/text()")[0]
????detailUrl = each.xpath(".//a[@class='postTitle2']/@href")[0]
????content = each.xpath(".//div[@class='c_b_p_desc']/text()")[0]
????date = each.xpath(".//p[@class='postfoot']/text()")[0]
????items = {
????????"title": title,
????????"image": detailUrl,
????????"content": content,
????????"date": date,
????}
????with?open("loaderman.json", "a") as?f:
????????f.write(json.dumps(items, ensure_ascii=False).encode("utf-8") + "\n")
效果:
多線程爬蟲案例
Queue是python中的標準庫,可以直接import Queue引用;隊列是線程間最常用的交換數據的形式?
python下多線程的思考?
對于資源,加鎖是個重要的環節。因為python原生的list,dict等,都是not thread safe的。而Queue,是線程安全的,因此在滿足使用條件下,建議使用隊列?
初始化:class Queue.Queue(maxsize) FIFO 先進先出
包中的常用方法:
Queue.qsize() 返回隊列的大小
Queue.empty() 如果隊列為空,返回True,反之False
Queue.full() 如果隊列滿了,返回True,反之False
Queue.full 與 maxsize 大小對應
Queue.get([block[, timeout]])獲取隊列,timeout等待時間
創建一個“隊列”對象
import Queue
myqueue = Queue.Queue(maxsize = 10)
將一個值放入隊列中
myqueue.put(10)
將一個值從隊列中取出
myqueue.get()
示意圖:
# -*- coding:utf-8 -*-# 使用了線程庫
import?threading
# 隊列
from?Queue import?Queue
# 解析庫
from?lxml import?etree
# 請求處理
import?requests
# json處理
import?json
import?time
class?ThreadCrawl(threading.Thread):
????def?__init__(self, threadName, pageQueue, dataQueue):
????????# threading.Thread.__init__(self)
????????# 調用父類初始化方法
????????super(ThreadCrawl, self).__init__()
????????# 線程名
????????self.threadName = threadName
????????# 頁碼隊列
????????self.pageQueue = pageQueue
????????# 數據隊列
????????self.dataQueue = dataQueue
????????# 請求報頭
????????self.headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
????def?run(self):
????????print?"啟動 "?+ self.threadName
????????while?not?CRAWL_EXIT:
????????????try:
????????????????# 取出一個數字,先進先出
????????????????# 可選參數block,默認值為True
????????????????# 1. 如果對列為空,block為True的話,不會結束,會進入阻塞狀態,直到隊列有新的數據
????????????????# 2. 如果隊列為空,block為False的話,就彈出一個Queue.empty()異常,
????????????????page = self.pageQueue.get(False)
????????????????url = "https://www.cnblogs.com/loaderman/default.html?page="?+ str(page)
????????????????# print url
????????????????content = requests.get(url, headers=self.headers).text
????????????????time.sleep(1)
????????????????self.dataQueue.put(content)
????????????????# print len(content)
????????????except:
????????????????pass
????????print?"結束 "?+ self.threadName
class?ThreadParse(threading.Thread):
????def?__init__(self, threadName, dataQueue, filename, lock):
????????super(ThreadParse, self).__init__()
????????# 線程名
????????self.threadName = threadName
????????# 數據隊列
????????self.dataQueue = dataQueue
????????# 保存解析后數據的文件名
????????self.filename = filename
????????# 鎖
????????self.lock = lock
????def?run(self):
????????print?"啟動"?+ self.threadName
????????while?not?PARSE_EXIT:
????????????try:
????????????????html = self.dataQueue.get(False)
????????????????self.parse(html)
????????????except:
????????????????pass
????????print?"退出"?+ self.threadName
????def?parse(self, html):
????????# 解析為HTML DOM
????????html = etree.HTML(html)
????????node_list = html.xpath('//div[contains(@class, "post")]')
????????print?(node_list)
????????items = {}
????????for?each in?node_list:
????????????print?(each)
????????????title = each.xpath(".//h2/a[@class='postTitle2']/text()")[0]
????????????detailUrl = each.xpath(".//a[@class='postTitle2']/@href")[0]
????????????content = each.xpath(".//div[@class='c_b_p_desc']/text()")[0]
????????????date = each.xpath(".//p[@class='postfoot']/text()")[0]
????????????items = {
????????????????"title": title,
????????????????"image": detailUrl,
????????????????"content": content,
????????????????"date": date,
????????????}
????????????with?open("loadermanThread.json", "a") as?f:
????????????????f.write(json.dumps(items, ensure_ascii=False).encode("utf-8") + "\n")
CRAWL_EXIT = False
PARSE_EXIT = False
def?main():
????# 頁碼的隊列,表示20個頁面
????pageQueue = Queue(20)
????# 放入1~10的數字,先進先出
????for?i in?range(1, 21):
????????pageQueue.put(i)
????# 采集結果(每頁的HTML源碼)的數據隊列,參數為空表示不限制
????dataQueue = Queue()
????filename = open("duanzi.json", "a")
????# 創建鎖
????lock = threading.Lock()
????# 三個采集線程的名字
????crawlList = ["采集線程1號", "采集線程2號", "采集線程3號"]
????# 存儲三個采集線程的列表集合
????threadcrawl = []
????for?threadName in?crawlList:
????????thread = ThreadCrawl(threadName, pageQueue, dataQueue)
????????thread.start()
????????threadcrawl.append(thread)
????# 三個解析線程的名字
????parseList = ["解析線程1號", "解析線程2號", "解析線程3號"]
????# 存儲三個解析線程
????threadparse = []
????for?threadName in?parseList:
????????thread = ThreadParse(threadName, dataQueue, filename, lock)
????????thread.start()
????????threadparse.append(thread)
????# 等待pageQueue隊列為空,也就是等待之前的操作執行完畢
????while?not?pageQueue.empty():
????????pass
????# 如果pageQueue為空,采集線程退出循環
????global?CRAWL_EXIT
????CRAWL_EXIT = True
????print?"pageQueue為空"
????for?thread in?threadcrawl:
????????thread.join()
????????print?"1"
????while?not?dataQueue.empty():
????????pass
????global?PARSE_EXIT
????PARSE_EXIT = True
????for?thread in?threadparse:
????????thread.join()
????????print?"2"
????with?lock:
????????# 關閉文件
????????filename.close()
????print?"謝謝使用!"
if?__name__ == "__main__":
????main()
效果:
完
? 碼上加油站
? 一起來加油
長按掃碼關注
記得點個贊和在看哦!總結
以上是生活随笔為你收集整理的python queue查询空_【Python】多线程爬虫案例的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: python prt_Python中的s
- 下一篇: 郑州财税金融职业学院的计算机考试,郑州财