多线程爬取百度关键字结果,并获取真实url
                                                            生活随笔
收集整理的這篇文章主要介紹了
                                多线程爬取百度关键字结果,并获取真实url
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.                        
                                項目目的:練習
項目要求:根據給定的關鍵字,檢索百度的結果,將結果保存到文件中
遇到問題:
1、python? list取值問題,有些看不清晰的,用for index, item in enumerate(array):查看
2、選取想要的元素 ,兩種方式:
一是tag.h3.a['href'],
二是tagh3 = result.find_all('h3');for h3 in tagh3:href = h3.find('a').get('href')
3、構建網址
out_url = [(key, page, "https://www.baidu.com/s?wd={}&pn={}".format(key, page * 10),) for key in keys for page in range(pages)]
4、結果去除百度自己的內容
title = tag.h3.a.text;if '百度' in title:break
5、去除類似視頻大全的百度整合內容
if not href.startswith('http'):break
6、獲取百度搜索的真實網址
baidu_url = requests.get(url=href, headers=myhead, allow_redirects=False)
real_url = baidu_url.headers['Location'] # 得到網頁原始地址
if real_url.startswith('http'):
allow_redirects=False是重點,禁止跳轉
7、任務和結果傳遞
self.work_queue = Queue() # 任務隊列
self.result_queue = Queue() # 結果隊列
?
8、多線程卡死
一定用? ?while not self.work_queue.empty():,不用寫成?while?True:
9、沒了,上代碼,為了便于調試,做了微調,代碼里面有說明
# !/usr/bin/env python # -*- coding:utf -8-*-import time from retrying import retry import requests from bs4 import BeautifulSoup import threading from queue import Queuelock = threading.RLock()class WorkManager(object):def __init__(self, do_job, works, thread_num=25):self.job = do_jobself.work_queue = Queue() # 任務隊列self.result_queue = Queue() # 結果隊列self.threads = []self.__init_work_queue(works)self.__init_thread_pool(thread_num)# #初始化工作隊列,添加工作入隊def __init_work_queue(self, works):for item in works:# print('__init_work_queue item:', item) # 參數tupeself.work_queue.put((self.job, item)) # 將任務函數和參數傳入任務隊列# #初始化線程,同時運行線程數量有效果,原理沒明白def __init_thread_pool(self, thread_num):for i in range(thread_num):self.threads.append(Work(self.work_queue, self.result_queue))# #等待所有線程運行完畢def wait_allcomplete(self):'''@description:等待線程結束,并取得運行結果@return:result_list'''for item in self.threads:if item.isAlive():item.join()result_list = []for i in range(self.result_queue.qsize()):res = self.result_queue.get()#print('wait_allcomplete:', res)result_list.append(res)return result_listclass Work(threading.Thread):def __init__(self, work_queue, result_queue):threading.Thread.__init__(self)self.work_queue = work_queueself.result_queue = result_queueself.start() # 啟動線程def run(self):# 一定不用死循環while not self.work_queue.empty():try:do, args = self.work_queue.get(block=False) # 任務異步出隊# print('Work args:', args) # 參數list or tupe,注意檢查此處result = do(*args) # 傳遞 list or tupe 各元素#print('work run result:', result, flush=True)self.result_queue.put(result) # 取得函數返回值self.work_queue.task_done() # 通知系統任務完成with lock:print('{}\tdone\twith\t{}\tat\t{}'.format(threading.currentThread().name, args[0], get_stime()), flush=True)except Exception as error:print(error, flush=True)breakdef get_stime():ct = time.time()local_time = time.localtime(ct)data_head = time.strftime("%Y-%m-%d %H:%M:%S", local_time)data_secs = (ct - int(ct)) * 1000stamp = "%s.%03d" % (data_head, data_secs)return stampmyhead = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36','Accept': 'image/webp,image/apng,image/*,*/*;q=0.8','Accept-Encoding': 'gzip,deflate,sdch, br','Accept-Language': 'zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4','Cache-Control': 'max-age=0','Connection': 'close','Proxy-Connection': 'no-cache' }def parse_url(url, params=None, headers=myhead, proxies=None, timeout=6, ecode='utf-8',wait_random_min=200, wait_random_max=3000, stop_max_attempt_number=100):@retry(wait_random_min=wait_random_min, wait_random_max=wait_random_max, stop_max_attempt_number=stop_max_attempt_number)def _parse_url(url):response = requests.get(url, params=params, headers=headers, proxies=proxies, timeout=timeout)assert response.status_code == 200# 由于status_code == 200,所以不能用于百度真是網址獲取,因其code==302return response.content.decode(ecode)try:response = _parse_url(url)soup = BeautifulSoup(response, 'lxml')[s.extract() for s in soup(["script", "style"])]except requests.exceptions.ConnectionError as e:print('ConnectionError:', e, url, flush=True)soup = Noneexcept requests.exceptions.ChunkedEncodingError as e:print('ChunkedEncodingError:', e, url, flush=True)soup = Noneexcept Exception as e:print('Unfortunitely Unknow Error:', e, url, flush=True)soup = Nonereturn soupdef fd():import win32ui_dlg = win32ui.CreateFileDialog(1) # 1表示打開文件對話框_dlg.SetOFNInitialDir('c:/') # 設置打開文件對話框中的初始顯示目錄_dlg.DoModal()filename = _dlg.GetPathName() # 獲取選擇的文件名稱return filenamedef make_urls(pages):'''_k = []_file = fd()if not _file:return Falseres = _file.split('.')[0:-1] # 文件名,含完整路徑,去掉后綴with open(_file) as f:for row in f.readlines():row = row.strip() # 默認刪除空白符 '#^\s*$'if len(row) == 0:break # 去除行len為0的行_k.append(row)keys = sorted(set(_k), key=_k.index)#為方便演示,用list直接替代讀文件'''keys = ["減肥計劃","減肥運動","如何減肥","怎么減肥","有效減肥","鄭多燕減肥","減肥視頻","減肥","減肥方法","減肥食譜"," ","減肚子","腰腹減肥","\t","減腰","減肥法","減肥法"]out_url = [(key, page, "https://www.baidu.com/s?wd={}&pn={}".format(key, page * 10),) for key in keys for page in range(pages)]return 'baidu', out_url# return res[0], out_urldef getkeys(key, page, url):_texts = []result = parse_url(url=url)'''#方法1tagh3 = result.find_all('h3')index = 0for h3 in tagh3:href = h3.find('a').get('href')title = h3.find('a').textif '百度' in title:breakif not href.startswith('http'):breakbaidu_url = requests.get(url=href, headers=myhead, allow_redirects=False) # 禁止跳轉real_url = baidu_url.headers['Location'] # 得到網頁原始地址if real_url.startswith('http'):index += 1_texts.append([index, title, real_url])#方法1結束'''# 方法2,效果與方法1相同allTags = result.findAll('div', ['result-op c-container xpath-log', 'result c-container'])# 'result-op c-container xpath-log' #百度自己內容index = 0for tag in allTags:href = tag.h3.a['href']title = tag.h3.a.textif '百度' in title:breakif not href.startswith('http'):breakbaidu_url = requests.get(url=href, headers=myhead, allow_redirects=False)real_url = baidu_url.headers['Location'] # 得到網頁原始地址if real_url.startswith('http'):index += 1_texts.append([key, page, index, title, real_url])# 方法2結束return _textsdef savefile(_filename, lists):# 函數說明:將爬取的文章lists寫入文件print('[' + _filename + ']開始保存......', end='', flush=True)lists.sort()with open(_filename, 'a', encoding='utf-8') as f:for lists_line in lists:for index, item in enumerate(lists_line):f.write('key:' + item[0] + '\tpage:' + str(item[1]) + '\tindex:' + str(item[2]) + '\ttitle:' + item[3] + '\turl:' + item[4] + '\n')print('[' + _filename + ']保存完成。', flush=True)def main():start = time.time()try:_name, urls = make_urls(10)except Exception as e:print(e)return Falsework_manager = WorkManager(getkeys, urls) # 調用函數,參數:list內tupe,線程數量texts = work_manager.wait_allcomplete()savefile(_name + '_百度詞頻.txt', texts)print("threadPool cost all time: %s" % (time.time() - start), flush=True)if __name__ == "__main__":main()# threadPool cost all time: 27.787729501724243?
總結
以上是生活随笔為你收集整理的多线程爬取百度关键字结果,并获取真实url的全部內容,希望文章能夠幫你解決所遇到的問題。
 
                            
                        - 上一篇: VS2015开发OpenDDS实例(AC
- 下一篇: 国科大英语Unit5
