當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

爬虫小案例：基于Bing关键词批量下载图片（第二版）

發(fā)布時間：2024/7/5 编程问答 71 豆豆

生活随笔收集整理的這篇文章主要介紹了爬虫小案例：基于Bing关键词批量下载图片（第二版）小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

一、需求：

基于Bing網(wǎng)站，輸入關(guān)鍵詞，批量下載圖片保存到本地

二、代碼展示：

import requests from lxml import etree import os from multiprocessing.dummy import Pool import json from time import time# 作用：按關(guān)鍵字、圖片數(shù)量爬取必應(yīng)圖片，存放到指定路徑。 # 使用方法：只需運行一條命令 BingImagesSpider('美女壁紙', 200, 'E:\images').run() class BingImagesSpider:thread_amount = 1000 # 線程池數(shù)量，線程池用于多IO請求，減少總的http請求時間per_page_images = 30 # 每頁必應(yīng)請求的圖片數(shù)count = 0 # 圖片計數(shù)success_count = 0# 忽略圖片標簽的一些字符ignore_chars = ['|', '.', '，', ',', '', '', '/', '@', ':', '：', ';', '；','[', ']', '+']# 允許的圖片類型image_types = ['bmp', 'jpg', 'png', 'tif', 'gif', 'pcx', 'tga', 'exif','fpx', 'svg', 'psd', 'cdr', 'pcd', 'dxf', 'ufo', 'eps', 'ai','raw', 'WMF', 'webp']# 請求頭headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'}# 必應(yīng)圖片 urlbing_image_url_pattern = 'https://www.bing.com/images/async?q={}&first={}&count={}&mmasync=1'def __init__(self, keyword, amount, path='./'):# keyword: 需爬取的關(guān)鍵字# amount: 需爬取的數(shù)量# path: 圖片存放路徑self.keyword = keywordself.amount = amountself.path = pathself.thread_pool = Pool(self.thread_amount)def __del__(self):self.thread_pool.close()self.thread_pool.join()# 作用：從必應(yīng)請求圖片def request_homepage(self, url):# url: 必應(yīng)圖片頁的 urlreturn requests.get(url, headers=self.headers)# 作用：解析必應(yīng)網(wǎng)頁，得到所有圖片的信息，封裝到列表中返回# 每個圖片的信息以字典對象存儲，字典的鍵包括 image_title, image_type, image_md5, image_urldef parse_homepage_response(self, response):# response: 必應(yīng)網(wǎng)站的響應(yīng)# 獲取各圖片信息所在的json格式字符串 mtree = etree.HTML(response.text)m_list = tree.xpath('//*[@class="imgpt"]/a/@m')# 對每個圖片分別處理info_list = []for m in m_list:dic = json.loads(m)# 去除一些文件名中不允許的字符image_title = dic['t']for char in self.ignore_chars:image_title = image_title.replace(char, ' ')image_title = image_title.strip()# 有些圖片的信息中不包含圖片格式，該情況將圖片設(shè)置為 jpg 格式image_type = dic['murl'].split('.')[-1]if image_type not in self.image_types:image_type = 'jpg'# 將每個圖片的信息存為字典格式info = dict()info['image_title'] = image_titleinfo['image_type'] = image_typeinfo['image_md5'] = dic['md5']info['image_url'] = dic['murl']print(info)info_list.append(info)return info_list# 請求具體圖片，保存到初始化時指定的路徑def request_and_save_image(self, info):# info: 每個圖片的信息,以字典對象存儲。字典的鍵包括 image_title, image_type, image_md5, image_urlfilename = '{} {}.{}'.format(self.count, info['image_title'],info['image_type'])filepath = os.path.join(self.path, filename)try:# 請求圖片response = requests.get(info['image_url'], headers=self.headers,timeout=1.5)# 保存圖片try:with open(filepath, 'wb') as fp:fp.write(response.content)except:pass# 打印日志self.count += 1self.success_count += 1print('{}: saving {} done.'.format(self.count, filepath))except requests.exceptions.RequestException as e:self.count += 1print('{}: saving {}failed. url: {}'.format(self.count, filepath,info['image_url']))print('\t tip:', e)# 作用：圖片信息的列表去重，去除重復的圖片信息def deduplication(self, info_list):result = []# 用圖片的 md5 做為唯一標識符md5_set = set()for info in info_list:if info['image_md5'] not in md5_set:result.append(info)md5_set.add(info['image_md5'])return result# 作用：運行爬蟲，爬取圖片def run(self):# 創(chuàng)建用于保存圖片的目錄if not os.path.exists(self.path):os.mkdir(self.path)# 根據(jù)關(guān)鍵詞和需要的圖片數(shù)量，生成將爬取的必應(yīng)圖片網(wǎng)頁列表homepage_urls = []for i in range(int(self.amount / self.per_page_images * 1.5) + 1): # 由于有些圖片會重復，故先請求1.5倍圖片，豁免url = self.bing_image_url_pattern.format(self.keyword,i * self.per_page_images,self.per_page_images)homepage_urls.append(url)print('homepage_urls len {}'.format(len(homepage_urls)))# 通過線程池請求所有必應(yīng)圖片網(wǎng)頁homepage_responses = self.thread_pool.map(self.request_homepage,homepage_urls)# 從必應(yīng)網(wǎng)頁解析所有圖片的信息，每個圖片包括 image_title, image_type, image_md5, image_url 等信息。info_list = []for response in homepage_responses:result = self.parse_homepage_response(response)info_list += resultprint('info amount before deduplication', len(info_list))# 刪除重復的圖片，避免重復下載info_list = self.deduplication(info_list)print('info amount after deduplication', len(info_list))info_list = info_list[: self.amount]print('info amount after split', len(info_list))# 下載所有圖片，并保存self.thread_pool.map(self.request_and_save_image, info_list)print('all done. {} successfully downloaded, {} failed.'.format(self.success_count, self.count - self.success_count))if __name__ == '__main__':# 關(guān)鍵詞：美女壁紙# 需要的圖片數(shù)量：100# 圖片保存路徑：'D:\images'start = time()BingImagesSpider('美女壁紙', 100, 'D:\images').run()print(time() - start)

總結(jié)

以上是生活随笔為你收集整理的爬虫小案例：基于Bing关键词批量下载图片（第二版）的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯，歡迎將生活随笔推薦給好友。

上一篇：自然语言处理-LDA主题模型
下一篇： Python数据结构常见的八大排序算法（