當前位置：首頁 > 编程语言 > python >内容正文

python

python爬虫实战(一)～爬取百度百科人物的文本+图片信息+Restful api接口

發布時間：2025/4/5 python 17 豆豆

生活随笔收集整理的這篇文章主要介紹了 python爬虫实战(一)～爬取百度百科人物的文本+图片信息+Restful api接口小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

我的github地址：https://github.com/yuyongsheng1990/python_spider_from_bdbaike

# -*- coding: UTF-8 -*- # @Project -> File: python_spider_from_bdbaike -> spider_baike_text_picture # @Time: 2021/6/3 20:13 # @Author: Yu Yongsheng # @Description: 從百度百科爬取人物的基本信息、信息框數據和圖片 import os from urllib.request import urlretrieve import urllib.parse from urllib.error import HTTPErrorimport requests from bs4 import BeautifulSoup from lxml import etree import re import xlwt import xlrd from xlutils.copy import copy# 防止ssl報錯 import ssl ssl._create_default_https_context = ssl._create_unverified_context# 爬蟲程序 def claw(content):# 訪問、下載html網頁url = 'https://baike.baidu.com/item/' + urllib.parse.quote(content) # 請求地址# 請求頭部，偽造瀏覽器，防止爬蟲被反headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}# 利用請求地址和請求頭部構造請求對象req = urllib.request.Request(url=url, headers=headers, method='GET')response = urllib.request.urlopen(req) # 發送請求，獲得響應text = response.read().decode('utf-8') # 讀取響應，獲得文本# ----------------------------------------------------------------------------------------------------# 解析html網頁soup = BeautifulSoup(text, 'lxml') # 創建soup對象，獲取html源碼intro_tag = soup.find_all('div', class_="lemma-summary") # 獲取百科基本信息列表name_tag = soup.find_all('dt', class_="basicInfo-item name") # 找到所有dt標簽，返回一個標簽列表value_tag = soup.find_all('dd', class_="basicInfo-item value") # 找到所有dd標簽，返回一個標簽列表# 處理基本信息：過濾數據，去掉空白intro_after_filter = [re.sub('\n+', '', item.get_text()) for item in intro_tag]intro_after_filter = [''.join(i.split()) for i in intro_after_filter] # 去除/0a亂碼# 將字符串列表連成字符串并返回intro_after_filter = ''.join(intro_after_filter)# print(intro_after_filter)# 抽取信息框數據profile_info = {}namelist = []valuelist = []for i in name_tag: # 將所有dt標簽內容存入列表name = i.get_text()name = ''.join(name.split()) # 去除/0a亂碼namelist.append(name)for i in value_tag: # 將所有dd標簽內容存入列表value = i.get_text().strip(' ')# value = re.sub('\n+', '、', i.get_text()).strip('、') # 老師不讓刪除換行符# value = ''.join(value.split()) # 刪除可能存在的亂嗎/0a，但一塊把空格刪除了，實際上不需要print(value)valuelist.append(value)for i, j in zip(namelist,valuelist): # 多遍歷循環，zip()接受一系列可迭代對象作為參數，將對象中對應的元素打包成一個個tuple（元組），然后返回由這些tuples組成的list（列表）。profile_info[i] = j# print(profile_info)# 爬取圖片# 找到所有img標簽，返回一個url的標簽列表img_urllist = []resp = requests.get(url=url, headers=headers)content = resp.contentsoup = BeautifulSoup(content, 'lxml')# img_list = soup.select('div .album-wrap')img_list = soup.select('a>div>img')# print(img_list)for img in img_list:try:# src = img.find('img').get('src')src = img.get('src')if re.match(r'https:(.*)image(.*)auto$', src):img_urllist.append(src)except:continue# print(img_urllist)return intro_after_filter, profile_info, img_urllist# 下載爬到的數據：基本信息、信息框、圖片 def download(name, intro, profile_dict, img_list):project_path = os.getcwd()# print('project_path:' + project_path)# 保存百科基本信息if not os.path.exists('introduction'):os.mkdir('introduction')introduction_file = project_path + '/introduction/' + name + '.txt'# print(introduction_file)if not os.path.exists(introduction_file):with open(introduction_file, 'x') as f:f.write(intro)else:with open(introduction_file, 'w') as f:f.write(intro)# print('introduction輸出完畢')# 保存信息框數據到excelif not os.path.exists('profile'):os.mkdir('profile')profile_file = project_path + '/profile/' + 'profile.csv'field_list = ['中文名', '外文名', '別名', '性別', '學位', '職稱', '國籍', '民族', '出生地', '籍貫', '出生日期', '逝世日期','星座', '血型', '身高','體重', '畢業院校', '職業', '經紀公司', '代表作品', '主要成就', '生肖', '語種', '特長', '粉絲名']if not os.path.exists(profile_file):workbook = xlwt.Workbook(encoding='utf-8')output_sheet = workbook.add_sheet('profile_sheet', cell_overwrite_ok=True)for i in range(len(field_list)):output_sheet.write(0, i, field_list[i])workbook.save(profile_file)rb = xlrd.open_workbook(profile_file)rows_num = rb.sheet_by_name('profile_sheet').nrows# print(rows_num)wb = copy(rb)output_sheet = wb.get_sheet(0)# print(profile)for i in range(len(field_list)):if profile_dict.get(field_list[i]):output_sheet.write(rows_num, i, profile_dict.get(field_list[i]))else:continueos.remove(profile_file)wb.save(profile_file)# 保存圖片# 請求頭部，偽造瀏覽器，防止爬蟲被反headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}download_limit = 10 # 單個人物下載的最大圖片數if not os.path.exists('img'):os.mkdir('img')name_path = project_path + '/img/' + nameif not os.path.exists(name_path):os.mkdir(name_path)count = 1for img_url in img_list:try:response = requests.get(img_url, headers=headers) # 得到訪問的網址content = response.contentfilename = name_path + '/' + name + '_%s.jpg' % countwith open(filename, "wb") as f:# 如果圖片質量太差，跳過if len(content) < 1000:continuef.write(content) # 保存圖片response.close()count += 1# 每個模特最多只下載download_limit張if count > download_limit:breakexcept HTTPError as e: # HTTP響應異常處理print(e.reason)if __name__ == '__main__':trigger = Truewhile (trigger):name = '潘建偉' # input('查詢詞語：')intro, profile_dict, img_list = claw(name)download(name, intro, profile_dict, img_list)# print("查詢結果：%s" % result)trigger = False

2. 人物履歷等數據按json格式輸出

2.1 json簡介

json，通信格式，可讀性強，卻會添加冗余空白格 --> separator對數據進行壓縮

2.2 json.dumps()方法參數

json.dumps()方法：

sort_keys參數：對dict對象進行排序，我們默認dict是無序存放的

一個合法的json文檔：有大括號{}擴起來的對象(鍵值對)；由中括號[]括起來的數組

dist_city={1:{"city_id":01,"city_name":"北京","area":["城東區","城南區"]},2:{"city_id":2,"city_name":"上海","area":["浦東區","朝陽區"]} } {"$schema": "http://json-schema.org/draft-04/schema#","type": "object","properties": {"email": {"type": "string"},"firstName":{"type": "string"},"lastName": {"type": "string"},} }

中文dict編碼報錯，ensure_ascii = False
skipkeys。dumps存儲dict時，key必須是str，否則TypeError，如果Skipkeys=True-->屏蔽非str的鍵值對。
拒絕json.dumps()方法自動排序，sort_keys=False
json.dumps()方法輸出自動換行縮進的數據格式，indent=4(值為縮進量)

3. python實現Restful框架的Flask接口

Flask官方教程文檔：https://dormousehole.readthedocs.io/en/latest/

python Restful API的Flask開發教程視頻：https://www.bilibili.com/video/BV1Kx411Q7gE?from=search&seid=11300633336131170833?

# 防止Flask實現的restful接口返回中文亂碼 app.config['JSON_AS_ASCII'] = False

總結

以上是生活随笔為你收集整理的python爬虫实战(一)～爬取百度百科人物的文本+图片信息+Restful api接口的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： python编程基础(二)～python
下一篇： python爬虫基础(二)～工具包: 下