简单网页爬虫
目錄
- 爬蟲
- 1.文字爬蟲
- 2.圖片爬蟲
- 3.視頻爬蟲
爬蟲
安裝requests模塊:pip install requests
1.文字爬蟲
import re # 導(dǎo)入re模塊 import requests # 導(dǎo)入request模塊response = requests.get('https://ishuo.cn/') # 獲取網(wǎng)頁內(nèi)容源代碼 data = response.text # 將網(wǎng)頁內(nèi)容源代碼存放在文本格式記錄data文件中result_list = re.findall('<div class="content">(.*?)</div>',data) # 找到要爬取的內(nèi)容(.*?)前后的共同點寫入列表 ''' <div class="content">與人a1人間的信任,就像是紙片,一旦破損,就不會再回到原來的樣子。</div> <div class="content">(.*?)</div><div class="content">一年奔波,塵緣遇了誰;一句珍重,天涯別了誰;一點靈犀,憑欄憶了.</div> <div class="content">(.*?)</div>''' for result in result_list:print(result) # 分別讀取2.圖片爬蟲
import re import requestscount= 0 for i in range(2,4):# 'http://pic.netbian.com/index_{i}.html'跳轉(zhuǎn)下一頁繼續(xù)爬respone = requests.get(f'http://pic.netbian.com/index_{i}.html') #獲取網(wǎng)址源代碼內(nèi)容data = respone.text #文本形式記錄# print(data)result_list = re.findall('src="(.*?)"',data) #從data中獲取所要爬的共同的東西(圖片)for result in result_list:if result.endswith('jpg'): # 判斷挑選需要的圖片的格式result = f'http://pic.netbian.com{result}' # 圖片的網(wǎng)址# print(result)img_respone = requests.get(result) # 獲取圖片的內(nèi)容(二進制記錄的內(nèi)容)img_name = result.split('/')[-1] # 每次循環(huán)分別記錄圖片名稱img_data = img_respone.content # 以字符形式記錄圖片# print(img_data)with open(img_name,'wb') as f: # 創(chuàng)建名字為img_name的圖片并打開f.write(img_data) # 以字符形式寫入圖片f.flush()count+=1print(f'爬取了{count}張圖片')3.視頻爬蟲
import re import requestsresponse = requests.get('https://www.ku6.com/index') # 獲取網(wǎng)址代碼 data = response.text # 文本形式記錄網(wǎng)頁代碼count = 0 # print(data) result_list = re.findall('<a class="video-image-warp" target="_blank" href="(.*?)">',data) # 找到視頻網(wǎng)址的共同地方for result in result_list:# print(result)if result.startswith('/video'):# print(result)result = f'https://www.ku6.com{result}' # 補滿視頻網(wǎng)址# print(result)detail_response = requests.get(result) # 獲取單個視頻網(wǎng)址代碼detail_data = detail_response.text # 文本形式記錄單個視頻網(wǎng)頁代碼# src="https://rbv01.ku6.com/wifi/o_1dab1luo5oao1jnk1bpnk321hevckvs" >< / video >(????) # flvURL: "https://rbv01.ku6.com/wifi/o_1dab1luo5oao1jnk1bpnk321hevckvs" # flvURL: "https://rbv01.ku6.com/wifi/o_1dab1luo5udcici1r2vefj1jksbkvs"video_url = re.findall('flvURL: "(.*?)"',detail_data) # 找到單個視頻網(wǎng)址代碼# print(video_url)video_response = requests.get(video_url[0]) # video_url是包含一個元素的列表video_data = video_response.content # 以二進制保存視頻video_name = f"{video_url[0].split('/')[-1]}.mp4" # 編輯視頻名字with open(video_name, 'wb') as fw: # 打開文件fw.write(video_data) # 寫入視頻fw.flush()count += 1print(f'爬取了{count}個視頻')轉(zhuǎn)載于:https://www.cnblogs.com/yellowcloud/p/10858775.html
總結(jié)
- 上一篇: memcached 扩展安装(windo
- 下一篇: bzoj5368 [Pkusc2018]