生活随笔
收集整理的這篇文章主要介紹了
某宝的爬虫测试
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
內容
基于網上的代碼,設計了一個搜索關鍵字去爬取商品信息的初級demo。cookies.txt文件需要在登陸的時候,去讀取,具體如下:
登陸的時候打開F12,選擇好Network 勾選Preserve log
登陸之后,會產生下面這個文檔。
保存cookie信息到cookies.txt就可以運行下面代碼。
import requests
from selenium
import webdriver
from selenium
.webdriver
.chrome
.options
import Options
from bs4
import BeautifulSoup
import re
from lxml
import etree
import pandas
as pd
import random
import time
import numpy
as np
from tqdm
import tqdm
import requests
import random
def set_user_agent():USER_AGENTS
= ["Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)","Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)","Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6","Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0","Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"]user_agent
= random
.choice
(USER_AGENTS
)return user_agent
class TaoBao:def __init__(self
,url
):self
.test_url
=urlself
.headers
={"Origin":"https://login.taobao.com","Upgrade-Insecure-Requests":"1","Content-Type":"application/x-www-form-urlencoded","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8","Referer":"https://login.taobao.com/member/login.jhtml?redirectURL=https%3A%2F%2Fwww.taobao.com%2F","Accept-Encoding":"gzip, deflate, br","Accept-Language":"zh-CN,zh;q=0.9","User-Agent":set_user_agent
()}self
.cookies
= {} self
.res_cookies_txt
= "" def read_cookies(self
):with open("cookies.txt",'r',encoding
='utf-8') as f
:cookies_txt
= f
.read
().strip
(';') for cookie
in cookies_txt
.split
(';'):name
,value
=cookie
.strip
().split
('=',1) self
.cookies
[name
]=value cookiesJar
= requests
.utils
.cookiejar_from_dict
(self
.cookies
, cookiejar
=None,overwrite
=True)return cookiesJar
def set_cookies(self
,cookies
):res_cookies_dic
= requests
.utils
.dict_from_cookiejar
(cookies
)for i
in res_cookies_dic
.keys
():self
.cookies
[i
] = res_cookies_dic
[i
]for k
in self
.cookies
.keys
():self
.res_cookies_txt
+= k
+"="+self
.cookies
[k
]+";"with open('cookies.txt',"w",encoding
="utf-8") as f
:f
.write
(self
.res_cookies_txt
)def login(self
):session
= requests
.session
()session
.headers
= self
.headerssession
.cookies
=self
.read_cookies
()response
= session
.get
(self
.test_url
)
self
.set_cookies
(response
.cookies
)return response
def deal(self
,wen
):tir
= wen
.textreps
= list(set(re
.findall
('u\d[a-z\d]{3}',tir
)))for rep
in reps
:try:hou
= '\\'+repbian
= hou
.encode
("gb18030").decode
("unicode-escape")tir
= tir
.replace
('\\\\'+rep
,bian
)except:continuetext
= re
.findall
('"icon":(.*?)"icon"',tir
)detail
= re
.findall
('"detail_url":"(.*?)",',tir
)title
= re
.findall
('"raw_title":"(.*?)",',tir
)nid
= re
.findall
('"nid":"(.*?)",',tir
)pic_url
= re
.findall
('"pic_url":"(.*?)",',tir
)item_loc
= re
.findall
('"item_loc":"(.*?)",',tir
)price
= re
.findall
('"view_price":"(.*?)",',tir
)web
= []for de
in detail
:web
.append
('https:' + de
)pic
= []for pp
in pic_url
:pic
.append
('https:' + pp
)biao
= pd
.DataFrame
()biao
['id'] = nidbiao
['title'] = titlebiao
['price'] = pricebiao
['detail'] = webbiao
['pic_url'] = picbiao
['location'] = item_loc
return biao
if __name__
== '__main__':something
= input('找什么')a
= something
.encode
("utf-8")b
= str(a
).replace
("b'",'').replace
('\\x','%').upper
()url
= 'https://s.taobao.com/search?q=' + btaobao
=TaoBao
(url
)wen
= taobao
.login
()biao
= taobao
.deal
(wen
)
總結
以上是生活随笔為你收集整理的某宝的爬虫测试的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。