西刺代理python_python爬虫西刺代理ip爬取
importrequestsfrom lxml importetreeimporttimeimportrandomimportcsvdeftest_ip(ip_address):'''測試ip是否可用
:param ip_address: 代理ip'''url= 'http://icanhazip.com/'headers={#headers 頭部文件
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
}
ip_pool=[]for ip_test inip_address:#print(ip_test)
try:
response= requests.get(url=url,headers=headers,proxies=ip_test,timeout=5)if response.status_code == '200':
ip_pool.append(ip_test)
time.sleep(random.randint(2,8))exceptException as e:pass
print(ip_pool)
files_save(ip_pool)deffiles_save(ip_list):'''將可用代理ip保存
:param ip_list:代理ip
:return:'''with open('./代理ip.csv','a+',encoding='utf-8')as f:
write=csv.writer(f)
write.writerow(ip_list)pass
defget_page_data(nums):'''獲取西刺代理的頁面信息
:return:'''ip_list=[]
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
}for i in range(1,nums+1):
url= "https://www.xicidaili.com/nn/{}".format(i)
response= requests.request('get',url=url,headers=headers)
page_data=etree.HTML(response.text)#獲取https信息
#https_infos = page_data.xpath(".//tr[@class='odd']")
#獲取http信息
#http_infos = page_data.xpath(".//tr[@class='']")
page_infos= page_data.xpath(".//tr[@class='odd']|.//tr[@class='']")for info inpage_infos:
ip_dict={}
ip_address= info.xpath(".//td[2]/text()")[0]
ip_port= info.xpath(".//td[3]/text()")[0]
ip_type= info.xpath(".//td[6]/text()")[0].lower()
ip_dict[ip_type]= ip_type+'://'+ip_address+':'+ip_port
ip_list.append(ip_dict)#print(ip_list)
test_ip(ip_list)pass
pass
if __name__ == '__main__':'''爬取代理ip時應(yīng)注意
需要測試此ip是否可用
爬取速度
分析:
url信息
頁面 url
1 https://www.xicidaili.com/nn/
2 https://www.xicidaili.com/nn/2
3 https://www.xicidaili.com/nn/3'''
#nums = int(input("請輸入爬取頁數(shù)>>"))
nums = 2get_page_data(nums)
總結(jié)
以上是生活随笔為你收集整理的西刺代理python_python爬虫西刺代理ip爬取的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: C# 修改项目文件夹名称完全版
- 下一篇: steelray project vie