6、通过xpath获取网页数据
生活随笔
收集整理的這篇文章主要介紹了
6、通过xpath获取网页数据
小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
1、xpath解析網(wǎng)頁(yè)源文件
from urllib import request from lxml import etree # 請(qǐng)求的url url = "http://www.dfenqi.cn/Product/Index" # 請(qǐng)求的頭文件 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36" } # 創(chuàng)建請(qǐng)求對(duì)象 req = request.Request(url,headers = headers) # 創(chuàng)建處理器對(duì)象 httpHandler = request.HTTPHandler() # 創(chuàng)建opener opener = request.build_opener(httpHandler) # 發(fā)送請(qǐng)求 response = opener.open(req) # 讀取源文件 html = response.read().decode('utf-8') # 創(chuàng)建xpath關(guān)系 xpath = "//div[@class='liebiao']/ul/li/p/text()" # 獲取屬性值列表 # xpath = "//div[@class='liebiao']/ul/li/p/@class" # 將html轉(zhuǎn)換成可解析對(duì)象 selector = etree.HTML(html) # 返回xpath查詢列表 goodsList = selector.xpath(xpath) # 顯示商品標(biāo)題 for goods in goodsList:print(goods)2、xpath解析源文件,并下載圖片至本地
from urllib import request from lxml import etree import osclass Spilder():def __init__(self,pageUrl):# 需要爬取網(wǎng)頁(yè)的urlself.pageUrl = pageUrl# 請(qǐng)求頭文件self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}# 請(qǐng)求的處理器self.httpHandler = request.HTTPHandler()# 請(qǐng)求的openerself.opener = request.build_opener(self.httpHandler)def loadPage(self):''' 請(qǐng)求網(wǎng)頁(yè) :return: 返回網(wǎng)頁(yè)源文件 '''req = request.Request(self.pageUrl,headers = self.headers)response = self.opener.open(req)return response.read()def getImageUrls(self,html,xpath):''' 根據(jù)xpath解析源文件 :param html: 源文件 :param xpath: xpath解析字符串 :return: 解析列表 '''selector = etree.HTML(html)imgUrls = selector.xpath(xpath)return imgUrlsdef loadImage(self,url):''' 下載圖片 :param url: 圖片url :return: 返回圖片數(shù)據(jù) '''req = request.Request(url,headers=self.headers)response = self.opener.open(req)return response.read()def writeImage(self,img,imgName):''' 在當(dāng)前文件夾下面創(chuàng)建image子文件夾,將圖片寫(xiě)入本地, :param img: 圖片數(shù)據(jù) :param imgName: 圖片名稱 :return: '''folderName = os.path.join(os.path.abspath(os.curdir),"image")if not(os.path.isdir(folderName)):os.mkdir(folderName)with open('image/%s' % imgName,'wb') as f:f.write(img)if __name__ == "__main__":url = "http://www.dfenqi.cn/Product/Index"spilder = Spilder(url)html = spilder.loadPage()xpath = "//div[@class='liebiao']/ul/li/div/a/img/@src"imgUrls = spilder.getImageUrls(html,xpath)index = 0for url in imgUrls:index += 1img = spilder.loadImage(url)spilder.writeImage(img,'img%s.jpg' % index)轉(zhuǎn)載于:https://www.cnblogs.com/toloy/p/8618007.html
總結(jié)
以上是生活随笔為你收集整理的6、通过xpath获取网页数据的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: 格式工厂软件处理视频
- 下一篇: Swoole练习 Web