當(dāng)前位置：首頁 > 编程语言 > python >内容正文

python

html调用python_对Python3 解析html的几种操作方式小结

發(fā)布時間：2023/12/19 python 24 豆豆

生活随笔收集整理的這篇文章主要介紹了 html调用python_对Python3 解析html的几种操作方式小结小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

解析html是爬蟲后的重要的一個處理數(shù)據(jù)的環(huán)節(jié)。一下記錄解析html的幾種方式。

先介紹基礎(chǔ)的輔助函數(shù)，主要用于獲取html并輸入解析后的結(jié)束

#把傳遞解析函數(shù)，便于下面的修改

def get_html(url, paraser=bs4_paraser):

headers = {

'Accept': '*/*',

'Accept-Encoding': 'gzip, deflate, sdch',

'Accept-Language': 'zh-CN,zh;q=0.8',

'Host': 'www.360kan.com',

'Proxy-Connection': 'keep-alive',

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'

}

request = urllib2.Request(url, headers=headers)

response = urllib2.urlopen(request)

response.encoding = 'utf-8'

if response.code == 200:

data = StringIO.StringIO(response.read())

gzipper = gzip.GzipFile(fileobj=data)

data = gzipper.read()

value = paraser(data) # open('E:/h5/haPkY0osd0r5UB.html').read()

return value

else:

pass

value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser)

for row in value:

print row

1，lxml.html的方式進(jìn)行解析，

The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt. It is unique in that it combines the speed and XML feature completeness of these libraries with the simplicity of a native Python API, mostly compatible but superior to the well-known ElementTree API. The latest release works with all CPython versions from 2.6 to 3.5. See the introduction for more information about background and goals of the lxml project. Some common questions are answered in the FAQ. [官網(wǎng)](http://lxml.de/)

def lxml_parser(page):

data = []

doc = etree.HTML(page)

all_div = doc.xpath('//div[@class="yingping-list-wrap"]')

for row in all_div:

# 獲取每一個影評，即影評的item

all_div_item = row.xpath('.//div[@class="item"]') # find_all('div', attrs={'class': 'item'})

for r in all_div_item:

value = {}

# 獲取影評的標(biāo)題部分

title = r.xpath('.//div[@class="g-clear title-wrap"][1]')

value['title'] = title[0].xpath('./a/text()')[0]

value['title_href'] = title[0].xpath('./a/@href')[0]

score_text = title[0].xpath('./div/span/span/@style')[0]

score_text = re.search(r'\d+', score_text).group()

value['score'] = int(score_text) / 20

# 時間

value['time'] = title[0].xpath('./div/span[@class="time"]/text()')[0]

# 多少人喜歡

value['people'] = int(

re.search(r'\d+', title[0].xpath('./div[@class="num"]/span/text()')[0]).group())

data.append(value)

return data

2，使用BeautifulSoup，不多說了，大家網(wǎng)上找資料看看

def bs4_paraser(html):

all_value = []

value = {}

soup = BeautifulSoup(html, 'html.parser')

# 獲取影評的部分

all_div = soup.find_all('div', attrs={'class': 'yingping-list-wrap'}, limit=1)

for row in all_div:

# 獲取每一個影評，即影評的item

all_div_item = row.find_all('div', attrs={'class': 'item'})

for r in all_div_item:

# 獲取影評的標(biāo)題部分

title = r.find_all('div', attrs={'class': 'g-clear title-wrap'}, limit=1)

if title is not None and len(title) > 0:

value['title'] = title[0].a.string

value['title_href'] = title[0].a['href']

score_text = title[0].div.span.span['style']

score_text = re.search(r'\d+', score_text).group()

value['score'] = int(score_text) / 20

# 時間

value['time'] = title[0].div.find_all('span', attrs={'class': 'time'})[0].string

# 多少人喜歡

value['people'] = int(

re.search(r'\d+', title[0].find_all('div', attrs={'class': 'num'})[0].span.string).group())

# print r

all_value.append(value)

value = {}

return all_value

3，使用SGMLParser，主要是通過start、end tag的方式進(jìn)行了，解析工程比較明朗，但是有點麻煩，而且該案例的場景不太適合該方法，（哈哈）

class CommentParaser(SGMLParser):

def __init__(self):

SGMLParser.__init__(self)

self.__start_div_yingping = False

self.__start_div_item = False

self.__start_div_gclear = False

self.__start_div_ratingwrap = False

self.__start_div_num = False

# a

self.__start_a = False

# span 3中狀態(tài)

self.__span_state = 0

# 數(shù)據(jù)

self.__value = {}

self.data = []

def start_div(self, attrs):

for k, v in attrs:

if k == 'class' and v == 'yingping-list-wrap':

self.__start_div_yingping = True

elif k == 'class' and v == 'item':

self.__start_div_item = True

elif k == 'class' and v == 'g-clear title-wrap':

self.__start_div_gclear = True

elif k == 'class' and v == 'rating-wrap g-clear':

self.__start_div_ratingwrap = True

elif k == 'class' and v == 'num':

self.__start_div_num = True

def end_div(self):

if self.__start_div_yingping:

if self.__start_div_item:

if self.__start_div_gclear:

if self.__start_div_num or self.__start_div_ratingwrap:

if self.__start_div_num:

self.__start_div_num = False

if self.__start_div_ratingwrap:

self.__start_div_ratingwrap = False

else:

self.__start_div_gclear = False

else:

self.data.append(self.__value)

self.__value = {}

self.__start_div_item = False

else:

self.__start_div_yingping = False

def start_a(self, attrs):

if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:

self.__start_a = True

for k, v in attrs:

if k == 'href':

self.__value['href'] = v

def end_a(self):

if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:

self.__start_a = False

def start_span(self, attrs):

if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:

if self.__start_div_ratingwrap:

if self.__span_state != 1:

for k, v in attrs:

if k == 'class' and v == 'rating':

self.__span_state = 1

elif k == 'class' and v == 'time':

self.__span_state = 2

else:

for k, v in attrs:

if k == 'style':

score_text = re.search(r'\d+', v).group()

self.__value['score'] = int(score_text) / 20

self.__span_state = 3

elif self.__start_div_num:

self.__span_state = 4

def end_span(self):

self.__span_state = 0

def handle_data(self, data):

if self.__start_a:

self.__value['title'] = data

elif self.__span_state == 2:

self.__value['time'] = data

elif self.__span_state == 4:

score_text = re.search(r'\d+', data).group()

self.__value['people'] = int(score_text)

pass

def sgl_parser(html):

parser = CommentParaser()

parser.feed(html)

return parser.data

4,HTMLParaer，與3原理相識，就是調(diào)用的方法不太一樣，基本上可以公用，

class CommentHTMLParser(HTMLParser.HTMLParser):

def __init__(self):

HTMLParser.HTMLParser.__init__(self)

self.__start_div_yingping = False

self.__start_div_item = False

self.__start_div_gclear = False

self.__start_div_ratingwrap = False

self.__start_div_num = False

# a

self.__start_a = False

# span 3中狀態(tài)

self.__span_state = 0

# 數(shù)據(jù)

self.__value = {}

self.data = []

def handle_starttag(self, tag, attrs):

if tag == 'div':

for k, v in attrs:

if k == 'class' and v == 'yingping-list-wrap':

self.__start_div_yingping = True

elif k == 'class' and v == 'item':

self.__start_div_item = True

elif k == 'class' and v == 'g-clear title-wrap':

self.__start_div_gclear = True

elif k == 'class' and v == 'rating-wrap g-clear':

self.__start_div_ratingwrap = True

elif k == 'class' and v == 'num':

self.__start_div_num = True

elif tag == 'a':

if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:

self.__start_a = True

for k, v in attrs:

if k == 'href':

self.__value['href'] = v

elif tag == 'span':

if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:

if self.__start_div_ratingwrap:

if self.__span_state != 1:

for k, v in attrs:

if k == 'class' and v == 'rating':

self.__span_state = 1

elif k == 'class' and v == 'time':

self.__span_state = 2

else:

for k, v in attrs:

if k == 'style':

score_text = re.search(r'\d+', v).group()

self.__value['score'] = int(score_text) / 20

self.__span_state = 3

elif self.__start_div_num:

self.__span_state = 4

def handle_endtag(self, tag):

if tag == 'div':

if self.__start_div_yingping:

if self.__start_div_item:

if self.__start_div_gclear:

if self.__start_div_num or self.__start_div_ratingwrap:

if self.__start_div_num:

self.__start_div_num = False

if self.__start_div_ratingwrap:

self.__start_div_ratingwrap = False

else:

self.__start_div_gclear = False

else:

self.data.append(self.__value)

self.__value = {}

self.__start_div_item = False

else:

self.__start_div_yingping = False

elif tag == 'a':

if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:

self.__start_a = False

elif tag == 'span':

self.__span_state = 0

def handle_data(self, data):

if self.__start_a:

self.__value['title'] = data

elif self.__span_state == 2:

self.__value['time'] = data

elif self.__span_state == 4:

score_text = re.search(r'\d+', data).group()

self.__value['people'] = int(score_text)

pass

def html_parser(html):

parser = CommentHTMLParser()

parser.feed(html)

return parser.data

3,4對于該案例來說確實是不太適合，趁現(xiàn)在有空記錄下來，功學(xué)習(xí)使用！

以上這篇對Python3 解析html的幾種操作方式小結(jié)就是小編分享給大家的全部內(nèi)容了，希望能給大家一個參考，也希望大家多多支持我們。

本文標(biāo)題: 對Python3 解析html的幾種操作方式小結(jié)

本文地址: http://www.cppcns.com/jiaoben/python/252307.html

總結(jié)

以上是生活随笔為你收集整理的html调用python_对Python3 解析html的几种操作方式小结的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯，歡迎將生活随笔推薦給好友。

3atv精品不卡视频,97人人超碰国产精品最新,中文字幕av一区二区三区人妻少妇,久久久精品波多野结衣,日韩一区二区三区精品

python

html调用python_对Python3 解析html的几种操作方式小结

總結(jié)