day 03 selenium与Beautifulsoup4的原理与使用
                                                            生活随笔
收集整理的這篇文章主要介紹了
                                day 03 selenium与Beautifulsoup4的原理与使用
小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.                        
                                
                            
                            
                            #爬取京東商品數(shù)據(jù)
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
def get_good(driver):
num=1
try:
time.sleep(5)
# 下拉滑動(dòng)5000px
js_code='''
window.scrollTo(0,5000)
'''
driver.execute_script(js_code)
# 等待5秒,待商品數(shù)據(jù)加載
time.sleep(5)
good_list = driver.find_elements_by_class_name('gl-item')
for good in good_list:
# print(good)
# 商品名稱
good_name = good.find_element_by_css_selector('.p-name em').text
# print(good_name)
good_url = good.find_element_by_css_selector('.p-name a').get_attribute('href')
# print(good_url)
good_price = good.find_element_by_class_name('p-price').text
# print(good_price)
# 商品評(píng)價(jià)
good_commit = good.find_element_by_class_name('p-commit').text
good_content = f'''
商品名稱:{good_name}
商品鏈接:{good_url}
商品價(jià)格:{good_price}
商品評(píng)價(jià):{good_commit}
\n
'''
print(good_content)
with open('jd.txt', 'a', encoding='utf-8')as f:
f.write(good_content)
num+=1
print('商品信息寫(xiě)入成功!')
# 找到下一頁(yè)并點(diǎn)擊
next_tag=driver.find_element_by_class_name('pn-next')
next_tag.click()
time.sleep(5)
# 遞歸調(diào)用函數(shù)本身
get_good(driver)
finally:
driver.close()
if __name__=='__main__':
driver=webdriver.Chrome()
try:
driver.implicitly_wait(10)
# 往京東發(fā)送請(qǐng)求
driver.get('http://www.jd.com/')
# 往京東主頁(yè)輸入墨菲定律,按回車鍵
input_tag=driver.find_element_by_id('key')
input_tag.send_keys('墨菲定律')
input_tag.send_keys(Keys.ENTER)
# 調(diào)取商品信息函數(shù)
get_good(driver)
finally:
driver.close()
Beautifulsoup4的原理與使用
html_doc='''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>
<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
# python自帶的解析庫(kù)
# soup=BeautifulSoup(html_doc,'html.parser')
# 利用bs4得到一個(gè)soup對(duì)象
soup=BeautifulSoup(html_doc,'lxml')
# bs4對(duì)象
# print(soup)
# bs4類型
# print(type(soup))
# 美化功能
# html=soup.prettify()
# print(html)
# 1、直接選擇標(biāo)簽(返回的是一個(gè)對(duì)象) *****
print(soup.a) # 獲取第一個(gè)a標(biāo)簽
print(soup.p) # 獲取第一個(gè)p標(biāo)簽
print(type(soup.a)) # <class 'bs4.element.Tag'>
# 2、獲取標(biāo)簽的名稱
print(soup.a.name) # 獲取a標(biāo)簽的名字
# 3、獲取標(biāo)簽的屬性 *****
print(soup.a.attrs) # 獲取a標(biāo)簽內(nèi)所有的屬性
print(soup.a.attrs['href']) # 獲取a標(biāo)簽內(nèi)的href屬性
# 4、獲取標(biāo)簽的文本內(nèi)容 *****
print(soup.p.text) # $37
# 5、嵌套選擇標(biāo)簽
print(soup.p.b) # 獲取第一個(gè)p標(biāo)簽內(nèi)的b標(biāo)簽
print(soup.p.b.text) # 打印b標(biāo)簽內(nèi)的文本
# 6、子節(jié)點(diǎn)、子孫節(jié)點(diǎn)
# 獲取子節(jié)點(diǎn)
print(soup.p.children) # 獲取第一個(gè)p標(biāo)簽所有的子節(jié)點(diǎn),返回的是一個(gè)迭代器
print(list(soup.p.children)) # list轉(zhuǎn)成列表
# 7、父節(jié)點(diǎn),祖先節(jié)點(diǎn)
print(soup.b.parent)
print(soup.b.parents)
print(list(soup.b.parents))
 
                        
                        
                        import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
def get_good(driver):
num=1
try:
time.sleep(5)
# 下拉滑動(dòng)5000px
js_code='''
window.scrollTo(0,5000)
'''
driver.execute_script(js_code)
# 等待5秒,待商品數(shù)據(jù)加載
time.sleep(5)
good_list = driver.find_elements_by_class_name('gl-item')
for good in good_list:
# print(good)
# 商品名稱
good_name = good.find_element_by_css_selector('.p-name em').text
# print(good_name)
good_url = good.find_element_by_css_selector('.p-name a').get_attribute('href')
# print(good_url)
good_price = good.find_element_by_class_name('p-price').text
# print(good_price)
# 商品評(píng)價(jià)
good_commit = good.find_element_by_class_name('p-commit').text
good_content = f'''
商品名稱:{good_name}
商品鏈接:{good_url}
商品價(jià)格:{good_price}
商品評(píng)價(jià):{good_commit}
\n
'''
print(good_content)
with open('jd.txt', 'a', encoding='utf-8')as f:
f.write(good_content)
num+=1
print('商品信息寫(xiě)入成功!')
# 找到下一頁(yè)并點(diǎn)擊
next_tag=driver.find_element_by_class_name('pn-next')
next_tag.click()
time.sleep(5)
# 遞歸調(diào)用函數(shù)本身
get_good(driver)
finally:
driver.close()
if __name__=='__main__':
driver=webdriver.Chrome()
try:
driver.implicitly_wait(10)
# 往京東發(fā)送請(qǐng)求
driver.get('http://www.jd.com/')
# 往京東主頁(yè)輸入墨菲定律,按回車鍵
input_tag=driver.find_element_by_id('key')
input_tag.send_keys('墨菲定律')
input_tag.send_keys(Keys.ENTER)
# 調(diào)取商品信息函數(shù)
get_good(driver)
finally:
driver.close()
Beautifulsoup4的原理與使用
html_doc='''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>
<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
# python自帶的解析庫(kù)
# soup=BeautifulSoup(html_doc,'html.parser')
# 利用bs4得到一個(gè)soup對(duì)象
soup=BeautifulSoup(html_doc,'lxml')
# bs4對(duì)象
# print(soup)
# bs4類型
# print(type(soup))
# 美化功能
# html=soup.prettify()
# print(html)
# 1、直接選擇標(biāo)簽(返回的是一個(gè)對(duì)象) *****
print(soup.a) # 獲取第一個(gè)a標(biāo)簽
print(soup.p) # 獲取第一個(gè)p標(biāo)簽
print(type(soup.a)) # <class 'bs4.element.Tag'>
# 2、獲取標(biāo)簽的名稱
print(soup.a.name) # 獲取a標(biāo)簽的名字
# 3、獲取標(biāo)簽的屬性 *****
print(soup.a.attrs) # 獲取a標(biāo)簽內(nèi)所有的屬性
print(soup.a.attrs['href']) # 獲取a標(biāo)簽內(nèi)的href屬性
# 4、獲取標(biāo)簽的文本內(nèi)容 *****
print(soup.p.text) # $37
# 5、嵌套選擇標(biāo)簽
print(soup.p.b) # 獲取第一個(gè)p標(biāo)簽內(nèi)的b標(biāo)簽
print(soup.p.b.text) # 打印b標(biāo)簽內(nèi)的文本
# 6、子節(jié)點(diǎn)、子孫節(jié)點(diǎn)
# 獲取子節(jié)點(diǎn)
print(soup.p.children) # 獲取第一個(gè)p標(biāo)簽所有的子節(jié)點(diǎn),返回的是一個(gè)迭代器
print(list(soup.p.children)) # list轉(zhuǎn)成列表
# 7、父節(jié)點(diǎn),祖先節(jié)點(diǎn)
print(soup.b.parent)
print(soup.b.parents)
print(list(soup.b.parents))
轉(zhuǎn)載于:https://www.cnblogs.com/RuiZi/p/11130042.html
總結(jié)
以上是生活随笔為你收集整理的day 03 selenium与Beautifulsoup4的原理与使用的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
 
                            
                        - 上一篇: Postgresql 物理热备份 --
- 下一篇: 小波变换和motion信号处理(三)(转
