3.使用Selenium模拟浏览器抓取淘宝商品美食信息
生活随笔
收集整理的這篇文章主要介紹了
3.使用Selenium模拟浏览器抓取淘宝商品美食信息
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
# 使用selenium+phantomJS模擬瀏覽器爬取淘寶商品信息
# 思路:
# 第一步:利用selenium驅(qū)動瀏覽器,搜索商品信息,得到商品列表
# 第二步:分析商品頁數(shù),驅(qū)動瀏覽器翻頁,并得到商品信息
# 第三步:爬取商品信息
# 第四步:存儲到mongodb
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from config import * import re browser = webdriver.PhantomJS(executable_path='/usr/bin/phantomjs',service_args=SERVICE_ARGS) # 表示給browser瀏覽器一個10秒的加載時間 wait = WebDriverWait(browser,10)# 使用webdriver打開chrome,打開淘寶頁面,搜索美食關(guān)鍵字,返回總頁數(shù) def search():print('正在搜索……')try:# 打開淘寶首頁browser.get('http://www.taobao.com')# 判斷輸入框是否已經(jīng)加載input = wait.until(EC.presence_of_element_located((By.ID,'q')))# < selenium.webdriver.remote.webelement.WebElement(session="d575fc60-91a9-11e8-917b-3dd730d5073d",element=":wdc:1532701944023") ># 判斷搜索按鈕是否可以進(jìn)行點(diǎn)擊操作submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button')))# 輸入美食input.send_keys(KEYWORD)# 點(diǎn)擊搜索按鈕submit.click()# 使用css_selector找到顯示總頁面的元素total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))return total.textexcept TimeoutException:print('超時')return search()def main():total = search()print(total)total = int(re.compile('(\d+)').search(total).group(1))print(total)if __name__ == '__main__':main()
# 使用PyQuery來解析htmldoc = pq(html) items = doc("#mainsrp-itemlist .items .item").items()for item in items:product = {# 去掉價格中的換行符"price": item.find(".price").text().replace("\n", ""),"image": item.find(".pic .img").attr("src"),"name": item.find(".title").text(),"location": item.find(".location").text(),"shop": item.find(".shop").text(),}""" 需要保存的商品信息:(1)商品的圖片(2)商品的價格(3)商品的名字(4)商品來源(5)商品店鋪""" # print(product)# {'price': '¥32.80',# 'image': '//g-search1.alicdn.com/img/bao/uploaded/i4/imgextra/i4/95948676/TB2IdzpcnnI8KJjSszbXXb4KFXa_!!0-saturn_solar.jpg_230x230.jpg',# 'name': '南萃坊流心蛋黃餅20個800克傳統(tǒng)糕點(diǎn)辦公室網(wǎng)紅零食小吃美食整箱', 'location': '浙江 杭州', 'shop': '南萃坊旗艦店'}save_products(product)# print("=" * 30)# == == == == == == == == == == == == == == ==# 將商品的信息存儲到MongoDB數(shù)據(jù)庫、txt文件中 def save_products(result):try:# 嘗試將結(jié)果集插入到數(shù)據(jù)庫中if db[MONGO_TABLE_SELENIUM].insert(result):print("存儲到MongoDB數(shù)據(jù)庫成功!", result)# 存儲到MongoDB數(shù)據(jù)庫成功! {'price': '¥33.00',# 'image': '//g-search1.alicdn.com/img/bao/uploaded/i4/i2/110202222/TB2k.9bhcj_B1NjSZFHXXaDWpXa_!!110202222.jpg_230x230.jpg',# 'name': '陜西特產(chǎn)紅星軟香酥小吃美食零食禮包早餐糕點(diǎn)豆沙西安網(wǎng)紅千層餅', 'location': '陜西 咸陽', 'shop': '紅星軟香酥專賣',# '_id': ObjectId('5b5dd570e138231772e2ef5d')}# == == == == == == == == == == == == == == ==# {'price': '¥24.90',# 'image': '//g-search1.alicdn.com/img/bao/uploaded/i4/imgextra/i1/13621870/TB2V3LhX56guuRjy1XdXXaAwpXa_!!0-saturn_solar.jpg_230x230.jpg',# 'name': '卜珂椰絲球椰蓉球美食早餐糕點(diǎn)心好吃的點(diǎn)心休閑零食品批發(fā)店小吃', 'location': '江蘇 蘇州', 'shop': '卜珂巧克力旗艦店'}# 存儲到MongoDB數(shù)據(jù)庫成功! {'price': '¥24.90',# 'image': '//g-search1.alicdn.com/img/bao/uploaded/i4/imgextra/i1/13621870/TB2V3LhX56guuRjy1XdXXaAwpXa_!!0-saturn_solar.jpg_230x230.jpg',# 'name': '卜珂椰絲球椰蓉球美食早餐糕點(diǎn)心好吃的點(diǎn)心休閑零食品批發(fā)店小吃', 'location': '江蘇 蘇州', 'shop': '卜珂巧克力旗艦店',# '_id': ObjectId('5b5dd575e138231772e2ef5e')}# == == == == == == == == == == == == == == ==# 將結(jié)果集存儲到txt文件中if result:with open("products.txt", "a", encoding="utf-8") as f:f.write(str(result) + "\n")f.close()except Exception:print("存儲失敗!", result)def main():try:total = search()# print(total)total = int(re.compile('(\d+)').search(total).group(1))# print(total)for i in range(2, total + 1):next_page(i)except Exception:print("出錯啦!")finally:browser.close() # 最后一定都要關(guān)閉瀏覽器if __name__ == '__main__':main()
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from config import * import re browser = webdriver.PhantomJS(executable_path='/usr/bin/phantomjs',service_args=SERVICE_ARGS) # 表示給browser瀏覽器一個10秒的加載時間 wait = WebDriverWait(browser,10)# 使用webdriver打開chrome,打開淘寶頁面,搜索美食關(guān)鍵字,返回總頁數(shù) def search():print('正在搜索……')try:# 打開淘寶首頁browser.get('http://www.taobao.com')# 判斷輸入框是否已經(jīng)加載input = wait.until(EC.presence_of_element_located((By.ID,'q')))# < selenium.webdriver.remote.webelement.WebElement(session="d575fc60-91a9-11e8-917b-3dd730d5073d",element=":wdc:1532701944023") ># 判斷搜索按鈕是否可以進(jìn)行點(diǎn)擊操作submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button')))# 輸入美食input.send_keys(KEYWORD)# 點(diǎn)擊搜索按鈕submit.click()# 使用css_selector找到顯示總頁面的元素total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))return total.textexcept TimeoutException:print('超時')return search()def main():total = search()print(total)total = int(re.compile('(\d+)').search(total).group(1))print(total)if __name__ == '__main__':main()
phantomJS爬數(shù)據(jù)比較慢,下面的測試結(jié)果,大概經(jīng)過5分多鐘才返回結(jié)果,正在搜索和超時提示返回比較慢
?phantojs的其他配置方法:
# 引入配置對象DesiredCapabilities from selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesdcap = dict(DesiredCapabilities.PHANTOMJS) # 從USER_AGENTS列表中隨機(jī)選一個瀏覽器頭,偽裝瀏覽器 dcap["phantomjs.page.settings.userAgent"] = USER_AGENTS # 不載入圖片,爬頁面速度會快很多 dcap["phantomjs.page.settings.loadImages"] = False # 設(shè)置代理 service_args = ['--disk-cache=true','--load-images=false'] # 打開帶配置信息的phantomJS瀏覽器 browser = webdriver.PhantomJS(executable_path='/usr/bin/phantomjs', desired_capabilities=dcap, service_args=service_args) # 隱式等待5秒,可以自己調(diào)節(jié) browser.implicitly_wait(5) # 設(shè)置10秒頁面超時返回,類似于requests.get()的timeout選項,driver.get()沒有timeout選項 # 以前遇到過driver.get(url)一直不返回,但也不報錯的問題,這時程序會卡住,設(shè)置超時選項能解決這個問題。 browser.set_page_load_timeout(10) # 設(shè)置10秒腳本超時時間 browser.set_script_timeout(10)?
?完整代碼
# 使用selenium+phantomJS模擬瀏覽器爬取淘寶商品信息 # 思路: # 第一步:利用selenium驅(qū)動瀏覽器,搜索商品信息,得到商品列表 # 第二步:分析商品頁數(shù),驅(qū)動瀏覽器翻頁,并得到商品信息 # 第三步:爬取商品信息 # 第四步:存儲到mongodb from selenium import webdriver from config import * from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import re from pyquery import PyQuery as pq import pymongo# client = pymongo.MongoClient(MONGO_URL) client = pymongo.MongoClient(host='192.168.33.12', port=27017) db = client[MONGO_DB_SELENIUM]browser = webdriver.PhantomJS(executable_path=EXECUTABLE_PATH,service_args=SERVICE_ARGS) browser.set_window_size(1400, 1000)# 表示給browser瀏覽器一個10秒的加載時間 wait = WebDriverWait(browser,10)# 使用webdriver打開chrome,打開淘寶頁面,搜索美食關(guān)鍵字,返回總頁數(shù) def search():print('正在搜索……')try:# 打開淘寶首頁browser.get('http://www.taobao.com')# 判斷輸入框是否已經(jīng)加載input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#q')))# 判斷搜索按鈕是否可以進(jìn)行點(diǎn)擊操作submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button')))# 輸入美食input.send_keys(KEYWORD_SELENIUM)# 點(diǎn)擊搜索按鈕submit.click()# 使用css_selector找到顯示總頁面的元素total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))# 獲取商品信息get_products()return total.textexcept TimeoutException:print('超時')return search()# 跳轉(zhuǎn)到下一頁 def next_page(page_number):try:# 輸入要跳轉(zhuǎn)的頁數(shù)input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))# 確認(rèn)進(jìn)行跳轉(zhuǎn)submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))input.clear()input.send_keys(page_number)submit.click()# 判斷當(dāng)前的頁數(shù)與網(wǎng)頁的高亮顯示是否對應(yīng)得上wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > ul > li.item.active > span"), str(page_number)))# 獲取商品信息get_products()except TimeoutException:next_page(page_number)# 獲取商品信息 def get_products():# 判斷商品是否加載成功wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))# 獲取頁面信息html = browser.page_source# 使用PyQuery來解析htmldoc = pq(html) items = doc("#mainsrp-itemlist .items .item").items()for item in items:product = {# 去掉價格中的換行符"price": item.find(".price").text().replace("\n", ""),"image": item.find(".pic .img").attr("src"),"name": item.find(".title").text(),"location": item.find(".location").text(),"shop": item.find(".shop").text(),}""" 需要保存的商品信息:(1)商品的圖片(2)商品的價格(3)商品的名字(4)商品來源(5)商品店鋪""" # print(product)# {'price': '¥32.80',# 'image': '//g-search1.alicdn.com/img/bao/uploaded/i4/imgextra/i4/95948676/TB2IdzpcnnI8KJjSszbXXb4KFXa_!!0-saturn_solar.jpg_230x230.jpg',# 'name': '南萃坊流心蛋黃餅20個800克傳統(tǒng)糕點(diǎn)辦公室網(wǎng)紅零食小吃美食整箱', 'location': '浙江 杭州', 'shop': '南萃坊旗艦店'}save_products(product)# print("=" * 30)# == == == == == == == == == == == == == == ==# 將商品的信息存儲到MongoDB數(shù)據(jù)庫、txt文件中 def save_products(result):try:# 嘗試將結(jié)果集插入到數(shù)據(jù)庫中if db[MONGO_TABLE_SELENIUM].insert(result):print("存儲到MongoDB數(shù)據(jù)庫成功!", result)# 存儲到MongoDB數(shù)據(jù)庫成功! {'price': '¥33.00',# 'image': '//g-search1.alicdn.com/img/bao/uploaded/i4/i2/110202222/TB2k.9bhcj_B1NjSZFHXXaDWpXa_!!110202222.jpg_230x230.jpg',# 'name': '陜西特產(chǎn)紅星軟香酥小吃美食零食禮包早餐糕點(diǎn)豆沙西安網(wǎng)紅千層餅', 'location': '陜西 咸陽', 'shop': '紅星軟香酥專賣',# '_id': ObjectId('5b5dd570e138231772e2ef5d')}# == == == == == == == == == == == == == == ==# {'price': '¥24.90',# 'image': '//g-search1.alicdn.com/img/bao/uploaded/i4/imgextra/i1/13621870/TB2V3LhX56guuRjy1XdXXaAwpXa_!!0-saturn_solar.jpg_230x230.jpg',# 'name': '卜珂椰絲球椰蓉球美食早餐糕點(diǎn)心好吃的點(diǎn)心休閑零食品批發(fā)店小吃', 'location': '江蘇 蘇州', 'shop': '卜珂巧克力旗艦店'}# 存儲到MongoDB數(shù)據(jù)庫成功! {'price': '¥24.90',# 'image': '//g-search1.alicdn.com/img/bao/uploaded/i4/imgextra/i1/13621870/TB2V3LhX56guuRjy1XdXXaAwpXa_!!0-saturn_solar.jpg_230x230.jpg',# 'name': '卜珂椰絲球椰蓉球美食早餐糕點(diǎn)心好吃的點(diǎn)心休閑零食品批發(fā)店小吃', 'location': '江蘇 蘇州', 'shop': '卜珂巧克力旗艦店',# '_id': ObjectId('5b5dd575e138231772e2ef5e')}# == == == == == == == == == == == == == == ==# 將結(jié)果集存儲到txt文件中if result:with open("products.txt", "a", encoding="utf-8") as f:f.write(str(result) + "\n")f.close()except Exception:print("存儲失敗!", result)def main():try:total = search()# print(total)total = int(re.compile('(\d+)').search(total).group(1))# print(total)for i in range(2, total + 1):next_page(i)except Exception:print("出錯啦!")finally:browser.close() # 最后一定都要關(guān)閉瀏覽器if __name__ == '__main__':main()
?
?
參考博文:
Selenium分手PhantomJS
盤點(diǎn)selenium phantomJS使用的坑
?
轉(zhuǎn)載于:https://www.cnblogs.com/zouke1220/p/9375276.html
新人創(chuàng)作打卡挑戰(zhàn)賽發(fā)博客就能抽獎!定制產(chǎn)品紅包拿不停!總結(jié)
以上是生活随笔為你收集整理的3.使用Selenium模拟浏览器抓取淘宝商品美食信息的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 浅析Unity 坐标系
- 下一篇: Jboss未授权访问部署木马 利用exp