selenium+chromedriver爬取淘宝美食信息保存到MongoDB
生活随笔
收集整理的這篇文章主要介紹了
selenium+chromedriver爬取淘宝美食信息保存到MongoDB
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
配置文件
MONGO_URL = 'localhost' MONGO_DB = 'taobao_food' MONGO_TABLE = 'products'代碼實現
from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import re from pyquery import PyQuery as pq from config import * import pymongoclient = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB]#chromedriver執行路徑 driver = webdriver.Chrome(executable_path="C:\chromedriver.exe") #selenium官方文檔 wait = WebDriverWait(driver, 10) def search():try:driver.get("https://world.taobao.com/")input = wait.until(#等待頁面加載完成,css選擇器EC.presence_of_element_located((By.CSS_SELECTOR, '#mq')))submit = wait.until(#等待按鈕可點擊EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_PopSearch > div.sb-search > ''div > form > input[type="submit"]'':nth-child(2)')))#input框輸入搜索關鍵字input.send_keys("美食")#按鈕點擊submit.click()total_page_num = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > ''div > div > div.total')))#獲取第一頁商品信息get_products()#返回總頁數return total_page_num.textexcept TimeoutException:return search()def next_page(page_num):try:input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > ''div > div.form > input')))#清空input框input.clear()#輸入跳轉到的頁碼input.send_keys(page_num)submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > ''div > div > div.form > span.btn.J_Submit')))#點擊確定按鈕跳轉submit.click()#確定是跳轉到了所要跳轉的頁碼wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager>div>''div>div>ul>li.item.active>''span'),str(page_num)))#獲取從第二頁開始的所有商品信息get_products()except TimeoutException:#如果超時重新調用此函數next_page(page_num)def get_products():wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))#獲取完整htmlhtml = driver.page_source#用pyquery解析doc = pq(html)items = doc('#mainsrp-itemlist .items .item').items()for item in items:product = {'商品名稱':item.find('.title').text(),'圖片地址':item.find('.pic .img').attr('src'),'商品價格':item.find('.price').text().strip().replace("\n",""),'付款人數':item.find('.deal-cnt').text()[:-3],'商品地址':item.find('.pic .pic-link').attr('href'),'發貨地':item.find('.location').text()}#保存save2mongo(product)def save2mongo(result):"""保存到MongoDB"""try:if db[MONGO_TABLE].insert(result):print("保存成功",result)except Exception:print("出錯",result)def main():total_page_num = search()#獲取的信息為“共 ?? 頁”,只留數字total_page_num = int(re.compile('(\d+)').search(total_page_num).group(1))#從第二頁到最后一頁獲取for i in range(2,total_page_num+1):next_page(i)if __name__ == "__main__":main()
轉載于:https://www.cnblogs.com/whz0215/p/9330266.html
總結
以上是生活随笔為你收集整理的selenium+chromedriver爬取淘宝美食信息保存到MongoDB的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Nginx 安装配置教程
- 下一篇: 给自己一点恒心,加油打气~