爬取58二手数据.py
生活随笔
收集整理的這篇文章主要介紹了
爬取58二手数据.py
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
#第一個模塊 抓取所有頻道鏈接
from bs4 import BeautifulSoup
import requestsstart_url = 'http://bj.58.com/sale.shtml'
url_host = 'http://bj.58.com'def get_index_url(url):wb_data = requests.get(url)soup = BeautifulSoup(wb_data.text, 'lxml')links = soup.select('ul.ym-submnu > li > b > a')for link in links:page_url = url_host + link.get('href')print(page_url)get_index_url(start_url)#第二個模塊 抓取所有商品鏈接和詳情數據from bs4 import BeautifulSoup
import requests
import time
import pymongoclient = pymongo.MongoClient('localhost', 27017)
ceshi = client['ceshi']
url_list = ceshi['url_list4']
item_info = ceshi['item_info4']# 在最左邊是在python 中對象的名稱,后面的是在數據庫中的名稱
# spider 1
def get_links_from(channel, pages):# td.t 沒有這個就終止list_view = '{}/pn{}/'.format(channel, str(pages))wb_data = requests.get(list_view)time.sleep(1)soup = BeautifulSoup(wb_data.text, 'lxml')if soup.find('td', 't'):for link in soup.select('td.t a.t'):item_link = link.get('href').split('?')[0]if item_link != 'http://jump.zhineng.58.com/jump':url_list.insert({'url':item_link})print(item_link)# return urlselse:# It's the last page !pass# spider 2
def get_item_info(url):wb_data = requests.get(url)soup = BeautifulSoup(wb_data.text, 'lxml')if url[:25] == 'http://zhuanzhuan.58.com/':data={'title':soup.title.text,'price': soup.select('span.price_now')[0].text,'area':soup.select('div.palce_li > span > i')[0].text,'url':url}item_info.insert(data)else:data={'title': soup.title.text,'price':soup.select('span.price.c_f50')[0].text,'area':soup.select('div.su_con > a ')[0].get_text(),'sale_man':soup.select('ul.vcard > li > a ')[0].text,'url':url}item_info.insert(data)#第三個模塊 主文件運行開始抓取
from multiprocessing import Pool
from pages_parsing import get_item_info,url_list,item_info,get_links_from
from channel_extact import channel_listitem_url = (item['url'] for item in url_list.find())
index_urls0 = (item['url'] for item in item_info.find())
x = set(item_url)
y = set(index_urls0)
rest_of_urls = x-ydef get_all_links_from(channel):for i in range(1,100):get_links_from(channel,i)return rest_of_urlsif __name__ == '__main__':pool = Pool()# pool = Pool(processes=6)#pool.map(get_all_links_from,channel_list.split())
pool.map(get_item_info,rest_of_urls)# count = 0
# for url in rest_of_urls:
# print(url)
# count += 1
# print(count)#第四個模塊 查看數據流
import time
from pages_parsing import url_listwhile True:print(url_list.find().count())time.sleep(5)
?
轉載于:https://www.cnblogs.com/dws-love-jfl-1314/p/6045670.html
總結
以上是生活随笔為你收集整理的爬取58二手数据.py的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 【转】最流行的编程语言JavaScrip
- 下一篇: SQL查询系列之六:SQL模糊查询