Python requests+BeautifulSoup 采集 安居客_新房信息
生活随笔
收集整理的這篇文章主要介紹了
Python requests+BeautifulSoup 采集 安居客_新房信息
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
?
Python requests+BeautifulSoup 采集 安居客_新房信息
?
?
# -*- coding: utf-8 -*-import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd
# 安居客 上海 新房 信息采集
# https://sh.fang.anjuke.com/loupan/all/p1/
titles = ['品名','樓盤首頁','樓盤詳情','戶型','相冊','用戶點評','問答','地圖交通','樓盤房源','動態資訊','樓盤評測', '看圖鏈接']
url_lists = list()
# 返回請求 bsObj
def getHtmlbsObj(url):
#http請求頭
Hostreferer = {
'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'
}
# url = "https://sh.fang.anjuke.com/loupan/all/p" + str(page) + "/"
start_html = requests.get(url, headers=Hostreferer)
start_html.encoding = 'utf-8'
bsObj = BeautifulSoup(start_html.text, "html.parser")
return bsObj
# 主存儲鏈接
pages = list()
def getHouseTitleLink(page):
num = 1
url = "https://sh.fang.anjuke.com/loupan/all/p"+str(page)+"/"
bsObj = getHtmlbsObj(url)
h_div = bsObj.find_all("div", {"class": "infos"})
if h_div != None:
# print(h_link)
for link in h_div:
status = link.find("div",{"class":"tag-panel"}).i.get_text()
#去除 售罄 的 樓盤
if status != "售罄":
name = link.find("span",{"class":"items-name"}).get_text()
if name not in pages:
print(str(num)+":"+name)
nameurl = getByNameFindUrl(name)
pages.append(name)
# print(pages)
num = num+1
else:
print("h_div data is None ERROR")
return pages
# 返回一個詳情頁面
def getByNameFindUrl(name):
# for name in NameLists:
nameurl = ""
# print(name)
url = "https://sh.fang.anjuke.com/loupan/s?kw="+name
# 搜索結果頁
bsObj = getHtmlbsObj(url)
h_div = bsObj.find_all("div", {"class": "infos"})
if h_div != None:
for h_link in h_div:
# https://sh.fang.anjuke.com/loupan/426516.html?from=AF_RANK_1
# https://sh.fang.anjuke.com/loupan/426516.html?from=AF_RANK_13424
nameurl = h_link.find("a", {"class": "lp-name"}).get('href')
list_urls = getPageInfoUrl(nameurl)
url_lists.append(list_urls)
# print(nameurl)
else:
print("h_div_name data is None ERROR")
# # 右側 更多 的url
# left_hot_loupan = bsObj.find("div",{"class":"tuangou_entry s-mod"})
# # <a>更多</a> 的 url
# if left_hot_loupan != None:
# order_loupan_link = left_hot_loupan.find("a",{"class":"chk-more","soj":"loupan_list"})
# str = order_loupan_link.get_text() + ":" + order_loupan_link.get("href")
# print("75"+str)
# else:
# print("left_hot_loupan data is None ERROR")
# # 下側 熱門新盤推薦 url
# down_rec_loupan = bsObj.find_all("div",{"class":"item-mod"})
# # print(down_rec_loupan)
# if down_rec_loupan != None:
# for item in down_rec_loupan:
# loupan_div_all = item.find_all("div",{"class":"infos"})
# for item in loupan_div_all:
# # 標題
# loupan_name = item.find("span",{"class":"items-name"}).get_text()
# loupan_name_link = item.find("a",{"class":"lp-name"}).get("href")
# name_str = loupan_name + ":" +loupan_name_link
# print(name_str)
# # 地址
# loupan_address_link = item.find("a",{"class":"address"}).get("href")
# loupan_address_map = item.find("span",{"class":"list-map"}).get_text()
# address_str = loupan_address_map +":"+ loupan_address_link
# print(address_str)
# else:
# print("down_rec_loupan data is None ERROR")
# return nameurl
# 解析詳情頁面鏈接所有 url
def getPageInfoUrl(nameurl):
links_url = list()
if nameurl != None:
bsObj = getHtmlbsObj(nameurl)
# 樓盤名
title = bsObj.find("div",{"class":"lp-tit"}).find("h1",{"id":"j-triggerlayer"}).get_text()
links_url.append(title)
print("title:"+title)
# 解析 nav 導航采集 鏈接
lp_nav_div = bsObj.find("div",{"class":"lp-nav"})
nav_a_links = lp_nav_div.find_all("a")
for temp in nav_a_links:
a_link = temp.get('href')
links_url.append(a_link)
a_name = temp.get_text()
str = a_name + ":" + a_link
print("113"+str)
# 看圖鏈接
img_link_div = bsObj.find("div",{"class":"clip"})
if img_link_div != None:
link = img_link_div.find("a").get("href")
links_url.append(link)
print("119看圖鏈接:"+link)
else:
print("img_link_div data is None ERROR")
# 右側url信息采集
left_info_div = bsObj.find("div",{"class":"basic-details"})
# if left_info_div != None:
# find_a = left_info_div.find_all("a")
# for item in find_a:
# str_title = item.get_text()
# str_url = item.get("href")
# print(str_title+":"+str_url)
# else:
# print("left_info_div data is None ERROR")
else:
print("nameurl data is None ERROR")
return links_url
if __name__ == '__main__':
# 開始時間
start = datetime.datetime.now()
print(start)
fileName = "D:/anjuke_house_url.csv"
for i in range(1,10):
lists = getHouseTitleLink(str(i))
print(lists.__len__())
print("寫入cvs格式文件")
print(url_lists.__len__())
test = pd.DataFrame(columns=titles, data=url_lists)
test.to_csv(fileName)
# 完成時間
end = datetime.datetime.now()
print(end)
print("寫入完成")
useSeconds = (end - start).total_seconds() # 精確秒數
print(useSeconds)
?
?
?
?
?
?
轉載于:https://www.cnblogs.com/ignorelove/p/9257382.html
總結
以上是生活随笔為你收集整理的Python requests+BeautifulSoup 采集 安居客_新房信息的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: n维椭球体积公式_为了方差无偏估计为什么
- 下一篇: iPhone4S国行、港版、美版、妖机识