當(dāng)前位置：首頁 > 编程语言 > python >内容正文

python

python爬虫requests源码链家_python爬虫爬取链家二手房信息

發(fā)布時(shí)間：2023/12/8 python 25 豆豆

生活随笔收集整理的這篇文章主要介紹了 python爬虫requests源码链家_python爬虫爬取链家二手房信息小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

#coding=utf-8

import requests

from fake_useragent import UserAgent

from bs4 import BeautifulSoup

import json

import csv

import time

# 構(gòu)建請(qǐng)求頭

userAgent = UserAgent()

headers = {

'user-agent': userAgent .Chrome

}

# 聲明一個(gè)列表存儲(chǔ)字典

data_list = []

def start_spider(page):

#設(shè)置重連次數(shù)

requests.adapters.DEFAULT_RETRIES = 15

s = requests.session()

#設(shè)置連接活躍狀態(tài)為False

s.keep_alive = False

#爬取的url,默認(rèn)爬取的南京的鏈家房產(chǎn)信息

url = 'https://nj.lianjia.com/ershoufang/pg{}/'.format(page)

# 請(qǐng)求url

resp = requests.get(url, headers=headers,timeout=10)

# 講返回體轉(zhuǎn)換成Beautiful

soup = BeautifulSoup(resp.content, 'lxml')

# 篩選全部的li標(biāo)簽

sellListContent = soup.select('.sellListContent li.LOGCLICKDATA')

# 循環(huán)遍歷

for sell in sellListContent:

try:

# 標(biāo)題

title = sell.select('div.title a')[0].string

# 先抓取全部的div信息，再針對(duì)每一條進(jìn)行提取

houseInfo = list(sell.select('div.houseInfo')[0].stripped_strings)

# 樓盤名字

loupan = houseInfo[0]

#對(duì)樓盤的信息進(jìn)行分割

info = houseInfo[0].split('|')

# 房子類型

house_type = info[1].strip()

# 面積大小

area = info[2].strip()

# 房間朝向

toward = info[3].strip()

# 裝修類型

renovation = info[4].strip()

# 房屋地址

positionInfo = ''.join(list(sell.select('div.positionInfo')[0].stripped_strings))

# 房屋總價(jià)

totalPrice = ''.join(list(sell.select('div.totalPrice')[0].stripped_strings))

# 房屋單價(jià)

unitPrice = list(sell.select('div.unitPrice')[0].stripped_strings)[0]

# 聲明一個(gè)字典存儲(chǔ)數(shù)據(jù)

data_dict = {}

data_dict['title'] = title

data_dict['loupan'] = loupan

data_dict['house_type'] = house_type

data_dict['area'] = area

data_dict['toward'] = toward

data_dict['renovation'] = renovation

data_dict['positionInfo'] = positionInfo

data_dict['totalPrice'] = totalPrice

data_dict['unitPrice'] = unitPrice

data_list.append(data_dict)

except Exception as e:

print(e)

continue

def main():

# 只爬取10頁

for page in range(1, 10):

start_spider(page)

time.sleep(3)

# 將數(shù)據(jù)寫入json文件

with open('data_json.json', 'a+', encoding='utf-8') as f:

json.dump(data_list, f, ensure_ascii=False, indent=4)

print('json文件寫入完成')

# 將數(shù)據(jù)寫入csv文件

with open('./data_csv.csv', 'w', encoding='utf-8', newline='') as f:

# 表頭

print(data_list)

title = data_list[0].keys()

# 創(chuàng)建writer對(duì)象

writer = csv.DictWriter(f, title)

# 寫入表頭

writer.writeheader()

# 批量寫入數(shù)據(jù)

writer.writerows(data_list)

print('csv文件寫入完成')

if __name__ == '__main__':

main()

總結(jié)

以上是生活随笔為你收集整理的python爬虫requests源码链家_python爬虫爬取链家二手房信息的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò)，歡迎將生活随笔推薦給好友。

上一篇：【毕业设计】stm32单片机智能扫地机器
下一篇： linux结束后台命令,linux 终端

3atv精品不卡视频,97人人超碰国产精品最新,中文字幕av一区二区三区人妻少妇,久久久精品波多野结衣,日韩一区二区三区精品

python

python爬虫requests源码链家_python爬虫爬取链家二手房信息

總結(jié)