import requests
from bs4 import BeautifulSoup
import urllib3
urllib3.disable_warnings()#關閉HTTPS的警告from sqlalchemy import create_engine
import pandas as pddefgethtml(url):headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}try:r=requests.get(url,headers=headers,timeout=3,verify=False)r.raise_for_status()r.encoding='utf8'return r.textexcept Exception as e:print('error of ', e.args)return''defget_url(url,list_url,M):html=gethtml(url[1])demo=BeautifulSoup(html,'html.parser')div=demo.find_all('div',attrs={'class':'items'})try:if M==1:hf=div[0].find_all('a')#區域elif M==2:hf=div[0].find('div',attrs={'class':'sub-items'}).find_all('a')#道路URLelif M==3:hf=div[1].find_all('a')#價位elif M==4:hf=div[2].find_all('a')#面積elif M==5:hf=div[3].find_all('a')#房型else:passexcept Exception as e:print('error',e.args)#得到頁數page=999try:page=len(demo.find('div',attrs={'class':'sale-left'}).find('div',attrs={'class':'multi-page'}).find_all(['a','i']))-2except:try:num=len(demo.find('div',attrs={'class':'sale-left'}).find('ul',attrs={'id':'houselist-mod-new'}).find_all('a'))#篩選后展示個數if num==0:page=0else:page=1except:page=0if page>0:try:count=0for i in hf:count=count+1list_url[i.attrs['href']]=(i.text,i.attrs['href'],url[2]+'-'+str(count),page)except Exception as e:print('error',e.args)defsave_mysql(data,table_url='hours_url_6'):conn = create_engine('mysql+pymysql://root:root123@localhost:3306/gethtml?charset=utf8')#創建連接df=pd.DataFrame(data).Tdf.columns=['name','url','num','page'] pd.io.sql.to_sql(df,table_url,con=conn,if_exists ='append',index=False)#導入數據庫if_exists = 'replace','append'conn.dispose()defmain():# item={'區域':0,'售價':1,'面積':2,'房型':3}#list_url={key,[title,url,url_no,page]} key=urlurl='https://xm.anjuke.com/sale/'url_no='1'#url等級編號ls_1={}list_url={}get_url(['城市',url,url_no],list_url,1)#獲取區域save_mysql(list_url)ls=list_url.copy()for depth inrange(2,6):list_url.clear()ls_1.clear()count=0for i in ls:count+=1url=ls[i]get_url(url,list_url,depth)iflen(list_url)>0:list_url['get_id']=Nonesave_mysql(list_url,'hours_url_8')for k in list_url:if list_url[k][3]>5:ls_1[k]=list_url[k]list_url.clear()print('\r','當前深度:%s, 獲取進度:%s/%s'%(depth,count,len(ls)),end='')print('\r','當前進度:%s , 獲取總數:%s'%(depth,len(ls_1)))ls=ls_1.copy()defmain_1():list_url={#'https://xm.anjuke.com/sale/siming/': ('思明', 'https://xm.anjuke.com/sale/siming/', '1-1', 8), #'https://xm.anjuke.com/sale/huli/': ('湖里', 'https://xm.anjuke.com/sale/huli/', '1-2', 8), #'https://xm.anjuke.com/sale/jimei/': ('集美', 'https://xm.anjuke.com/sale/jimei/', '1-3', 8), #'https://xm.anjuke.com/sale/haicang/': ('海滄', 'https://xm.anjuke.com/sale/haicang/', '1-4', 8), #'https://xm.anjuke.com/sale/tongana/': ('同安', 'https://xm.anjuke.com/sale/tongana/', '1-5', 8), 'https://xm.anjuke.com/sale/xiangana/':('翔安','https://xm.anjuke.com/sale/xiangana/','1-6',8),'https://xm.anjuke.com/sale/xiamenzhoubian/':('廈門周邊','https://xm.anjuke.com/sale/xiamenzhoubian/','1-7',8)}ls=list_url.copy()ls_1={}list_url.clear()for depth inrange(2,6):list_url.clear()ls_1.clear()count=0for i in ls:count+=1url=ls[i]get_url(url,list_url,depth)iflen(list_url)>0:save_mysql(list_url,'hours_url_9')for k in list_url:if list_url[k][3]>5:ls_1[k]=list_url[k]list_url.clear()print('\r','當前深度:%s, 獲取進度:%s/%s'%(depth,count,len(ls)),end='')print('\r','當前進度:%s , 獲取總數:%s'%(depth,len(ls_1)))ls=ls_1.copy()
main()
2、URL爬取房屋信息
從ID獲取房源的數據 查看是否已獲取數據
import pandas as pd
import pymysql
from bs4 import BeautifulSoup
import requests
import re
from sqlalchemy import create_engine
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)defget_url(url):try:headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}req=requests.get(url,headers=headers,timeout=3,verify=False)req.raise_for_status()req.encoding='utf-8'return req.textexcept Exception as e:error_no=req.status_codeif error_no==404or error_no==503:return error_noelse:print('error',e.args)return1defhtml_demo(html,list_data):pat=re.compile(r'[\:]')demo=BeautifulSoup(html,'html.parser')div=demo.find('div',attrs={'class':'wrapper'})#房產信息li=demo.find_all('li',attrs={'class':'houseInfo-detail-item'})for i in li:div=i.find_all('div')count=1for j in div:if count==1:a=''.join(pat.sub('',j.text.strip()).split())elif count==2:b=''.join(pat.sub('',j.text.strip()).split())list_data[a]=belse:continuecount+=1# 銷售信息demo.find('div',attrs={'class':'broker-wrap'})b_info=demo.find('div',attrs={'class':'broker-wrap'})brokercard_name=b_info.find('div',attrs={'class':'brokercard-name'}).text.split()[0]list_data['銷售']=brokercard_name#評分em=b_info.find_all('em')for i inrange(3):list_data[pat.sub('',em[i*2].text)]=em[i*2+1].text#公司名稱try:gs_name=b_info.find('a',attrs={'class':'text-overflow'}).attrs['title'].split(':')list_data[gs_name[0]]=gs_name[1]except:passdefsave_mysql(df,database='hours_data'):conn = create_engine('mysql+pymysql://root:root123@localhost:3306/gethtml?charset=utf8')#創建連接pd.io.sql.to_sql(df,database,con=conn,if_exists ='append',index=False)#導入數據庫if_exists = 'replace','append'conn.dispose()defupdate_id(id_num,database='hours_url_id'):conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')cursor=conn.cursor()sql="update %s set get_id=1 where ID='%s'"%(database,id_num)cursor.execute(sql)conn.commit()cursor.close()conn.close()defget_id(id_table):conn=pymysql.connect('localhost','root','root123',database='gethtml',charset='utf8')cursor=conn.cursor()sql='select distinct ID from %s where get_id is null '% id_tablecursor.execute(sql)data_id=cursor.fetchall()return data_idcursor.close()conn.close()defdata_exists(table_name,url_no):conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')cursor=conn.cursor()sql='select ID from %s where ID="%s"'%(table_name,url_no) cursor.execute(sql)return cursor.arraysizedefmain():data=get_id('hours_url_id')start_url='https://xm.anjuke.com/prop/view/'list_data={}count=0n=len(data)foridin data:count+=1print('\r','獲取數據進度:%s/%s'%(count,n),end='')count_id=data_exists('hours_url_id',id[0])if count_id >=1:update_id(id[0])continuetry:url=start_url+id[0]html=get_url(url)if html==1:#其他異常跳過continueelif html==404:#404異常刪除passelif html==503:print('網頁需驗證:503')breakelse:html_demo(html,list_data)list_data['ID']=id[0]df=pd.DataFrame(list_data,index=id)save_mysql(df)update_id(id[0])except Exception as e:count+=1print('error',url,e.args)continuemain()
3、數據處理
hours的數據處理部分
import numpy as np
import pandas as pd
import pymysqldefget_hours_data():
conn=pymysql.connect('localhost','root','root123',database='gethtml',charset='utf8')sql='select * from %s '%'hours_data'df=pd.read_sql(sql,conn,index_col='ID')conn.close()return dfdefhours_data_extract(df):df.drop_duplicates(keep='first',inplace=True)df=df[~df.index.duplicated()]if df['建筑面積'].str[-3:].drop_duplicates(keep='first').count()>1:print('建筑面積異常,存在單位不統一')if df['房屋單價'].str[-4:].drop_duplicates(keep='first').count()>1:print('房屋單價異常,存在單位不統一')#錯誤數據修復df=df[(df.所在樓層.str[:1]!='共')&(df.所在樓層.str[:1]!='地')]# df.所在樓層=df.所在樓層.str.replace(['共90層','共74層'],['共9層','共7層'])#數據分類處理df.建筑面積=df.建筑面積.str[:-3].astype(np.float)df.房屋單價=df.房屋單價.str[:-4].astype(np.int)df.建造年代=df.建造年代.str[:-1].astype(np.int)df.所在位置=df.所在位置.str.split('-').str[0].replace(['思明','湖里','集美','海滄','同安','翔安','廈門周邊'],[1,2,3,4,5,6,7])df.房屋類型=df.房屋類型.replace(['普通住宅','平房','公寓','暫無'],[1,2,3,4])df[['室','廳','衛']]= df['房屋戶型'].str.extract('(\d+)室(\d+)廳(\d+)衛', expand=False).astype(np.int)df['均面積']=df['建筑面積']/(df['室']+df['廳'])df['樓高']=df.loc[:,'所在樓層'].str[4:].str[:-2].astype(np.int)df.樓高=df.樓高.replace([90,74],[9,7])df['層級']=df.loc[:,'所在樓層'].str[:2].replace(['低層','中層','高層'],[1,2,3]).astype(np.int)df.裝修程度=df.裝修程度.replace(['精裝修','簡單裝修','豪華裝修','毛坯'],[1,2,3,4])df.產權年限=df.產權年限.replace(['70年','50年','40年'],[1,2,3])df.配套電梯=df.配套電梯.replace(['無','有'],[0,1])df.房本年限=df.房本年限.replace(['滿五年','滿二年','不滿二年','暫無'],[1,2,3,4])df.產權性質=df.產權性質.replace(['商品房住宅','商住兩用','單位集體自建房','使用權','保障性住房','動遷配套房','其他'],[1,2,3,4,5,6,7])df.唯一住房=df.唯一住房.replace(['是','否','暫無'],[1,0,0])df.房屋朝向=df.房屋朝向.replace(['東','南','西','北','東西','南北','東南','東北','西南','西北'],[1,2,3,4,5,6,7,8,9,10])df['price_group']=df.房屋單價//10000#刪除不需要字段df=df.drop(['所屬小區','房屋戶型','參考月供','所在樓層','參考首付','銷售','房源','服務','評價','工商注冊名稱','一手房源'],axis=1)return dfdefgetdata():df=get_hours_data()data=hours_data_extract(df)return data
import pandas as pd
import pymysql
import numpy as np
import sklearn.decomposition as dp
import matplotlib.pyplot as plt
from sklearn import preprocessingdefget_data(data_table):conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')sql="select * from %s where 產權性質='商品房住宅'"% data_tabledf=pd.read_sql(sql,conn,index_col='ID')conn.close()return dfdefdata_revision(df):df=df.drop_duplicates(keep='first')#刪除重復的行df=df[~df.index.duplicated()]#刪除ID重復if df['建筑面積'].str[-3:].drop_duplicates(keep='first').count()>1:print('建筑面積異常,存在單位不統一')df['建筑面積']=df['建筑面積'].str[:-3]df['建筑面積']= df['建筑面積'].astype(np.float)if df['房屋單價'].str[-4:].drop_duplicates(keep='first').count()>1:print('房屋單價異常,存在單位不統一')df['房屋單價']=df['房屋單價'].str[:-4]df['房屋單價']= df['房屋單價'].astype(np.int)df['建造年代']=df['建造年代'].str[:-1]df['建造年代']= df['建造年代'].astype(np.int)df['所在位置']=df['所在位置'].str.split('-').str[0]df=df.drop(['參考月供'],axis=1)return dfdefdata_pca(df):y=df['房屋單價']X=df[['建造年代','建筑面積','所在位置']]# encoder=preprocessing.LabelEncoder()#所在位置自動分組X[['所在位置']]=X[['所在位置']].apply(preprocessing.LabelEncoder().fit_transform)#單價進行區間分組bins=[0,20000,50000,100000]labels=[1,2,3]y=pd.cut(y,bins,right=True,labels=labels)pca=dp.PCA(n_components=2)#加載PCA算法,降維2維值train_X=pca.fit_transform(X)#對原始數據降維x_=[]y_=[]for i inrange(2,0,-1):for j inrange(len(train_X)):if y[j]==i:x_.append(train_X[j][0])y_.append(train_X[j][1])plt.scatter(x_,y_)x_.clear()y_.clear()plt.show()defmain():df=get_data('hours_data')data=data_revision(df)data_pca(data)# for i in data.columns:# print(data[i].head())# print(data.count())main()