加州房价预测数据预处理
本文是該系列讀書筆記的第二章數據預處理部分
獲取數據
數據的初步分析,數據探索
地理分布
數據特征的相關性
創建新的特征
數據清洗, 創建處理流水線
本文是該系列讀書筆記的第二章數據預處理部分
導入常用的數據分析庫
import pandas as pd
import numpy as np
import os
import tarfile
from six.moves import urllib
獲取數據
download_root="https://raw.githubusercontent.com/ageron/handson-ml/master/"
house_path="datasets/housing"
housing_url=download_root+house_path+"/housing.tgz"
def fecthing_housing_data(housing_url=housing_url,house_path=house_path):
if not os.path.exists(house_path):
os.makedirs(house_path)
tgz_path=os.path.join(house_path,'housing.tgz')
urllib.request.urlretrieve(housing_url,tgz_path)
housing_tgz=tarfile.open(tgz_path)
housing_tgz.extractall(path=house_path)
housing_tgz.close()
def load_housing_data(house_path=house_path):
csv_path=os.path.join(house_path,"housing.csv")
return pd.read_csv(csv_path)
數據的初步分析,數據探索
# fecthing_housing_data() # 下載數據,解壓出csv文件
housing=load_housing_data()
housing.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY |
| 1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY |
| 2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY |
| 3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY |
| 4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY |
housing.info()
# total_bedrooms 存在缺失值,
# 前9列為float格式,經度,維度,房齡中位數,總的房間數,臥室數目,人口,家庭數,收入中位數,房屋價格的中位數,
# 最后一列為離海距離為object類型
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude 20640 non-null float64
latitude 20640 non-null float64
housing_median_age 20640 non-null float64
total_rooms 20640 non-null float64
total_bedrooms 20433 non-null float64
population 20640 non-null float64
households 20640 non-null float64
median_income 20640 non-null float64
median_house_value 20640 non-null float64
ocean_proximity 20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
# 需要查看ocean_proximity都包含哪些,
housing['ocean_proximity'].value_counts()
<1H OCEAN 9136
INLAND 6551
NEAR OCEAN 2658
NEAR BAY 2290
ISLAND 5
Name: ocean_proximity, dtype: int64
# 對數值類型的特征進行初步的統計
housing.describe()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
|---|---|---|---|---|---|---|---|---|---|
| count | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20433.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 |
| mean | -119.569704 | 35.631861 | 28.639486 | 2635.763081 | 537.870553 | 1425.476744 | 499.539680 | 3.870671 | 206855.816909 |
| std | 2.003532 | 2.135952 | 12.585558 | 2181.615252 | 421.385070 | 1132.462122 | 382.329753 | 1.899822 | 115395.615874 |
| min | -124.350000 | 32.540000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 0.499900 | 14999.000000 |
| 25% | -121.800000 | 33.930000 | 18.000000 | 1447.750000 | 296.000000 | 787.000000 | 280.000000 | 2.563400 | 119600.000000 |
| 50% | -118.490000 | 34.260000 | 29.000000 | 2127.000000 | 435.000000 | 1166.000000 | 409.000000 | 3.534800 | 179700.000000 |
| 75% | -118.010000 | 37.710000 | 37.000000 | 3148.000000 | 647.000000 | 1725.000000 | 605.000000 | 4.743250 | 264725.000000 |
| max | -114.310000 | 41.950000 | 52.000000 | 39320.000000 | 6445.000000 | 35682.000000 | 6082.000000 | 15.000100 | 500001.000000 |
%matplotlib inline
import matplotlib.pyplot as plt
# 查看每個數值特征的分布,
housing.hist(bins=50,figsize=(20,15))
# plt.show()
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000000179D4A20>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000000019A2A128>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000000019A557B8>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000000019A7AE48>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000000019AAB518>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000000019AAB550>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000000019B03278>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000000019B29908>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000000019B53F98>]],
dtype=object)
地理分布
housing.plot(kind="scatter", x="longitude", y="latitude")
<matplotlib.axes._subplots.AxesSubplot at 0x19bbfcc0>
housing.plot(kind="scatter", x="longitude", y="latitude",alpha=0.4)
# 標量,可選,默認值無,alpha混合值,介于0(透明)和1(不透明)之間
# 顯示高密度區域的散點圖,顏色越深,表示人口越密集,雖然我對加州的地理位置不是特別清楚
<matplotlib.axes._subplots.AxesSubplot at 0x1a705b70>
housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.4,
s=housing['population']/50,label='population',
c='median_house_value',cmap=plt.get_cmap("jet"),colorbar=True,
figsize=(9,6))
# import matplotlib
# plt.figure(figsize=(15,9))
# sc=plt.scatter(housing['longitude'],housing['latitude'],alpha=0.4,
# s=housing['population']/100,label='population',
# c=housing['median_house_value'],cmap=plt.get_cmap("jet"))
# plt.legend()
# matplotlib.rcParams["font.sans-serif"]=["SimHei"]
# matplotlib.rcParams['axes.unicode_minus'] = False
# matplotlib.rcParams['font.size'] =15
# plt.xlabel('經度')
# plt.ylabel('緯度')
# color_bar=plt.colorbar(sc)
# color_bar.set_label('meidan_house_value')
# plt.show()
#以上為使用plt的完整代碼,將坐標軸的內容以及添加colorbar,設置中文坐標軸標題
<matplotlib.axes._subplots.AxesSubplot at 0x19ffb390>
# 房價與位置和人口密度聯系密切,但是如何用數學的角度來描述幾個變量之間的關聯呢,可以使用標準相關系數standard correlation coefficient
# 常用的相關系數為皮爾遜相關系數
corr_matrix = housing.corr()
corr_matrix
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
|---|---|---|---|---|---|---|---|---|---|
| longitude | 1.000000 | -0.924664 | -0.108197 | 0.044568 | 0.069608 | 0.099773 | 0.055310 | -0.015176 | -0.045967 |
| latitude | -0.924664 | 1.000000 | 0.011173 | -0.036100 | -0.066983 | -0.108785 | -0.071035 | -0.079809 | -0.144160 |
| housing_median_age | -0.108197 | 0.011173 | 1.000000 | -0.361262 | -0.320451 | -0.296244 | -0.302916 | -0.119034 | 0.105623 |
| total_rooms | 0.044568 | -0.036100 | -0.361262 | 1.000000 | 0.930380 | 0.857126 | 0.918484 | 0.198050 | 0.134153 |
| total_bedrooms | 0.069608 | -0.066983 | -0.320451 | 0.930380 | 1.000000 | 0.877747 | 0.979728 | -0.007723 | 0.049686 |
| population | 0.099773 | -0.108785 | -0.296244 | 0.857126 | 0.877747 | 1.000000 | 0.907222 | 0.004834 | -0.024650 |
| households | 0.055310 | -0.071035 | -0.302916 | 0.918484 | 0.979728 | 0.907222 | 1.000000 | 0.013033 | 0.065843 |
| median_income | -0.015176 | -0.079809 | -0.119034 | 0.198050 | -0.007723 | 0.004834 | 0.013033 | 1.000000 | 0.688075 |
| median_house_value | -0.045967 | -0.144160 | 0.105623 | 0.134153 | 0.049686 | -0.024650 | 0.065843 | 0.688075 | 1.000000 |
數據特征的相關性
import seaborn as sns
plt.Figure(figsize=(25,20))
hm=sns.heatmap(corr_matrix,cbar=True,annot=True,square=True,fmt='.2f',annot_kws={'size':9}, cmap="YlGnBu")
plt.show()
corr_matrix['median_house_value'].sort_values(ascending=False)
"""
相關系數的范圍是 -1 到 1。當接近 1 時,意味強正相關;
例如,當收入中位數增加時,房價中位數也會增加。
當相關系數接近 -1 時,意味強負相關;
緯度和房價中位數有輕微的負相關性(即,越往北,房價越可能降低)。
最后,相關系數接近 0,意味沒有線性相關性。
"""
# 使用pandas中的scatter_matrix 可以從另外一種角度分析多個變量之間的相關性
from pandas.plotting import scatter_matrix
attributes=['median_house_value',"median_income","total_bedrooms","housing_median_age"]
scatter_matrix(housing[attributes],figsize=(12,9))
# sns.pairplot(housing[['median_house_value',"median_income",]],height=5)
# 使用seaborn中的pariplot可以實現同樣的結果
housing.plot(kind="scatter",x='median_income',y='median_house_value',alpha=0.2)
<matplotlib.axes._subplots.AxesSubplot at 0x1e3df9e8>
創建新的特征
重點關注收入的中位數與房屋價值的中位數之間的關系,從上圖以及相關系數都可以得到兩者之間存在很明顯的正相關
可以清洗的看到向上的趨勢,并且數據點不是非常分散,
我們之前統計得到的最高房價位于5000000美元的水平線
從頻率分布直方圖hist可以看到housing_median_age ,meidan_house_value 具有長尾分布,可以嘗試對其進行log或者開根號等轉化
當然,不同項目的處理方法各不相同,但大體思路是相似的。
housing['rooms_per_household']=housing['total_rooms']/housing['households']
housing['bedrooms_per_room']= housing['total_bedrooms']/housing['total_rooms']
housing['population_per_household']=housing['population']/housing['households']
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)
# """
# 新的特征房間中,臥室占比與房屋價值中位數有著更明顯的負相關性,比例越低,房價越高;
# 每家的房間數也比街區的總房間數的更有信息,很明顯,房屋越大,房價就越高
# """
median_house_value 1.000000
median_income 0.688075
rooms_per_household 0.151948
total_rooms 0.134153
housing_median_age 0.105623
households 0.065843
total_bedrooms 0.049686
population_per_household -0.023737
population -0.024650
longitude -0.045967
latitude -0.144160
bedrooms_per_room -0.255880
Name: median_house_value, dtype: float64
數據清洗, 創建處理流水線
缺失值處理
處理object文本數據類型
特征放縮
構建模型pepeline
以上幾個步驟我們在之前的博客中基本上都已經用過,這里作為讀書筆記不會再過多的詳細解釋
# total_bedrooms特征缺失值處理
"""
- 去掉含有缺失值的樣本,dropna()
- 去掉含有缺失值的特征 dropna(axis=1)
- 進行填充(中位數,平均值,0,插值填充) fillna(housing['total_bedrooms'].median()) 較為方便的使用pandas中的方法
"""
from sklearn.preprocessing import Imputer
imputer=Imputer(strategy='mean')
housing_num=housing.drop('ocean_proximity',axis=1)
imputer.fit(housing_num)
Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
housing_num_trans=pd.DataFrame(imputer.transform(housing_num),columns=housing_num.columns)
housing_num_trans.info()
# 缺失值補齊,總覺得如果是缺失值處理的話,可以直接用pandas中的fillna會節省一點時間,在原始的數據上直接處理掉,后面也就不用再去擔心這個
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 12 columns):
longitude 20640 non-null float64
latitude 20640 non-null float64
housing_median_age 20640 non-null float64
total_rooms 20640 non-null float64
total_bedrooms 20640 non-null float64
population 20640 non-null float64
households 20640 non-null float64
median_income 20640 non-null float64
median_house_value 20640 non-null float64
rooms_per_household 20640 non-null float64
bedrooms_per_room 20640 non-null float64
population_per_household 20640 non-null float64
dtypes: float64(12)
memory usage: 1.9 MB
# 處理文本object類型數據
from sklearn.preprocessing import LabelEncoder
encoder= LabelEncoder()
house_cat=housing['ocean_proximity']
house_cat_encode=encoder.fit_transform(house_cat)
house_cat_encode
array([3, 3, 3, ..., 1, 1, 1], dtype=int64)
encoder.classes_
array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
dtype=object)
在之前博客中也提到類似的操作,改操作可能會將兩個臨近的值
比兩個疏遠的值更為相似,因此一般情況下,對與類標才會使用LabelEncoder,對于特征不會使用該方式對特征轉換
更為常用的操作是獨熱編碼,給每個分類創建一個二元屬性,比如當分類是INLAND,有則是1,沒有則是0
skleanrn中提供了編碼器OneHotEncoder,類似與pandas中pd.get_dummies()
from sklearn.preprocessing import OneHotEncoder
# OneHotEncoder只能對數值型數據進行處理,只接受2D數組
encoder=OneHotEncoder()
housing_cat_1hot=encoder.fit_transform(house_cat_encode.reshape((-1,1)))
housing_cat_1hot
<20640x5 sparse matrix of type '<class 'numpy.float64'>'
with 20640 stored elements in Compressed Sparse Row format>
housing_cat_1hot.toarray()
array([[0., 0., 0., 1., 0.],
[0., 0., 0., 1., 0.],
[0., 0., 0., 1., 0.],
...,
[0., 1., 0., 0., 0.],
[0., 1., 0., 0., 0.],
[0., 1., 0., 0., 0.]])
# 使用LabelBinarizer 可以實現同樣的效果
from sklearn.preprocessing import LabelBinarizer
encoder=LabelBinarizer()
housing_cat_1hot=encoder.fit_transform(house_cat)
housing_cat_1hot
array([[0, 0, 0, 1, 0],
[0, 0, 0, 1, 0],
[0, 0, 0, 1, 0],
...,
[0, 1, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 1, 0, 0, 0]])
# 直接在原始的數據上使用pandas.get_dummies()是最簡單的方法
pd.get_dummies(housing[['ocean_proximity']]).head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
| ocean_proximity_<1H OCEAN | ocean_proximity_INLAND | ocean_proximity_ISLAND | ocean_proximity_NEAR BAY | ocean_proximity_NEAR OCEAN | |
|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 0 | 0 | 0 | 1 | 0 |
| 3 | 0 | 0 | 0 | 1 | 0 |
| 4 | 0 | 0 | 0 | 1 | 0 |
# 特征放縮 我們常用到的MinMaxScaler和StandandScaler兩種
# 一般會對不同范圍內的特征進行放縮,有助于優化算法收斂的速度(尤其是針對梯度提升的優化算法)
# 歸一化: 減去最小值,然后除以最大最小值的差
# 標準化: 減去平均值,然后除以方差,得到均值為0,方差為1的標準正態分布,受異常值影響比較小,決策樹和隨機森林不需要特征放縮
# 特征放縮一般針對訓練數據集進行transform_fit,對測試集數據進行transform
# 從劃分數據集→pipeline
from sklearn.model_selection import train_test_split
housing=load_housing_data()
# train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42) # 隨機采樣
from sklearn.model_selection import StratifiedShuffleSplit # 分層采樣
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
for train_index, test_index in split.split(housing, housing["income_cat"]): # 按照收入中位數進行分層采樣
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
housing = strat_train_set.copy() # 創建一個副本,以免損傷訓練集,
housing.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 11 columns):
longitude 16512 non-null float64
latitude 16512 non-null float64
housing_median_age 16512 non-null float64
total_rooms 16512 non-null float64
total_bedrooms 16354 non-null float64
population 16512 non-null float64
households 16512 non-null float64
median_income 16512 non-null float64
median_house_value 16512 non-null float64
ocean_proximity 16512 non-null object
income_cat 16512 non-null float64
dtypes: float64(10), object(1)
memory usage: 1.5+ MB
#轉化流水線
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline=Pipeline([('imputer',Imputer(strategy='median')),('std_scaler',StandardScaler())])
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
housing_num=housing.drop('ocean_proximity',axis=1)
housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_cat=housing['ocean_proximity']
housing_cat_tr= LabelBinarizer().fit_transform(housing_cat)
housing_train=np.c_[housing_num_tr,housing_cat_tr]
housing_train.shape
# 數字特征與categoriy 特征不能同時進行轉化,需要進行FeatureUnion
# 你給它一列轉換器(可以是所有的轉換器),當調用它的transform()方法,每個轉換器的transform()會被并行執行,
# 等待輸出,然后將輸出合并起來,并返回結果
# 當然也可以通過分批轉化,然后通過np將轉化好的數據集合并,本質上沒有什么區別,只不過對于測試集仍然需要transform,然后再合并成轉化好的測試集
(16512, 14)
import os
import sys
sys.path.append(os.getcwd())
from future_encoders import ColumnTransformer
from future_encoders import OneHotEncoder
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
array([[-1.15604281, 0.77194962, 0.74333089, ..., 0. ,
1. , 0. ],
[-1.17602483, 0.6596948 , -1.1653172 , ..., 0. ,
1. , 0. ],
[ 1.18684903, -1.34218285, 0.18664186, ..., 0. ,
1. , 1. ],
...,
[ 1.58648943, -0.72478134, -1.56295222, ..., 0. ,
1. , 0. ],
[ 0.78221312, -0.85106801, 0.18664186, ..., 0. ,
1. , 0. ],
[-1.43579109, 0.99645926, 1.85670895, ..., 0. ,
1. , 0. ]])
np.allclose(housing_prepared, housing_train)
True
后續內容已經放在github上,篇幅過大就只能把數據預處理的部分整理在這里,然后把后續的算法的實現部分整理在github中
總結
以上是生活随笔為你收集整理的加州房价预测数据预处理的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 工行随用金怎么还款?两种方式任你选择
- 下一篇: 2019信用卡管理软件哪个好用?这三款软