import pandas as pd
from sklearn.model_selection import train_test_split# Read the data
X_full = pd.read_csv('../input/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')# Obtain target and predictors
y = X_full.SalePrice
features =['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)from sklearn.ensemble import RandomForestRegressor# Define the models,定義了5種參數的隨機森林模型
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)models =[model_1, model_2, model_3, model_4, model_5]from sklearn.metrics import mean_absolute_error# Function for comparing different modelsdefscore_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):model.fit(X_t, y_t)preds = model.predict(X_v)return mean_absolute_error(y_v, preds)# 找出誤差最小的模型for i inrange(0,len(models)):mae = score_model(models[i])print("Model %d MAE: %d"%(i+1, mae))best_model = models[2]
my_model = best_modelmy_model.fit(X, y)# Generate test predictions
preds_test = my_model.predict(X_test)# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': X_test.index,'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)
評分:mae誤差 20998.83780
2. Missing Values 缺失值處理
缺失值的處理:
丟棄整列,缺點是信息丟失嚴重
cols_with_missing =[col for col in X_train.columnsif X_train[col].isnull().any()]# Your code here# Fill in the lines below: drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing,axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing,axis=1)
差值填補,比如填充均值等
from sklearn.impute import SimpleImputer# Fill in the lines below: imputationhelp(SimpleImputer)
imp = SimpleImputer()# 默認以均值進行填補# imp = SimpleImputer(strategy="median") # 中位數填補
imputed_X_train = pd.DataFrame(imp.fit_transform(X_train))# 擬合,填補
imputed_X_valid = pd.DataFrame(imp.transform(X_valid))#填補# Fill in the lines below: imputation removed column names; put them back
imputed_X_train.columns = X_train.columns # 差值去除了特征名稱,再填上
imputed_X_valid.columns = X_valid.columns
SimpleImputer 參考如下
classSimpleImputer(_BaseImputer)| SimpleImputer(missing_values=nan, strategy='mean', fill_value=None,verbose=0, copy=True, add_indicator=False)|| Imputation transformer for completing missing values.|| Read more in the :ref:`User Guide <impute>`.|| Parameters|----------| missing_values : number, string, np.nan (default)orNone| The placeholder for the missing values. All occurrences of| `missing_values` will be imputed.|| strategy : string, default='mean'| The imputation strategy.||- If "mean", then replace missing values using the mean along| each column. Can only be used with numeric data.|- If "median", then replace missing values using the median along| each column. Can only be used with numeric data.|- If "most_frequent", then replace missing using the most frequent| value along each column. Can be used with strings or numeric data.|- If "constant", then replace missing values with fill_value. Can be| used with strings or numeric data.
# Get list of categorical variables,獲取非數字類變量
s =(X_train.dtypes =='object')
object_cols =list(s[s].index)print("Categorical variables:")print(object_cols)
Categorical variables:['Type','Method','Regionname'] # 特征名稱
from sklearn.preprocessing import LabelEncoder# Make copy to avoid changing original data
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()# Apply label encoder to each column with categorical data
label_encoder =LabelEncoder()for col in object_cols:label_X_train[col]= label_encoder.fit_transform(X_train[col])label_X_valid[col]= label_encoder.transform(X_valid[col])
One-Hot Encoding
# Apply one-hot encoder to each column with categorical data
OH_encoder =OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))# One-hot encoding removed index; put it back,放回idx
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1) # 丟棄原有的文字列,只剩數字
num_X_valid = X_valid.drop(object_cols, axis=1)# Add one-hot encoded columns to numerical features # 數字列和編碼后的文本特征列合并
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
遇見訓練集和測試集的文字變量種類不一樣:
檢查哪些特征在兩個集合里都是一樣的,不一樣的話直接編碼會出錯
# All categorical columns
object_cols =[col for col in X_train.columns if X_train[col].dtype =="object"]# Columns that can be safely label encoded
good_label_cols =[col for col in object_cols ifset(X_train[col])==set(X_valid[col])]# Problematic columns that will be dropped from the dataset
bad_label_cols =list(set(object_cols)-set(good_label_cols))
這里處理的方法是,丟棄不一致的,對一致的進行編碼轉換
from sklearn.preprocessing import LabelEncoder# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)# Apply label encoder
labEncoder =LabelEncoder()for feature in set(good_label_cols):label_X_train[feature]= labEncoder.fit_transform(label_X_train[feature])label_X_valid[feature]= labEncoder.transform(label_X_valid[feature])
查看文字特征里,有多少種變量值
# Get number of unique entries in each column with categorical data
object_nunique =list(map(lambda col: X_train[col].nunique(), object_cols))
d =dict(zip(object_cols, object_nunique))# Print number of unique entries by column, in ascending ordersorted(d.items(), key=lambda x: x[1])[('Street',2), # 街道有2個不同的值('Utilities',2),('CentralAir',2),。。。('Exterior2nd',16),('Neighborhood',25)] # 種數較多的不宜用one-hot,# 數據集擴大的很厲害,可以label-encoding,或丟棄
# Columns that will be one-hot encoded
# 不同數值數 <10 的特征進行 one-hot編碼
low_cardinality_cols =[col for col in object_cols if X_train[col].nunique()<10]# Columns that will be dropped from the dataset
# 剩余的(兩個set做差),丟棄
high_cardinality_cols =list(set(object_cols)-set(low_cardinality_cols))
from sklearn.preprocessing import OneHotEncoder# one_hot編碼器
ohEnc =OneHotEncoder(handle_unknown='ignore', sparse=False)# 不同數值數 <10 的特征one_hot編碼
OH_X_train = pd.DataFrame(ohEnc.fit_transform(X_train[low_cardinality_cols]))
OH_X_valid = pd.DataFrame(ohEnc.transform(X_valid[low_cardinality_cols]))# 編碼后index丟失,再加上
OH_X_train.index = X_train.index
OH_X_valid.index = X_valid.index# 數字特征(原數據丟棄文字特征,即得到)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)# 合并 數字特征 + one_hot編碼(記得恢復index)后的文字特征(特征數值種類多的丟棄了)
OH_X_train = pd.concat([OH_X_train, num_X_train], axis=1)
OH_X_valid = pd.concat([OH_X_valid, num_X_valid], axis=1)