import pandas
as pddata
=pd
.read_csv
('adult.csv',header
=None,index_col
=False,names
=['年齡','單位性質','權重','學歷','受教育時長','婚姻狀況','職業','家庭狀況','種族','性別','資產所得','資產損失','周工作時長','原籍','收入'])
data_lite
=data
[['年齡','單位性質','學歷','性別','周工作時長','職業','收入']]display
(data_lite
.head
())
年齡單位性質學歷性別周工作時長職業收入
0| 39 | State-gov | Bachelors | Male | 40 | Adm-clerical | <=50K |
1| 50 | Self-emp-not-inc | Bachelors | Male | 13 | Exec-managerial | <=50K |
2| 38 | Private | HS-grad | Male | 40 | Handlers-cleaners | <=50K |
3| 53 | Private | 11th | Male | 40 | Handlers-cleaners | <=50K |
4| 28 | Private | Bachelors | Female | 40 | Prof-specialty | <=50K |
data_dummies
=pd
.get_dummies
(data_lite
)print('樣本原始特征:\n',list(data_lite
.columns
),'\n')print('虛擬變量特征:\n',list(data_dummies
.columns
),'\n')print('data_dummies.shape:\n',data_dummies
.shape
,'\n')print('data_dummies的類型:\n',type(data_dummies
))
樣本原始特征:['年齡', '單位性質', '學歷', '性別', '周工作時長', '職業', '收入'] 虛擬變量特征:['年齡', '周工作時長', '單位性質_ ?', '單位性質_ Federal-gov', '單位性質_ Local-gov', '單位性質_ Never-worked', '單位性質_ Private', '單位性質_ Self-emp-inc', '單位性質_ Self-emp-not-inc', '單位性質_ State-gov', '單位性質_ Without-pay', '學歷_ 10th', '學歷_ 11th', '學歷_ 12th', '學歷_ 1st-4th', '學歷_ 5th-6th', '學歷_ 7th-8th', '學歷_ 9th', '學歷_ Assoc-acdm', '學歷_ Assoc-voc', '學歷_ Bachelors', '學歷_ Doctorate', '學歷_ HS-grad', '學歷_ Masters', '學歷_ Preschool', '學歷_ Prof-school', '學歷_ Some-college', '性別_ Female', '性別_ Male', '職業_ ?', '職業_ Adm-clerical', '職業_ Armed-Forces', '職業_ Craft-repair', '職業_ Exec-managerial', '職業_ Farming-fishing', '職業_ Handlers-cleaners', '職業_ Machine-op-inspct', '職業_ Other-service', '職業_ Priv-house-serv', '職業_ Prof-specialty', '職業_ Protective-serv', '職業_ Sales', '職業_ Tech-support', '職業_ Transport-moving', '收入_ <=50K', '收入_ >50K'] data_dummies.shape:(32561, 46) data_dummies的類型:<class 'pandas.core.frame.DataFrame'>
pd
.set_option
('display.max_columns', None)
pd
.set_option
('display.max_rows', None)data_dummies
.head
()
年齡周工作時長單位性質_ ?單位性質_ Federal-gov單位性質_ Local-gov單位性質_ Never-worked單位性質_ Private單位性質_ Self-emp-inc單位性質_ Self-emp-not-inc單位性質_ State-gov單位性質_ Without-pay學歷_ 10th學歷_ 11th學歷_ 12th學歷_ 1st-4th學歷_ 5th-6th學歷_ 7th-8th學歷_ 9th學歷_ Assoc-acdm學歷_ Assoc-voc學歷_ Bachelors學歷_ Doctorate學歷_ HS-grad學歷_ Masters學歷_ Preschool學歷_ Prof-school學歷_ Some-college性別_ Female性別_ Male職業_ ?職業_ Adm-clerical職業_ Armed-Forces職業_ Craft-repair職業_ Exec-managerial職業_ Farming-fishing職業_ Handlers-cleaners職業_ Machine-op-inspct職業_ Other-service職業_ Priv-house-serv職業_ Prof-specialty職業_ Protective-serv職業_ Sales職業_ Tech-support職業_ Transport-moving收入_ <=50K收入_ >50K
0| 39 | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1| 50 | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
2| 38 | 40 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3| 53 | 40 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
4| 28 | 40 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
features
=data_dummies
.loc
[:,'年齡':'職業_ Transport-moving']print('features的類型:\n',type(features
),'\n')X
=features
.values
print('X的類型:\n',type(X
),'\n')print('打印X的前五行:\n',X
[:5,:])y
=data_dummies
['收入_ >50K'].values
print('特征形態:{} 標簽形態{}'.format(X
.shape
,y
.shape
))
features的類型:<class 'pandas.core.frame.DataFrame'> X的類型:<class 'numpy.ndarray'> 打印X的前五行:[[39 40 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 00 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0][50 13 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 00 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0][38 40 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 00 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0][53 40 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 00 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0][28 40 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 00 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]]
特征形態:(32561, 44) 標簽形態(32561,)
features
.columns
type(features
.columns
)
pandas.core.indexes.base.Index
from sklearn
.model_selection
import train_test_splitX_train
,X_test
,y_train
,y_test
=train_test_split
(X
,y
,random_state
=0)from sklearn
import treego_dating_tree
=tree
.DecisionTreeClassifier
(max_depth
=5)go_dating_tree
.fit
(X_train
,y_train
)print('模型的分:{:.2f}'.format(go_dating_tree
.score
(X_test
,y_test
)))
模型的分:0.80
總結
以上是生活随笔為你收集整理的深入浅出python机器学习_6.3.1_随机森林实例——要不要和相亲对象进一步发展的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。