python数据处理与机器学习
生活随笔
收集整理的這篇文章主要介紹了
python数据处理与机器学习
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
提綱
numpy:
#genformtxt import numpy as np #genformtxtdata=np.genfromtxt("genfromtxtdata") #print(help(numpy.genfromtxt))#matrix-list of list matrix=np.array([[12,12],[12,12],[1,13]]) print(matrix) #強制轉換成一致數據類型 dataa=np.array([1,2,4.0,1]) #切片#判斷 #datab=dataa #結果返回true,false #導出等于某一值的數組 #booldata=(datab==1) #print(datab[booldata]) #取出包含某一值的某一行 boolmatrix =(matrix[:,1]==13) print(matrix[boolmatrix,:]) # & | 與或#類型轉換 dataa.astype(float)#求極值 dataa.min() #按照行列求和 matrix.sum(axis=1)#np.zeros((3,4)->元組格式) #np.arange(15).reshape(3,4) #np.random.random()->先進入random模塊,默認范圍-1->+1 #np.linspace(0,2*pi,100)->均勻取值 #np.exp()#相減:維度一樣對應相減,不一樣都減去后一個數 A=np.array([[1,2],[1,1]]) B=np.array([[1,2],[1,1]]) print(A*B)#對應元素相乘 print(A.dot(B))#矩陣相乘 print(np.dot(A,B))#矩陣操作 #向下取 a=np.floor(10*np.random.random((3,4))) b=np.floor(10*np.random.random((3,4))) #將矩陣拉成向量 print(a) print(a.ravel())#數據拼接 #print(np.hstack((a,b))) #print(np.vstack((a,b))) #數據切分 #print(np.hsplit(a,2)) #print(np.vsplit(a,2))#數據復制 b=a b.shape=4,3 #改變b的形狀,a的形狀跟著變了 print(a) #a,b的ID值一樣,指向統一內存空間 print(id(a),id(b)) #淺復制 #c與a雖然指向的地址不同但是共用一套數值,改變 c,a也會改變 c=a.view() c.shape=2,6 c[1,1]=11 print(a.shape) print(a) #深復制 #d與a完全沒關系了 d=a.copy()#索引操作 #找最大值所在的位置 intt=a.argmax(axis=0) print(intt) #擴展數組 a=np.arange(1,20,10) b=np.tile(a,(2,3)) print(b) #排序 a=np.array([[1,2,3],[3,2,1]]) #從小到大的索引值 j=np.argsort(a) a.sort(axis=1) print(j) print(a)?pandas:
import pandas as pd import numpy as np current_path = %pwd print(current_path) #food_info=pd.read_csv("food_info.csv") #DataFrame數據類型 #print(type(food_info)) #print(food_info.dtypes)#food_info.head() #food_info.tail(4) #print(food_info.columns) #print(food_info.shape)#索引與計算 #print(food_info.loc[0]) #傳入一個list->多列 #print(food_info[["NDB_No","Shrt_Desc"]]) #column_list=food_info.columns.tolist() #print(column_list)##數據預處理 #food_info.sort_values("NDB_No",inplace=True) ##排序后缺失值會被放到最后 ##從小到大排序 #print(food_info["NDB_No"]) ##從大到小 #food_info.sort_values("NDB_No",inplace=True,ascending=False) #print(food_info["NDB_No"]) titanic_train_info=pd.read_csv("titanic_train.csv") #print(titanic_train_info.head()) #age=titanic_train_info["Age"] #print(age.loc[0:10]) #age_is_null=pd.isnull(age) #print(age_is_null) #age_null_true=age[age_is_null] #age_null_count=len(age_null_true) #print(age_null_count) #除去缺失值求平均 #age_null_false=titanic_train_info["Age"][age_is_null==False] #average_age=sum(age_null_false)/len(age_null_false) #average_age1=titanic_train_info["Age"].mean() #print(average_age,average_age1)#數據統計表 #基準-統計對象-方法 #求均值是默認方法 #passager_survival=titanic_train_info.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean) #print(passager_survival) #passager_age=titanic_train_info.pivot_table(index="Pclass",values="Age",aggfunc=np.mean) #print(passager_age) #port_stats=titanic_train_info.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum) #print(port_stats) ##缺失值丟掉 #titanic_train_info1=titanic_train_info drop_na_columns=titanic_train_info1.dropna(axis=0,subset=["Age","Sex"]) drop_na_columns.head()#定位到某一具體值 row_index_83_age=titanic_train_info1.loc[83,"Age"] print(row_index_83_age)#自定義函數 #titanic_train_info1.apply("函數名") #series結構 import pandas as pd score_csv=pd.read_csv("fandango_score_comparison.csv") series_FILM=score_csv["FILM"] #print(type(series_FILM))from pandas import Series film_names=series_FILM.values #print(type(film_names)) series_rt=score_csv["RottenTomatoes"] #print(series_rt) rt_scores=series_rt.values print(rt_scores) #以名字所謂索引 series_customer=Series(rt_scores,index=film_names) series_customer["Minions (2015)"] series_customer[5:10]matplotlib:
#折線圖 import pandas as pd unrate=pd.read_csv("UNRATE.csv") unrate["DATE"]=pd.to_datetime(unrate["DATE"]) #print(unrate.head(12))import matplotlib.pyplot as plt #first_twelve=unrate[0:100] #plt.plot(first_twelve["DATE"],first_twelve["VALUE"]) #plt.xticks(rotation=45) #plt.xlabel("month") #plt.ylabel("rate") #plt.title("失業率") #plt.show()#fig=plt.figure() #ax1=fig.add_subplot(4,3,1) #ax2=fig.add_subplot(4,3,2) #ax2=fig.add_subplot(4,3,6)import numpy as np #fig=plt.figure(figsize=(10,6)) #ax1=fig.add_subplot(2,1,1) #ax2=fig.add_subplot(2,1,2) #ax1.plot(np.random.randint(1,5,5),np.arange(5)) #ax2.plot(np.arange(10)*3,np.arange(10)) #plt.show() unrate["Month"]=unrate["DATE"].dt.month #fig=plt.figure(figsize=(6,3)) #plt.plot(unrate[0:12]["Month"],unrate[0:12]["VALUE"],c="red") #plt.plot(unrate[12:24]["Month"],unrate[12:24]["VALUE"],c="blue") fig=plt.figure(figsize=(10,5)) colors=["red","blue","green","orange","black"] for i in range(5):start_index=i*12end_index=(i+1)*12subset=unrate[start_index:end_index]label=str(1948+i)plt.plot(subset["Month"],subset["VALUE"],c=colors[i],label=label) plt.legend(loc="best") plt.show()#bar import pandas as pd reviews = pd.read_csv('fandango_scores.csv') cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars'] norm_reviews = reviews[cols] #print(norm_reviews[:1])import matplotlib.pyplot as plt from numpy import arange #The Axes.bar() method has 2 required parameters, left and height. #We use the left parameter to specify the x coordinates of the left sides of the bar. #We use the height parameter to specify the height of each bar num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars'] bar_heights = norm_reviews.ix[0, num_cols].values bar_positions = arange(5) + 0.75 tick_positions = range(1,6) fig, ax = plt.subplots()ax.bar(bar_positions, bar_heights, 0.5) #橫著畫圖 ax.barh(bar_positions, bar_heights, 0.5) ax.set_xticks(tick_positions) ax.set_xticklabels(num_cols, rotation=45)ax.set_xlabel('Rating Source') ax.set_ylabel('Average Rating') ax.set_title('Average User Rating For Avengers: Age of Ultron (2015)') plt.show()#散點圖 #Let's look at a plot that can help us visualize many points. #函數返回一個figure圖像和一個子圖ax的array列表。 fig = plt.figure(figsize=(10,5)) ax1 = fig.add_subplot(2,1,1) ax2 = fig.add_subplot(2,1,2) ax1.scatter(norm_reviews['Fandango_Ratingvalue'], norm_reviews['RT_user_norm']) ax1.set_xlabel('Fandango') ax1.set_ylabel('Rotten Tomatoes') ax2.scatter(norm_reviews['RT_user_norm'], norm_reviews['Fandango_Ratingvalue']) ax2.set_xlabel('Rotten Tomatoes') ax2.set_ylabel('Fandango') plt.show() #柱狀圖 import pandas as pd import matplotlib.pyplot as plt reviews = pd.read_csv('fandango_scores.csv') cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue'] norm_reviews = reviews[cols] #print(norm_reviews[:5]) #數據計數 fandango_distribution = norm_reviews['Fandango_Ratingvalue'].value_counts() #數據索引從小到大排列 fandango_distribution = fandango_distribution.sort_index() imdb_distribution = norm_reviews['IMDB_norm'].value_counts() imdb_distribution = imdb_distribution.sort_index() #print(fandango_distribution) #print(imdb_distribution) fig, ax = plt.subplots() #ax.hist(norm_reviews['Fandango_Ratingvalue']) #bins指定個數,range指定區間 ax.hist(norm_reviews['Fandango_Ratingvalue'],bins=20) ax.hist(norm_reviews['Fandango_Ratingvalue'], range=(4, 5),bins=20) ax.set_ylim(0,20) #四分圖(盒圖) num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue'] fig, ax = plt.subplots() ax.boxplot(norm_reviews[num_cols].values) ax.set_xticklabels(num_cols, rotation=90) ax.set_ylim(0,5) plt.show() #一些細節 import pandas as pd import matplotlib.pyplot as plt # Add your code here. fig, ax = plt.subplots() ax.plot(women_degrees['Year'], women_degrees['Biology'], label='Women') ax.plot(women_degrees['Year'], 100-women_degrees['Biology'], label='Men') #去掉小橫線 ax.tick_params(bottom="off", top="off", left="off", right="off") ax.set_title('Percentage of Biology Degrees Awarded By Gender') ax.legend(loc="upper right") major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics'] fig = plt.figure(figsize=(12, 12))#for sp in range(0,4): # ax = fig.add_subplot(2,2,sp+1) # ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c='blue', label='Women') # ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c='green', label='Men') # # Add your code here. # ## Calling pyplot.legend() here will add the legend to the last subplot that was created. #plt.legend(loc='upper right') #plt.show() major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics'] fig = plt.figure(figsize=(12, 12))for sp in range(0,4):ax = fig.add_subplot(2,2,sp+1)ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c='blue', label='Women')ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c='green', label='Men')for key,spine in ax.spines.items():spine.set_visible(False)ax.set_xlim(1968, 2011)ax.set_ylim(0,100)ax.set_title(major_cats[sp])ax.tick_params(bottom="off", top="off", left="off", right="off") # Calling pyplot.legend() here will add the legend to the last subplot that was created. plt.legend(loc='upper right') plt.show()#Setting Line Width cb_dark_blue = (0/255, 107/255, 164/255) cb_orange = (255/255, 128/255, 14/255)fig = plt.figure(figsize=(12, 12))for sp in range(0,4):ax = fig.add_subplot(2,2,sp+1)# Set the line width when specifying how each line should look.ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c=cb_dark_blue, label='Women', linewidth=10)ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c=cb_orange, label='Men', linewidth=10)for key,spine in ax.spines.items():spine.set_visible(False)ax.set_xlim(1968, 2011)ax.set_ylim(0,100)ax.set_title(major_cats[sp])ax.tick_params(bottom="off", top="off", left="off", right="off")plt.legend(loc='upper right') plt.show()stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics'] fig = plt.figure(figsize=(18, 3)) for sp in range(0,6):ax = fig.add_subplot(1,6,sp+1)ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3)for key,spine in ax.spines.items():spine.set_visible(False)ax.set_xlim(1968, 2011)ax.set_ylim(0,100)ax.set_title(stem_cats[sp])ax.tick_params(bottom="off", top="off", left="off", right="off")if sp == 0:ax.text(2005, 87, 'Men')ax.text(2002, 8, 'Women')elif sp == 5:ax.text(2005, 62, 'Men')ax.text(2001, 35, 'Women') plt.show()seaborn:
#seaborn風格模板 import seaborn as sns import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np %matplotlib inline def sinplot(flip=1):x=np.linspace(0,14,100)for i in range(1,7):plt.plot(x,np.sin(x+i*0.5)*(7-i)*flip) #sns默認風格(有五種主題風格) #sns.set() #sinplot() #sns.set_style("whitegrid") #sns.set_style("dark") #sns.set_style("white") #sns.set_style("ticks") #data=np.random.normal(size=(20,6))+np.arange(6)/2 #sns.boxplot(data=data) #去掉上方和右邊的線條 #sns.despine() #sns.despine(offset=10) #sns.despine(left=True) #with內執行的都是當前風格 #with sns.axes_style("darkgrid"): # plt.subplot(211) # sinplot() #plt.subplot(212) #sinplot(-1) ##設置整體布局 sns.set_style("whitegrid") sns.set_context("paper",font_scale=2.5,rc=({"lines.linewidth":4.5}))#poster/notebook plt.figure(figsize=(8,6)) sinplot() # 顏色(離散型與連續型)>顏色很重要 >color_palette()能傳入任何matplot所支持的顏色 >color_palette()不寫參數則默認顏色 >set_palette()設置所有圖的顏色#分類色板 #默認的繪圖顏色 current_palette=sns.color_palette() sns.palplot(current_palette) #hls默認的顏色空間 sns.palplot(sns.color_palette("hls",8)) #把顏色放到數據中 fig=plt.figure(figsize=(10,6)) data=np.random.normal(size=(20,6))+np.arange(6)/2 sns.boxplot(data=data,palette=sns.color_palette("hls",8)) #更改調色板亮度與飽和度 #fig=plt.figure(figsize=(10,6)) #sns.palplot(sns.hls_palette(8,l=.2,h=.9)) #sns.boxplot(data=data,palette=sns.hls_palette(8,l=.2,h=.9))#調出來成對的顏色 sns.palplot(sns.color_palette("Paired",8))使用xkcd來命名顏色 xkcd包含了一套眾包努力的針對隨機GRB色的命名,產生了954個可以隨時通過xkcd_rgb字典中調用的命名顏色plt.plot([0,1],[0,1],sns.xkcd_rgb["pale red"],lw=3) plt.plot([0,1],[0,2],sns.xkcd_rgb["medium green"],lw=3) plt.plot([0,1],[0,3],sns.xkcd_rgb["denim blue"],lw=3) #連續畫板 #色彩可以變換,比如用顏色的變化表示值重要性的變化 sns.palplot(sns.color_palette("Blues")) #由深到淺 sns.palplot(sns.color_palette("Blues_r")) #線性調色板 sns.palplot(sns.color_palette("cubehelix",8)) sns.palplot(sns.cubehelix_palette(8,start=.5,rot=-0.75)) #指定顏色深淺 sns.palplot(sns.light_palette("green")) sns.palplot(sns.dark_palette("purple")) x,y=np.random.multivariate_normal([0,0],[[1,-.5],[-.5,1]],size=300).T #plt.scatter(x,y) fig=plt.figure(figsize=(10,6)) pal=sns.dark_palette("green",as_cmap=True) sns.kdeplot(x,y,cmap=pal)?
轉載于:https://www.cnblogs.com/janghe/p/8013000.html
與50位技術專家面對面20年技術見證,附贈技術全景圖總結
以上是生活随笔為你收集整理的python数据处理与机器学习的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 81.游戏项目-物体任意角度飞行和停止
- 下一篇: Android 第三方库RxLifecy