Datawha组队——Pandas(下)综合练习(打卡)
生活随笔
收集整理的這篇文章主要介紹了
Datawha组队——Pandas(下)综合练习(打卡)
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標(biāo)簽
plt.rcParams['axes.unicode_minus']=False #用來正常顯示負(fù)號#讀取數(shù)據(jù)
df = pd.read_csv('端午粽子數(shù)據(jù).csv')
df.columns = df.columns.str.strip()
df.columns
print(msno.matrix(df))
df = df.drop(df.index[df['發(fā)貨地址'].isnull()],axis=0)
# df_1 = df[df['發(fā)貨地址'].str.contains(r'[杭州]{2}')]
def is_number(x):try:float(x)return Trueexcept (SyntaxError,ValueError) as e :return Falsedf[~df.價(jià)格.map(is_number)]
df.loc[[538,4376],'價(jià)格']=['45.9','45.0']
df['價(jià)格'] = df['價(jià)格'].astype(float)
df_1 = df[df['發(fā)貨地址'].str.contains(r'[杭州]{2}')]
df_1['價(jià)格'].mean()
結(jié)果為:
df[df['標(biāo)題'].str.contains(r'[嘉興]{2}') & ~(df['發(fā)貨地址'].str.contains(r'[嘉興]{2}'))] df['價(jià)格'].describe(percentiles=[.2,.4,.6,.8]).loc[['20%','40%','60%','80%']] df['new_價(jià)格'] = pd.cut(df['價(jià)格'],[0.0,29.3,43.9,69.84,124.80,np.inf],labels=['低','較低','中','較高','高']) df.set_index('new_價(jià)格').sort_index(ascending=False).head() df['new_付款人數(shù)'] = df['付款人數(shù)'].astype('string').str.extract(r'(\d+(\.\d+)?)')[0] # df['new_付款人數(shù)_wan'] = df['付款人數(shù)'].astype('string').str.extract(r'(\d+\.?\d*\d+)',expand=False) df['new_付款人數(shù)'] = pd.to_numeric(df['new_付款人數(shù)'],errors='coerce') df['付款人數(shù)'] = df['付款人數(shù)'].apply(str) s1 = pd.to_numeric((df[df['付款人數(shù)'].str.contains(r'[萬]{1}')]['new_付款人數(shù)']*10000)) s2 = pd.to_numeric(df[~(df['付款人數(shù)'].str.contains(r'[萬]{1}'))]['new_付款人數(shù)']) df['new_付款人數(shù)']= pd.concat([s1,s2],axis=0)#查看缺失值數(shù)量 print(df['new_付款人數(shù)'].isnull().sum()) print(df.index[df['new_付款人數(shù)'].isnull()]) print(df.loc[183]) g = df.groupby(df['new_價(jià)格']) # g.groups print(g.get_group('低')['new_付款人數(shù)'].isnull().sum()) print(g.get_group('較低')['new_付款人數(shù)'].isnull().sum()) print(g.get_group('中')['new_付款人數(shù)'].isnull().sum()) print(g.get_group('較高')['new_付款人數(shù)'].isnull().sum()) print(g.get_group('高')['new_付款人數(shù)'].isnull().sum())#求均值 print(g.get_group('低')['new_付款人數(shù)'].mean()) print(g.get_group('較低')['new_付款人數(shù)'].mean()) print(g.get_group('中')['new_付款人數(shù)'].mean()) print(g.get_group('較高')['new_付款人數(shù)'].mean()) print(g.get_group('高')['new_付款人數(shù)'].mean())#缺失值填充 df['new_付款人數(shù)'].fillna(g.get_group('低')['new_付款人數(shù)'].mean(),inplace=True) df['new_付款人數(shù)'].isnull().sum()存在問題:通過之前對價(jià)格的分類對數(shù)據(jù)進(jìn)行分類填充,但是在填充時(shí),發(fā)現(xiàn)不能分組填充,只能一次性填充,這個(gè)問題待思考解決。
#字符串拼接 address = [] for i in df['發(fā)貨地址'].str.split(' '):if len(i)>1:add = i[0]+i[1]else:add = i[0]address.append(add) df['new_發(fā)貨地址']= address ('商品發(fā)貨地為'+df['new_發(fā)貨地址']+',店鋪為'+df['店鋪']+',共計(jì)'+df['付款人數(shù)']+',單價(jià)為'+df['價(jià)格']).to_frame().rename(columns={0:'ID'})#apply函數(shù) s = df.apply(lambda r:f'商品發(fā)貨地址為{r["new_發(fā)貨地址"]},店鋪為{r["店鋪"]},共計(jì){r["付款人數(shù)"]},單價(jià)為{r["價(jià)格"]}',axis=1).to_frame().rename(columns={0:'ID'}) s address = [] shops = [] persons = [] prices = [] for i in s['ID'].str.split(','):add = i[0].split('為')[1]if len(add)>3:add = add[:2] + ' ' + add[2:]shop = i[1].split('為')[1]person = i[2].split('計(jì)')[1]price = i[3].split('為')[1]address.append(add)shops.append(shop)persons.append(person)prices.append(price) s['發(fā)貨地址'] = address s['店鋪'] = shops s['付款人數(shù)'] = persons s['價(jià)格'] = prices print(s) df = pd.read_csv('墨爾本溫度數(shù)據(jù).csv') df holiday = pd.date_range(start='19810501', end='19810503').append(pd.date_range(start='19820501', end='19820503')).append(pd.date_range(start='19830501', end='19830503')).append(pd.date_range(start='19840501', end='19840503')).append(pd.date_range(start='19850501', end='19850503')).append(pd.date_range(start='19860501', end='19860503')).append(pd.date_range(start='19870501', end='19870503')).append(pd.date_range(start='19880501', end='19880503')).append(pd.date_range(start='19890501', end='19890503')).append(pd.date_range(start='19900501', end='19900503')).append(pd.date_range(start='19811001', end='19811007')).append(pd.date_range(start='19821001', end='19821007')).append(pd.date_range(start='19831001', end='19831007')).append(pd.date_range(start='19841001', end='19841007')).append(pd.date_range(start='19851001', end='19851007')).append(pd.date_range(start='19861001', end='19861007')).append(pd.date_range(start='19871001', end='19871007')).append(pd.date_range(start='19881001', end='19881007')).append(pd.date_range(start='19891001', end='19891007')).append(pd.date_range(start='19891001', end='19891007')).append(pd.date_range(start='19810101', end='19901231',freq='BMS')) holiday = holiday.drop_duplicates() df['Date'] = pd.to_datetime(df['Date']) result = df[~df['Date'].isin(holiday)].set_index('Date').resample('M').mean() result #常規(guī)法 years = [] months = [] for i in df['Date'].astype('string').str.split('-'):year = i[0]month = str(int(i[1]))years.append(year)months.append(month) df['Y'] = years df['M'] = months Y = df.groupby('Y') M = df.groupby(['Y','M']) tempYlist = [] tempYZlist = [] for i in range(1981,1991):tempYlist = []for j in range(1,13):tempY = Y.get_group(str(i))[Y.get_group(str(i))['M']==str(j)]['Temp'].min() # print(tempY)tempYlist.append(tempY)Ymean = np.sum(np.mean(tempYlist)) tempMZlist = [] for i in range(1,13):tempMlist = []for j in range(1981,1991):tempM = Y.get_group(str(j))[Y.get_group(str(j))['M']==str(i)]['Temp'].min()tempMlist.append(tempM)print(tempMlist)tempMZlist.append(np.mean(tempMlist)) Sj = tempMZlist/Ymean Sj import pandas as pd import numpy as np import datetime df = pd.read_csv('摩拜單車數(shù)據(jù).csv') df['new_start_time'] = pd.to_datetime(df['start_time'] ) df['new_start_time'] = pd.to_datetime(df['new_start_time'].apply(lambda x:datetime.datetime.strftime(x,'%Y-%m-%d'))) # datetime.datetime.strftime(df['new_start_time'][0],'%Y-%m-%d') df['start_time'] = pd.to_datetime(df['start_time']) df['work_week'] = df['start_time'].dt.dayofweek df.groupby('work_week').size()對數(shù)據(jù)按星期分類,0-6代表星期一到星期天,統(tǒng)計(jì)每天的交易量
data = df.groupby('new_start_time') zts = pd.Timestamp('2016-07-31 07:30') zte = pd.to_datetime('2016-07-31 09:30') wts = pd.to_datetime('2016-07-31 17:30') wte = pd.to_datetime('2016-07-31 19:00') time = pd.to_datetime('2016-07-31 00:00:00') times = [] countZs = [] countWs = [] for i in range(1,32):zts = zts + pd.offsets.Day()zte = zte + pd.offsets.Day()wts = wts + pd.offsets.Day()wte = wte + pd.offsets.Day()time = datetime.datetime.strftime(time + pd.offsets.Day(),'%Y-%m-%d %H:%M:%S')countZ = data.get_group(time)[(data.get_group(time)['start_time']>= zts) & (data.get_group(time)['start_time']<= zte)]['start_time'].count()countW = data.get_group(time)[(data.get_group(time)['start_time']>= wts) & (data.get_group(time)['start_time']<= wte)]['start_time'].count() # print(countZ,countW)time = pd.Timestamp(time)times.append(time)countZs.append(countZ)countWs.append(countW) workdf = pd.DataFrame() workdf['time']=times workdf['countZ']=countZs workdf['countW']=countWs workdf['time'] = pd.to_datetime(workdf['time']) workday = pd.date_range(start='2016-08-01',end='2016-08-31 ',freq='B') workdf[workdf['time'].isin(workday)][workdf['countZ']>workdf['countW']]統(tǒng)計(jì)出8月份每周五的記錄量
f = df[df['work_week']==4].groupby('new_start_time') print(f.size()) data = df[df['new_start_time']==pd.to_datetime('2016-08-26')] data['end_time'] = pd.to_datetime(data['end_time']) data['start_time'] = pd.to_datetime(data['start_time']) data['time_sep'] = (data['end_time']-data['start_time']).dt.seconds/60 data['new_time_sep'] = pd.cut(data['time_sep'],[0,30,120,360],labels=['one','two','three']) data.set_index(['new_time_sep']) one = data[data['new_time_sep']=='one']['time_sep'].mean() two = data[data['new_time_sep']=='two']['time_sep'].mean() three = data[data['new_time_sep']=='three']['time_sep'].mean() print(one,two,three) #公式計(jì)算 import math lon1 = df['start_location_x'] lat1 = df['start_location_y'] lon2 = df['end_location_x'] lat2 = df['end_location_y'] R = 6371 dlon = lon2 - lon1 dlat = lat2 - lat1 a = (np.sin(dlat/2))**2 + np.cos(lat1) * np.cos(lat2) * (np.sin(dlon/2))**2 c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a)) d = R * c #geopy !pip install geopy import geopy.distance # print (geopy.distance.distance(coords_1, coords_2).km) lon1 = df['start_location_x'].tolist() lat1 = df['start_location_y'].tolist() lon2 = df['end_location_x'].tolist() lat2 = df['end_location_y'].tolist() coords_1 = list(zip(lat1, lon1)) coords_2 = list(zip(lat2, lon2)) dist = [] for i,j in zip(coords_1,coords_2):dis = geopy.distance.distance(i, j).kmdist.append(dis) #距離 df['dis'] = d#勻速=距離/時(shí)間 df['sudu'] = df['dis']/df['time_sep']#3sigmoid篩選一禪 Dmean = df['sudu'].mean() Dstd = df['sudu'].std() #閾值 thre1 = Dmean-3*Dstd thre2 = Dmean+3*Dstd #異常值 outlies = df[(df['sudu']<thre1) | (df['sudu']>thre2)]畫圖展示:
#未處理之前 plt.figure() plt.scatter(range(df.shape[0]),df['sudu'].tolist()) plt.xlabel('用戶') plt.ylabel('速度值') plt.title('未處理缺失值-速度圖像') plt.show()#處理之后 Dmean = df['sudu'].mean() Dstd = df['sudu'].std()thre1 = Dmean-3*Dstd thre2 = Dmean+3*Dstdoutlies = df.index[(df['sudu']<thre1) | (df['sudu']>thre2)]data = df.drop(outlies,axis=0)plt.figure() plt.scatter(range(data.shape[0]),data['sudu'].tolist()) plt.xlabel('用戶') plt.ylabel('速度值') plt.title('處理缺失值-速度圖像') plt.show()?
總結(jié)
以上是生活随笔為你收集整理的Datawha组队——Pandas(下)综合练习(打卡)的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: tomcat日志设置与详解
- 下一篇: i3wmvim终极配置