pandas matplotlib 直播数据分析
生活随笔
收集整理的這篇文章主要介紹了
pandas matplotlib 直播数据分析
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
直播數(shù)據(jù)分析
針對douyu_60937 直播間 (2018/11/19 19:04:18 - 2018/11/20 7:56:42) 這個時間段的數(shù)據(jù)分析
基礎(chǔ)數(shù)據(jù)展示
以上數(shù)據(jù)是從直播間的彈幕中提取的相關(guān)數(shù)據(jù),每個字段解釋為
依賴
pandas==0.23.4 matplotlib==3.0.2 numpy==1.15.4 datetime數(shù)據(jù)處理
基礎(chǔ)準(zhǔn)備
import pandas as pd import matplotlib.pyplot as plt import numpy as np import datetimepath = "douyu_60937.xlsx" data = pd.read_excel(path)
根據(jù)牌子名稱統(tǒng)計最大值、最小值、平均值
- 計算牌子的最大值、最小值、平均值的時候需要根據(jù)user 將數(shù)據(jù)刪除重復(fù)項 ,避免多次計算
- 需要做成圖我們返回值設(shè)置成dict()
{'mean': {'小人參': 11.848837209302326, '196': 16.475254730713246, '女流': 11.418693982074263, 'Amss': 10.0, '水煮肉': 26.0, '小僵尸': 9.93750495049505, '339': 18.941176470588236, '金發(fā)雅': 24.0, '猛男': 7.768361581920904, '小豆包': 12.676724137931034, '阿冷': 10.416666666666666, '寅子': 7.059241706161138, '小癢蟲': 10.718562874251496, 'S1un': 22.0, '小緑帽': 9.938271604938272, '集團(tuán)軍': 7.909323116219668, '小烏賊': 8.377464788732395, '點子王': 5.686131386861314, '小肚皮': 7.034911587538053, '林Q': 21.0}, 'min': {'小人參': 2, '196': 4, '女流': 3, 'Amss': 4, '水煮肉': 26, '小僵尸': 1, '339': 12, '金發(fā)雅': 24, '猛男': 1, '小豆包': 3, '阿冷': 6, '寅子': 1, '小癢蟲': 2, 'S1un': 22, '小緑帽': 1, '集團(tuán)軍': 1, '小烏賊': 1, '點子王': 3, '小肚皮': 1, '林Q': 21}, 'max': {'小人參': 30, '196': 30, '女流': 30, 'Amss': 27, '水煮肉': 26, '小僵尸': 25, '339': 25, '金發(fā)雅': 24, '猛男': 23, '小豆包': 23, '阿冷': 22, '寅子': 22, '小癢蟲': 22, 'S1un': 22, '小緑帽': 22, '集團(tuán)軍': 22, '小烏賊': 22, '點子王': 22, '小肚皮': 21, '林Q': 21}}
根據(jù)牌子名稱統(tǒng)計數(shù)量
- 分類統(tǒng)計的時候我們要修改列名用rename(column={'老列名':'新列名'})
{'sign_count': {'小肚皮': 15439, '小僵尸': 12625, '集團(tuán)軍': 1566, '196': 1374, '女流': 781, '影魔王': 640, '大馬猴': 429, '寅子': 422, '小緑帽': 405, '小8路': 397, '小烈驢': 370, '小烏賊': 355, '小贏僧': 355, '保安團(tuán)': 343, '豬芳芳': 243, '小豆包': 232, '王菠蘿': 187, '二帆': 184, '猛男': 177, '汽車人': 173}}
統(tǒng)計各個等級的用戶數(shù)量
now_data = data[['user', 'level']].drop_duplicates(subset=['user']) # 刪除重復(fù)用戶 now_data = data[['level']].groupby(['level']).agg({'level': 'count'}).rename(columns={'level': 'level_count'}) now_data.sort_values(['level_count'], ascending=False, inplace=True){'level_count': {16: 3159, 19: 3122, 17: 3086, 21: 2984, 18: 2882, 15: 2832, 22: 2624, 23: 2564, 20: 2545, 13: 2379, 14: 2308, 24: 2274, 11: 2066, 12: 1894, 7: 1781, 9: 1753, 10: 1690, 8: 1678, 5: 1645, 25: 1554}}
每小時發(fā)言數(shù)量
- 根據(jù)時間統(tǒng)計需要構(gòu)造一個datetime 數(shù)據(jù)類型的列 , 利用 resample("時間標(biāo)識符") + count() 進(jìn)行統(tǒng)計
- 為了后續(xù)制圖方便我在這里直接把 <class 'pandas._libs.tslibs.timestamps.Timestamp'> 轉(zhuǎn)換成python 內(nèi)置的 datetime 類 方法是 to_pydatetime
{'user_count': {datetime.datetime(2018, 11, 19, 19, 0): 12707, datetime.datetime(2018, 11, 19, 20, 0): 12374, datetime.datetime(2018, 11, 19, 21, 0): 19340, datetime.datetime(2018, 11, 19, 22, 0): 13530, datetime.datetime(2018, 11, 19, 23, 0): 8, datetime.datetime(2018, 11, 20, 0, 0): 2, datetime.datetime(2018, 11, 20, 1, 0): 1, datetime.datetime(2018, 11, 20, 2, 0): 0, datetime.datetime(2018, 11, 20, 3, 0): 0, datetime.datetime(2018, 11, 20, 4, 0): 5, datetime.datetime(2018, 11, 20, 5, 0): 1, datetime.datetime(2018, 11, 20, 6, 0): 11, datetime.datetime(2018, 11, 20, 7, 0): 23}}
每小時在線人數(shù)(發(fā)言人)
df = data.drop_duplicates(subset=['user']) df = df.loc[:, ('user', 'uptime')] df = df.set_index('uptime') result = df.resample('H').count().rename(columns={'user': 'user_count'}) print(result) result = result.to_dict() s = {} for k, v in result['user_count'].items():s[k.to_pydatetime()] = v result['user_count'] = s print(result){'user_count': {datetime.datetime(2018, 11, 19, 19, 0): 4223, datetime.datetime(2018, 11, 19, 20, 0): 2207, datetime.datetime(2018, 11, 19, 21, 0): 3843, datetime.datetime(2018, 11, 19, 22, 0): 1875, datetime.datetime(2018, 11, 19, 23, 0): 6, datetime.datetime(2018, 11, 20, 0, 0): 0, datetime.datetime(2018, 11, 20, 1, 0): 1, datetime.datetime(2018, 11, 20, 2, 0): 0, datetime.datetime(2018, 11, 20, 3, 0): 0, datetime.datetime(2018, 11, 20, 4, 0): 1, datetime.datetime(2018, 11, 20, 5, 0): 1, datetime.datetime(2018, 11, 20, 6, 0): 1, datetime.datetime(2018, 11, 20, 7, 0): 4}}
制作圖表
粉絲牌等級情況 柱狀圖
def autolabel(ax, rects, xpos='center'):xpos = xpos.lower()ha = {'center': 'center', 'right': 'left', 'left': 'right'}offset = {'center': 0.5, 'right': 0.57, 'left': 0.43}for rect in rects:height = rect.get_height()ax.text(rect.get_x() + rect.get_width() * offset[xpos], 1.01 * height,'{}'.format(height), ha=ha[xpos], va='bottom')def sign_bar_wiht_leve(t1):"""粉絲牌等級情況柱狀圖:return:"""mean = [round(x, 1) for x in t1['mean'].values()]min = t1['min'].values()max = t1['max'].values()ind = np.arange(len(mean))width = 0.35fig, ax = plt.subplots()rects_2 = ax.bar(ind, min, width / 2, color='IndianRed', label='最低等級')rects_1 = ax.bar(ind + width / 2, mean, width / 2, color='SkyBlue', label='平均等級')rects_3 = ax.bar(ind + width, max, width / 2, color='Black', label='最高等級')ax.set_ylabel('等級')ax.set_title('粉絲牌等級 (2018/11/19 19:04:18 - 2018/11/20 7:56:42)')ax.set_xticks(ind)ax.set_xticklabels(t1['mean'].keys())ax.legend()fig = plt.gcf()fig.set_size_inches(30, 10.5)autolabel(ax=ax, rects=rects_1, xpos="center")autolabel(ax=ax, rects=rects_2, xpos="center")autolabel(ax=ax, rects=rects_3, xpos="center")fig.savefig("粉絲牌等級.jpg")fig.show()
粉絲牌占比 餅圖
def sign_pie_with_count(t2):"""粉絲牌占比:param t2::return:"""sizes = list(t2['sign_count'].values())[:5]labels = list(t2['sign_count'].keys())[:5]plt.figure(figsize=(8, 4)) # 調(diào)節(jié)圖形大小explode = (0, 0, 0, 0) # 將某一塊分割出來,值越大分割出的間隙越大patches, text1, text2 = plt.pie(sizes,labels=labels,autopct='%3.2f%%', # 數(shù)值保留固定小數(shù)位shadow=False, # 無陰影設(shè)置startangle=90, # 逆時針起始角度設(shè)置pctdistance=0.8) # 數(shù)值距圓心半徑倍數(shù)距離plt.axis('equal')plt.legend()plt.title('粉絲牌占比 \n(2018/11/19 19:04:18 - 2018/11/20 7:56:42)')plt.savefig("粉絲牌占比.jpg")plt.show()
用戶等級分布 折線圖
def user_line_with_count(t4):"""用戶等級分布折線圖:param t4::return:"""t4 = sorted(t4['level_count'].items(), key=lambda d: d[0])x = [i[0] for i in t4]y = [i[1] for i in t4]plt.figure(figsize=(8, 4))plt.plot(x, y, "b--", linewidth=1)# 設(shè)置數(shù)字標(biāo)簽for a, b in zip(x, y):plt.text(a, b, b, ha='center', va='bottom', fontsize=10)plt.xlabel("用戶等級")plt.ylabel("數(shù)量")plt.title("用戶等級情況\n(2018/11/19 19:04:18 - 2018/11/20 7:56:42)")plt.savefig("用戶等級情況.jpg")plt.show()
每個時間段用戶以及彈幕量 折線圖
def show_label(x, y, plt):# 設(shè)置數(shù)字標(biāo)簽for a, b in zip(x, y):plt.text(a, b, b, ha='center', va='bottom', fontsize=10)def user_time(t5, t6):"""每個時間段用戶以及彈幕量:param t5::param t6::return:"""t_5 = {k.strftime("%Y-%m-%d %H"): v for k, v in t5['user_count'].items()}t_6 = {k.strftime("%Y-%m-%d %H"): v for k, v in t6['user_count'].items()}x_1 = t_5.keys()y_1 = t_5.values()x_2 = t_6.keys()y_2 = t_6.values()plt.figure(figsize=(9, 5))plt.plot(x_1, y_1, "o-", linewidth=1, label='彈幕數(shù)量')plt.plot(x_2, y_2, "g--", linewidth=1, label='在線人數(shù)')show_label(x_1, y_1, plt)show_label(x_2, y_2, plt)plt.xticks(rotation=30)plt.legend()plt.xlabel("小時")plt.ylabel("人數(shù)")plt.title("每個時間段用戶以及彈幕量\n(2018/11/19 19:04:18 - 2018/11/20 7:56:42)")plt.savefig("每個時間段用戶以及彈幕量.jpg")plt.show()
總結(jié)
以上是生活随笔為你收集整理的pandas matplotlib 直播数据分析的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 关于Java中next() nextLi
- 下一篇: Qt 常量中有换行符 中文