pandas在数据分析(异常值识别问题)中的应用,以衍生特征计算为例(含2022年全国服务外包大赛实例)
??我們以2022年全國服務外包大賽的A03題目作為示例代碼演示衍生特征計算過程。
??問題的主要任務時找出商品的銷量異常和價格異常,提供4個月的商品信息數據,共1700萬余條,4個月的店鋪信息數據,共60萬余條,強調時間復雜度空間復雜度、異常值識別率和準確率。我們用店鋪分析輔助商品的異常,以提高可信度和準確率。
??店鋪部分數據鏈接:https://pan.baidu.com/s/1iAp-s2JwG_YTB35BevMNyQ 提取碼:jhnb
??我們現在的工作就是上圖中異常店鋪檢測中特征挖掘部分。
??上圖是異常店鋪檢測的具體流程,我們現在做的是流程的第二步:衍生變量計算。因為有部分衍生特征涉及類目,但是店鋪數據的類目是存在缺失值的(關于前期的數據預處理與數據預覽流程,見:https://blog.csdn.net/Hjh1906008151/article/details/124313507),因為涉及的衍生特征相對重要,且缺失數量近10%,所以我們將任務分為兩部分。
不涉及填補缺失值的衍生特征
??應用到了自然語言處理特征工程部分的知識和綜合評價方面的知識,主要是對pandas的一些應用的記錄:
import numpy as np import pandas as pd import re import jieba import gensim from matplotlib import pyplot as plt from sklearn.preprocessing import StandardScalerread_file = r"../Distribution testing/shop.tsv" write_file = r""def clear_character(sentence):pattern = re.compile('[^\u4e00-\u9fa5]') # 去掉字母數字亂七八糟的符號,只留中文line = re.sub(pattern, '', sentence)new_sentence = ''.join(line.split())return new_sentencedef make_bigrams(texts):bigram = gensim.models.Phrases(texts, min_count=3, threshold=100) # higher threshold fewer phrases.# trigram = gensim.models.Phrases(bigram[texts], threshold=1)# 更快地找到 trigram/bigrambigram_mod = gensim.models.phrases.Phraser(bigram)# trigram_mod = gensim.models.phrases.Phraser(trigram)return [bigram_mod[doc] for doc in texts]def data_cleaning(df):train_text = [clear_character(data) for data in df["SHOP_NAME"]]# 切詞,這里不適合Stanfordcorenlp,安裝不方便不適合小白,而且跑起來太慢了train_seg_text = [jieba.lcut(s) for s in train_text]# 去掉停用詞,這里發現直接對詞頻統計的結果分析效果更好# train_st_text = [drop_stopwords(s, stopwords) for s in train_seg_text]# 構建bigram, trigram模型 => 二元分詞三元分詞# 我們這里先只采用二元分詞data_words_bigrams = make_bigrams(train_seg_text)return data_words_bigramsdef Normalization(data):# 最大最小這里不合適# min_max = MinMaxScaler(feature_range=(0, 1))# data[["ITEMDESC_SCORE", "SERVICE_SCORE", "DELIVERY_SCORE"]] = min_max.fit_transform(data[["ITEMDESC_SCORE", "SERVICE_SCORE", "DELIVERY_SCORE"]])# data = data.Tfor i in data.columns:temp = StandardScaler().fit_transform(pd.DataFrame(np.array(data[i]).reshape(-1, 1)))try:ret = np.hstack((ret, temp))except:ret = tempreturn retdef E_j_fun(data, rows, columns): #計算熵值E = np.array([[None] * columns for i in range(rows)]) # 新建空矩陣for i in range(rows):for j in range(columns):if data[i][j] == 0:e_ij = 0.0else:P_ij = data[i][j] / data.sum(axis=0)[j] # 計算比重(列求和)e_ij = (-1 / np.log(rows)) * P_ij * np.log(P_ij)E[i][j] = e_ij# print(E)E_j=E.sum(axis=0) # 求出每列信息熵(指標)列求和return E_jdef critic(data, rows, columns, our_weight):Z_ij = np.array([[None] * rows for i in range(columns)])data_std = np.std(data, axis=1, ddof=1)# print(data_std)data_rela = np.corrcoef(data)data_rela = data_rela.sum(axis=1)# print(data_std, "\n", data_rela) # 樣本標準差(n-1)C_i = data_rela * data_std # 矩陣點乘W_i = C_i/sum(C_i)W_i = W_i*our_weightprint(W_i)for i in range(columns):for j in range(rows):Z_ij[i][j] = data[i][j] * W_i[i]ret = Z_ij.sum(axis=0)return retdef get_benrate(series): # 處理被除數為0BeiChuShu = series['SHOP_SALES_AMOUNT']ChuShu = series['SHOP_SALES_VOLUME']if BeiChuShu == 0:return 0else:return BeiChuShu / ChuShudef shop(file, weight_crdict, weight_reputation, keyword, keyword2):# 先編號,為以后做打算file["index"] = [i for i in range(file.shape[0])]# 計算年份# 13311/654400的異常值,不到5%。直接滾吧file1 = file.dropna(axis=0, how='any', subset=["SHOP_OPEN_DATE"])year = [(2021 - int(x[:4])) * 12 + 9 - int(x[5:7]) for x in file1["SHOP_OPEN_DATE"]]file1["year"] = yearfile = pd.merge(file, file1, how="outer")# 統計關鍵詞word_cut = data_cleaning(file)# word = pd.DataFrame(word_cut)# word.replace(to_replace='None', value=np.nan).dropna(axis=1, how='all')# print(word)# word.to_csv("切詞結果.csv", encoding="utf-8", header=0, index=0)# for x in word_cut:# print(x, "\t", set(x), "\t", set(keyword), "\t",1-int(set(keyword).isdisjoint(set(x))))trust1 = [1 - int(set(keyword).isdisjoint(set(x))) for x in word_cut] # 內存炸了trust2 = [-1 + int(set(keyword2).isdisjoint(set(x))) for x in word_cut]trust = [i + j for i, j in zip(trust1, trust2)]# trust = []# with open("切詞結果.csv", 'r', encoding='utf-8', errors='ignore') as f:# for line in f:# print(line)# final_list = list()# for row in line:# final_list.append(row.split(','))# print(final_list)file["trust"] = trust# 計算平均單價,順便練一下applyfile['ave_price'] = 0 # 處理0file['ave_price'] = file.apply(get_benrate, axis=1)# 計算可信度評分file2 = file.dropna(axis=0, how='any', subset=["trust", "year"])Standard_data = Normalization(file2[["trust", "year"]]).TCredit_Score = critic(Standard_data, Standard_data.shape[1], Standard_data.shape[0], weight_crdict)Credit_Score = (Credit_Score - min(Credit_Score)) / (max(Credit_Score) - min(Credit_Score)) * 100file2["Credit_Score"] = Credit_Scorefile = pd.merge(file, file2, how="outer")# 計算聲譽評分# 2157/654400的異常值,不到0.5%。直接滾吧file3 = file.dropna(axis=0, how='any', subset=["ITEMDESC_SCORE", "SERVICE_SCORE", "DELIVERY_SCORE"])Standard_data = Normalization(file3[["ITEMDESC_SCORE", "SERVICE_SCORE", "DELIVERY_SCORE"]]).TReputation_Score = critic(Standard_data, Standard_data.shape[1], Standard_data.shape[0], weight_reputation)Reputation_Score = (Reputation_Score - min(Reputation_Score)) / (max(Reputation_Score) - min(Reputation_Score)) * 100file3["Reputation_Score"] = Reputation_Scorefile = pd.merge(file, file3, how="outer")# 可視化與保存print(file)index = [i for i in range(file.shape[0])]file["index1"] = indexfile.plot.scatter(x='Credit_Score', y='index1', s=2, c="pink")plt.show()print(file.value_counts(["Credit_Score"]))file.plot.scatter(x='Reputation_Score', y='index1', s=2, c="lightskyblue")plt.show()file.plot.scatter(x='Reputation_Score', y='index1', s=2, c="lightcoral")plt.show()file.plot.scatter(x='Reputation_Score', y='index1', s=2, c="mediumspringgreen")plt.show()print(file.value_counts(["Reputation_Score"]))file = file.drop(labels='index1', axis=1)file.to_csv("店鋪數據.csv", encoding="utf-8")def main():file = pd.read_csv(read_file, sep="\t", encoding="utf-8")weight_crdict = [5, 4] # trust和year的人為權重weight_reputation = [3, 1, 1] # 商品描述得分,服務得分,配送得分的人為權重keyword = ["旗艦店", "官方", "直銷店", "直銷", "廠家直銷", "直營店"] # 出現這樣的字眼會被當作可信任的店鋪keyword2 = ["小店", "折扣店", "特賣"]# 出現這樣的字眼會被當作不可信任的店鋪shop(file, weight_crdict, weight_reputation, keyword, keyword2)if __name__ == '__main__':main()??里面涉及的綜合評價實現可以參考博主這篇文章:https://blog.csdn.net/Hjh1906008151/article/details/123433270
??綜合評價分布概覽:
涉及缺失值填補的衍生特征
??缺失值填補部分可以參考博主這篇文章:https://blog.csdn.net/Hjh1906008151/article/details/124338450
??在上文提到的數據預處理中也提到了,但是隨機森林表現感人,不建議在此處使用。
總結
以上是生活随笔為你收集整理的pandas在数据分析(异常值识别问题)中的应用,以衍生特征计算为例(含2022年全国服务外包大赛实例)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: python tab键自动补全怎么用_p
- 下一篇: 水稻秸秆当饲料综合利用雷州观摩会 国稻种