【知识发现】隐语义模型LFM算法python实现(二)
生活随笔
收集整理的這篇文章主要介紹了
【知识发现】隐语义模型LFM算法python实现(二)
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
http://blog.csdn.net/fjssharpsword/article/details/78015956
基于該篇文章中的代碼優化,主要是在生成負樣例上提高執行速度,代碼參考如下:
# -*- coding: utf-8 -*- ''' Created on 2017年10月16日@author: Administrator ''' import numpy as np import pandas as pd from math import exp import time import mathclass LFM:def __init__(self,lclass,iters,alpha,lamda,topk,ratio,traindata):self.lclass = lclass#隱類數量,對性能有影響self.iters = iters#迭代次數,收斂的最佳迭代次數未知self.alpha =alpha#梯度下降步長self.lamda = lamda#正則化參數self.topk =topk #推薦top k項self.ratio =ratio #正負樣例比率,對性能最大影響self.traindata=traindata#初始化開始..... def getUserPositiveItem(self, userid):#生成正樣例traindata=self.traindataseries = traindata[traindata['userid'] == userid]['itemid']positiveItemList = list(series.values)return positiveItemListdef getUserNegativeItem(self, userid):#生成負樣例traindata=self.traindataitemLen=self.itemLenratio=self.ratiouserItemlist = list(set(traindata[traindata['userid'] == userid]['itemid'])) #用戶評分過的物品negativeItemList = []count = ratio*len(userItemlist)#生成負樣例的數量for key,value in itemLen.iteritems():#itemLen.indexif count==0:breakif key in userItemlist:continuenegativeItemList.append(key)count=count-1return negativeItemList def initUserItem(self, userid):#traindata=self.traindatapositiveItem = self.getUserPositiveItem( userid)negativeItem = self.getUserNegativeItem( userid)itemDict = {}for item in positiveItem: itemDict[item] = 1for item in negativeItem: itemDict[item] = 0return itemDictdef initModel(self):traindata=self.traindatalcalss=self.lclass #隱類數量userID = list(set(traindata['userid'].values))self.userID=userIDitemID = list(set(traindata['itemid'].values))self.itemID=itemIDitemCount=[len(traindata[traindata['itemid'] == item]['userid']) for item in itemID ]self.itemLen = pd.Series(itemCount, index=itemID).sort_values(ascending=False)#統計每個物品對應的熱門度(次數并降序#初始化p、q矩陣arrayp = np.random.rand(len(userID), lcalss) #構造p矩陣,[0,1]內隨機值arrayq = np.random.rand(lcalss, len(itemID)) #構造q矩陣,[0,1]內隨機值p = pd.DataFrame(arrayp, columns=range(0,lcalss), index=userID)q = pd.DataFrame(arrayq, columns=itemID, index=range(0,lcalss))#生成負樣例userItem = []for userid in userID:itemDict = self.initUserItem(userid)userItem.append({userid:itemDict})return p, q, userItem#初始化結束..... def sigmod(self,x):# 單位階躍函數,將興趣度限定在[0,1]范圍內y = 1.0/(1+exp(-x))return ydef lfmPredict(self,p, q, userID, itemID):#利用參數p,q預測目標用戶對目標物品的興趣度p = np.mat(p.ix[userID].values)q = np.mat(q[itemID].values).Tr = (p * q).sum()r = self.sigmod(r)return rdef latenFactorModel(self):#traindata=self.traindatalclass=self.lclassiters=self.iters #迭代次數alpha = self.alpha #梯度下降步長lamda = self.lamda #正則化參數p, q, userItem = self.initModel()for step in range(0, iters):for user in userItem:for userID, samples in user.items():for itemID, rui in samples.items():eui = rui - self.lfmPredict(p, q, userID, itemID)for f in range(0, lclass):#print('step %d user %d class %d' % (step, userID, f))p[f][userID] += alpha * (eui * q[itemID][f] - lamda * p[f][userID])q[itemID][f] += alpha * (eui * p[f][userID] - lamda * q[itemID][f])alpha *= 0.9#學習速率return p, qdef recommend(self,userid,p,q):itemID=self.itemIDTopk=self.topk#traindata=self.traindata#userItemlist = list(set(traindata[traindata['userid'] == userid]['itemid']))#otherItemList = [item for item in set(traindata['itemid'].values) if item not in userItemlist]predictList = [self.lfmPredict(p, q, userid, itemid) for itemid in itemID]series = pd.Series(predictList, index=itemID)series = series.sort_values(ascending=False)[:Topk]return seriesdef recallAndPrecision(self,p,q):#召回率和準確率traindata = self.traindata#itemID=self.itemIDuserID=self.userIDhit = 0recall = 0precision = 0for userid in userID:trueItem = traindata[traindata['userid'] == userid]['itemid']preitem=self.recommend(userid, p, q)preItem=list(preitem.index)for item in preItem:if item in trueItem:hit += 1recall += len(trueItem)precision += len(preItem)return (hit / (recall * 1.0),hit / (precision * 1.0))def coverage(self,p,q):#覆蓋率traindata = self.traindatarecommend_items = set()all_items = set()userID=self.userIDfor userid in userID:trueItem = traindata[traindata['userid'] == userid]['itemid']for item in trueItem:all_items.add(item)preitem = self.recommend(userid, p, q)preItem=list(preitem.index)for item in preItem:recommend_items.add(item)return len(recommend_items) / (len(all_items) * 1.0)def popularity(self,p,q):#流行度#traindata = self.traindataitemLen=self.itemLen#itemID=self.itemIDuserID=self.userIDret = 0n = 0for userid in userID:preitem = self.recommend(userid, p, q)preItem=list(preitem.index)for item in preItem:ret += math.log(1+itemLen[item])n += 1return ret / (n * 1.0)if __name__ == "__main__": start = time.clock() #導入數據#df_sample = pd.read_csv("D:\\dev\\workspace\\PyRecSys\\demo\\ratings.csv",names=['userid','itemid','ratings'],header=0)df_sample = pd.read_csv("D:\\tmp\\ratings.csv",names=['userid','itemid','ratings'],header=0)traindata=df_sample[['userid','itemid']] for ratio in [1,2,3,5,10,20]:for lclass in [5,10,20,30,50]: lfm=LFM(lclass,2,0.02,0.01,10,ratio,traindata) #隱類參數p,q=lfm.latenFactorModel()#推薦#preitem = lfm.recommend(1, p, q)#print (preitem)#模型評估print ("%3s%20s%20s%20s%20s%20s" % ('ratio','lcalss',"recall",'precision','coverage','popularity'))recall,precision = lfm.recallAndPrecision(p,q)coverage =lfm.coverage(p,q)popularity =lfm.popularity(p,q)print ("%3d%20d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (ratio,lclass,recall * 100,precision * 100,coverage * 100,popularity))end = time.clock() print('finish all in %s' % str(end - start))關注三點:
1)性能受正負樣例比率、隱類數量影響最大,要訓練出一個最佳參數。
2)對于梯度下降的收斂條件,即迭代次數,限定步長為0.02,迭代次數n要訓練出一個最佳值。
3)對于增量數據的訓練:保存p、q矩陣,對于增量樣本集,可以在p、q基礎上訓練,有待實踐驗證,避免每次全量訓練耗費性能。
總結
以上是生活随笔為你收集整理的【知识发现】隐语义模型LFM算法python实现(二)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 【正一专栏】曼城攻击力惊人露出冠军相
- 下一篇: 【知识发现】隐语义模型LFM算法pyth