Event Recommendation Engine Challenge分步解析第五步
一、請知曉
本文是基于:
Event Recommendation Engine Challenge分步解析第一步
Event Recommendation Engine Challenge分步解析第二步
Event Recommendation Engine Challenge分步解析第三步
Event Recommendation Engine Challenge分步解析第四步
需要讀者先閱讀前四篇文章解析
?
二、活躍度/event熱度數據
由于用到event_attendees.csv.gz文件,我們先看看該文件
import pandas as pd df_events_attendees = pd.read_csv('event_attendees.csv.gz', compression='gzip') df_events_attendees.head()代碼示例結果(該文件保存了某event出席情況信息):
1)變量解釋
nevents:train.csv和test.csv中總共的events數目,這里值為13418
self.eventPopularity:稀疏矩陣,shape為(nevents,1),保存的值是某個event在上圖中yes數目-no數目,即一行行處理上述文件,獲取該event的index,后yes列空格分割后數目減去no列空格分割數目,并做歸一化
?
import pandas as pd import scipy.io as sio eventPopularity = sio.mmread('EA_eventPopularity').todense() pd.DataFrame(eventPopularity)代碼示例結果:
?
第五步完整代碼:
?
from collections import defaultdict import locale, pycountry import scipy.sparse as ss import scipy.io as sio import itertools #import cPickle #From python3, cPickle has beed replaced by _pickle import _pickle as cPickleimport scipy.spatial.distance as ssd import datetime from sklearn.preprocessing import normalizeimport gzip import numpy as npimport hashlib#處理user和event關聯數據 class ProgramEntities:"""我們只關心train和test中出現的user和event,因此重點處理這部分關聯數據,經過統計:train和test中總共3391個users和13418個events"""def __init__(self):#統計訓練集中有多少獨立的用戶的eventsuniqueUsers = set()#uniqueUsers保存總共多少個用戶:3391個uniqueEvents = set()#uniqueEvents保存總共多少個events:13418個eventsForUser = defaultdict(set)#字典eventsForUser保存了每個user:所對應的eventusersForEvent = defaultdict(set)#字典usersForEvent保存了每個event:哪些user點擊for filename in ['train.csv', 'test.csv']:f = open(filename)f.readline()#跳過第一行for line in f:cols = line.strip().split(',')uniqueUsers.add( cols[0] )uniqueEvents.add( cols[1] )eventsForUser[cols[0]].add( cols[1] )usersForEvent[cols[1]].add( cols[0] )f.close()self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )self.userIndex = dict()self.eventIndex = dict()for i, u in enumerate(uniqueUsers):self.userIndex[u] = ifor i, e in enumerate(uniqueEvents):self.eventIndex[e] = iftrain = open('train.csv')ftrain.readline()for line in ftrain:cols = line.strip().split(',')i = self.userIndex[ cols[0] ]j = self.eventIndex[ cols[1] ]self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )ftrain.close()sio.mmwrite('PE_userEventScores', self.userEventScores)#為了防止不必要的計算,我們找出來所有關聯的用戶或者關聯的event#所謂關聯用戶指的是至少在同一個event上有行為的用戶user pair#關聯的event指的是至少同一個user有行為的event pairself.uniqueUserPairs = set()self.uniqueEventPairs = set()for event in uniqueEvents:users = usersForEvent[event]if len(users) > 2:self.uniqueUserPairs.update( itertools.combinations(users, 2) )for user in uniqueUsers:events = eventsForUser[user]if len(events) > 2:self.uniqueEventPairs.update( itertools.combinations(events, 2) )#rint(self.userIndex)cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )#數據清洗類 class DataCleaner:def __init__(self):#一些字符串轉數值的方法#載入localeself.localeIdMap = defaultdict(int)for i, l in enumerate(locale.locale_alias.keys()):self.localeIdMap[l] = i + 1#載入countryself.countryIdMap = defaultdict(int)ctryIdx = defaultdict(int)for i, c in enumerate(pycountry.countries):self.countryIdMap[c.name.lower()] = i + 1if c.name.lower() == 'usa':ctryIdx['US'] = iif c.name.lower() == 'canada':ctryIdx['CA'] = ifor cc in ctryIdx.keys():for s in pycountry.subdivisions.get(country_code=cc):self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1self.genderIdMap = defaultdict(int, {'male':1, 'female':2})#處理LocaleIddef getLocaleId(self, locstr):#這樣因為localeIdMap是defaultdict(int),如果key中沒有locstr.lower(),就會返回默認int 0return self.localeIdMap[ locstr.lower() ]#處理birthyeardef getBirthYearInt(self, birthYear):try:return 0 if birthYear == 'None' else int(birthYear)except:return 0#性別處理def getGenderId(self, genderStr):return self.genderIdMap[genderStr]#joinedAtdef getJoinedYearMonth(self, dateString):dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")return "".join( [str(dttm.year), str(dttm.month) ] )#處理locationdef getCountryId(self, location):if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind(' ') > -1:return self.countryIdMap[ location[location.rindex(' ') + 2: ].lower() ]else:return 0#處理timezonedef getTimezoneInt(self, timezone):try:return int(timezone)except:return 0def getFeatureHash(self, value):if len(value.strip()) == 0:return -1else:#return int( hashlib.sha224(value).hexdigest()[0:4], 16) python3會報如下錯誤#TypeError: Unicode-objects must be encoded before hashingreturn int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4], 16)#python必須先進行encodedef getFloatValue(self, value):if len(value.strip()) == 0:return 0.0else:return float(value)#用戶與用戶相似度矩陣 class Users:"""構建user/user相似度矩陣"""def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #計算向量u和v之間的相關系數cleaner = DataCleaner()nusers = len(programEntities.userIndex.keys())#3391#print(nusers)fin = open('users.csv')colnames = fin.readline().strip().split(',') #7列特征self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#構建稀疏矩陣for line in fin:cols = line.strip().split(',')#只考慮train.csv中出現的用戶,這一行是作者注釋上的,但是我不是很理解#userIndex包含了train和test的所有用戶,為何說只考慮train.csv中出現的用戶if cols[0] in programEntities.userIndex:i = programEntities.userIndex[ cols[0] ]#獲取user:對應的indexself.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#localeself.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#處理性別self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#處理joinedAt列self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#處理locationself.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#處理timezonefin.close()#歸一化矩陣self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('US_userMatrix', self.userMatrix)#計算用戶相似度矩陣,之后會用到self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)for i in range(0, nusers):self.userSimMatrix[i, i] = 1.0for u1, u2 in programEntities.uniqueUserPairs:i = programEntities.userIndex[u1]j = programEntities.userIndex[u2]if (i, j) not in self.userSimMatrix:#print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]#print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())self.userSimMatrix[i, j] = usimself.userSimMatrix[j, i] = usimsio.mmwrite('US_userSimMatrix', self.userSimMatrix)#用戶社交關系挖掘 class UserFriends:"""找出某用戶的那些朋友,想法非常簡單1)如果你有更多的朋友,可能你性格外向,更容易參加各種活動2)如果你朋友會參加某個活動,可能你也會跟隨去參加一下"""def __init__(self, programEntities):nusers = len(programEntities.userIndex.keys())#3391self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.]),保存每一個用戶的朋友數self.userFriends = ss.dok_matrix( (nusers, nusers) )fin = gzip.open('user_friends.csv.gz')print( 'Header In User_friends.csv.gz:',fin.readline() )ln = 0#逐行打開user_friends.csv.gz文件#判斷第一列的user是否在userIndex中,只有user在userIndex中才是我們關心的user#獲取該用戶的Index,和朋友數目#對于該用戶的每一個朋友,如果朋友也在userIndex中,獲取其朋友的userIndex,然后去userEventScores中獲取該朋友對每個events的反應#score即為該朋友對所有events的平均分#userFriends矩陣記錄了用戶和朋友之間的score#如851286067:1750用戶出現在test.csv中,該用戶在User_friends.csv.gz中一共2151個朋友#那么其朋友占比應該是2151 / 總的朋友數sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198for line in fin:if ln % 200 == 0:print( 'Loading line:', ln )cols = line.decode().strip().split(',')user = cols[0]if user in programEntities.userIndex:friends = cols[1].split(' ')#獲得該用戶的朋友列表i = programEntities.userIndex[user]self.numFriends[i] = len(friends)for friend in friends:if friend in programEntities.userIndex:j = programEntities.userIndex[friend]#the objective of this score is to infer the degree to#and direction in which this friend will influence the#user's decision, so we sum the user/event score for#this user across all training eventseventsForUser = programEntities.userEventScores.getrow(j).todense()#獲取朋友對每個events的反應:0, 1, or -1#print(eventsForUser.sum(), np.shape(eventsForUser)[1] )#socre即是用戶朋友在13418個events上的平均分score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,#print(score)self.userFriends[i, j] += scoreself.userFriends[j, i] += scoreln += 1fin.close()#歸一化數組sumNumFriends = self.numFriends.sum(axis=0)#每個用戶的朋友數相加#print(sumNumFriends)self.numFriends = self.numFriends / sumNumFriends#每個user的朋友數目比例sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)sio.mmwrite('UF_userFriends', self.userFriends)#構造event和event相似度數據 class Events:"""構建event-event相似度,注意這里有2種相似度1)由用戶-event行為,類似協同過濾算出的相似度2)由event本身的內容(event信息)計算出的event-event相似度"""def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):cleaner = DataCleaner()fin = gzip.open('events.csv.gz')fin.readline()#skip headernevents = len(programEntities.eventIndex)print(nevents)#13418self.eventPropMatrix = ss.dok_matrix( (nevents, 7) )self.eventContMatrix = ss.dok_matrix( (nevents, 100) )ln = 0for line in fin:#if ln > 10:#breakcols = line.decode().strip().split(',')eventId = cols[0]if eventId in programEntities.eventIndex:i = programEntities.eventIndex[eventId]self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth( cols[2] )#start_timeself.eventPropMatrix[i, 1] = cleaner.getFeatureHash( cols[3] )#cityself.eventPropMatrix[i, 2] = cleaner.getFeatureHash( cols[4] )#stateself.eventPropMatrix[i, 3] = cleaner.getFeatureHash( cols[5] )#zipself.eventPropMatrix[i, 4] = cleaner.getFeatureHash( cols[6] )#countryself.eventPropMatrix[i, 5] = cleaner.getFloatValue( cols[7] )#latself.eventPropMatrix[i, 6] = cleaner.getFloatValue( cols[8] )#lonfor j in range(9, 109):self.eventContMatrix[i, j-9] = cols[j]ln += 1fin.close()self.eventPropMatrix = normalize(self.eventPropMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('EV_eventPropMatrix', self.eventPropMatrix)self.eventContMatrix = normalize(self.eventContMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('EV_eventContMatrix', self.eventContMatrix)#calculate similarity between event pairs based on the two matricesself.eventPropSim = ss.dok_matrix( (nevents, nevents) )self.eventContSim = ss.dok_matrix( (nevents, nevents) )for e1, e2 in programEntities.uniqueEventPairs:i = programEntities.eventIndex[e1]j = programEntities.eventIndex[e2]if not ((i, j) in self.eventPropSim):epsim = psim( self.eventPropMatrix.getrow(i).todense(), self.eventPropMatrix.getrow(j).todense())if np.isnan(epsim):epsim = 0self.eventPropSim[i, j] = epsimself.eventPropSim[j, i] = epsimif not ((i, j) in self.eventContSim):#兩個向量,如果某個全為0,會返回nan"""import numpy as npa = np.array([0, 1, 1, 1, 0, 0, 0, 1, 0, 0])b = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])from scipy.spatial.distance import cosinetemp = cosine(a, b)會出現下面問題:Warning (from warnings module):File "D:\Python35\lib\site-packages\scipy\spatial\distance.py", line 644dist = 1.0 - uv / np.sqrt(uu * vv)RuntimeWarning: invalid value encountered in double_scalars"""ecsim = csim( self.eventContMatrix.getrow(i).todense(), self.eventContMatrix.getrow(j).todense())if np.isnan(ecsim):ecsim = 0self.eventContSim[i, j] = ecsimself.eventContSim[j, i] = ecsimsio.mmwrite('EV_eventPropSim', self.eventPropSim)sio.mmwrite('EV_eventContSim', self.eventContSim)class EventAttendees:"""統計某個活動,參加和不參加的人數,從而為活動活躍度做準備"""def __init__(self, programEntities):nevents = len(programEntities.eventIndex)#13418self.eventPopularity = ss.dok_matrix( (nevents, 1) )f = gzip.open('event_attendees.csv.gz')f.readline()#skip headerfor line in f:cols = line.decode().strip().split(',')eventId = cols[0]if eventId in programEntities.eventIndex:i = programEntities.eventIndex[eventId]self.eventPopularity[i, 0] = len(cols[1].split(' ')) - len(cols[4].split(' '))#yes人數-no人數,即出席人數減未出席人數f.close()self.eventPopularity = normalize( self.eventPopularity, norm='l1', axis=0, copy=False)sio.mmwrite('EA_eventPopularity', self.eventPopularity)def data_prepare():"""計算生成所有的數據,用矩陣或者其他形式存儲方便后續提取特征和建模"""print('第1步:統計user和event相關信息...')pe = ProgramEntities()print('第1步完成...\n')print('第2步:計算用戶相似度信息,并用矩陣形式存儲...')Users(pe)print('第2步完成...\n')print('第3步:計算用戶社交關系信息,并存儲...')UserFriends(pe)print('第3步完成...\n')print('第4步:計算event相似度信息,并用矩陣形式存儲...')Events(pe)print('第4步完成...\n')print('第5步:計算event熱度信息...')EventAttendees(pe)print('第5步完成...\n')#運行進行數據準備 data_prepare()?
綜上完成數據的預處理和保存功能
下面我們來看看特征構建:Event Recommendation Engine Challenge分步解析第六步
?
轉載于:https://www.cnblogs.com/always-fight/p/10505454.html
總結
以上是生活随笔為你收集整理的Event Recommendation Engine Challenge分步解析第五步的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Go语言交叉编译工具gox
- 下一篇: 小组会谈(2019.3.14)