# 代碼來源 # phanein/deepwalk# Random walkwithopen(f,'w')asfout:forwalkingraph.build_deepwalk_corpus_iter(G=G,# 圖num_paths=num_paths,# 路徑數(shù)path_length=path_length,# 路徑長度alpha=alpha,#rand=rand):#fout.write(u"{}\n".format(u" ".join(vforvinwalk)))classGraph(defaultdict):"""
Efficient basic implementation of nx
這里我們看到,
該類繼承defaultdict,
圖其實可以簡單的表示為dict,
key為節(jié)點,value為與之相連的節(jié)點
"""def__init__(self):super(Graph,self).__init__(list)defnodes(self):returnself.keys()defadjacency_iter(self):returnself.iteritems()defsubgraph(self,nodes={}):# 提取子圖subgraph=Graph()forninnodes:ifninself:subgraph[n]=[xforxinself[n]ifxinnodes]returnsubgraphdefmake_undirected(self):#因為是無向圖,所以v in self[u]并且 u in self[v]t0=time()forvinlist(self):forotherinself[v]:ifv!=other:self[other].append(v)t1=time()logger.info('make_directed: added missing edges {}s'.format(t1-t0))self.make_consistent()returnselfdefmake_consistent(self):# 去重t0=time()forkiniterkeys(self):self[k]=list(sorted(set(self[k])))t1=time()logger.info('make_consistent: made consistent in {}s'.format(t1-t0))self.remove_self_loops()returnselfdefremove_self_loops(self):# 自已不會與自己有連接removed=0t0=time()forxinself:ifxinself[x]:self[x].remove(x)removed+=1t1=time()logger.info('remove_self_loops: removed {} loops in {}s'.format(removed,(t1-t0)))returnselfdefcheck_self_loops(self):forxinself:foryinself[x]:ifx==y:returnTruereturnFalsedefhas_edge(self,v1,v2):# 兩個節(jié)點是否有邊ifv2inself[v1]orv1inself[v2]:returnTruereturnFalsedefdegree(self,nodes=None):# 節(jié)點的度數(shù)ifisinstance(nodes,Iterable):return{v:len(self[v])forvinnodes}else:returnlen(self[nodes])deforder(self):"Returns the number of nodes in the graph"returnlen(self)defnumber_of_edges(self):# 圖中邊的數(shù)量"Returns the number of nodes in the graph"returnsum([self.degree(x)forxinself.keys()])/2defnumber_of_nodes(self):# 圖中結(jié)點數(shù)量"Returns the number of nodes in the graph"returnself.order()# 核心代碼defrandom_walk(self,path_length,alpha=0,rand=random.Random(),start=None):""" Returns a truncated random walk.
path_length: Length of the random walk.
alpha: probability of restarts.
start: the start node of the random walk.
"""G=selfifstart:path=[start]else:# Sampling is uniform w.r.t V, and not w.r.t Epath=[rand.choice(list(G.keys()))]whilelen(path)<path_length:cur=path[-1]iflen(G[cur])>0:ifrand.random()>=alpha:path.append(rand.choice(G[cur]))# 相鄰節(jié)點隨機選else:path.append(path[0])# 有一定概率選擇回到起點else:breakreturn[str(node)fornodeinpath]# TODO add build_walks in heredefbuild_deepwalk_corpus(G,num_paths,path_length,alpha=0,rand=random.Random(0)):walks=[]nodes=list(G.nodes())# 這里和上面論文算法流程對應(yīng)forcntinrange(num_paths):# 外循環(huán),相當于要迭代多少epochrand.shuffle(nodes)# 打亂nodes順序,加速收斂fornodeinnodes:# 每個node都會產(chǎn)生一條路徑walks.append(G.random_walk(path_length,rand=rand,alpha=alpha,start=node))returnwalksdefbuild_deepwalk_corpus_iter(G,num_paths,path_length,alpha=0,rand=random.Random(0)):# 流式處理用walks=[]nodes=list(G.nodes())forcntinrange(num_paths):rand.shuffle(nodes)fornodeinnodes:yieldG.random_walk(path_length,rand=rand,alpha=alpha,start=node)defclique(size):returnfrom_adjlist(permutations(range(1,size+1)))# How do you split a list into evenly sized chunks?defgrouper(n,iterable,padvalue=None):"grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"returnzip_longest(*[iter(iterable)]*n,fillvalue=padvalue)defparse_adjacencylist(f):adjlist=[]forlinf:iflandl[0]!="#":introw=[int(x)forxinl.strip().split()]row=[introw[0]]row.extend(set(sorted(introw[1:])))adjlist.extend([row])returnadjlistdefparse_adjacencylist_unchecked(f):adjlist=[]forlinf:iflandl[0]!="#":adjlist.extend([[int(x)forxinl.strip().split()]])returnadjlistdefload_adjacencylist(file_,undirected=False,chunksize=10000,unchecked=True):ifunchecked:parse_func=parse_adjacencylist_uncheckedconvert_func=from_adjlist_uncheckedelse:parse_func=parse_adjacencylistconvert_func=from_adjlistadjlist=[]t0=time()total=0withopen(file_)asf:foridx,adj_chunkinenumerate(map(parse_func,grouper(int(chunksize),f))):adjlist.extend(adj_chunk)total+=len(adj_chunk)t1=time()logger.info('Parsed {} edges with {} chunks in {}s'.format(total,idx,t1-t0))t0=time()G=convert_func(adjlist)t1=time()logger.info('Converted edges to graph in {}s'.format(t1-t0))ifundirected:t0=time()G=G.make_undirected()t1=time()logger.info('Made graph undirected in {}s'.format(t1-t0))returnGdefload_edgelist(file_,undirected=True):G=Graph()withopen(file_)asf:forlinf:x,y=l.strip().split()[:2]x=int(x)y=int(y)G[x].append(y)ifundirected:G[y].append(x)G.make_consistent()returnGdefload_matfile(file_,variable_name="network",undirected=True):mat_varables=loadmat(file_)mat_matrix=mat_varables[variable_name]returnfrom_numpy(mat_matrix,undirected)deffrom_networkx(G_input,undirected=True):G=Graph()foridx,xinenumerate(G_input.nodes()):foryiniterkeys(G_input[x]):G[x].append(y)ifundirected:G.make_undirected()returnGdeffrom_numpy(x,undirected=True):G=Graph()ifissparse(x):cx=x.tocoo()fori,j,vinzip(cx.row,cx.col,cx.data):G[i].append(j)else:raiseException("Dense matrices not yet supported.")ifundirected:G.make_undirected()G.make_consistent()returnGdeffrom_adjlist(adjlist):G=Graph()forrowinadjlist:node=row[0]neighbors=row[1:]G[node]=list(sorted(set(neighbors)))returnGdeffrom_adjlist_unchecked(adjlist):G=Graph()forrowinadjlist:node=row[0]neighbors=row[1:]G[node]=neighborsreturnG
至于skipgram,大家可以直接用gensim工具即可.
from gensim.models import Word2Vec
from gensim.models.word2vec import Vocablogger = logging.getLogger("deepwalk")class Skipgram(Word2Vec):"""A subclass to allow more customization of the Word2Vec internals."""def __init__(self, vocabulary_counts=None, **kwargs):self.vocabulary_counts = Nonekwargs["min_count"] = kwargs.get("min_count", 0)kwargs["workers"] = kwargs.get("workers", cpu_count())kwargs["size"] = kwargs.get("size", 128)kwargs["sentences"] = kwargs.get("sentences", None)kwargs["window"] = kwargs.get("window", 10)kwargs["sg"] = 1kwargs["hs"] = 1if vocabulary_counts != None:self.vocabulary_counts = vocabulary_countssuper(Skipgram, self).__init__(**kwargs)