simhash
聽聞SimHash很強,對海量文檔相似度的計算有很高的效率。查了查文檔,大致的流程如下:
大致流程就是:分詞, 配合詞頻計算哈希串(每個分出來的詞最終會計算處同樣的長度), 降維,計算海明距離。
#coding:utf8 import math import jieba import jieba.analyseclass SimHash(object):def __init__(self):passdef getBinStr(self, source):if source == "":return 0else:x = ord(source[0]) << 7m = 1000003mask = 2 ** 128 - 1for c in source:x = ((x * m) ^ ord(c)) & maskx ^= len(source)if x == -1:x = -2x = bin(x).replace('0b', '').zfill(64)[-64:]print(source, x)return str(x)def getWeight(self, source):# fake weight with keywordreturn ord(source)def unwrap_weight(self, arr):ret = ""for item in arr:tmp = 0if int(item) > 0:tmp = 1ret += str(tmp)return retdef simHash(self, rawstr):seg = jieba.cut(rawstr, cut_all=True)keywords = jieba.analyse.extract_tags("|".join(seg), topK=100, withWeight=True)print(keywords)ret = []for keyword, weight in keywords:binstr = self.getBinStr(keyword)keylist = []for c in binstr:weight = math.ceil(weight)if c == "1":keylist.append(int(weight))else:keylist.append(-int(weight))ret.append(keylist)# 對列表進(jìn)行"降維"rows = len(ret)cols = len(ret[0])result = []for i in range(cols):tmp = 0for j in range(rows):tmp += int(ret[j][i])if tmp > 0:tmp = "1"elif tmp <= 0:tmp = "0"result.append(tmp)return "".join(result)def getDistince(self, hashstr1, hashstr2):length = 0for index, char in enumerate(hashstr1):if char == hashstr2[index]:continueelse:length += 1return lengthif __name__ == "__main__":simhash = SimHash()s1 = "100元=38萬星幣,加微信"s2 = "38萬星幣100元,加VX"with open("a.txt", "r") as file:s1 = "".join(file.readlines())file.close()with open("b.txt", "r") as file:s2 = "".join(file.readlines())file.close()# s1 = "this is just test for simhash, here is the difference"# s2 = "this is a test for simhash, here is the difference"# print(simhash.getBinStr(s1))# print(simhash.getBinStr(s2))hash1 = simhash.simHash(s1)hash2 = simhash.simHash(s2)distince = simhash.getDistince(hash1, hash2)# value = math.sqrt(len(s1)**2 + len(s2)**2)value = 5print("海明距離:", distince, "判定距離:", value, "是否相似:", distince<=value)經(jīng)計算發(fā)現(xiàn),對大文本有較強的驗證性,對小短文本相似度計算略有偏差,海明距離的計算會有不準(zhǔn)。
Building prefix dict from the default dictionary ... Loading model from cache /var/folders/d0/d4zzr4n51m7_vj9ryfb633pc0000gn/T/jieba.cache Loading model cost 0.764 seconds. Prefix dict has been built succesfully. 海明距離: 1 判定距離: 5 是否相似: True參考鏈接:
- https://blog.csdn.net/gzt940726/article/details/80460419
- https://blog.csdn.net/madujin/article/details/53152619
總結(jié)
- 上一篇: paddlehub自动抠图-人像
- 下一篇: 文章采集器-免费文章采集器