simhash是locality sensitive hash(局部敏感哈希)的一种,最早由Moses Charikar在《similarity estimation techniques from rounding algorithms》一文中提出。Google就是基于此算法实现网页文件查重的。
simhash将一个文档转换成一个64位的字节的simhash值,然后判断重复只需要判断他们的simhash值的距离是不是<n,就可以判断两个文档是否相似。

#-*- coding:UTF-8 -*-
#
# 文章内容查重模块

import jieba
from simhash import Simhash, SimhashIndex

class CheckDup():
    def __init__(self):
        self.__index = None

    def add_index(self,key,value):
        s = Simhash([tag for tag in jieba.cut_for_search(value) if len(tag)>2])
        if self.__index == None:
            print "init check dup..."
            # k=16
            self.__index = SimhashIndex([(key,s)], k=16)
        else:
            # print "add index key:"+key
            self.__index.add(key,s)

    def del_index(self,key,value):
        s = Simhash([tag for tag in jieba.cut_for_search(value) if len(tag)>2])
        if self.__index:
            # print "del index key:"+key
            self.__index.delete(key,s)

    def update_index(self,key,old_v,new_v):
        os = Simhash([tag for tag in jieba.cut_for_search(old_v) if len(tag)>2])
        ns = Simhash([tag for tag in jieba.cut_for_search(new_v) if len(tag)>2])
        if self.__index:
            # print "update index key:"+key
            self.__index.delete(key,os)
            self.__index.add(key,ns)

    def find_near(self,value):
        if self.__index == None:
            return []
        s = Simhash([tag for tag in jieba.cut_for_search(value) if len(tag)>2])
        return self.__index.get_near_dups(s)