Skip to content

Instantly share code, notes, and snippets.

@wilhelmjung
Created October 13, 2014 03:23
Show Gist options
  • Select an option

  • Save wilhelmjung/efc950890b454e5f5609 to your computer and use it in GitHub Desktop.

Select an option

Save wilhelmjung/efc950890b454e5f5609 to your computer and use it in GitHub Desktop.
#string hash functions
def bkdr(str):
seed = 13131 # 31,131,13131,...
hash = 0
for ch in str:
hash = hash * seed + ord(ch) # char to int
return hash & 0xFFFFFFFF # unit32
def english_words_hash_test(wtxt):
dict = {} # hash, words
f = open(wtxt, 'r')
lines = f.readlines()
print '\nread %s lines from file "%s."'%(len(lines), wtxt)
for line in lines:
line = line.split()[0] # 1st column.
hash = bkdr(line)
if dict.has_key(hash):
dict[hash].append(line)
print 'got dup hash %s for %s'%(hash, dict[hash])
else:
dict[hash] = [line]
print '%s hashes computed.'%(len(dict))
ndup = len(lines) - len(dict)
print 'collision cnt:', ndup
print 'collision rate:', 1.0 * ndup / len(lines)
if __name__ == "__main__":
# OSWI.txt, wordsEn.txt,
english_words_hash_test('E:/OSWI.txt')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment