Created
October 13, 2014 03:23
-
-
Save wilhelmjung/efc950890b454e5f5609 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #string hash functions | |
| def bkdr(str): | |
| seed = 13131 # 31,131,13131,... | |
| hash = 0 | |
| for ch in str: | |
| hash = hash * seed + ord(ch) # char to int | |
| return hash & 0xFFFFFFFF # unit32 | |
| def english_words_hash_test(wtxt): | |
| dict = {} # hash, words | |
| f = open(wtxt, 'r') | |
| lines = f.readlines() | |
| print '\nread %s lines from file "%s."'%(len(lines), wtxt) | |
| for line in lines: | |
| line = line.split()[0] # 1st column. | |
| hash = bkdr(line) | |
| if dict.has_key(hash): | |
| dict[hash].append(line) | |
| print 'got dup hash %s for %s'%(hash, dict[hash]) | |
| else: | |
| dict[hash] = [line] | |
| print '%s hashes computed.'%(len(dict)) | |
| ndup = len(lines) - len(dict) | |
| print 'collision cnt:', ndup | |
| print 'collision rate:', 1.0 * ndup / len(lines) | |
| if __name__ == "__main__": | |
| # OSWI.txt, wordsEn.txt, | |
| english_words_hash_test('E:/OSWI.txt') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment