Created
June 11, 2019 07:50
-
-
Save Siyeong-Lee/a006df6b44cc73d74a435d17d05da118 to your computer and use it in GitHub Desktop.
countingWordsinFile.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import multiprocessing | |
| import string | |
| from multiprocessing_mapreduce import SimpleMapReduce | |
| def file_to_words(filename): | |
| """Read a file and return a sequence of (word, occurances) values. | |
| """ | |
| STOP_WORDS = set([ | |
| 'a', 'an', 'and', 'are', 'as', 'be', 'by', 'for', 'if', 'in', | |
| 'is', 'it', 'of', 'or', 'py', 'rst', 'that', 'the', 'to', 'with', | |
| ]) | |
| TR = string.maketrans(string.punctuation, ' ' * len(string.punctuation)) | |
| print multiprocessing.current_process().name, 'reading', filename | |
| output = [] | |
| with open(filename, 'rt') as f: | |
| for line in f: | |
| if line.lstrip().startswith('..'): # Skip rst comment lines | |
| continue | |
| line = line.translate(TR) # Strip punctuation | |
| for word in line.split(): | |
| word = word.lower() | |
| if word.isalpha() and word not in STOP_WORDS: | |
| output.append( (word, 1) ) | |
| return output | |
| def count_words(item): | |
| """Convert the partitioned data for a word to a | |
| tuple containing the word and the number of occurances. | |
| """ | |
| word, occurances = item | |
| return (word, sum(occurances)) | |
| if __name__ == '__main__': | |
| import operator | |
| import glob | |
| input_files = glob.glob('*.rst') | |
| mapper = SimpleMapReduce(file_to_words, count_words) | |
| word_counts = mapper(input_files) | |
| word_counts.sort(key=operator.itemgetter(1)) | |
| word_counts.reverse() | |
| print '\nTOP 20 WORDS BY FREQUENCY\n' | |
| top20 = word_counts[:20] | |
| longest = max(len(word) for word, count in top20) | |
| for word, count in top20: | |
| print '%-*s: %5s' % (longest+1, word, count) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment