Last active
August 29, 2015 13:55
-
-
Save pcx/8750818 to your computer and use it in GitHub Desktop.
Calculate mostly frequently occuring words (including their synonyms)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| from nltk.corpus import stopwords, wordnet | |
| from nltk.probability import FreqDist | |
| if len(sys.argv) < 2: | |
| print("Usage: python eflu.py path-to-file") | |
| sys.exit(1) | |
| f = open(sys.argv[1]) | |
| data = f.read() | |
| my_words = data.split() | |
| stop_words = stopwords.words('english') | |
| real_words = [word.lower() for word in my_words | |
| if word.lower() not in stop_words] | |
| fdist = FreqDist(real_words) | |
| top_words = fdist.keys()[0:20] | |
| weighted_top_words = dict() | |
| for top_word in top_words: | |
| syn_words = wordnet.synsets(top_word) | |
| weight = fdist.freq(top_word) | |
| for syn_word in syn_words: | |
| weight += fdist.freq(syn_word) | |
| weighted_top_words[top_word] = weight | |
| for k in weighted_top_words.keys(): | |
| print(k + " - " + | |
| str(weighted_top_words.get(k))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment