Created
June 8, 2016 22:41
-
-
Save w-garcia/89812f9de7e0a4f1e64f6318b3223925 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import nltk | |
| from nltk.corpus import stopwords | |
| def strip_redundant_info(line): | |
| begin_key = line.find(']') + 2 | |
| end_key = line.find('\n') - 2 | |
| # extract auto-generated string | |
| useful_description = line[begin_key:end_key + 1] | |
| # TODO: look for parts of speech first and preserve? | |
| # TODO: remove punctuation | |
| # remove stop words | |
| word_list = nltk.word_tokenize(useful_description) | |
| for word in word_list: | |
| if word in stopwords.words('english'): | |
| word_list.remove(word) | |
| return ' '.join(word_list) + '\n' | |
| def process_system(system): | |
| f = open('cassandra.txt', 'r') | |
| if f is None: | |
| print "Couldn't find " + system + ". Aborting." | |
| return | |
| fw = open('cassandra_kw.txt', 'w') | |
| for line in f: | |
| if line.find('[') == 0: | |
| line = strip_redundant_info(line) | |
| fw.write(line) | |
| print "Processed " + system + "." | |
| def main(): | |
| process_system('cassandra') | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment