Created
June 9, 2016 15:13
-
-
Save w-garcia/40bd7e32dbf4ee313517757562013919 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import nltk | |
| from nltk.corpus import stopwords | |
| systems = ['cassandra', 'flume', 'hbase', 'hdfs', 'mapreduce', 'zookeeper'] | |
| def strip_redundant_info(line): | |
| begin_key = line.find(']') + 2 | |
| end_key = line.find('\n') - 2 | |
| # extract auto-generated string | |
| useful_description = line[begin_key:end_key + 1] | |
| # TODO: look for parts of speech first and preserve? | |
| # TODO: remove punctuation | |
| # remove stop words | |
| word_list = nltk.word_tokenize(useful_description) | |
| for word in word_list: | |
| if word in stopwords.words('english'): | |
| word_list.remove(word) | |
| return ' '.join(word_list) + '\n' | |
| def process_system(system): | |
| f = open(system + '.txt', 'r') | |
| if f is None: | |
| print "Couldn't find " + system + ". Aborting." | |
| return | |
| fw = open(system + '_kw.txt', 'w') | |
| for line in f: | |
| if line.find('[') == 0: | |
| line = strip_redundant_info(line) | |
| fw.write(line) | |
| print "Processed " + system + "." | |
| def combine(): | |
| f = open('my_data.txt', 'w') | |
| for system_name in systems: | |
| fw = open(system_name + '_kw.txt', 'r') | |
| for line in fw: | |
| f.write(line) | |
| print "Combined files." | |
| def main(): | |
| for system_name in systems: | |
| process_system(system_name) | |
| combine() | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment