Skip to content

Instantly share code, notes, and snippets.

@w-garcia
Created June 9, 2016 15:13
Show Gist options
  • Select an option

  • Save w-garcia/40bd7e32dbf4ee313517757562013919 to your computer and use it in GitHub Desktop.

Select an option

Save w-garcia/40bd7e32dbf4ee313517757562013919 to your computer and use it in GitHub Desktop.
import nltk
from nltk.corpus import stopwords
systems = ['cassandra', 'flume', 'hbase', 'hdfs', 'mapreduce', 'zookeeper']
def strip_redundant_info(line):
begin_key = line.find(']') + 2
end_key = line.find('\n') - 2
# extract auto-generated string
useful_description = line[begin_key:end_key + 1]
# TODO: look for parts of speech first and preserve?
# TODO: remove punctuation
# remove stop words
word_list = nltk.word_tokenize(useful_description)
for word in word_list:
if word in stopwords.words('english'):
word_list.remove(word)
return ' '.join(word_list) + '\n'
def process_system(system):
f = open(system + '.txt', 'r')
if f is None:
print "Couldn't find " + system + ". Aborting."
return
fw = open(system + '_kw.txt', 'w')
for line in f:
if line.find('[') == 0:
line = strip_redundant_info(line)
fw.write(line)
print "Processed " + system + "."
def combine():
f = open('my_data.txt', 'w')
for system_name in systems:
fw = open(system_name + '_kw.txt', 'r')
for line in fw:
f.write(line)
print "Combined files."
def main():
for system_name in systems:
process_system(system_name)
combine()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment