Skip to content

Instantly share code, notes, and snippets.

@w-garcia
Created June 8, 2016 22:41
Show Gist options
  • Select an option

  • Save w-garcia/89812f9de7e0a4f1e64f6318b3223925 to your computer and use it in GitHub Desktop.

Select an option

Save w-garcia/89812f9de7e0a4f1e64f6318b3223925 to your computer and use it in GitHub Desktop.
import nltk
from nltk.corpus import stopwords
def strip_redundant_info(line):
begin_key = line.find(']') + 2
end_key = line.find('\n') - 2
# extract auto-generated string
useful_description = line[begin_key:end_key + 1]
# TODO: look for parts of speech first and preserve?
# TODO: remove punctuation
# remove stop words
word_list = nltk.word_tokenize(useful_description)
for word in word_list:
if word in stopwords.words('english'):
word_list.remove(word)
return ' '.join(word_list) + '\n'
def process_system(system):
f = open('cassandra.txt', 'r')
if f is None:
print "Couldn't find " + system + ". Aborting."
return
fw = open('cassandra_kw.txt', 'w')
for line in f:
if line.find('[') == 0:
line = strip_redundant_info(line)
fw.write(line)
print "Processed " + system + "."
def main():
process_system('cassandra')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment