w-garcia · June 8, 2016 22:41
diff --git a/keyword_preprocess.py b/keyword_preprocess.py
 import nltk
 from nltk.corpus import stopwords


 def strip_redundant_info(line):
    begin_key = line.find(']') + 2
    end_key = line.find('\n') - 2

    # extract auto-generated string
    useful_description = line[begin_key:end_key + 1]
    # TODO: look for parts of speech first and preserve?
    # TODO: remove punctuation
    # remove stop words
    word_list = nltk.word_tokenize(useful_description)
    for word in word_list:
        if word in stopwords.words('english'):
            word_list.remove(word)

    return ' '.join(word_list) + '\n'


 def process_system(system):
    f = open('cassandra.txt', 'r')
    if f is None:
        print "Couldn't find " + system + ". Aborting."
        return

    fw = open('cassandra_kw.txt', 'w')

    for line in f:
        if line.find('[') == 0:
            line = strip_redundant_info(line)

        fw.write(line)
    print "Processed " + system + "."


 def main():
    process_system('cassandra')

 if __name__ == '__main__':
    main()
	import nltk
	from nltk.corpus import stopwords


	def strip_redundant_info(line):
	begin_key = line.find(']') + 2
	end_key = line.find('\n') - 2

	# extract auto-generated string
	useful_description = line[begin_key:end_key + 1]
	# TODO: look for parts of speech first and preserve?
	# TODO: remove punctuation
	# remove stop words
	word_list = nltk.word_tokenize(useful_description)
	for word in word_list:
	if word in stopwords.words('english'):
	word_list.remove(word)

	return ' '.join(word_list) + '\n'


	def process_system(system):
	f = open('cassandra.txt', 'r')
	if f is None:
	print "Couldn't find " + system + ". Aborting."
	return

	fw = open('cassandra_kw.txt', 'w')

	for line in f:
	if line.find('[') == 0:
	line = strip_redundant_info(line)

	fw.write(line)
	print "Processed " + system + "."


	def main():
	process_system('cassandra')

	if __name__ == '__main__':
	main()
No results found