Skip to content

Instantly share code, notes, and snippets.

@micvbang
Last active September 5, 2016 19:48
Show Gist options
  • Select an option

  • Save micvbang/806fb2fd93a21aaa67d90046dfdc7c3b to your computer and use it in GitHub Desktop.

Select an option

Save micvbang/806fb2fd93a21aaa67d90046dfdc7c3b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2.7
import os
import codecs
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import DanishStemmer
here = lambda *x: os.path.join(os.path.dirname(os.path.realpath(__file__)), *x)
def readlines(path, removenewline=True, codec='utf8'):
return (l.strip('\n') for l in codecs.open(path, 'r', 'utf8'))
def vectorize(data):
vectorizer = CountVectorizer(ngram_range=(2, 3), max_features=100)
return vectorizer.fit_transform(data)
raw = list(readlines(here('unclassified.txt')))
data = vectorize(raw)
kmeans = KMeans()
for l, c in zip(raw, kmeans.fit_predict(data)):
print u"{}, {}".format(c, l).encode('utf8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment