Last active
August 13, 2018 17:09
-
-
Save cstrelioff/4e84d18fc13b0de8aac4 to your computer and use it in GitHub Desktop.
textmine + lda in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| # vim:fenc=utf-8 | |
| # | |
| # Copyright © 2015 Christopher C. Strelioff <chris.strelioff@gmail.com> | |
| # | |
| # Distributed under terms of the MIT license. | |
| """ | |
| An example of getting titles and vocab for lda using textmine package. | |
| -- adapted from: http://www.christianpeccei.com/textmining/ | |
| """ | |
| from __future__ import print_function | |
| import numpy as np | |
| import textmining | |
| # Create some very short sample documents | |
| doc1 = 'John and Bob are brothers.' | |
| doc2 = 'John went to the store. The store was closed.' | |
| doc3 = 'Bob went to the store too.' | |
| # make a titles tuple | |
| # -- these should be the "titles" for the "documents" above | |
| titles = ("sentence 1 -- brothers", | |
| "sentence 2 -- john to store", | |
| "sentence 3 -- bob to store") | |
| # Initialize class to create term-document matrix | |
| tdm = textmining.TermDocumentMatrix() | |
| # Add the documents | |
| tdm.add_doc(doc1) | |
| tdm.add_doc(doc2) | |
| tdm.add_doc(doc3) | |
| # create a temp variable with doc-term info | |
| temp = list(tdm.rows(cutoff=1)) | |
| # get the vocab from first row | |
| vocab = tuple(temp[0]) | |
| # get document-term matrix from remaining rows | |
| X = np.array(temp[1:]) | |
| ## | |
| ## print out info, as in blog post with a little extra info | |
| ## | |
| ## post: http://bit.ly/1bxob2E | |
| ## | |
| # document-term matrix | |
| print("type(X): {}".format(type(X))) | |
| print("shape: {}".format(X.shape)) | |
| print("X:\n\n", X , "\n") | |
| # the vocab | |
| print("type(vocab): {}".format(type(vocab))) | |
| print("len(vocab): {}".format(len(vocab))) | |
| print("vocab:\n\n", vocab, "\n") | |
| # titles for each story | |
| print("type(titles): {}".format(type(titles))) | |
| print("len(titles): {}".format(len(titles))) | |
| print("titles:\n\n", titles , "\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment