cstrelioff · August 13, 2018 17:09 · Jun 22, 2015 · Jun 22, 2015 · Jun 22, 2015 · Apr 27, 2015
diff --git a/README.md b/README.md
@@ -4,9 +4,9 @@ In a blog post I wrote about the python package
 [lda](https://github.com/ariddell/lda), see 
 [here](http://chrisstrelioff.ws/sandbox/2014/11/13/getting_started_with_latent_dirichlet_allocation_in_python.html),
 I used the pre-processed data (included with the lda package) for the example.
-I have since eceived many questions regarding the document-term matrix, the
+I have since received many questions regarding the document-term matrix, the
 titles, and the vocabulary-- where do they come from?  This gist will use the 
-*textmining* package to help answer these questions.
+*textmining* package to (hopefully) help answer these types of questions.
 
 ## Install textmining package
 
@@ -24,7 +24,7 @@ $ pip install textmining
 The script can be run from the command with the usual command:
 
 ```bash
-$ python lda_
+$ python lda_textmine_ex.py
 ```
 
 The output should look like:
@@ -74,10 +74,7 @@ titles:
 ('Brothers.', 'John to the store.', 'Bob to the store.')
 
 ```
-
-Hopefully this gives a sense of how a set of documents relates to the
-document-term matrix X, the vocab, and the titls mentioned in the original
-post.
-
-
+Hopefully this gives a sense of how a set of documents (a corpus) relates to
+the *document-term matrix*, the *vocabulary*, and the *titles* mentioned in the
+original post.
 
diff --git a/README.md b/README.md
@@ -0,0 +1,83 @@
+# process corpus for lda
+
+In a blog post I wrote about the python package
+[lda](https://github.com/ariddell/lda), see 
+[here](http://chrisstrelioff.ws/sandbox/2014/11/13/getting_started_with_latent_dirichlet_allocation_in_python.html),
+I used the pre-processed data (included with the lda package) for the example.
+I have since eceived many questions regarding the document-term matrix, the
+titles, and the vocabulary-- where do they come from?  This gist will use the 
+*textmining* package to help answer these questions.
+
+## Install textmining package
+
+To install *textmining* use pip (create a virtual environment first, if you'd
+like):
+
+```bash
+$ pip install textmining
+```
+
+## Usage
+
+###  Run script from command line
+
+The script can be run from the command with the usual command:
+
+```bash
+$ python lda_
+```
+
+The output should look like:
+
+```
+**These are the 'documents', making up our 'corpus':
+document 1: John and Bob are brothers.
+document 2: John went to the store. The store was closed.
+document 3: Bob went to the store too.
+-- In real applications, these 'documents' might be read from files, websites, etc.
+
+**These are the 'document titles':
+title 1: Brothers.
+title 2: John to the store.
+title 3: Bob to the store.
+-- In real applications, these 'titles' might be the file name, the story title, webpage title, etc.
+
+** The textmining packages is one tool for creating the 'document-term' matrix, 'vocabulary', etc.
+   You can write your own, if needed.
+
+** Output produced by the textmining package...
+* The 'document-term' matrix
+type(X): <type 'numpy.ndarray'>
+shape: (3, 12)
+X:
+[[1 0 1 0 1 0 1 1 0 0 0 0]
+ [0 2 0 1 0 1 0 1 1 1 2 0]
+ [0 1 0 1 0 0 1 0 0 1 1 1]]
+-- Notice there are 3 rows, for 3 'documents' and
+   12 columns, for 12 'vocabulary' words
+-- The number of rows and columns depends on the number of documents
+   and number of unique words in -all- documents
+
+* The 'vocabulary':
+type(vocab): <type 'tuple'>
+len(vocab): 12
+vocab:
+('and', 'the', 'brothers', 'to', 'are', 'closed', 'bob', 'john', 'was', 'went', 'store', 'too')
+-- These are the 12 words in the vocabulary
+-- Often common 'stop' words, like 'and', 'the', 'to', etc are
+   filtered out -before- creating the document-term matrix and vocab
+
+* Again, the 'titles' for this 'corpus':
+type(titles): <type 'tuple'>
+len(titles): 3
+titles:
+('Brothers.', 'John to the store.', 'Bob to the store.')
+
+```
+
+Hopefully this gives a sense of how a set of documents relates to the
+document-term matrix X, the vocab, and the titls mentioned in the original
+post.
+
+
+
diff --git a/lda_textmine_ex.py b/lda_textmine_ex.py
@@ -22,13 +22,30 @@
 doc2 = 'John went to the store. The store was closed.'
 doc3 = 'Bob went to the store too.'
 
+print("\n**These are the 'documents', making up our 'corpus':")
+for n, doc in enumerate([doc1, doc2, doc3]):
+    print("document {}: {}".format(n+1, doc))
+
+print("-- In real applications, these 'documents' "
+      "might be read from files, websites, etc.")
+
 # make a titles tuple 
 # -- these should be the "titles" for the "documents" above
-titles = ("sentence 1 -- brothers",
-          "sentence 2 -- john to store",
-          "sentence 3 -- bob to store")
+titles = ("Brothers.",
+          "John to the store.",
+          "Bob to the store.")
+
+print("\n**These are the 'document titles':")
+for n, title in enumerate(titles):
+    print("title {}: {}".format(n+1, title))
+
+print("-- In real applications, these 'titles' might "
+      "be the file name, the story title, webpage title, etc.")
 
 # Initialize class to create term-document matrix
+print("\n** The textmining packages is one tool for creating the "
+      "'document-term' matrix, 'vocabulary', etc."
+      "\n   You can write your own, if needed.")
 tdm = textmining.TermDocumentMatrix()
 
 # Add the documents
@@ -50,18 +67,29 @@
 ##
 ## post: http://bit.ly/1bxob2E
 ##
+print("\n** Output produced by the textmining package...")
 
 # document-term matrix
+print("* The 'document-term' matrix")
 print("type(X): {}".format(type(X)))
 print("shape: {}".format(X.shape))
-print("X:\n\n", X , "\n")
+print("X:", X, sep="\n" )
+print("-- Notice there are 3 rows, for 3 'documents' and\n"
+      "   12 columns, for 12 'vocabulary' words\n"
+      "-- The number of rows and columns depends on the number of documents\n"
+      "   and number of unique words in -all- documents")
 
 # the vocab
+print("\n* The 'vocabulary':")
 print("type(vocab): {}".format(type(vocab)))
 print("len(vocab): {}".format(len(vocab)))
-print("vocab:\n\n", vocab, "\n")
+print("vocab:", vocab, sep="\n")
+print("-- These are the 12 words in the vocabulary\n"
+      "-- Often common 'stop' words, like 'and', 'the', 'to', etc are\n"
+      "   filtered out -before- creating the document-term matrix and vocab")
 
 # titles for each story
+print("\n* Again, the 'titles' for this 'corpus':")
 print("type(titles): {}".format(type(titles)))
 print("len(titles): {}".format(len(titles)))
-print("titles:\n\n", titles , "\n")
+print("titles:", titles, sep="\n", end="\n\n")
diff --git a/lda_textmine_ex.py b/lda_textmine_ex.py
@@ -0,0 +1,67 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:fenc=utf-8
+#
+# Copyright © 2015 Christopher C. Strelioff <chris.strelioff@gmail.com>
+#
+# Distributed under terms of the MIT license.
+
+"""
+An example of getting titles and vocab for lda using textmine package.
+
+-- adapted from: http://www.christianpeccei.com/textmining/
+
+"""
+from __future__ import print_function
+
+import numpy as np
+import textmining
+
+# Create some very short sample documents
+doc1 = 'John and Bob are brothers.'
+doc2 = 'John went to the store. The store was closed.'
+doc3 = 'Bob went to the store too.'
+
+# make a titles tuple 
+# -- these should be the "titles" for the "documents" above
+titles = ("sentence 1 -- brothers",
+          "sentence 2 -- john to store",
+          "sentence 3 -- bob to store")
+
+# Initialize class to create term-document matrix
+tdm = textmining.TermDocumentMatrix()
+
+# Add the documents
+tdm.add_doc(doc1)
+tdm.add_doc(doc2)
+tdm.add_doc(doc3)
+
+# create a temp variable with doc-term info
+temp = list(tdm.rows(cutoff=1))
+
+# get the vocab from first row
+vocab = tuple(temp[0])
+
+# get document-term matrix from remaining rows
+X = np.array(temp[1:])
+
+##
+## print out info, as in blog post with a little extra info
+##
+## post: http://bit.ly/1bxob2E
+##
+
+# document-term matrix
+print("type(X): {}".format(type(X)))
+print("shape: {}".format(X.shape))
+print("X:\n\n", X , "\n")
+
+# the vocab
+print("type(vocab): {}".format(type(vocab)))
+print("len(vocab): {}".format(len(vocab)))
+print("vocab:\n\n", vocab, "\n")
+
+# titles for each story
+print("type(titles): {}".format(type(titles)))
+print("len(titles): {}".format(len(titles)))
+print("titles:\n\n", titles , "\n")
No results found