kennyballou · August 29, 2015 14:03 · Jul 3, 2014 · Dec 7, 2010 · Dec 7, 2010 · Dec 7, 2010
diff --git a/naivebayes.py b/naivebayes.py
@@ -1,58 +1,97 @@
-#Author: Krishnamurthy Koduvayur Viswanathan
+#!/usr/bin/env python
+'''Sample Naive Bayes Classifier
+'''
 
-from __future__ import division
 import collections
 import math
+import sys
 
-class Model: 
-        def __init__(self, arffFile):
-                self.trainingFile = arffFile
-                self.features = {}      #all feature names and their possible values (including the class label)
-                self.featureNameList = []       #this is to maintain the order of features as in the arff
-                self.featureCounts = collections.defaultdict(lambda: 1)#contains tuples of the form (label, feature_name, feature_value)
-                self.featureVectors = []        #contains all the values and the label as the last entry
-                self.labelCounts = collections.defaultdict(lambda: 0)   #these will be smoothed later
-
-        def TrainClassifier(self):
-                for fv in self.featureVectors:
-                        self.labelCounts[fv[len(fv)-1]] += 1 #udpate count of the label
-                        for counter in range(0, len(fv)-1):
-                                self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1
-
-                for label in self.labelCounts:  #increase label counts (smoothing). remember that the last feature is actually the label
-                        for feature in self.featureNameList[:len(self.featureNameList)-1]:
-                                self.labelCounts[label] += len(self.features[feature])
-
-        def Classify(self, featureVector):      #featureVector is a simple list like the ones that we use to train
-                probabilityPerLabel = {}
-                for label in self.labelCounts:
-                        logProb = 0
-                        for featureValue in featureVector:
-                                logProb += math.log(self.featureCounts[(label, self.featureNameList[featureVector.index(featureValue)], featureValue)]/self.labelCounts[label])
-                        probabilityPerLabel[label] = (self.labelCounts[label]/sum(self.labelCounts.values())) * math.exp(logProb)
-                print probabilityPerLabel
-                return max(probabilityPerLabel, key = lambda classLabel: probabilityPerLabel[classLabel])
-
-        def GetValues(self):
-                file = open(self.trainingFile, 'r')
-                for line in file:
-                        if line[0] != '@':  #start of actual data
-                                self.featureVectors.append(line.strip().lower().split(','))
-                        else:   #feature definitions
-                                if line.strip().lower().find('@data') == -1 and (not line.lower().startswith('@relation')):
-                                        self.featureNameList.append(line.strip().split()[1])
-                                        self.features[self.featureNameList[len(self.featureNameList) - 1]] = line[line.find('{')+1: line.find('}')].strip().split(',')
-                file.close()
-
-        def TestClassifier(self, arffFile):
-                file = open(arffFile, 'r')
-                for line in file:
-                        if line[0] != '@':
-                                vector = line.strip().lower().split(',')
-                                print "classifier: " + self.Classify(vector) + " given " + vector[len(vector) - 1]                                
-
-if __name__ == "__main__":
-        model = Model("/home/tennis.arff")
-        model.GetValues()
-        model.TrainClassifier()
-        model.TestClassifier("/home/tennis.arff")
+__author__ = 'Krishnamurthy Koduvayur Viswanathan'
+__credits__ = ['Kenny Ballou',]
+
+class Model(object):
+    '''Simple classifier model'''
+    def __init__(self, arff_file):
+        self.training_file = arff_file
+        # all feature names and possible values
+        self.features = {}
+        # order maintenance -- maintain label order
+        self.feature_name_list = []
+        # contains tuples of the from (label, feature_name, feature_value)
+        self.feature_counts = collections.defaultdict(lambda: 1)
+        # contains all the values of the label as the last entry
+        self.feature_vectors = []
+        # smoothing will occur later
+        self.label_counts = collections.defaultdict(lambda: 0)
+
+    def get_values(self):
+        '''Parse training file and build model'''
+        with open(self.training_file, 'r') as training_file:
+            for line in training_file:
+                line = line.strip().lower()
+                # start of actual data
+                if line[0] != '@':
+                    self.feature_vectors.append(line.split(','))
+                # feature definitions
+                elif ('@data' not in line and
+                      (not line.startswith('@relation'))):
+                    self.feature_name_list.append(line.strip().split()[1])
+                    feature = line[line.index('{') + 1:
+                                   line.index('}')].strip().split(',')
+                    self.features[self.feature_name_list[-1]] = feature
+
+    def train_classifier(self):
+        '''Train the model'''
+        for feature_vector in self.feature_vectors:
+            # update count for label
+            self.label_counts[feature_vector[-1]] += 1
+            for counter in range(0, len(feature_vector)-1):
+                self.feature_counts[
+                    (feature_vector[-1],
+                     self.feature_name_list[counter],
+                     feature_vector[counter])] += 1
+
+        # increase label counts (smoothing). Recall, last element is the label
+        for label in self.label_counts:
+            for feature in self.feature_name_list[:-1]:
+                self.label_counts[label] += len(self.features[feature])
+
+    def classify(self, feature_vector):
+        '''Classify features given by feature_vector
+
+        :param feature_vector: simple list similar to ones given for training
+        '''
+        prob_per_label = {}
+        for label in self.label_counts:
+            log_prob = 0
+            for feature_value in feature_vector:
+                feature_name = self.feature_name_list[
+                    feature_vector.index(feature_value)]
+                log_prob += math.log(
+                    self.feature_counts[(label, feature_name, feature_value)] /
+                    self.label_counts[label])
+            prob_per_label[label] = ((self.label_counts[label] /
+                                      sum(self.label_counts.values())) *
+                                     math.exp(log_prob))
+        print(prob_per_label)
+        return max(prob_per_label, key=lambda c: prob_per_label[c])
+
+    def test_classifier(self, arff_file):
+        '''Test our model'''
+        with open(arff_file, 'r') as arff:
+            for line in arff:
+                if line[0] != '@':
+                    vector = line.strip().lower().split(',')
+                    print("classifier: %s given %s" % (
+                        self.classify(vector), vector[-1]))
+
+def main(arff_file):
+    '''main'''
+    model = Model(arff_file)
+    model.get_values()
+    model.train_classifier()
+    model.test_classifier(arff_file)
+
+if __name__ == '__main__':
+    assert len(sys.argv[1:]) > 0
+    main(sys.argv[1])
diff --git a/naivebayes.py b/naivebayes.py
@@ -1,10 +1,11 @@
+#Author: Krishnamurthy Koduvayur Viswanathan
+
 from __future__ import division
 import collections
 import math
 
 class Model: 
         def __init__(self, arffFile):
-                print "hello"
                 self.trainingFile = arffFile
                 self.features = {}      #all feature names and their possible values (including the class label)
                 self.featureNameList = []       #this is to maintain the order of features as in the arff

diff --git a/naivebayes.py b/naivebayes.py
@@ -18,7 +18,7 @@ def TrainClassifier(self):
                         for counter in range(0, len(fv)-1):
                                 self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1
 
-                for label in self.labelCounts:  #increment the counts for each label. remember that the last feature is actually the label
+                for label in self.labelCounts:  #increase label counts (smoothing). remember that the last feature is actually the label
                         for feature in self.featureNameList[:len(self.featureNameList)-1]:
                                 self.labelCounts[label] += len(self.features[feature])
 
@@ -28,7 +28,7 @@ def Classify(self, featureVector):      #featureVector is a simple list like the
                         logProb = 0
                         for featureValue in featureVector:
                                 logProb += math.log(self.featureCounts[(label, self.featureNameList[featureVector.index(featureValue)], featureValue)]/self.labelCounts[label])
-                        probabilityPerLabel[label] = math.exp(logProb)
+                        probabilityPerLabel[label] = (self.labelCounts[label]/sum(self.labelCounts.values())) * math.exp(logProb)
                 print probabilityPerLabel
                 return max(probabilityPerLabel, key = lambda classLabel: probabilityPerLabel[classLabel])
 
@@ -51,7 +51,7 @@ def TestClassifier(self, arffFile):
                                 print "classifier: " + self.Classify(vector) + " given " + vector[len(vector) - 1]                                
 
 if __name__ == "__main__":
-        model = Model("/home/wirelive/Dropbox/code/NaiveBayesPython/tennis.arff")
+        model = Model("/home/tennis.arff")
         model.GetValues()
         model.TrainClassifier()
-        model.TestClassifier("/home/wirelive/Dropbox/code/NaiveBayesPython/tennis.arff")
+        model.TestClassifier("/home/tennis.arff")
diff --git a/naivebayes.py b/naivebayes.py
@@ -6,9 +6,9 @@ class Model:
         def __init__(self, arffFile):
                 print "hello"
                 self.trainingFile = arffFile
-                self.features = {} #feature names and vals (including class labels)
-                self.featureNameList = []  #maintain the order of features as in the arff
-                self.featureCounts = collections.defaultdict(lambda: 1)#(label, feature_name, feature_value)
+                self.features = {}      #all feature names and their possible values (including the class label)
+                self.featureNameList = []       #this is to maintain the order of features as in the arff
+                self.featureCounts = collections.defaultdict(lambda: 1)#contains tuples of the form (label, feature_name, feature_value)
                 self.featureVectors = []        #contains all the values and the label as the last entry
                 self.labelCounts = collections.defaultdict(lambda: 0)   #these will be smoothed later
 
@@ -18,11 +18,11 @@ def TrainClassifier(self):
                         for counter in range(0, len(fv)-1):
                                 self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1
 
-                for label in self.labelCounts:  #last feature is actually the label
+                for label in self.labelCounts:  #increment the counts for each label. remember that the last feature is actually the label
                         for feature in self.featureNameList[:len(self.featureNameList)-1]:
                                 self.labelCounts[label] += len(self.features[feature])
 
-        def Classify(self, featureVector):      
+        def Classify(self, featureVector):      #featureVector is a simple list like the ones that we use to train
                 probabilityPerLabel = {}
                 for label in self.labelCounts:
                         logProb = 0
@@ -38,8 +38,7 @@ def GetValues(self):
                         if line[0] != '@':  #start of actual data
                                 self.featureVectors.append(line.strip().lower().split(','))
                         else:   #feature definitions
-                                if line.strip().lower().find('@data') == -1 and 
-           (not line.lower().startswith('@relation')):
+                                if line.strip().lower().find('@data') == -1 and (not line.lower().startswith('@relation')):
                                         self.featureNameList.append(line.strip().split()[1])
                                         self.features[self.featureNameList[len(self.featureNameList) - 1]] = line[line.find('{')+1: line.find('}')].strip().split(',')
                 file.close()

diff --git a/naivebayes.py b/naivebayes.py
@@ -38,7 +38,8 @@ def GetValues(self):
                         if line[0] != '@':  #start of actual data
                                 self.featureVectors.append(line.strip().lower().split(','))
                         else:   #feature definitions
-                                if line.strip().lower().find('@data') == -1 and (not line.lower().startswith('@relation')):
+                                if line.strip().lower().find('@data') == -1 and 
+           (not line.lower().startswith('@relation')):
                                         self.featureNameList.append(line.strip().split()[1])
                                         self.features[self.featureNameList[len(self.featureNameList) - 1]] = line[line.find('{')+1: line.find('}')].strip().split(',')
                 file.close()

diff --git a/naivebayes.py b/naivebayes.py
@@ -6,9 +6,9 @@ class Model:
         def __init__(self, arffFile):
                 print "hello"
                 self.trainingFile = arffFile
-                self.features = {}      #all feature names and their possible values (including the class label)
-                self.featureNameList = []       #this is to maintain the order of features as in the arff
-                self.featureCounts = collections.defaultdict(lambda: 1)#contains tuples of the form (label, feature_name, feature_value)
+                self.features = {} #feature names and vals (including class labels)
+                self.featureNameList = []  #maintain the order of features as in the arff
+                self.featureCounts = collections.defaultdict(lambda: 1)#(label, feature_name, feature_value)
                 self.featureVectors = []        #contains all the values and the label as the last entry
                 self.labelCounts = collections.defaultdict(lambda: 0)   #these will be smoothed later
 
@@ -18,11 +18,11 @@ def TrainClassifier(self):
                         for counter in range(0, len(fv)-1):
                                 self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1
 
-                for label in self.labelCounts:  #increment the counts for each label. remember that the last feature is actually the label
+                for label in self.labelCounts:  #last feature is actually the label
                         for feature in self.featureNameList[:len(self.featureNameList)-1]:
                                 self.labelCounts[label] += len(self.features[feature])
 
-        def Classify(self, featureVector):      #featureVector is a simple list like the ones that we use to train
+        def Classify(self, featureVector):      
                 probabilityPerLabel = {}
                 for label in self.labelCounts:
                         logProb = 0

diff --git a/naivebayes.py b/naivebayes.py
@@ -0,0 +1,57 @@
+from __future__ import division
+import collections
+import math
+
+class Model: 
+        def __init__(self, arffFile):
+                print "hello"
+                self.trainingFile = arffFile
+                self.features = {}      #all feature names and their possible values (including the class label)
+                self.featureNameList = []       #this is to maintain the order of features as in the arff
+                self.featureCounts = collections.defaultdict(lambda: 1)#contains tuples of the form (label, feature_name, feature_value)
+                self.featureVectors = []        #contains all the values and the label as the last entry
+                self.labelCounts = collections.defaultdict(lambda: 0)   #these will be smoothed later
+
+        def TrainClassifier(self):
+                for fv in self.featureVectors:
+                        self.labelCounts[fv[len(fv)-1]] += 1 #udpate count of the label
+                        for counter in range(0, len(fv)-1):
+                                self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1
+
+                for label in self.labelCounts:  #increment the counts for each label. remember that the last feature is actually the label
+                        for feature in self.featureNameList[:len(self.featureNameList)-1]:
+                                self.labelCounts[label] += len(self.features[feature])
+
+        def Classify(self, featureVector):      #featureVector is a simple list like the ones that we use to train
+                probabilityPerLabel = {}
+                for label in self.labelCounts:
+                        logProb = 0
+                        for featureValue in featureVector:
+                                logProb += math.log(self.featureCounts[(label, self.featureNameList[featureVector.index(featureValue)], featureValue)]/self.labelCounts[label])
+                        probabilityPerLabel[label] = math.exp(logProb)
+                print probabilityPerLabel
+                return max(probabilityPerLabel, key = lambda classLabel: probabilityPerLabel[classLabel])
+
+        def GetValues(self):
+                file = open(self.trainingFile, 'r')
+                for line in file:
+                        if line[0] != '@':  #start of actual data
+                                self.featureVectors.append(line.strip().lower().split(','))
+                        else:   #feature definitions
+                                if line.strip().lower().find('@data') == -1 and (not line.lower().startswith('@relation')):
+                                        self.featureNameList.append(line.strip().split()[1])
+                                        self.features[self.featureNameList[len(self.featureNameList) - 1]] = line[line.find('{')+1: line.find('}')].strip().split(',')
+                file.close()
+
+        def TestClassifier(self, arffFile):
+                file = open(arffFile, 'r')
+                for line in file:
+                        if line[0] != '@':
+                                vector = line.strip().lower().split(',')
+                                print "classifier: " + self.Classify(vector) + " given " + vector[len(vector) - 1]                                
+
+if __name__ == "__main__":
+        model = Model("/home/wirelive/Dropbox/code/NaiveBayesPython/tennis.arff")
+        model.GetValues()
+        model.TrainClassifier()
+        model.TestClassifier("/home/wirelive/Dropbox/code/NaiveBayesPython/tennis.arff")
No results found