-
-
Save kennyballou/e26ddeb469509f059b70 to your computer and use it in GitHub Desktop.
Revisions
-
kennyballou revised this gist
Jul 3, 2014 . 1 changed file with 93 additions and 54 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,58 +1,97 @@ #!/usr/bin/env python '''Sample Naive Bayes Classifier ''' import collections import math import sys __author__ = 'Krishnamurthy Koduvayur Viswanathan' __credits__ = ['Kenny Ballou',] class Model(object): '''Simple classifier model''' def __init__(self, arff_file): self.training_file = arff_file # all feature names and possible values self.features = {} # order maintenance -- maintain label order self.feature_name_list = [] # contains tuples of the from (label, feature_name, feature_value) self.feature_counts = collections.defaultdict(lambda: 1) # contains all the values of the label as the last entry self.feature_vectors = [] # smoothing will occur later self.label_counts = collections.defaultdict(lambda: 0) def get_values(self): '''Parse training file and build model''' with open(self.training_file, 'r') as training_file: for line in training_file: line = line.strip().lower() # start of actual data if line[0] != '@': self.feature_vectors.append(line.split(',')) # feature definitions elif ('@data' not in line and (not line.startswith('@relation'))): self.feature_name_list.append(line.strip().split()[1]) feature = line[line.index('{') + 1: line.index('}')].strip().split(',') self.features[self.feature_name_list[-1]] = feature def train_classifier(self): '''Train the model''' for feature_vector in self.feature_vectors: # update count for label self.label_counts[feature_vector[-1]] += 1 for counter in range(0, len(feature_vector)-1): self.feature_counts[ (feature_vector[-1], self.feature_name_list[counter], feature_vector[counter])] += 1 # increase label counts (smoothing). Recall, last element is the label for label in self.label_counts: for feature in self.feature_name_list[:-1]: self.label_counts[label] += len(self.features[feature]) def classify(self, feature_vector): '''Classify features given by feature_vector :param feature_vector: simple list similar to ones given for training ''' prob_per_label = {} for label in self.label_counts: log_prob = 0 for feature_value in feature_vector: feature_name = self.feature_name_list[ feature_vector.index(feature_value)] log_prob += math.log( self.feature_counts[(label, feature_name, feature_value)] / self.label_counts[label]) prob_per_label[label] = ((self.label_counts[label] / sum(self.label_counts.values())) * math.exp(log_prob)) print(prob_per_label) return max(prob_per_label, key=lambda c: prob_per_label[c]) def test_classifier(self, arff_file): '''Test our model''' with open(arff_file, 'r') as arff: for line in arff: if line[0] != '@': vector = line.strip().lower().split(',') print("classifier: %s given %s" % ( self.classify(vector), vector[-1])) def main(arff_file): '''main''' model = Model(arff_file) model.get_values() model.train_classifier() model.test_classifier(arff_file) if __name__ == '__main__': assert len(sys.argv[1:]) > 0 main(sys.argv[1]) -
kvorion revised this gist
Dec 7, 2010 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,10 +1,11 @@ #Author: Krishnamurthy Koduvayur Viswanathan from __future__ import division import collections import math class Model: def __init__(self, arffFile): self.trainingFile = arffFile self.features = {} #all feature names and their possible values (including the class label) self.featureNameList = [] #this is to maintain the order of features as in the arff -
kvorion revised this gist
Dec 7, 2010 . 1 changed file with 4 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -18,7 +18,7 @@ def TrainClassifier(self): for counter in range(0, len(fv)-1): self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1 for label in self.labelCounts: #increase label counts (smoothing). remember that the last feature is actually the label for feature in self.featureNameList[:len(self.featureNameList)-1]: self.labelCounts[label] += len(self.features[feature]) @@ -28,7 +28,7 @@ def Classify(self, featureVector): #featureVector is a simple list like the logProb = 0 for featureValue in featureVector: logProb += math.log(self.featureCounts[(label, self.featureNameList[featureVector.index(featureValue)], featureValue)]/self.labelCounts[label]) probabilityPerLabel[label] = (self.labelCounts[label]/sum(self.labelCounts.values())) * math.exp(logProb) print probabilityPerLabel return max(probabilityPerLabel, key = lambda classLabel: probabilityPerLabel[classLabel]) @@ -51,7 +51,7 @@ def TestClassifier(self, arffFile): print "classifier: " + self.Classify(vector) + " given " + vector[len(vector) - 1] if __name__ == "__main__": model = Model("/home/tennis.arff") model.GetValues() model.TrainClassifier() model.TestClassifier("/home/tennis.arff") -
kvorion revised this gist
Dec 7, 2010 . 1 changed file with 6 additions and 7 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -6,9 +6,9 @@ class Model: def __init__(self, arffFile): print "hello" self.trainingFile = arffFile self.features = {} #all feature names and their possible values (including the class label) self.featureNameList = [] #this is to maintain the order of features as in the arff self.featureCounts = collections.defaultdict(lambda: 1)#contains tuples of the form (label, feature_name, feature_value) self.featureVectors = [] #contains all the values and the label as the last entry self.labelCounts = collections.defaultdict(lambda: 0) #these will be smoothed later @@ -18,11 +18,11 @@ def TrainClassifier(self): for counter in range(0, len(fv)-1): self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1 for label in self.labelCounts: #increment the counts for each label. remember that the last feature is actually the label for feature in self.featureNameList[:len(self.featureNameList)-1]: self.labelCounts[label] += len(self.features[feature]) def Classify(self, featureVector): #featureVector is a simple list like the ones that we use to train probabilityPerLabel = {} for label in self.labelCounts: logProb = 0 @@ -38,8 +38,7 @@ def GetValues(self): if line[0] != '@': #start of actual data self.featureVectors.append(line.strip().lower().split(',')) else: #feature definitions if line.strip().lower().find('@data') == -1 and (not line.lower().startswith('@relation')): self.featureNameList.append(line.strip().split()[1]) self.features[self.featureNameList[len(self.featureNameList) - 1]] = line[line.find('{')+1: line.find('}')].strip().split(',') file.close() -
kvorion revised this gist
Dec 7, 2010 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -38,7 +38,8 @@ def GetValues(self): if line[0] != '@': #start of actual data self.featureVectors.append(line.strip().lower().split(',')) else: #feature definitions if line.strip().lower().find('@data') == -1 and (not line.lower().startswith('@relation')): self.featureNameList.append(line.strip().split()[1]) self.features[self.featureNameList[len(self.featureNameList) - 1]] = line[line.find('{')+1: line.find('}')].strip().split(',') file.close() -
kvorion revised this gist
Dec 7, 2010 . 1 changed file with 5 additions and 5 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -6,9 +6,9 @@ class Model: def __init__(self, arffFile): print "hello" self.trainingFile = arffFile self.features = {} #feature names and vals (including class labels) self.featureNameList = [] #maintain the order of features as in the arff self.featureCounts = collections.defaultdict(lambda: 1)#(label, feature_name, feature_value) self.featureVectors = [] #contains all the values and the label as the last entry self.labelCounts = collections.defaultdict(lambda: 0) #these will be smoothed later @@ -18,11 +18,11 @@ def TrainClassifier(self): for counter in range(0, len(fv)-1): self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1 for label in self.labelCounts: #last feature is actually the label for feature in self.featureNameList[:len(self.featureNameList)-1]: self.labelCounts[label] += len(self.features[feature]) def Classify(self, featureVector): probabilityPerLabel = {} for label in self.labelCounts: logProb = 0 -
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,57 @@ from __future__ import division import collections import math class Model: def __init__(self, arffFile): print "hello" self.trainingFile = arffFile self.features = {} #all feature names and their possible values (including the class label) self.featureNameList = [] #this is to maintain the order of features as in the arff self.featureCounts = collections.defaultdict(lambda: 1)#contains tuples of the form (label, feature_name, feature_value) self.featureVectors = [] #contains all the values and the label as the last entry self.labelCounts = collections.defaultdict(lambda: 0) #these will be smoothed later def TrainClassifier(self): for fv in self.featureVectors: self.labelCounts[fv[len(fv)-1]] += 1 #udpate count of the label for counter in range(0, len(fv)-1): self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1 for label in self.labelCounts: #increment the counts for each label. remember that the last feature is actually the label for feature in self.featureNameList[:len(self.featureNameList)-1]: self.labelCounts[label] += len(self.features[feature]) def Classify(self, featureVector): #featureVector is a simple list like the ones that we use to train probabilityPerLabel = {} for label in self.labelCounts: logProb = 0 for featureValue in featureVector: logProb += math.log(self.featureCounts[(label, self.featureNameList[featureVector.index(featureValue)], featureValue)]/self.labelCounts[label]) probabilityPerLabel[label] = math.exp(logProb) print probabilityPerLabel return max(probabilityPerLabel, key = lambda classLabel: probabilityPerLabel[classLabel]) def GetValues(self): file = open(self.trainingFile, 'r') for line in file: if line[0] != '@': #start of actual data self.featureVectors.append(line.strip().lower().split(',')) else: #feature definitions if line.strip().lower().find('@data') == -1 and (not line.lower().startswith('@relation')): self.featureNameList.append(line.strip().split()[1]) self.features[self.featureNameList[len(self.featureNameList) - 1]] = line[line.find('{')+1: line.find('}')].strip().split(',') file.close() def TestClassifier(self, arffFile): file = open(arffFile, 'r') for line in file: if line[0] != '@': vector = line.strip().lower().split(',') print "classifier: " + self.Classify(vector) + " given " + vector[len(vector) - 1] if __name__ == "__main__": model = Model("/home/wirelive/Dropbox/code/NaiveBayesPython/tennis.arff") model.GetValues() model.TrainClassifier() model.TestClassifier("/home/wirelive/Dropbox/code/NaiveBayesPython/tennis.arff")