Skip to content

Instantly share code, notes, and snippets.

@kennyballou
Forked from kvorion/naivebayes.py
Last active August 29, 2015 14:03
Show Gist options
  • Select an option

  • Save kennyballou/e26ddeb469509f059b70 to your computer and use it in GitHub Desktop.

Select an option

Save kennyballou/e26ddeb469509f059b70 to your computer and use it in GitHub Desktop.

Revisions

  1. kennyballou revised this gist Jul 3, 2014. 1 changed file with 93 additions and 54 deletions.
    147 changes: 93 additions & 54 deletions naivebayes.py
    Original file line number Diff line number Diff line change
    @@ -1,58 +1,97 @@
    #Author: Krishnamurthy Koduvayur Viswanathan
    #!/usr/bin/env python
    '''Sample Naive Bayes Classifier
    '''

    from __future__ import division
    import collections
    import math
    import sys

    class Model:
    def __init__(self, arffFile):
    self.trainingFile = arffFile
    self.features = {} #all feature names and their possible values (including the class label)
    self.featureNameList = [] #this is to maintain the order of features as in the arff
    self.featureCounts = collections.defaultdict(lambda: 1)#contains tuples of the form (label, feature_name, feature_value)
    self.featureVectors = [] #contains all the values and the label as the last entry
    self.labelCounts = collections.defaultdict(lambda: 0) #these will be smoothed later

    def TrainClassifier(self):
    for fv in self.featureVectors:
    self.labelCounts[fv[len(fv)-1]] += 1 #udpate count of the label
    for counter in range(0, len(fv)-1):
    self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1

    for label in self.labelCounts: #increase label counts (smoothing). remember that the last feature is actually the label
    for feature in self.featureNameList[:len(self.featureNameList)-1]:
    self.labelCounts[label] += len(self.features[feature])

    def Classify(self, featureVector): #featureVector is a simple list like the ones that we use to train
    probabilityPerLabel = {}
    for label in self.labelCounts:
    logProb = 0
    for featureValue in featureVector:
    logProb += math.log(self.featureCounts[(label, self.featureNameList[featureVector.index(featureValue)], featureValue)]/self.labelCounts[label])
    probabilityPerLabel[label] = (self.labelCounts[label]/sum(self.labelCounts.values())) * math.exp(logProb)
    print probabilityPerLabel
    return max(probabilityPerLabel, key = lambda classLabel: probabilityPerLabel[classLabel])

    def GetValues(self):
    file = open(self.trainingFile, 'r')
    for line in file:
    if line[0] != '@': #start of actual data
    self.featureVectors.append(line.strip().lower().split(','))
    else: #feature definitions
    if line.strip().lower().find('@data') == -1 and (not line.lower().startswith('@relation')):
    self.featureNameList.append(line.strip().split()[1])
    self.features[self.featureNameList[len(self.featureNameList) - 1]] = line[line.find('{')+1: line.find('}')].strip().split(',')
    file.close()

    def TestClassifier(self, arffFile):
    file = open(arffFile, 'r')
    for line in file:
    if line[0] != '@':
    vector = line.strip().lower().split(',')
    print "classifier: " + self.Classify(vector) + " given " + vector[len(vector) - 1]

    if __name__ == "__main__":
    model = Model("/home/tennis.arff")
    model.GetValues()
    model.TrainClassifier()
    model.TestClassifier("/home/tennis.arff")
    __author__ = 'Krishnamurthy Koduvayur Viswanathan'
    __credits__ = ['Kenny Ballou',]

    class Model(object):
    '''Simple classifier model'''
    def __init__(self, arff_file):
    self.training_file = arff_file
    # all feature names and possible values
    self.features = {}
    # order maintenance -- maintain label order
    self.feature_name_list = []
    # contains tuples of the from (label, feature_name, feature_value)
    self.feature_counts = collections.defaultdict(lambda: 1)
    # contains all the values of the label as the last entry
    self.feature_vectors = []
    # smoothing will occur later
    self.label_counts = collections.defaultdict(lambda: 0)

    def get_values(self):
    '''Parse training file and build model'''
    with open(self.training_file, 'r') as training_file:
    for line in training_file:
    line = line.strip().lower()
    # start of actual data
    if line[0] != '@':
    self.feature_vectors.append(line.split(','))
    # feature definitions
    elif ('@data' not in line and
    (not line.startswith('@relation'))):
    self.feature_name_list.append(line.strip().split()[1])
    feature = line[line.index('{') + 1:
    line.index('}')].strip().split(',')
    self.features[self.feature_name_list[-1]] = feature

    def train_classifier(self):
    '''Train the model'''
    for feature_vector in self.feature_vectors:
    # update count for label
    self.label_counts[feature_vector[-1]] += 1
    for counter in range(0, len(feature_vector)-1):
    self.feature_counts[
    (feature_vector[-1],
    self.feature_name_list[counter],
    feature_vector[counter])] += 1

    # increase label counts (smoothing). Recall, last element is the label
    for label in self.label_counts:
    for feature in self.feature_name_list[:-1]:
    self.label_counts[label] += len(self.features[feature])

    def classify(self, feature_vector):
    '''Classify features given by feature_vector
    :param feature_vector: simple list similar to ones given for training
    '''
    prob_per_label = {}
    for label in self.label_counts:
    log_prob = 0
    for feature_value in feature_vector:
    feature_name = self.feature_name_list[
    feature_vector.index(feature_value)]
    log_prob += math.log(
    self.feature_counts[(label, feature_name, feature_value)] /
    self.label_counts[label])
    prob_per_label[label] = ((self.label_counts[label] /
    sum(self.label_counts.values())) *
    math.exp(log_prob))
    print(prob_per_label)
    return max(prob_per_label, key=lambda c: prob_per_label[c])

    def test_classifier(self, arff_file):
    '''Test our model'''
    with open(arff_file, 'r') as arff:
    for line in arff:
    if line[0] != '@':
    vector = line.strip().lower().split(',')
    print("classifier: %s given %s" % (
    self.classify(vector), vector[-1]))

    def main(arff_file):
    '''main'''
    model = Model(arff_file)
    model.get_values()
    model.train_classifier()
    model.test_classifier(arff_file)

    if __name__ == '__main__':
    assert len(sys.argv[1:]) > 0
    main(sys.argv[1])
  2. kvorion revised this gist Dec 7, 2010. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion naivebayes.py
    Original file line number Diff line number Diff line change
    @@ -1,10 +1,11 @@
    #Author: Krishnamurthy Koduvayur Viswanathan

    from __future__ import division
    import collections
    import math

    class Model:
    def __init__(self, arffFile):
    print "hello"
    self.trainingFile = arffFile
    self.features = {} #all feature names and their possible values (including the class label)
    self.featureNameList = [] #this is to maintain the order of features as in the arff
  3. kvorion revised this gist Dec 7, 2010. 1 changed file with 4 additions and 4 deletions.
    8 changes: 4 additions & 4 deletions naivebayes.py
    Original file line number Diff line number Diff line change
    @@ -18,7 +18,7 @@ def TrainClassifier(self):
    for counter in range(0, len(fv)-1):
    self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1

    for label in self.labelCounts: #increment the counts for each label. remember that the last feature is actually the label
    for label in self.labelCounts: #increase label counts (smoothing). remember that the last feature is actually the label
    for feature in self.featureNameList[:len(self.featureNameList)-1]:
    self.labelCounts[label] += len(self.features[feature])

    @@ -28,7 +28,7 @@ def Classify(self, featureVector): #featureVector is a simple list like the
    logProb = 0
    for featureValue in featureVector:
    logProb += math.log(self.featureCounts[(label, self.featureNameList[featureVector.index(featureValue)], featureValue)]/self.labelCounts[label])
    probabilityPerLabel[label] = math.exp(logProb)
    probabilityPerLabel[label] = (self.labelCounts[label]/sum(self.labelCounts.values())) * math.exp(logProb)
    print probabilityPerLabel
    return max(probabilityPerLabel, key = lambda classLabel: probabilityPerLabel[classLabel])

    @@ -51,7 +51,7 @@ def TestClassifier(self, arffFile):
    print "classifier: " + self.Classify(vector) + " given " + vector[len(vector) - 1]

    if __name__ == "__main__":
    model = Model("/home/wirelive/Dropbox/code/NaiveBayesPython/tennis.arff")
    model = Model("/home/tennis.arff")
    model.GetValues()
    model.TrainClassifier()
    model.TestClassifier("/home/wirelive/Dropbox/code/NaiveBayesPython/tennis.arff")
    model.TestClassifier("/home/tennis.arff")
  4. kvorion revised this gist Dec 7, 2010. 1 changed file with 6 additions and 7 deletions.
    13 changes: 6 additions & 7 deletions naivebayes.py
    Original file line number Diff line number Diff line change
    @@ -6,9 +6,9 @@ class Model:
    def __init__(self, arffFile):
    print "hello"
    self.trainingFile = arffFile
    self.features = {} #feature names and vals (including class labels)
    self.featureNameList = [] #maintain the order of features as in the arff
    self.featureCounts = collections.defaultdict(lambda: 1)#(label, feature_name, feature_value)
    self.features = {} #all feature names and their possible values (including the class label)
    self.featureNameList = [] #this is to maintain the order of features as in the arff
    self.featureCounts = collections.defaultdict(lambda: 1)#contains tuples of the form (label, feature_name, feature_value)
    self.featureVectors = [] #contains all the values and the label as the last entry
    self.labelCounts = collections.defaultdict(lambda: 0) #these will be smoothed later

    @@ -18,11 +18,11 @@ def TrainClassifier(self):
    for counter in range(0, len(fv)-1):
    self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1

    for label in self.labelCounts: #last feature is actually the label
    for label in self.labelCounts: #increment the counts for each label. remember that the last feature is actually the label
    for feature in self.featureNameList[:len(self.featureNameList)-1]:
    self.labelCounts[label] += len(self.features[feature])

    def Classify(self, featureVector):
    def Classify(self, featureVector): #featureVector is a simple list like the ones that we use to train
    probabilityPerLabel = {}
    for label in self.labelCounts:
    logProb = 0
    @@ -38,8 +38,7 @@ def GetValues(self):
    if line[0] != '@': #start of actual data
    self.featureVectors.append(line.strip().lower().split(','))
    else: #feature definitions
    if line.strip().lower().find('@data') == -1 and
    (not line.lower().startswith('@relation')):
    if line.strip().lower().find('@data') == -1 and (not line.lower().startswith('@relation')):
    self.featureNameList.append(line.strip().split()[1])
    self.features[self.featureNameList[len(self.featureNameList) - 1]] = line[line.find('{')+1: line.find('}')].strip().split(',')
    file.close()
  5. kvorion revised this gist Dec 7, 2010. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion naivebayes.py
    Original file line number Diff line number Diff line change
    @@ -38,7 +38,8 @@ def GetValues(self):
    if line[0] != '@': #start of actual data
    self.featureVectors.append(line.strip().lower().split(','))
    else: #feature definitions
    if line.strip().lower().find('@data') == -1 and (not line.lower().startswith('@relation')):
    if line.strip().lower().find('@data') == -1 and
    (not line.lower().startswith('@relation')):
    self.featureNameList.append(line.strip().split()[1])
    self.features[self.featureNameList[len(self.featureNameList) - 1]] = line[line.find('{')+1: line.find('}')].strip().split(',')
    file.close()
  6. kvorion revised this gist Dec 7, 2010. 1 changed file with 5 additions and 5 deletions.
    10 changes: 5 additions & 5 deletions naivebayes.py
    Original file line number Diff line number Diff line change
    @@ -6,9 +6,9 @@ class Model:
    def __init__(self, arffFile):
    print "hello"
    self.trainingFile = arffFile
    self.features = {} #all feature names and their possible values (including the class label)
    self.featureNameList = [] #this is to maintain the order of features as in the arff
    self.featureCounts = collections.defaultdict(lambda: 1)#contains tuples of the form (label, feature_name, feature_value)
    self.features = {} #feature names and vals (including class labels)
    self.featureNameList = [] #maintain the order of features as in the arff
    self.featureCounts = collections.defaultdict(lambda: 1)#(label, feature_name, feature_value)
    self.featureVectors = [] #contains all the values and the label as the last entry
    self.labelCounts = collections.defaultdict(lambda: 0) #these will be smoothed later

    @@ -18,11 +18,11 @@ def TrainClassifier(self):
    for counter in range(0, len(fv)-1):
    self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1

    for label in self.labelCounts: #increment the counts for each label. remember that the last feature is actually the label
    for label in self.labelCounts: #last feature is actually the label
    for feature in self.featureNameList[:len(self.featureNameList)-1]:
    self.labelCounts[label] += len(self.features[feature])

    def Classify(self, featureVector): #featureVector is a simple list like the ones that we use to train
    def Classify(self, featureVector):
    probabilityPerLabel = {}
    for label in self.labelCounts:
    logProb = 0
  7. @invalid-email-address Anonymous created this gist Dec 7, 2010.
    57 changes: 57 additions & 0 deletions naivebayes.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,57 @@
    from __future__ import division
    import collections
    import math

    class Model:
    def __init__(self, arffFile):
    print "hello"
    self.trainingFile = arffFile
    self.features = {} #all feature names and their possible values (including the class label)
    self.featureNameList = [] #this is to maintain the order of features as in the arff
    self.featureCounts = collections.defaultdict(lambda: 1)#contains tuples of the form (label, feature_name, feature_value)
    self.featureVectors = [] #contains all the values and the label as the last entry
    self.labelCounts = collections.defaultdict(lambda: 0) #these will be smoothed later

    def TrainClassifier(self):
    for fv in self.featureVectors:
    self.labelCounts[fv[len(fv)-1]] += 1 #udpate count of the label
    for counter in range(0, len(fv)-1):
    self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1

    for label in self.labelCounts: #increment the counts for each label. remember that the last feature is actually the label
    for feature in self.featureNameList[:len(self.featureNameList)-1]:
    self.labelCounts[label] += len(self.features[feature])

    def Classify(self, featureVector): #featureVector is a simple list like the ones that we use to train
    probabilityPerLabel = {}
    for label in self.labelCounts:
    logProb = 0
    for featureValue in featureVector:
    logProb += math.log(self.featureCounts[(label, self.featureNameList[featureVector.index(featureValue)], featureValue)]/self.labelCounts[label])
    probabilityPerLabel[label] = math.exp(logProb)
    print probabilityPerLabel
    return max(probabilityPerLabel, key = lambda classLabel: probabilityPerLabel[classLabel])

    def GetValues(self):
    file = open(self.trainingFile, 'r')
    for line in file:
    if line[0] != '@': #start of actual data
    self.featureVectors.append(line.strip().lower().split(','))
    else: #feature definitions
    if line.strip().lower().find('@data') == -1 and (not line.lower().startswith('@relation')):
    self.featureNameList.append(line.strip().split()[1])
    self.features[self.featureNameList[len(self.featureNameList) - 1]] = line[line.find('{')+1: line.find('}')].strip().split(',')
    file.close()

    def TestClassifier(self, arffFile):
    file = open(arffFile, 'r')
    for line in file:
    if line[0] != '@':
    vector = line.strip().lower().split(',')
    print "classifier: " + self.Classify(vector) + " given " + vector[len(vector) - 1]

    if __name__ == "__main__":
    model = Model("/home/wirelive/Dropbox/code/NaiveBayesPython/tennis.arff")
    model.GetValues()
    model.TrainClassifier()
    model.TestClassifier("/home/wirelive/Dropbox/code/NaiveBayesPython/tennis.arff")