andreiz · August 27, 2024 09:42 · Jun 27, 2013 · Feb 22, 2013 · Jan 26, 2013
diff --git a/classifier.php b/classifier.php
@@ -49,6 +49,9 @@
 for ($j=0; $j < NUM_FEATURES+1; $j++)
     $weights[$j] = mt_rand()/mt_getrandmax()*5.0;
 
+// Calculate the data we need for feature scaling (mean/variance)
+$scaling = calc_feature_scaling($training);
+
 $learning_rate = 0.05;
 $steps = 20000; // number of steps to take for gradient descent
 
@@ -59,9 +62,10 @@
     for ($j = 0; $j < NUM_FEATURES+1; $j++) {
         $sum_m = 0.0;
         for ($i = 0; $i < $NUM_SAMPLES; $i++) {
-            $h = hypothesis($training[$i], $weights);
+            $scaled_data = scale($training[$i], $scaling);
+            $h = hypothesis($scaled_data, $weights);
             // The first weight has a dummy 1 "feature" value
-            $part = ($h - $labels[$i]) * ($j==0 ? 1.0 : $training[$i][$j-1]);
+            $part = ($h - $labels[$i]) * ($j==0 ? 1.0 : $scaled_data[$j-1]);
             $sum_m = $sum_m + $part;
         }
         $temp[$j] = $weights[$j] - $learning_rate * $sum_m/$NUM_SAMPLES;
@@ -77,7 +81,7 @@
 print "\nValidating training\n";
 $correct = 0;
 for ($i = 0; $i < $NUM_SAMPLES; $i++) {
-    $predict = predict($training[$i], $weights);
+    $predict = predict(scale($training[$i], $scaling), $weights);
     printf("Input: %-16s actual: %d, predict: %d", vector_to_str($training[$i]), $labels[$i], $predict);
     if ($labels[$i] != $predict)
         print " - miss";
@@ -96,7 +100,7 @@
     array(-14., 1.1, 1.),
 );
 for ($i = 0; $i < sizeof($test); $i++) {
-    $predict = predict($test[$i], $weights);
+    $predict = predict(scale($test[$i], $scaling), $weights);
     printf("Input: %-16s predict: %d\n", vector_to_str($test[$i]), $predict);
 }
 
@@ -122,6 +126,43 @@ function predict($input, $weights)
     return $predict;
 }
 
+function scale($input, $scaling)
+{
+    foreach ($input as $f => &$value) {
+        $value = ($value - $scaling['mean'][$f]) /
+                    $scaling['variance'][$f];
+    }
+    return $input;
+}
+
+function calc_feature_scaling($data)
+{
+    $mins = array_fill(0, NUM_FEATURES, INF);
+    $maxs = array_fill(0, NUM_FEATURES, -INF);
+    $sums = array_fill(0, NUM_FEATURES, 0);
+    $scaling = array('mean' => array(),
+                     'variance' => array());
+    $N = sizeof($data);
+    foreach ($data as $i => $row) {
+        foreach ($row as $f => $value) {
+            if ($value > $maxs[$f])
+                $maxs[$f] = $value;
+            if ($value < $mins[$f])
+                $mins[$f] = $value;
+            $sums[$f] += $value;
+        }
+    }
+
+    for ($f = 0; $f < NUM_FEATURES; $f++) {
+        $scaling['mean'][$f] = $sums[$f] / $N;
+        $scaling['variance'][$f] = $maxs[$f] - $mins[$f];
+        if ($scaling['variance'][$f] == 0)
+            throw new Exception("Feature #$f has the same value in all the samples, invalid data");
+    }
+
+    return $scaling;
+}
+
 function vector_to_str($x)
 {
     return '['.implode(", ", $x).']';

diff --git a/classifier.php b/classifier.php
@@ -51,8 +51,8 @@
 
 $learning_rate = 0.05;
 $steps = 20000; // number of steps to take for gradient descent
-$temp = array(); // temp array to hold updates for weights during the loop
 
+$temp = array(); // temp array to hold updates for weights during the loop
 for ($n = 0; $n < $steps; $n++) {
 
     // For each weight, perform the gradient descent step and save the result to temp
@@ -78,7 +78,10 @@
 $correct = 0;
 for ($i = 0; $i < $NUM_SAMPLES; $i++) {
     $predict = predict($training[$i], $weights);
-    printf("Input: %s, actual: %d, predict: %d\n", vector_to_str($training[$i]), $labels[$i], $predict);
+    printf("Input: %-16s actual: %d, predict: %d", vector_to_str($training[$i]), $labels[$i], $predict);
+    if ($labels[$i] != $predict)
+        print " - miss";
+    print "\n";
     if ($predict == $labels[$i])
         $correct++;
 }
@@ -94,7 +97,7 @@
 );
 for ($i = 0; $i < sizeof($test); $i++) {
     $predict = predict($test[$i], $weights);
-    printf("Input: %s, predict: %d\n", vector_to_str($test[$i]), $predict);
+    printf("Input: %-16s predict: %d\n", vector_to_str($test[$i]), $predict);
 }
 
 function hypothesis($x, $weights)

diff --git a/classifier.php b/classifier.php
@@ -0,0 +1,127 @@
+<?php
+
+error_reporting(E_ALL);
+
+define('NUM_FEATURES', 3);
+
+// My dataset describes cities around the world where I might consider living.
+// Each sample (city) consists of 3 features:
+//  * Feature 1: average low winter temperature in the city
+//  * Feature 2: city population, in millions
+//  * Feature 3: does the city have an airport I can fly to from USA directly?
+//
+// The labels (categories) are 1 (yes) and 0 (no).
+// All the data is floating-point.
+
+$training = array(
+    array(-11., 2.6,  1.),
+    array(  8., 0.78, 1.),
+    array( 15., 4.2,  0.),
+    array(-16., 0.18, 0.),
+    array(  3., 1.1,  0.),
+    array(  7., 1.4,  1.),
+    array( -3., 1.44, 1.),
+    array( -7., 0.52, 0.),
+    array( 30., 0.82, 1.),
+    array( 20., 1.32, 0.),
+);
+
+$labels = array(
+    0.,
+    1.,
+    0.,
+    0.,
+    1.,
+    1.,
+    1.,
+    0.,
+    0.,
+    1
+);
+
+$NUM_SAMPLES = sizeof($training);
+
+// Initialize the weights array to random starting values.
+// There are always 1+NUM_FEATURES weights, because the first weight
+// does not correspond to a feature value, since:
+//   weights * features = weight0 + weight1 * feature1 + weight2 * feature2 + ...
+$weights = array();
+for ($j=0; $j < NUM_FEATURES+1; $j++)
+    $weights[$j] = mt_rand()/mt_getrandmax()*5.0;
+
+$learning_rate = 0.05;
+$steps = 20000; // number of steps to take for gradient descent
+$temp = array(); // temp array to hold updates for weights during the loop
+
+for ($n = 0; $n < $steps; $n++) {
+
+    // For each weight, perform the gradient descent step and save the result to temp
+    for ($j = 0; $j < NUM_FEATURES+1; $j++) {
+        $sum_m = 0.0;
+        for ($i = 0; $i < $NUM_SAMPLES; $i++) {
+            $h = hypothesis($training[$i], $weights);
+            // The first weight has a dummy 1 "feature" value
+            $part = ($h - $labels[$i]) * ($j==0 ? 1.0 : $training[$i][$j-1]);
+            $sum_m = $sum_m + $part;
+        }
+        $temp[$j] = $weights[$j] - $learning_rate * $sum_m/$NUM_SAMPLES;
+    }
+
+    $weights = $temp;
+}
+
+echo "Executed $n steps\n";
+echo "Weights: ", vector_to_str($weights), "\n";
+
+// Validate the results
+print "\nValidating training\n";
+$correct = 0;
+for ($i = 0; $i < $NUM_SAMPLES; $i++) {
+    $predict = predict($training[$i], $weights);
+    printf("Input: %s, actual: %d, predict: %d\n", vector_to_str($training[$i]), $labels[$i], $predict);
+    if ($predict == $labels[$i])
+        $correct++;
+}
+printf("Correctness = %.0f%%\n", $correct/$NUM_SAMPLES*100.0);
+
+// Try some predictions
+print "\nTesting the model\n";
+$test = array(
+    array(-1., 1.1, 1.),
+    array(23., 0.9, 0.),
+    array( 4., 1.9, 0.),
+    array(-14., 1.1, 1.),
+);
+for ($i = 0; $i < sizeof($test); $i++) {
+    $predict = predict($test[$i], $weights);
+    printf("Input: %s, predict: %d\n", vector_to_str($test[$i]), $predict);
+}
+
+function hypothesis($x, $weights)
+{
+    $score = $weights[0]; // free weight
+    $k = sizeof($x);
+    // Calculate dot product
+    for ($i = 0; $i < $k; $i++)
+        $score += $weights[$i+1] * $x[$i];
+    // Run through the sigmoid (logistic) function
+    return 1.0/(1.0 + exp(-$score));
+}
+
+function predict($input, $weights)
+{
+    $output = hypothesis($input, $weights);
+    // Threshold on 0.5
+    if ($output >= 0.50)
+        $predict = 1;
+    else
+        $predict = 0;
+    return $predict;
+}
+
+function vector_to_str($x)
+{
+    return '['.implode(", ", $x).']';
+}
+
+?>
No results found