RobinfWu · December 10, 2018 21:54
diff --git a/MachineLearningBlogPost.html b/MachineLearningBlogPost.html
 <html><head><meta content="text/html; charset=UTF-8" http-equiv="content-type"><style type="text/css">.lst-kix_ltk9e9jr8w77-0>li:before{content:"\0025cf  "}.lst-kix_ltk9e9jr8w77-1>li:before{content:"\0025cb  "}ul.lst-kix_ozbimzt2zbqm-5{list-style-type:none}ul.lst-kix_ozbimzt2zbqm-4{list-style-type:none}ul.lst-kix_hjjcnlx2j62y-8{list-style-type:none}ul.lst-kix_ozbimzt2zbqm-7{list-style-type:none}ul.lst-kix_hjjcnlx2j62y-7{list-style-type:none}ul.lst-kix_ozbimzt2zbqm-6{list-style-type:none}ul.lst-kix_hjjcnlx2j62y-6{list-style-type:none}ul.lst-kix_ozbimzt2zbqm-1{list-style-type:none}ul.lst-kix_hjjcnlx2j62y-5{list-style-type:none}ul.lst-kix_ozbimzt2zbqm-0{list-style-type:none}.lst-kix_ltk9e9jr8w77-2>li:before{content:"\0025a0  "}ul.lst-kix_hjjcnlx2j62y-4{list-style-type:none}ul.lst-kix_ozbimzt2zbqm-3{list-style-type:none}ul.lst-kix_hjjcnlx2j62y-3{list-style-type:none}ul.lst-kix_ozbimzt2zbqm-2{list-style-type:none}.lst-kix_ltk9e9jr8w77-5>li:before{content:"\0025a0  "}ul.lst-kix_hjjcnlx2j62y-2{list-style-type:none}ul.lst-kix_hjjcnlx2j62y-1{list-style-type:none}ul.lst-kix_hjjcnlx2j62y-0{list-style-type:none}.lst-kix_ltk9e9jr8w77-3>li:before{content:"\0025cf  "}ul.lst-kix_ozbimzt2zbqm-8{list-style-type:none}.lst-kix_ltk9e9jr8w77-4>li:before{content:"\0025cb  "}.lst-kix_ltk9e9jr8w77-6>li:before{content:"\0025cf  "}.lst-kix_ozbimzt2zbqm-0>li:before{content:"\0025cf  "}.lst-kix_ltk9e9jr8w77-7>li:before{content:"\0025cb  "}.lst-kix_ozbimzt2zbqm-1>li:before{content:"\0025cb  "}.lst-kix_ltk9e9jr8w77-8>li:before{content:"\0025a0  "}ul.lst-kix_ltk9e9jr8w77-8{list-style-type:none}ul.lst-kix_ltk9e9jr8w77-3{list-style-type:none}ul.lst-kix_ltk9e9jr8w77-2{list-style-type:none}ul.lst-kix_ltk9e9jr8w77-1{list-style-type:none}ul.lst-kix_ltk9e9jr8w77-0{list-style-type:none}ul.lst-kix_ltk9e9jr8w77-7{list-style-type:none}ul.lst-kix_ltk9e9jr8w77-6{list-style-type:none}.lst-kix_hjjcnlx2j62y-7>li:before{content:"\0025cb  "}.lst-kix_hjjcnlx2j62y-8>li:before{content:"\0025a0  "}ul.lst-kix_ltk9e9jr8w77-5{list-style-type:none}ul.lst-kix_ltk9e9jr8w77-4{list-style-type:none}.lst-kix_hjjcnlx2j62y-5>li:before{content:"\0025a0  "}.lst-kix_hjjcnlx2j62y-6>li:before{content:"\0025cf  "}.lst-kix_ozbimzt2zbqm-7>li:before{content:"\0025cb  "}.lst-kix_hjjcnlx2j62y-2>li:before{content:"\0025a0  "}.lst-kix_ozbimzt2zbqm-8>li:before{content:"\0025a0  "}.lst-kix_dw3s0lcncm8v-7>li:before{content:"\0025cb  "}.lst-kix_hjjcnlx2j62y-3>li:before{content:"\0025cf  "}.lst-kix_hjjcnlx2j62y-4>li:before{content:"\0025cb  "}.lst-kix_dw3s0lcncm8v-8>li:before{content:"\0025a0  "}.lst-kix_ozbimzt2zbqm-3>li:before{content:"\0025cf  "}.lst-kix_ozbimzt2zbqm-2>li:before{content:"\0025a0  "}.lst-kix_ozbimzt2zbqm-4>li:before{content:"\0025cb  "}.lst-kix_dw3s0lcncm8v-1>li:before{content:"\0025cb  "}.lst-kix_dw3s0lcncm8v-3>li:before{content:"\0025cf  "}.lst-kix_dw3s0lcncm8v-2>li:before{content:"\0025a0  "}.lst-kix_dw3s0lcncm8v-6>li:before{content:"\0025cf  "}.lst-kix_hjjcnlx2j62y-1>li:before{content:"\0025cb  "}.lst-kix_ozbimzt2zbqm-6>li:before{content:"\0025cf  "}.lst-kix_dw3s0lcncm8v-5>li:before{content:"\0025a0  "}.lst-kix_ozbimzt2zbqm-5>li:before{content:"\0025a0  "}.lst-kix_hjjcnlx2j62y-0>li:before{content:"\0025cf  "}.lst-kix_dw3s0lcncm8v-4>li:before{content:"\0025cb  "}.lst-kix_dw3s0lcncm8v-0>li:before{content:"\0025cf  "}ul.lst-kix_dw3s0lcncm8v-7{list-style-type:none}ul.lst-kix_dw3s0lcncm8v-8{list-style-type:none}ul.lst-kix_dw3s0lcncm8v-3{list-style-type:none}ul.lst-kix_dw3s0lcncm8v-4{list-style-type:none}ul.lst-kix_dw3s0lcncm8v-5{list-style-type:none}ul.lst-kix_dw3s0lcncm8v-6{list-style-type:none}ul.lst-kix_dw3s0lcncm8v-0{list-style-type:none}ul.lst-kix_dw3s0lcncm8v-1{list-style-type:none}ul.lst-kix_dw3s0lcncm8v-2{list-style-type:none}ol{margin:0;padding:0}table td,table th{padding:0}.c2{color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:12pt;font-family:"Arial";font-style:normal}.c1{padding-top:0pt;padding-bottom:0pt;line-height:1.15;orphans:2;widows:2;text-align:center}.c0{padding-top:0pt;padding-bottom:0pt;line-height:1.15;orphans:2;widows:2;text-align:left}.c6{color:#000000;text-decoration:none;vertical-align:baseline;font-family:"Arial";font-style:normal}.c15{background-color:#ffffff;max-width:540pt;padding:36pt 36pt 36pt 36pt}.c3{font-size:12pt;font-weight:700}.c12{font-weight:400;font-size:18pt}.c9{font-weight:700;font-size:24pt}.c10{font-weight:400;font-size:11pt}.c11{margin-left:36pt;padding-left:0pt}.c7{margin-left:55pt;text-indent:-27pt}.c14{padding:0;margin:0}.c4{height:11pt}.c8{font-style:italic}.c5{font-size:12pt}.c16{margin-left:36pt}.c13{vertical-align:super}.title{padding-top:0pt;color:#000000;font-size:26pt;padding-bottom:3pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}.subtitle{padding-top:0pt;color:#666666;font-size:15pt;padding-bottom:16pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}li{color:#000000;font-size:11pt;font-family:"Arial"}p{margin:0;color:#000000;font-size:11pt;font-family:"Arial"}h1{padding-top:20pt;color:#000000;font-size:20pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h2{padding-top:18pt;color:#000000;font-size:16pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h3{padding-top:16pt;color:#434343;font-size:14pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h4{padding-top:14pt;color:#666666;font-size:12pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h5{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h6{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;font-style:italic;orphans:2;widows:2;text-align:left}</style></head><body class="c15"><p class="c0"><span class="c9">Introduction</span></p><p class="c0 c4"><span class="c6 c3"></span></p><p class="c0"><span class="c2">In this blog post, we will expand on the Perceptron Algorithm.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c5">The perceptron is </span><span class="c2">a machine learning algorithm.</span></p><p class="c0 c4"><span class="c6 c3"></span></p><p class="c0"><span class="c5">Machine learning is essentially a method of analyzing data that uses automated analytical model building. </span><span class="c2">A common definition for machine learning is for a system to learn a task without being explicitly programmed to perform said task. As such, machine learning has been the driving engine for artificial intelligence. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">Machine learning is often categorized into four types: supervised learning, unsupervised learning, semi-supervised and reinforcement learning. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c3">Supervised learning</span><span class="c2">&nbsp;</span></p><p class="c0"><span class="c2">You know how some practice problems have an answer key? Have you ever tried to study the solution to understand the problem? Welcome to supervised learning where your dataset has the answer keys!</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">Supervised learning is a machine learning system where the learning algorithm knows the outcome of the training dataset provided. Using these outcomes, the algorithm continually makes predictions until it reaches an acceptable level of accuracy.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">Supervised learning involves learning a mapping between between x and y where x is the feature vector (independent variable) and y is the label (dependent variable). If the labeled data, y, is discrete such as boolean values or textual labels, then this becomes a classification problem. If the labeled data, y, is continuous, then this becomes a regression problem.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">Supervised learning problems can be divided into classification problems or regression problems. Classification is where the the problem is trying to categorize data into different categories. In supervised learnings the training data outcomes would be the categories that each data point is associated with in the training data set. Regression is where the outcomes are represented by real values like floats. </span></p><p class="c1"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 563.64px; height: 275.50px;"><img alt="" src="images/image23.png" style="width: 563.64px; height: 275.50px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c6 c3">Unsupervised Learning</span></p><p class="c0"><span class="c2">You know how some practice problems do not come with an answer key? The best you can do might be to cluster the problems of your exam into different types of questions, but without the answer key, you don&rsquo;t know if you&rsquo;re right or wrong. Welcome to the unsupervised learning which is more difficult than supervised learning. K-Means Clustering is an example of an unsupervised learning algorithm.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">Unsupervised learning is when a problem provides only the training data and no corresponding outcomes. The purpose of an unsupervised learning system is to understand the underlying model or distribution of the data rather than understand the data itself and its correlation with outcome. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">Unsupervised learning can be divided into clustering and association. In clustering problems, we are trying to figure out the groupings of data. In association problems, we are trying to learn a set of rules that describe the dataset.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c1"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 496.86px; height: 289.50px;"><img alt="" src="images/image26.png" style="width: 496.86px; height: 289.50px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c0 c4"><span class="c3 c6"></span></p><p class="c0"><span class="c6 c3">Semi-Supervised Learning</span></p><p class="c0"><span class="c2">Semi-Supervised learning is when the dataset is semi-labeled. Some of the data points might have outcomes while others do not. Semi-supervised learning is a mix of both supervised and unsupervised learning. In these problems we can use unsupervised learning techniques to learn about the structure of the input data and we can also use supervised learning techniques to make predictions on the unlabeled data using the outcomes of the labeled data as training. </span></p><p class="c0 c4"><span class="c6 c3"></span></p><p class="c0"><span class="c3">Reinforcement Learning</span></p><p class="c0"><span class="c2">Even more challenging than supervised and unsupervised is reinforcement learning which deals with how an agent acts in a dynamic environment, maximizing its reward. For example, OpenAI has used reinforcement learning to compete with expert human players of the video game known as Dota 2. In this example, the &lsquo;agent&rsquo; would be the characters and the environment would the entire stage where the characters can roam around. The &lsquo;reward&rsquo; that these characters want to maximize would be winning the game.</span></p><p class="c0 c4"><span class="c6 c3"></span></p><p class="c0"><span class="c2">This blog post shall focus on supervised learning tasks for binary classification.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c9">Perceptron Algorithm</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">Perceptron is an algorithm for supervised learning of binary classifiers. A binary classifier is essentially a function that maps its input x to an output value y that is a single binary value. The perceptron algorithm is a linear classifier. This means that if the training set is not linearly separable, the classifier can never be one hundred percent accurate. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">The most basic means of classifying data into two categories would be to draw a line that separates them. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 240.00px; height: 227.00px;"><img alt="" src="images/image18.png" style="width: 240.00px; height: 227.00px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c0"><span class="c2">The diagram above shows two feature vectors (which are the two axises) and two labels (marked as red dots or blue dots). There are three different lines shown that perfectly separates the data points into two categories. In fact, there can be an infinite number of ways to draw a line such that these two labels are separated. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 444.00px; height: 205.00px;"><img alt="" src="images/image20.png" style="width: 444.00px; height: 205.00px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c5">The Perceptron algorithm starts by initializing the weights to 0. It then visits each and every data point x to check if the Sign(x</span><span class="c5 c13">T</span><span class="c5">w) is equal to the label of x. Note that Sign refers to the</span><span class="c2">&nbsp;sign function which outputs either 1 or -1. Whenever there&rsquo;s a mismatch with the predicted label and the actual label, the algorithm adds x to the weights if the actual label is positive or subtracts x to the weights if the actual label is negative.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c1"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 468.00px; height: 200.00px;"><img alt="" src="images/image25.png" style="width: 468.00px; height: 200.00px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">The perceptron algorithm is a building block for neural networks and is considered the simplest feedforward neural network. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c6 c9">Logistic Regression</span></p><p class="c0"><span class="c2">Logistic regression is a statistical model that uses logistic functions to model binary, dependent variables. Unlike the perceptron algorithm which uses the sign function, logistic regression uses the sigmoid function and introduces an important machine learning concept called gradient descent.</span></p><p class="c0 c4"><span class="c6 c10"></span></p><p class="c0"><span class="c5">We can represent a line as: W</span><span class="c5 c13">T</span><span class="c2">x + b where W is the slope of the line, x is the feature vectors, and b is the y-intercept. More formally, W is often referred to as the weight and b is referred to as the bias. </span></p><p class="c0"><span class="c2">We already know what x is because that&rsquo;s our data. But what about the weights and bias?</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">For binary classification, you may want to know the probability of an event happening given some data. </span></p><p class="c1"><img src="images/image1.png"></p><p class="c0"><span class="c5">You could try </span><img src="images/image2.png"><span class="c2">&nbsp;which wouldn&rsquo;t work because a probability must be between 0 and 1. </span></p><p class="c0"><span class="c5">By applying a sigmoid function </span><img src="images/image3.png"><span class="c2">, you can bound the output between 0 and 1. </span></p><p class="c1"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 294.45px; height: 271.50px;"><img alt="_images/sigmoid.png" src="images/image29.png" style="width: 294.45px; height: 271.50px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c1"><img src="images/image4.png"></p><p class="c1"><img src="images/image5.png"></p><p class="c0"><span class="c6 c3">Loss Functions: </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">A loss function measures the error between the predicted output with the actual output from the dataset. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">There are many types of loss functions:</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">Hinge Loss - this is what Support Vector Machine would use.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c5">Cross-Entropy Loss (also known as log loss, logistic loss) - this is what a Logistic Regression would use.</span><span class="c2">&nbsp;</span></p><p class="c1"><img src="images/image6.png"></p><p class="c1 c4"><span class="c2"></span></p><p class="c1"><img src="images/image7.png"></p><p class="c1 c4"><span class="c2"></span></p><p class="c0"><span class="c2">A cost function measures the average error across the entire dataset:</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c1"><span class="c5">&nbsp;</span><img src="images/image8.png"></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">Ideally, we would like no errors, a perfect model, a cost function of zero. We want to find w and b that would minimize the cost function as much as possible. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c3">Gradient Descent</span><span class="c2">&nbsp;is an algorithm to find w,b that minimizes a convex function:</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">Repeat {</span></p><p class="c0 c16"><img src="images/image9.png"></p><p class="c0 c16"><img src="images/image10.png"></p><p class="c0"><span class="c2">}</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">Both w and b can be initialized to 0. Alternatively, w can also be initialized to a random value. </span></p><p class="c0"><span class="c5">The alpha value, </span><img src="images/image11.png"><span class="c2">, is the learning rate which controls the rate of minimizing towards a local minimum. If alpha is too large, it may miss the local minimum entirely, and if alpha is too small, the training time can be too slow. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><img src="images/image12.png"><span class="c5">and </span><img src="images/image13.png"><span class="c2">&nbsp;are the gradients of the cost function with respect to the weights and bias.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">How do we calculate these gradients? </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 720.00px; height: 420.00px;"><img alt="" src="images/image21.png" style="width: 720.00px; height: 420.00px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c0"><span class="c2">We use a calculus trick known as the Chain Rule as shown above. This involves taking the derivative of the loss function with respect to the activation function (sigmoid in this case), the derivative of the activation function with respect to its inputs z, the derivative of z with respect to the weights or bias, and through Chain Rule, we can derive the derivative of the loss function with respect to the weights and or bias. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">It is important to note that gradient descent does not guarantee that it would find the absolute minimum of the convex function, and convex optimization is an active area of research. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c6 c3">The XOR Problem </span></p><p class="c0 c4"><span class="c6 c3"></span></p><p class="c0"><span class="c2">Here&rsquo;s another labeled dataset: </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 281.00px; height: 140.00px;"><img alt="" src="images/image22.png" style="width: 281.00px; height: 140.00px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c0"><span class="c5">And, down b</span><span class="c2">elow is the dataset plotted onto a graph. There are only four points, but can you draw a line that separates these them into their respective labels?</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 336.61px; height: 324.50px;"><img alt="" src="images/image27.png" style="width: 336.61px; height: 324.50px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c0"><span class="c2">Well, you can&rsquo;t!</span></p><p class="c0 c4"><span class="c6 c3"></span></p><p class="c0"><span class="c5">What you see above is the XOR Problem, and it is impossible for the </span><span class="c5">perceptron </span><span class="c2">algorithm to solve this simple problem. There does not exist a single line that can separate these four points into their respective labels.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c5">That is the problem with linear models; they assume that the problem can be linearly separable. Not everything can be so conveniently </span><span class="c5">separated </span><span class="c2">by a straight line.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">To solve the XOR problem - and any problems that involves complicated manifolds-, we need to increase the model capacity, meaning that modeling the mapping between the feature vectors and the labels must be more sophisticated than a hyperplane.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">By building upon the perceptron algorithm and logistic regression we can create a neural network.</span></p><p class="c0"><span class="c2">&nbsp;</span></p><p class="c0"><span class="c6 c9">Neural Network</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c5">A neural network is a machine learning system that is inspired by the brain. It takes after biological neural networks in animal brains.</span><span class="c5">&nbsp;A neural network is essentially stacking multiple perceptrons </span><span class="c5">and can consist of millions of processing nodes that are interconnected. An individual node could be connected to nodes in the layer beneath it from where it receives data and also to nodes in the layers above it that send data.</span><span class="c2">&nbsp;The term Deep Neural Network simply means that there are many layers to the neural network. Deep Learning is a buzzword the media loves to use, but it means exactly the same thing as Deep Neural Network.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 720.00px; height: 446.67px;"><img alt="" src="images/image24.png" style="width: 720.00px; height: 446.67px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c0"><span class="c5">A neural network has an input layer, at least one hidden layer, and an output layer. </span><span class="c5">Training data is fed to the input layer. The data then goes through all the middle layers and finally comes out at the output layer. The hidden layer(s) are generally in the middle layers. </span><span class="c2">The Perceptron Algorithm can be thought of as a neural network with an input layer, a single hidden layer with one &lsquo;neuron&rsquo;, and an output layer. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c5">Another way to think about neural networks is that they are essentially a composite function. If x represents the </span><span class="c3">input layer</span><span class="c5">, f represents </span><span class="c3">hidden layer 1</span><span class="c5">, and g represents a function of the</span><span class="c3">&nbsp;hidden layer 2</span><span class="c2">, then the entire neural network could above could be viewed as a composite function: g(f(x)). It&rsquo;s like Inception except instead of dreams within dreams, we have a function within a function. The more layers a neural networks has, the &lsquo;deeper&rsquo; the neural network becomes, and the more sophisticated it can model the mapping between x and y. It is this potential of finding complex models that makes neural networks incredibly powerful, especially with big data.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">The activation functions for the hidden layers do not have to use a sigmoid. In fact, tanh (Hyperbolic Tangent) is strictly better than sigmoid because it&rsquo;s centered towards the origin.</span></p><p class="c0"><img src="images/image14.png"><span class="c5">&nbsp; &nbsp; </span><img src="images/image15.png"></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 384.28px; height: 275.50px;"><img alt="" src="images/image28.png" style="width: 384.28px; height: 275.50px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">ReLU (Rectified Linear Unit) is usually the default activation function for hidden layers. It is the most widely used activation function. </span></p><p class="c0"><img src="images/image16.png"><span class="c5">&nbsp; &nbsp; </span><img src="images/image17.png"></p><p class="c0"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 268.00px; height: 233.00px;"><img alt="" src="images/image19.png" style="width: 268.00px; height: 233.00px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">For binary classification, we would want the output layer to have the sigmoid function to give a binary output. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">Logistic Regression uses gradient descent to minimize the cost function. The same holds true for a neural network, but how does one calculate the gradient on a neural network? </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c5">Neural networks uses an algorithm known as backpropagation which calculates the gradients using chain rule. </span><span class="c2">In the backpropagation algorithm, we are looking for the minimum value of the error function in weight space. This is done using gradient descent. The output of the algorithm is the weights that minimize the error function. These are considered to be a solution to the problem.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c9">Tips on Training a Neural Network</span></p><p class="c0"><span class="c5">It is common to split the dataset into a training set, validation set (development set), and test set </span><span class="c2">or into just a training set and a test set. The dataset is generally split in an 80:20 ratio where the training set is 80 percent and the test set is 20 percent. If there is a validation set as well, the validation set is 20 percent of the training set. These numbers are not set in stone but a recommendation.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c3">Normalize the data </span><span class="c2">&ndash; normalizing your data can make it easier and faster to optimize the weights. Here are two very common ways to optimize the data. Both can be use at the same time.</span></p><ul class="c14 lst-kix_ltk9e9jr8w77-0 start"><li class="c0 c11"><span class="c2">Subtract Mean: The mean of the dataset becomes zero. This can be done by subtracting every single value in a given feature vector by the mean of the entire vector.</span></li><li class="c0 c11"><span class="c2">Normalize Variance: This can be done by dividing each value of a given feature vector by the variance of the vector.</span></li></ul><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">Deep Learning suffers from exploding/diminishing gradients. A partial solution is to that the more features you have, the smaller you want the weights to be. WIth n being the number of features, you can set the variance of the weights to be = 1/n for tanh and 2/n for ReLU.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c6 c3">Bias and Variance</span></p><p class="c0"><span class="c2">Bias and variance is an important concept in the field of machine learning.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">If you have High Bias, that indicates underfitting and having a poor training set error. Underfitting means that the model is not sophisticated enough to accurately model the training set. Solutions to this might be having a bigger network, train longer, or have a different neural network architecture</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c5">If you have high variance, that indicates overfitting and having a good training set error but &nbsp;a poor validation set error. Solutions to this might be including regularizations which are a collection of techniques to counteract overfitting. </span><span class="c3">Dropout </span><span class="c5">is one method of regularization which randomly turns off the some percentage of &lsquo;neurons&rsquo; in the hidden layers as the model is training. Another common technique is </span><span class="c3">Early Stopping</span><span class="c5">. By plotting to validation set error and the training set error over the course of iterations, this technique picks the point right before the validation set error increases. This technique stops training the right before overfitting happens. Another solution is </span><span class="c3">Data Augmentation</span><span class="c2">, meaning increasing the size of the dataset. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">Understanding bias and variance is important because it helps you deduce where the problem lies. For instance, if you have high bias, increasing the size of your dataset will not help. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c3">Mini-Batch Gradient Descent</span></p><p class="c0"><span class="c2">Suppose you have a dataset of size m = 5 million. The neural network would have to process all the data before making a step in the gradient descent process. Using mini-batch, you can split up the training/test set into 1,000 each. There would be 5,000 mini-batches. By using mini-batches, the model can converge towards a local minimum faster. </span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">If the mini-batch size is m, that is called Batch Gradient Descent. The problem is that it takes too long for each iteration. This is fine on a small training set (Ex: m &lt;= 2000).</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c2">If the mini-batch size is 1, that is called Stochastic Gradient Descent. The problem is that it loses speedup from vectorization.</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c9">Conclusion</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c5">Machine learning is </span><span class="c2">a really broad field and this is just a small part of it. There are a multitude of algorithms and models built for different types of problems. Decision trees for example are a form of supervised learning used to depict every possible outcome of a decision. Another example is the nearest neighbors algorithm. This algorithm approximates how likely it is for a data point to be a part of one grouping or another. Like these, there are many more useful algorithms that are waiting to be studied!</span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0 c4"><span class="c2"></span></p><p class="c0"><span class="c6 c3">References </span></p><p class="c0 c7"><span class="c5">&ldquo;Artificial Neural Network.&rdquo; </span><span class="c5 c8">Wikipedia</span><span class="c2">, Wikimedia Foundation, 6 Dec. 2018, en.wikipedia.org/wiki/Artificial_neural_network.</span></p><p class="c0 c7"><span class="c5">&ldquo;Deep Learning.&rdquo; </span><span class="c8 c5">Coursera</span><span class="c2">, Rice University, www.coursera.org/specializations/deep-learning.</span></p><p class="c0 c7"><span class="c5">Hardesty, Larry. &ldquo;Explained: Neural Networks.&rdquo; </span><span class="c8 c5">MIT News</span><span class="c2">, 14 Apr. 2017, news.mit.edu/2017/explained-neural-networks-deep-learning-0414.</span></p><p class="c0 c7"><span class="c5">&ldquo;Perceptron.&rdquo; </span><span class="c8 c5">Wikipedia</span><span class="c2">, Wikimedia Foundation, 26 Nov. 2018, en.wikipedia.org/wiki/Perceptron.</span></p><p class="c0 c7"><span class="c5">Saurabh. &ldquo;What Is Backpropagation? | Training A Neural Network | Edureka.&rdquo; </span><span class="c8 c5">Edureka Blog</span><span class="c2">, Edureka, 6 Dec. 2018, www.edureka.co/blog/backpropagation/.</span></p><p class="c0 c7"><span class="c5">&ldquo;Supervised and Unsupervised Machine Learning Algorithms.&rdquo; </span><span class="c8 c5">Machine Learning Mastery</span><span class="c2">, 22 Sept. 2016, machinelearningmastery.com/supervised-and-unsupervised-machine-learning-algorithms/.</span></p><p class="c0 c4"><span class="c6 c3"></span></p><p class="c0 c4"><span class="c6 c3"></span></p></body></html>
No results found