jangsean · September 19, 2023 16:09 · Nov 14, 2017 · Nov 14, 2017 · Nov 14, 2017 · Nov 14, 2017
diff --git a/Regression.R b/Regression.R
@@ -243,6 +243,50 @@ red <- lm(y~x, data=dat)
 anova(red,full)
 #Look at the F value & Pr(>F) this anova prints out.
 
+########################################################################
+#EX. Categorical Inputs
+
+# Call str on flowers to see the types of each column
+str(flowers)
+
+# Use unique() to see how many possible values Time takes
+unique(flowers$Time)
+
+# Build a formula to express Flowers as a function of Intensity and Time: fmla. Print it
+(fmla <- as.formula("Flowers ~ Intensity + Time"))
+
+# Use fmla and model.matrix to see how the data is represented for modeling
+mmat <- model.matrix(fmla, data = flowers)
+
+# Examine the first 20 lines of flowers
+head(flowers, n = 20)
+
+# Examine the first 20 lines of mmat
+head(mmat, n = 20)
+
+# flowers in is the workspace
+str(flowers)
+
+# fmla is in the workspace
+fmla
+
+# Fit a model to predict Flowers from Intensity and Time : flower_model
+flower_model <- lm(fmla, data = flowers)
+
+# Use summary on mmat to remind yourself of its structure
+summary(mmat)
+
+# Use summary to examine flower_model 
+summary(flower_model)
+
+# Predict the number of flowers on each plant
+flowers$predictions <- predict(flower_model, newdata = flowers)
+
+# Plot predictions vs actual flowers (predictions on x-axis)
+ggplot(flowers, aes(x = predictions, y = Flowers)) + 
+  geom_point() +
+  geom_abline(color = "blue") 
+
 ########################################################################
 #Dummy Code Categorical Variables
 #https://stats.stackexchange.com/questions/115049/why-do-we-need-to-dummy-code-categorical-variables

diff --git a/Regression.R b/Regression.R
@@ -962,13 +962,9 @@ lambda <- 10^seq(10, -2, length=100)
 
 #create test and training set
 library(glmnet)
-set.seed(489)
-train = sample(x = 1:nrow(x), size= nrow(x)/2)
-test = (-train)
-ytest = y[test]
 
 #OLS
-swisslm <- lm(Fertility~., data = swiss, subset = train)
+swisslm <- lm(Fertility~., data = swiss[train,])
 s.pred <- predict(swisslm, newdata = swiss[test,])
 
 #Ridge (alpha = 0)

diff --git a/Regression.R b/Regression.R
@@ -972,15 +972,15 @@ swisslm <- lm(Fertility~., data = swiss, subset = train)
 s.pred <- predict(swisslm, newdata = swiss[test,])
 
 #Ridge (alpha = 0)
-ridge.mod <- glmnet(x[train,], y[train], alpha = 0, lambda = lambda)
-cv.out <- cv.glmnet(x[train,], y[train], alpha = 0)
+cv.out <- cv.glmnet(x[train,], y[train], alpha = 0, lambda = vectorOfPossibleLambdas)
 bestlamRidge <- cv.out$lambda.min
+ridge.mod <- glmnet(x[train,], y[train], alpha = 0, lambda = bestlamRidge)
 ridge.pred <- predict(ridge.mod, s = bestlamRidge, newx = x[test,])
 
 #Lasso (alpha = 1)
-lasso.mod <- glmnet(x[train,], y[train], alpha = 1, lambda = lambda)
-cv.out <- cv.glmnet(x[train,], y[train], alpha = 1)
+cv.out <- cv.glmnet(x[train,], y[train], alpha = 1, lambda = vectorOfPossibleLambdas)
 bestlamLasso <- cv.out$lambda.min
+lasso.mod <- glmnet(x[train,], y[train], alpha = 1, lambda = bestlamLasso)
 lasso.pred <- predict(lasso.mod, s = bestlamLasso, newx = x[test,])
 
 #Check RMSE LINEAR

diff --git a/Regression.R b/Regression.R
@@ -944,4 +944,79 @@ ggplot(bikesAugust, aes(x = pred, y = cnt)) +
   geom_abline()
 
 #Sometimes it might make negative prediction. However if you look at RMSE it will be smaller than other models.
-#perhaps rounding negative predictions up to zero is a reasonable tradeoff
+#perhaps rounding negative predictions up to zero is a reasonable tradeoff
+
+########################################################################
+#RIDGE AND LASSO
+########################################################################
+
+#REFERENCE: https://www.r-bloggers.com/ridge-regression-and-the-lasso/
+
+swiss <- datasets::swiss
+#remove the Fertility column and remove the intercept column
+x <- model.matrix(Fertility~., data=swiss)[,-1]
+y <- swiss$Fertility
+lambda <- 10^seq(10, -2, length=100)
+
+#When lambda = 0, we get same coefficents as OLS model
+
+#create test and training set
+library(glmnet)
+set.seed(489)
+train = sample(x = 1:nrow(x), size= nrow(x)/2)
+test = (-train)
+ytest = y[test]
+
+#OLS
+swisslm <- lm(Fertility~., data = swiss, subset = train)
+s.pred <- predict(swisslm, newdata = swiss[test,])
+
+#Ridge (alpha = 0)
+ridge.mod <- glmnet(x[train,], y[train], alpha = 0, lambda = lambda)
+cv.out <- cv.glmnet(x[train,], y[train], alpha = 0)
+bestlamRidge <- cv.out$lambda.min
+ridge.pred <- predict(ridge.mod, s = bestlamRidge, newx = x[test,])
+
+#Lasso (alpha = 1)
+lasso.mod <- glmnet(x[train,], y[train], alpha = 1, lambda = lambda)
+cv.out <- cv.glmnet(x[train,], y[train], alpha = 1)
+bestlamLasso <- cv.out$lambda.min
+lasso.pred <- predict(lasso.mod, s = bestlamLasso, newx = x[test,])
+
+#Check RMSE LINEAR
+sqrt(mean((s.pred-ytest)^2))
+#Check RMSE RIDGE
+sqrt(mean((ridge.pred - ytest)^2))
+#Check RMSE LASSO
+sqrt(mean((lasso.pred-ytest)^2))
+
+########################################################################
+#REFERENCE:https://www.youtube.com/watch?v=fAPCaue8UKQ
+########################################################################
+#Elastic net: mix of lasso and ridge regression
+
+
+#Cross-validation to find optimal lambda for lasso regression
+CV = cv.glmnet(x=trainX, y=trainY, family="binomial/gaussian", type.measure="deviance AKA mse", alpha = 1, nlambda = 100)
+
+#Following plot shows the models (with different lambda values) that glmnet has fit, along with the error type for each of the models
+plot(CV)
+
+#Fit model
+fit = glmnet(x=trainX, y=trainY, family = "binomial", alpha=1, lambda=CV$lambda.1se)
+
+#???lambda.1se vs lambda.min???
+# This is a question about parsimony. The lambda.min option refers to value of λ at the lowest CV error.
+# The error at this value of λ is the average of the errors over the k folds and hence this estimate of the error is uncertain. 
+# The lambda.1se represents the value of λ in the search that was simpler than the best model (lambda.min), but which has error 
+# within 1 standard error of the best model. In other words, using the value of lambda.1se as the selected value for λ results 
+# in a model that is slightly simpler than the best model but which cannot be distinguished from the best model in terms of error 
+# given the uncertainty in the kk-fold CV estimate of the error of the best model.
+# 
+# The choice is yours:
+#   
+# The best model that may be too complex or slightly overfitted: lambda.min
+# The simplest model that has comparable error to the best model given the uncertainty: lambda.1se
+
+#List nonzero coefficients:
+fit$beta[,1]
diff --git a/Regression.R b/Regression.R
@@ -237,6 +237,12 @@ unique(flowers$Time)
 fmla <- as.formula("Flowers ~ Intensity + Time")
 mmat <- model.matrix(fmla, data = flowers) #Time column changes to boolean: TimeLate (0 or 1)
 
+#Partial F-Test on a Factor Coded as a Set of Dummies
+full <- lm(y~x+as.factor(color), data=dat)
+red <- lm(y~x, data=dat)
+anova(red,full)
+#Look at the F value & Pr(>F) this anova prints out.
+
 ########################################################################
 #Dummy Code Categorical Variables
 #https://stats.stackexchange.com/questions/115049/why-do-we-need-to-dummy-code-categorical-variables
@@ -938,8 +944,4 @@ ggplot(bikesAugust, aes(x = pred, y = cnt)) +
   geom_abline()
 
 #Sometimes it might make negative prediction. However if you look at RMSE it will be smaller than other models.
-#perhaps rounding negative predictions up to zero is a reasonable tradeoff
-
-
-
-
+#perhaps rounding negative predictions up to zero is a reasonable tradeoff
diff --git a/Regression.R b/Regression.R
@@ -0,0 +1,945 @@
+########################################################################
+#1-var Linear Regression
+########################################################################
+#lm(formula you wanna fit, dataFrame)
+cmodel <- lm(temperature ~ chirps_per_sec, data = cricket)
+
+#Formula
+fmla <- temperature ~ chirps_per_sec
+fmla <- blood_pressure ~ age + weight
+#LHS: outcome, RHS: inputs. 
+fmla <- as.formula("temperature ~ chirps_per_sec") #converts string to formula
+
+#conveniently package diagnostics from summary(cmodel) in a dataframe
+broom::glance(cmodel)
+
+sigr:wrapFTest(cmodel) #R^2 test
+
+#by default returns training data predictions
+predict(model)
+
+ggplot(cricket, aes(x = prediction, y = temperature)) + geom_point() + geom_abline(color = "darkblue") + ggtitle("temperature vs linear model prediction")
+
+#apply model to new data
+predict(model, newdata = newChirps)
+
+########################################################################
+#Predicting once you fit a model
+########################################################################
+
+# unemployment is in your workspace
+summary(unemployment)
+
+# newrates is in your workspace
+newrates
+
+# Predict female unemployment in the unemployment data set
+unemployment$prediction <-  predict(unemployment_model)
+
+# load the ggplot2 package
+library(ggplot2)
+
+# Make a plot to compare predictions to actual (prediction on x axis). 
+ggplot(unemployment, aes(x = prediction, y = female_unemployment)) + 
+  geom_point() +
+  geom_abline(color = "blue")
+
+# Predict female unemployment rate when male unemployment is 5%
+pred <- predict(unemployment_model, newdata = newrates)
+# Print it
+pred
+
+########################################################################
+#Multivariate Linear Regression
+########################################################################
+
+# bloodpressure is in the workspace
+summary(bloodpressure)
+
+# Create the formula and print it
+fmla <- blood_pressure ~ age + weight
+fmla
+
+# Fit the model: bloodpressure_model
+bloodpressure_model <- lm(fmla, data = bloodpressure)
+
+# Print bloodpressure_model and call summary() 
+bloodpressure_model
+summary(bloodpressure_model)
+
+# predict blood pressure using bloodpressure_model :prediction
+bloodpressure$prediction <- predict(bloodpressure_model)
+
+# plot the prediction vs actual
+ggplot(bloodpressure, aes(x = prediction, y = blood_pressure)) + 
+  geom_point() +
+  geom_abline(color = "blue")
+
+#Pros of Linear Regression
+#easy to fit and to apply, concise, less prone to overfitting, interpretable
+#Cons of Linear Regression
+#can only express linear and additive relationships
+#collinearity -- when input variables are partially correlated.
+#when variables are highly correlated, signs of coefficients might not be what you expect, but does not affect model accuracy
+#model may be unstable (e.g. unusually high coefficients when variables are highly correlated)
+
+########################################################################
+#Evaluating Regression Models (graphically, RMSE, R^2, (and F-test from link))
+########################################################################
+#http://facweb.cs.depaul.edu/sjost/csc423/documents/f-test-reg.htm
+#https://feliperego.github.io/blog/2015/10/23/Interpreting-Model-Output-In-R
+
+#Evaluating model graphically 
+#(by looking at plots of ground truth vs predictions aka y vs y_hat)
+#a well-fitted model would have:
+#x=y line runs through center of points
+#line of perfect prediction
+#a poorly fitting model would have:
+#regions where points are all on one side of x=y line
+#systematic errors - errors that are correlated w/ value of outcome
+#either you don't have all the variables in the model or
+#you need an algorithm that can find more complex relationships in data
+
+#Residual Plot
+#residual plot: difference between outcome & prediction vs prediction
+#good fit: no systematic errors (all the errors evenly distributed between positive and negative and same magnitude above and below)
+#bad fit: systematic errors (clusters of all positives or all negatives of residuals)
+
+#Gain Curve/Plot (http://www2.cs.uregina.ca/~dbd/cs831/notes/lift_chart/lift_chart.html)
+#useful when sorting instances is more important than predicting exact outcome values
+#measures how well model sorts the outcome
+#x-axis: fraction of items in model-sorted order (decreasing)
+#y-axis: fraction of total accumulated home sales
+#Wizard curve: perfect model
+GainCurvePlot(frame, xvar, truthvar, title)
+#frame is a data frame
+#xvar and truthvar are strings naming the prediction and actual outcome columns of frame
+#title is the title of the plot
+#When the predictions sort in exactly the same order, the relative Gini coefficient is 1. 
+#When the model sorts poorly, the relative Gini coefficient is close to zero, or even negative.
+
+
+#Residuals
+unemployment$residuals <- unemployment$female_unemployment - unemployment$predictions
+ggplot(unemployment, aes(x = predictions, y = residuals)) + 
+  geom_pointrange(aes(ymin = 0, ymax = residuals)) + 
+  geom_hline(yintercept = 0, linetype = 3) + 
+  ggtitle("residuals vs. linear model prediction")
+
+
+#RMSE
+error <- houseprices$predictedPrices - houseprices$actualPrices #residuals
+err <- err^2
+rmse <- sqrt(mean(err))
+#is RMSE large or small? one way to tell is to compare it to the standard deviation of outcome
+
+#R^2
+#near 1: model fits well
+#near 0: no better than guessing the average value
+#R^2 = 1 - RSS (variance from model)/SST (variance of data)
+
+error <- houseprices$predictedPrices - houseprices$actualPrices #residuals
+rss <- sum(err^2)
+totalerr <- houseprices$actualPrices - mean(houseprices$actualPrices)
+sstot <- sum(toterr^2)
+s_squared <- 1 - (rss/sstot)
+
+#Correlation and R^2
+p = cor(prediction, actual) = 0.8995709
+p^2 = 0.8092278 = R^2
+#true for models that minimize squared error: linear regression & GAM regression
+
+#can get R^2 from summary() or glance()$r.squared or adj.r.squared
+
+cor(pred_y, y) #= sqrt(R^2) ONLY for trainng data & models that minimize square error
+
+#FAST RMSE/R_SQUARED FUNCTIONS
+rmse(actual, predicted)
+r_squared(actual, predicted)
+
+########################################################################
+#Training a Model
+########################################################################
+#if we don't have enough data to split training vs test sets, use cross validation
+
+#N-fold Cross Validation
+#partition data into n subsets
+#e.g. 3-fold cross validation
+#first, train a model using A and B and use that model to make pred. on C
+#then B and C to predict A
+#then A and C to predict B
+
+#splitPlan = vtreat::kWayCrossValidation(nRows in training data, nSplits aka n-way, dframe, y)
+#dframe and y aren't technically used so can set them both to be NULL
+#The resulting splitPlan is list of nSplits elements; each element contains two vectors:
+  #train: the indices of dframe that will form the training set
+  #app: the indices of dframe that will form the test (or application) set
+
+#split <- splitPlan[[1]], model <- lm(fmla, data = df[split$train,])
+#df$pred.cv[split$app] <- predict(model, newdata = df[split$app,])
+
+#if the n-fold cross validation tests look good enough, create the final model via all data
+#unfortunately you can't evaluate final model for future data because training/test
+#cross validation only tests the modelling process whereas training/test tests for future value prediction
+
+########################################################################
+#EX. Generating a random split of training/test set
+#If you have a dataset dframe of size N & want random subset of X% of N, where 0 < X < 1,
+gp = runif(N) #vector of uniform random numbers
+dframe[gp < X,] #will be what you want
+dframe[gp >= X,] #will be complement
+#note: (close to x/1-x split, but not exactly due to uniform random distribution)
+
+#train the model
+fmla = cty ~ hwy
+mpg_model = lm(fmla, data = mpg_train)
+
+#test model using rmse(predCol, ycol) & r_squared(predcol, ycol)
+mpg_train$pred <- predict(mpg_model, mpg_train)
+mpg_test$pred <- predict(mpg_model, mpg_test)
+rmse_train <- rmse(mpg_train$pred, mpg_train$cty)
+rmse_test <- rmse(mpg_test$pred, mpg_test$cty)
+rsq_train <- r_squared(mpg_train$pred, mpg_train$cty)
+rsq_test <- r_squared(mpg_test$pred, mpg_test$cty)
+ggplot(mpg_test, aes(x = pred, y = cty)) + 
+  geom_point() + 
+  geom_abline()
+########################################################################
+#EX. Create a Cross Validation Plan
+k <- 3 # Number of folds
+splitPlan <- kWayCrossValidation(nRows, k, NULL, NULL)
+
+# Run the 3-fold cross validation plan from splitPlan
+mpg$pred.cv <- 0 
+for(i in 1:k) {
+  split <- splitPlan[[i]]
+  model <- lm(cty ~ hwy, data = mpg[split$train,])
+  mpg$pred.cv[split$app] <- predict(model, newdata = mpg[split$app,])
+}
+
+# Predict from a full model
+mpg$pred <- predict(lm(cty ~ hwy, data = mpg))
+
+# Get the rmse of the full model's predictions
+rmse(mpg$cty, mpg$pred)
+
+# Get the rmse of the cross-validation predictions
+rmse(mpg$cty, mpg$pred.cv)
+
+########################################################################
+#Categorical Inputs (One-hot encoding)
+########################################################################
+model.matrix(WtLoss24 ~ Diet + age + BMI, data = diet)
+#all numerical values, converts categorical var. with N levels into N-1 indicator var
+
+#use unique() to see how many possible values Time takes (two: "Late" and "Early")
+unique(flowers$Time)
+fmla <- as.formula("Flowers ~ Intensity + Time")
+mmat <- model.matrix(fmla, data = flowers) #Time column changes to boolean: TimeLate (0 or 1)
+
+########################################################################
+#Dummy Code Categorical Variables
+#https://stats.stackexchange.com/questions/115049/why-do-we-need-to-dummy-code-categorical-variables
+
+########################################################################
+#Variable Interactions that are bad for Additive/Linear Regression
+########################################################################
+#Additive relationship: e.g. plant_height ~ bacteria + sun
+#increase in sun causes increase in plant_height regardless of level of bacteria level
+
+#plant_head ~ bacteria + sun + bacteria:sun (interaction between bacteria and sunlight)
+#change in height is more (or less) the sum of the effets due to sun/bacteria
+#at higher levels of sunlight, 1 unit change in bacteria causes more change in height
+
+#Expressing Interactions in R
+#Interaction - Colon (:)
+y ~ a:b
+
+#Main Effects and Interaction:
+y ~ a*b === y ~ a + b + a:b
+
+#Expressing product of two variables: I
+y ~ I(a*b)
+
+########################################################################
+#Transforming the response before modelling
+########################################################################
+#Sometimes get better models by transforming output rather than predicting output directly
+
+#e.g. log transform
+
+#e.g. Log Transform for Monetary Data.
+  #monetary values tend to be lognormally distributed
+  #long tail wide dynamic range (60-700k)
+  #in such a case, mean > median (50k vs 39k)
+  #predicting the mean will overpredict typical values (as what regression typically does)
+  #if you take the log of monetary data, it will be transformed into a normal distribution
+    #AKA mean ~= median again, more reasonable dynamic range (1.8 - 5.8)
+
+#The Procedure
+model <- lm(log(y) ~ x, data = train)
+logpred <- predict(model, data = test)
+pred <- exp(logpred) #transform predictions to outcome space
+
+#Consequences
+#prediction errors are multiplicative in outcome space (size of error relative to size of outcome)
+#multiplicative error: pred/y
+#relative error: (pred - y)/y = pred/y - 1
+#reducing multiplicative error reduces relative error
+#log transformations are useful when you want to reduce relative error rather than additive error
+#good example: monetary. Being off by $100 is less important for $100000 but more for $200 income
+
+#Root Mean Squared Relative Error
+#RMSRE = sqrt(mean((pred-y)/y)^2))
+#predicting log-outcome reduces RMS-relative error
+#but model will often have larger RMSE
+
+########################################################################
+#Ex. Log Transform
+
+# Examine Income2005 in the training set
+summary(income_train$Income2005)
+
+# Write the formula for log income as a function of the tests and print it
+(fmla.log <- log(Income2005) ~ Arith + Word + Parag + Math + AFQT)
+
+# Fit the linear model
+model.log <-  lm(fmla.log, data=income_train)
+
+# Make predictions on income_test
+income_test$logpred <- predict(model.log, newdata= income_test)
+summary(income_test$logpred)
+
+# Convert the predictions to monetary units
+income_test$pred.income <- exp(income_test$logpred)
+summary(income_test$pred.income)
+
+#  Plot predicted income (x axis) vs income
+ggplot(income_test, aes(x = pred.income, y = Income2005)) + 
+  geom_point() + 
+  geom_abline(color = "blue")
+
+# Add predictions to the test set
+income_test <- income_test %>%
+  mutate(pred.absmodel = predict(model.abs, income_test),#model.abs
+         pred.logmodel = exp(predict(model.log, income_test)))#model.log
+
+# Gather the predictions and calculate residuals and relative error
+income_long <- income_test %>% 
+  gather(key = modeltype, value = pred, pred.absmodel, pred.logmodel) %>%
+  mutate(residual = pred - Income2005,   # residuals
+         relerr   = residual/Income2005 )   # relative error
+
+# Calculate RMSE and relative RMSE and compare
+income_long %>% 
+  group_by(modeltype) %>%      # group by modeltype
+  summarize(rmse     = sqrt(mean(residual^2)),    # RMSE
+            rmse.rel = sqrt(mean((relerr)^2)))    # Root mean squared relative error
+
+########################################################################
+#Transforming the inputs before modelling
+########################################################################
+
+#Why to Transform Input Variables
+#domain knowledge/synthetic variables
+  #intelligenceAnimals ~ mass.brains/mass.body^2.3
+#pragmatic reasons
+  #log transform to reduce dynamic range
+  #log transform because meaningful changes in variable are multiplicative
+  #y approximately linear in f(x) rather than in x
+
+#anx ~ I(hassles^3)
+#I(): treat an expression literally (not as an interaction)
+
+#Linear, Quadratic, and Cubic Models
+mod_lin <- lm(anx ~ hassles, hassleframe)
+summary(mod_lin)$r.squared #0.5334847
+mod_quad <- lm(anx ~ I(hassles^2), hassleframe)
+summary(mod_quad)$r.squared #0.6241029
+mod_tritic <- lm(anx ~ I(hassles^3), hassleframe)
+summary(mod_tritic)$r.squared #0.6474421 - clearly the best
+
+########################################################################
+#EX. Transforming inputs
+
+# houseprice is in the workspace
+summary(houseprice)
+
+# Create the formula for price as a function of squared size
+(fmla_sqr <- (price ~ I(size^2)))
+
+# Fit a model of price as a function of squared size (use fmla_sqr)
+model_sqr <- lm(fmla_sqr, data = houseprice)
+
+# Fit a model of price as a linear function of size
+model_lin <- lm(price ~ size, data = houseprice)
+
+# Make predictions and compare
+houseprice %>% 
+  mutate(pred_lin = predict(model_lin),       # predictions from linear model
+         pred_sqr = predict(model_sqr)) %>%   # predictions from quadratic model 
+  gather(key = modeltype, value = pred, pred_lin, pred_sqr) %>% # gather the predictions
+  ggplot(aes(x = size)) + 
+  geom_point(aes(y = price)) +                   # actual prices
+  geom_line(aes(y = pred, color = modeltype)) + # the predictions
+  scale_color_brewer(palette = "Dark2")
+
+########################################################################
+#Logistic Regression to Predict Probabilities https://www.stat.wisc.edu/courses/st572-larget/handouts08-4.pdf
+########################################################################
+#Generalized Linear Model
+glm(formula, data = train, family = binomial(link = "logit"))
+#assumes inputs additive, linear in log-odds: log(p/(1-p))
+#family: describes error distribution of model
+  #logistic regresssion: family = binomial
+#outcome: two classes, e.g. a and b
+#model returns Prob(b)
+  #Recommend: 0/1 or FALSE/TRUE
+
+predict(model, newdata = test, type = "response")
+#newdat: by default, trainng data
+#to get probabilities: use type = "response"
+  #by default: returns log-odds of the event, not the probability
+
+#Methods of Evaluating Logistic Regression Model: pseudo-R^2
+#R^2 = 1 - RSS/TSS
+#pseudR^2 = 1 - deviance/null.deviance
+#Deviance: analogous to variance (RSS)
+#Null Deviance: Similar to TSS
+#Pseudo R^2: Deviance explained
+
+#Pseudo-R^2 on Training Data
+#Using broom::glance()
+glance(model) %>% summarize(pR2 = 1 - deviance/null.deviance)
+##  pseudoR2
+## 1 0.5922402
+
+#Using sigr::wrapChiSqTest()
+wrapChiSqTest(model)
+## "... pseudo-R2=0.59 ..."
+
+#Pseudo R^2 on Test Data
+test %>% mutate(pred = predict(model, newdata = test, type = "response")) %>%
+  wrapChiSqTest("predColName", "trueValuesColName of 0 or 1", TRUE) #the "TRUE" is the target value/target event
+
+#GainCurvePlot great for logistic regression
+GainCurvePlot(test, "pred", "has_dmd", "DMd model on test")
+
+########################################################################
+#EX. Logistic Regresssion on Sparrow Survival Probability
+
+#predict status (survived vs perished) based on toatl_length, weight, and humerus
+
+# sparrow is in the workspace
+summary(sparrow)
+
+# Create the survived column based on values TRUE/FALSE rather than Survived/Perished
+sparrow$survived <- sparrow$status == "Survived"
+
+# Create the formula
+(fmla <- as.formula("survived ~ total_length + weight + humerus"))
+
+# Fit the logistic regression model
+sparrow_model <- glm(fmla, data = sparrow, family = binomial(link = "logit"))
+
+# Call summary
+summary(sparrow_model)
+
+# Call glance
+(perf <- glance(sparrow_model))
+
+# Calculate pseudo-R-squared
+(pseudoR2 <- 1 - (sparrow_model$deviance)/(sparrow_model$null.deviance))
+
+# Make predictions
+sparrow$pred <- predict(sparrow_model, type = "response")
+
+# Look at gain curve
+GainCurvePlot(sparrow, "pred", "survived", "sparrow survival model")
+
+########################################################################
+#Poisson and Quasipoisson Regression to Predict Counts
+########################################################################
+#Predicting Counts
+#linear regression: predict values in [-infinity, infinity]
+#counts: integers in range [0, infinity]
+
+glm(formula, data, family = poisson_or_quasipoisson)
+#inputs are additive and linear in log(count)
+#outcome: integers
+  #counts: e.g. number of traffic tickets a driver gets
+  #rates: e.g. number of website hits/day
+#prediction: expected rate or intensity (not integral)
+  #expected # traffic tickets; expected hits/day
+
+#Poisson vs Quasipoisson
+  #poisson assumes that mean(y) = var(y) ("the same" as in, should be of a similar order of magnitude***)
+  #if vary(y) much different from mean(y) - use quasipoisson
+  #generally required a large sample size
+  #if rates/counts >> 0 - regular regression is fine
+
+#use same pseudo R^2
+
+#use same predict method as logistic regression, along with type = "response"
+
+#usage of paste function:
+paste(outcome, "~", paste(vars, collapse = " + "))
+#"cnt ~ hr + holiday + workingday + weathersit + temp + atemp + hum + windspeed"
+
+########################################################################
+#EX. Poisson/Quasipoisson
+
+# The outcome column
+outcome #"cnt"
+
+# The inputs to use
+vars ["hr", "holiday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed"]
+
+# Create the formula string for bikes rented as a function of the inputs
+(fmla <- paste(outcome, "~", paste(vars, collapse = " + ")))
+
+# Calculate the mean and variance of the outcome
+(mean_bikes <- mean(bikesJuly$cnt))
+(var_bikes <- var(bikesJuly$cnt))
+
+# Fit the model
+bike_model <- glm(fmla, data = bikesJuly, family = quasipoisson)
+
+# Call glance
+(perf <- glance(bike_model))
+
+# Calculate pseudo-R-squared
+(pseudoR2 <- 1 - perf$deviance/perf$null.deviance)
+
+# bikesAugust is in the workspace
+str(bikesAugust)
+
+# bike_model is in the workspace
+summary(bike_model)
+
+# Make predictions on August data
+bikesAugust$pred  <- predict(bike_model, newdata = bikesAugust, type = "response")
+
+# Calculate the RMSE (CAN USE RMSE AS LONG AS YOU DON'T TRANSFORM THE OUTPUT RESPONSES)
+bikesAugust %>% 
+  mutate(residual = pred - cnt) %>%
+  summarize(rmse  = sqrt(mean(residual^2)))
+
+########################################################################
+#EX. Visualize the Bike Rental Predictions as a Function of Time
+# Plot predictions and cnt by date/time
+bikesAugust %>% 
+  # set start to 0, convert unit to days
+  mutate(instant = (instant - min(instant))/24) %>%  
+  # gather cnt and pred into a value column
+  gather(key = valuetype, value = value, cnt, pred) %>%
+  filter(instant < 14) %>% # restric to first 14 days
+  # plot value by instant
+  ggplot(aes(x = instant, y = value, color = valuetype, linetype = valuetype)) + 
+  geom_point() + 
+  geom_line() + 
+  scale_x_continuous("Day", breaks = 0:14, labels = 0:14) + 
+  scale_color_brewer(palette = "Dark2") + 
+  ggtitle("Predicted August bike rentals, Quasipoisson model")
+
+########################################################################
+#Generalized Additive Models (GAM) to learn Non-Linear Transformations
+########################################################################
+#GAM: output depends additively on unknown smooth functions of input variables
+#e.g. y ~ b0 + s1(x1) + s2(x2)
+#a GAM learns the intercept  b0 and functions si's of the input
+#Use GAM to learn the relationship: quadratic? cubic? quartic?
+
+mgcv::gam(formula, family, data)
+
+#family:
+  #gaussian (default): "regular" expression
+  #binomial: probabilities
+  #poisson/quassipoisson: counts
+#Best for larger datasets, as they are complex and more likely to overfit
+
+#s() function (fits a spline between input and outcome)
+anxiety ~ s(hassles)
+#to specify that an input has a non-linear relationship with the outcome
+#use s() with continuous variables NOT CATEGORICAL******************
+  #more than about 10 unique values
+
+model <- gam(anxiety ~ s(hassles), data = hassleFrame, family = gaussian)
+summary(model)
+## 
+## ...
+## 
+## R-sq.(adj) =  0.619   Deviance explained = 64.1% (pseudo R-squared)
+## GCV = 49.132  Scale est. = 45.153    n = 40
+#The "deviance explained" reports the model's unadjusted R^2
+
+plot(model) #gives plots of variable transformations that the algorithm learned
+
+#y-values of these plots
+predict(model, type = "terms")
+
+#make predictions with model. use as.numeric to convert the matrix from predict(gam) to a vector
+as.numeric(predict(model, newdata = hassleFrame, type = "response"))
+
+########################################################################
+#EX. GAM Example
+
+# soybean_train is in the workspace
+summary(soybean_train)
+
+# Plot weight vs Time (Time on x axis)
+ggplot(soybean_train, aes(x = Time, y = weight)) + 
+  geom_point()
+
+# Load the package mgcv
+library(mgcv)
+
+# Create the formula 
+(fmla.gam <- weight ~ s(Time))
+
+# Fit the GAM Model
+model.gam <- gam(fmla.gam, data = soybean_train, family = gaussian)
+
+# Call summary() on model.lin and look for R-squared
+summary(model.lin)
+
+# Call summary() on model.gam and look for R-squared
+summary(model.gam)
+
+# Call plot() on model.gam
+plot(model.gam)
+
+# soybean_test is in the workspace
+summary(soybean_test)
+
+# Get predictions from linear model
+soybean_test$pred.lin <- predict(model.lin, newdata = soybean_test)
+
+# Get predictions from gam model
+soybean_test$pred.gam <- as.numeric(predict(model.gam, newdata = soybean_test))
+
+# Gather the predictions into a "long" dataset
+soybean_long <- soybean_test %>%
+  gather(key = modeltype, value = pred, pred.lin, pred.gam)
+
+# Calculate the rmse
+soybean_long %>%
+  mutate(residual = weight - pred) %>%     # residuals
+  group_by(modeltype) %>%                  # group by modeltype
+  summarize(rmse = sqrt(mean(residual^2))) # calculate the RMSE
+
+# Compare the predictions against actual weights on the test data
+soybean_long %>%
+  ggplot(aes(x = Time)) +                          # the column for the x axis
+  geom_point(aes(y = weight)) +                    # the y-column for the scatterplot
+  geom_point(aes(y = pred, color = modeltype)) +   # the y-column for the point-and-line plot
+  geom_line(aes(y = pred, color = modeltype, linetype = modeltype)) + # the y-column for the point-and-line plot
+  scale_color_brewer(palette = "Dark2")
+
+########################################################################
+#Decision Trees
+########################################################################
+#Rules of form:
+  #if A and B and C then Y
+#non-linear concepts
+  #intervals
+  #non-monotonic relationships
+#non-additive interactions
+  #AND: similar to multiplication
+
+#Pros:
+  #trees have an expressive concept space (lower RMSE)
+#Cons:
+  #coarse-grained predictions (can give six possible values, whereas linear can give continuous value predictions)
+
+#Ensembles of Trees (Several Trees) (Random Forest & Gradient-Boosted Trees)
+  #ensemble model fits animal intelligence data better than single tree
+  #ensembles give finer-grained predictions than single trees
+
+########################################################################
+#Random Forests
+########################################################################
+
+#Multiple diverse decision trees averaged together
+  #reduces overfit
+  #increases model expressiveness
+  #finer grain predictions
+
+#Building a Random Forest Model
+#1. Draw bootstrapped sample from training data
+#2. For each sample grow a tree
+  #at each node, pick best variable to split on (from a random subset of all variables)
+  #continue until tree is grown
+#3. To score a datum, evaluate it with all the trees and average the results
+
+#CAN BE USED FOR BOTH CLASSIFICATION AND REGRESSION PROBLEMS
+model <- ranger::ranger(formula, trainngData, num.trees = 500, respect.unordered.factors = "order")
+#use at least 200 trees
+#mtry - number of variables to try at each node
+  #by default, square root of the total number of variables
+#respect.unordered.factors
+  #Specifies how to treat unordered factor variables. We recommend setting this to "order" for regression.
+
+## Ranger result
+## ...
+## OOB (OUT OF BAG) estimate, AKA prediction error (MSE):       3103.623 
+## R squared (OOB):                  0.7837386
+#NOTE: Random forest algorithm returns estimates of out-of-sample performance
+
+predict(model, test_data)$predictions
+#predict() inputs: model and data
+#predictions can be accessed in the element $predictions
+
+########################################################################
+#EX. Random Forests
+
+# bikesJuly is in the workspace
+str(bikesJuly)
+
+# Random seed to reproduce results
+seed
+
+# The outcome column
+(outcome <- "cnt")
+
+# The input variables
+(vars <- c("hr", "holiday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed"))
+
+# Create the formula string for bikes rented as a function of the inputs
+(fmla <- paste(outcome, "~", paste(vars, collapse = " + ")))
+
+# Load the package ranger
+library(ranger)
+
+# Fit and print the random forest model
+(bike_model_rf <- ranger(fmla, # formula 
+                         bikesJuly, # data
+                         num.trees = 500, 
+                         respect.unordered.factors = "order", 
+                         seed = seed))
+
+# bikesAugust is in the workspace
+str(bikesAugust)
+
+# bike_model_rf is in the workspace
+bike_model_rf
+
+# Make predictions on the August data
+bikesAugust$pred <- predict(bike_model_rf, newdata = bikesAugust)$predictions
+
+# Calculate the RMSE of the predictions
+bikesAugust %>% 
+  mutate(residual = pred - cnt)  %>% # calculate the residual
+  summarize(rmse  = sqrt(mean(residual^2)))      # calculate rmse
+
+# Plot actual outcome vs predictions (predictions on x-axis)
+ggplot(bikesAugust, aes(x = pred, y = cnt)) + 
+  geom_point() + 
+  geom_abline()
+
+########################################################################
+#Categorical Inputs (One-hot encoding) MANUALLY
+########################################################################
+#Most R functions manage the conversion for you i.e. model.matrix as we learned before
+#however, some do not e.g xgboost()
+  #must convert categorical variables to numeric representation
+#conversion to indicators: one-hot encoding
+
+#vtreat to one-hot encode categorical variables (by transforming it into indicator variables 
+#coded "lev" & to clean bad values out of numerical variables coded "clean")
+#Basic Idea:
+  #designTreatmentsZ() to design a treatment plan from the training data
+  #prepare() to create "clean" data
+    #all numerical
+    #no missing values
+    #use prepare() with treatment plan for all future data
+
+#DESIGN TREATMENT PLAN
+vars <- c("x", "u")
+treatplan <- designTreatmentsZ(dframe, varslist, verbose = FALSE)
+  #varslist: list of input variable names as strings
+  #Set verbose to FALSE to suppress progress messages
+  #designTreatmentsZ has property scoreFrame that describe the var-mapping & the types of each new variable.
+
+scoreFrame <- treatplan %>% 
+  magrittr::use_series(scoreFrame) %>% 
+  select(varName, origName, code)
+
+newvars <- scoreFrame %>% 
+  filter(code %in% c("clean", "lev")) %>%
+           magrittr::use_series(varName)
+
+#PREPARE
+training.treat <- prepare(treatmentplan, dframe, varRestriction = newvars)
+#inputs to prepare():
+  #treatmentplan: treatment plan
+  #dframe: data frame
+  #varestriction: list of variables to prepare (optional)
+    #default: prepare all variables
+
+########################################################################
+#EX. Categorical Inputs
+
+# dframe is in the workspace
+dframe
+
+# Create and print a vector of variable names
+(vars <- c("color", "size"))
+
+# Load the package vtreat
+library(vtreat)
+
+# Create the treatment plan
+treatplan <- designTreatmentsZ(dframe, vars)
+
+# Examine the scoreFrame
+(scoreFrame <- treatplan %>%
+    use_series(scoreFrame) %>%
+    select(varName, origName, code))
+
+# We only want the rows with codes "clean" or "lev"
+(newvars <- scoreFrame %>%
+    filter(code %in% c("clean", "lev")) %>%
+    use_series(varName))
+
+# Create the treated training data
+(dframe.treat <- prepare(treatplan, dframe, varRestriction = newvars))
+
+########################################################################
+#EX. Novel Levels that e.g. model.matrix can't handle but vtreat can
+
+# newvars is in the workspace
+newvars
+
+# Print dframe and testframe
+dframe #has 'r', 'g', and 'b'
+testframe #has 'y'
+
+# Use prepare() to one-hot-encode testframe
+(testframe.treat <- prepare(treatplan, testframe, varRestriction = newvars))
+
+########################################################################
+#Gradient Boosting
+########################################################################
+#ensemble method that builds up a model by incrementally improving the existing one
+
+#1. Fit a shallow tree T1 to the data. M1 = T1
+#2. Fit a tree T2 to the residuals. Find gamma such that M2 = M1 + gamma*T2 is the best fit to data.
+#For regularized boosting, decrease learning by a factor eta in range (0,1).
+  #M2 = M1 + eta*gamma*T2. A larger eta = faster learning, smaller eta = less risk of overfit
+#3. Repeat (2) until stopping condition is met.
+
+#Final Model: M = M1 + eta*sum(gamma_i * Ti)
+
+#Because gradient boosting optimizes error on the training data, it's very easy to overfit the model.
+#best practice is to estimate out of sample error via cross validation for each incremental model
+#then retroactively decide how many trees to use
+
+#xgboost()
+#1. Run xgb.cv() with a large number of rounds (trees). Fits the model & calculates cross-validated error as well.
+#2. xgb.cv()$evaluation_log: records estimated RMSE for each round.
+  #find the number of trees that minimizes estimated RMSE: n_best
+#3. Run xgboost(), setting nrounds = n_best
+
+#EX.
+treatplan <- designTreatmentsZ(bikesJan, vars)
+newvars <- treatplan$scoreFrame %>%
+  filter(code %in% c("clean", "lev")) %>%
+  use_series(varName)
+
+bikesJan.treat <- prepare(treatplan, bikesJan, varRestriction = newvars)
+
+cv <- xgb.cv(data = as.matrix(bikesJan.treat), #a numeric matrix
+             label = bikesJan$cnt, #vector of outcomes (numeric)
+             objective = "reg:linear",
+             nrounds = 100, nfold = 5, eta = 0.3, max_depth = 6)
+
+#Key inputs to xgb.cv() and xgboost()
+  #data: input data as matrix; label: outcome
+  #objective: for regression - "reg:linear"
+  #nrounds: maximum number of trees to fit
+  #eta: learning rate
+  #max_depth: maximum depth of individual trees
+  #nfold: (xgb.cv() only): number of folds for cross validation
+  #early_stopping_rounds: after this many rounds without improvement, stop.
+
+elog <- as.data.frame(cv$evaluation_log)
+nrounds <- which.min(elog$test_rmse_mean) #out of sample error
+## 78 #the index of the minimum value corresponds to best number of trees
+
+#run cgboost() with right number of trees to get final model
+nrounds <- 78
+model <- xgboost(data = as.matrix(bikesJan.treat), 
+                 label = bikesJan$cnt,
+                 nrounds = nrounds,
+                 objective = "reg:linear",
+                 eta = 0.3,
+                 depth = 6)
+
+#Predict with xgboost() model
+bikesFeb.treat <- prepare(treatplan, bikesFeb, varRestriction = newvars)
+bikesFeb$pred <- predict(model, as.matrix(bikesFeb.treat))
+
+########################################################################
+#EX. Gradient Boosting Machines
+
+# bikesJuly & bikesJuly.treat are in the workspace
+
+# Load the package xgboost
+library(xgboost)
+
+# Run xgb.cv
+cv <- xgb.cv(data = as.matrix(bikesJuly.treat), 
+             label = bikesJuly$cnt,
+             nrounds = 100,
+             nfold = 5,
+             objective = "reg:linear",
+             eta = 0.3,
+             max_depth = 6,
+             early_stopping_rounds = 10,
+             verbose = 0    # silent
+)
+
+# Get the evaluation log 
+elog <- as.data.frame(cv$evaluation_log)
+
+# Determine and print how many trees minimize training and test error
+elog %>% 
+  summarize(ntrees.train = which.min(train_rmse_mean),   # find the index of min(train_rmse_mean)
+            ntrees.test  = which.min(test_rmse_mean))   # find the index of min(test_rmse_mean)
+#ntrees.train = 67, ntrees.test = 57
+
+#In most cases, ntrees.test is less than ntrees.train. The training error keeps decreasing 
+#even after the test error starts to increase. It's important to use cross-validation to find 
+#the right number of trees (as determined by ntrees.test) and avoid an overfit model.
+
+# Examine the workspace
+ls()
+
+# The number of trees to use, as determined by xgb.cv
+ntrees
+
+# Run xgboost
+bike_model_xgb <- xgboost(data = as.matrix(bikesJuly.treat), # training data as matrix
+                          label = bikesJuly$cnt,  # column of outcomes
+                          nrounds = ntrees,       # number of trees to build
+                          objective = "reg:linear", # objective
+                          eta = 0.3,
+                          depth = 6,
+                          verbose = 0  # silent
+)
+
+# Make predictions
+bikesAugust$pred <- predict(bike_model_xgb, as.matrix(bikesAugust.treat))
+
+# Plot predictions (on x axis) vs actual bike rental count
+ggplot(bikesAugust, aes(x = pred, y = cnt)) + 
+  geom_point() + 
+  geom_abline()
+
+#Sometimes it might make negative prediction. However if you look at RMSE it will be smaller than other models.
+#perhaps rounding negative predictions up to zero is a reasonable tradeoff
+
+
+
+
No results found