Skip to content

Instantly share code, notes, and snippets.

@jangsean
Forked from cherryyingzizhang/Regression.R
Created September 19, 2023 16:09
Show Gist options
  • Select an option

  • Save jangsean/4d390ee684deebd109d8c1757e42e42a to your computer and use it in GitHub Desktop.

Select an option

Save jangsean/4d390ee684deebd109d8c1757e42e42a to your computer and use it in GitHub Desktop.

Revisions

  1. @cherryyingzizhang cherryyingzizhang revised this gist Nov 14, 2017. 1 changed file with 44 additions and 0 deletions.
    44 changes: 44 additions & 0 deletions Regression.R
    Original file line number Diff line number Diff line change
    @@ -243,6 +243,50 @@ red <- lm(y~x, data=dat)
    anova(red,full)
    #Look at the F value & Pr(>F) this anova prints out.

    ########################################################################
    #EX. Categorical Inputs

    # Call str on flowers to see the types of each column
    str(flowers)

    # Use unique() to see how many possible values Time takes
    unique(flowers$Time)

    # Build a formula to express Flowers as a function of Intensity and Time: fmla. Print it
    (fmla <- as.formula("Flowers ~ Intensity + Time"))

    # Use fmla and model.matrix to see how the data is represented for modeling
    mmat <- model.matrix(fmla, data = flowers)

    # Examine the first 20 lines of flowers
    head(flowers, n = 20)

    # Examine the first 20 lines of mmat
    head(mmat, n = 20)

    # flowers in is the workspace
    str(flowers)

    # fmla is in the workspace
    fmla

    # Fit a model to predict Flowers from Intensity and Time : flower_model
    flower_model <- lm(fmla, data = flowers)

    # Use summary on mmat to remind yourself of its structure
    summary(mmat)

    # Use summary to examine flower_model
    summary(flower_model)

    # Predict the number of flowers on each plant
    flowers$predictions <- predict(flower_model, newdata = flowers)

    # Plot predictions vs actual flowers (predictions on x-axis)
    ggplot(flowers, aes(x = predictions, y = Flowers)) +
    geom_point() +
    geom_abline(color = "blue")

    ########################################################################
    #Dummy Code Categorical Variables
    #https://stats.stackexchange.com/questions/115049/why-do-we-need-to-dummy-code-categorical-variables
  2. @cherryyingzizhang cherryyingzizhang revised this gist Nov 14, 2017. 1 changed file with 1 addition and 5 deletions.
    6 changes: 1 addition & 5 deletions Regression.R
    Original file line number Diff line number Diff line change
    @@ -962,13 +962,9 @@ lambda <- 10^seq(10, -2, length=100)

    #create test and training set
    library(glmnet)
    set.seed(489)
    train = sample(x = 1:nrow(x), size= nrow(x)/2)
    test = (-train)
    ytest = y[test]

    #OLS
    swisslm <- lm(Fertility~., data = swiss, subset = train)
    swisslm <- lm(Fertility~., data = swiss[train,])
    s.pred <- predict(swisslm, newdata = swiss[test,])

    #Ridge (alpha = 0)
  3. @cherryyingzizhang cherryyingzizhang revised this gist Nov 14, 2017. 1 changed file with 4 additions and 4 deletions.
    8 changes: 4 additions & 4 deletions Regression.R
    Original file line number Diff line number Diff line change
    @@ -972,15 +972,15 @@ swisslm <- lm(Fertility~., data = swiss, subset = train)
    s.pred <- predict(swisslm, newdata = swiss[test,])

    #Ridge (alpha = 0)
    ridge.mod <- glmnet(x[train,], y[train], alpha = 0, lambda = lambda)
    cv.out <- cv.glmnet(x[train,], y[train], alpha = 0)
    cv.out <- cv.glmnet(x[train,], y[train], alpha = 0, lambda = vectorOfPossibleLambdas)
    bestlamRidge <- cv.out$lambda.min
    ridge.mod <- glmnet(x[train,], y[train], alpha = 0, lambda = bestlamRidge)
    ridge.pred <- predict(ridge.mod, s = bestlamRidge, newx = x[test,])

    #Lasso (alpha = 1)
    lasso.mod <- glmnet(x[train,], y[train], alpha = 1, lambda = lambda)
    cv.out <- cv.glmnet(x[train,], y[train], alpha = 1)
    cv.out <- cv.glmnet(x[train,], y[train], alpha = 1, lambda = vectorOfPossibleLambdas)
    bestlamLasso <- cv.out$lambda.min
    lasso.mod <- glmnet(x[train,], y[train], alpha = 1, lambda = bestlamLasso)
    lasso.pred <- predict(lasso.mod, s = bestlamLasso, newx = x[test,])

    #Check RMSE LINEAR
  4. @cherryyingzizhang cherryyingzizhang revised this gist Nov 14, 2017. 1 changed file with 76 additions and 1 deletion.
    77 changes: 76 additions & 1 deletion Regression.R
    Original file line number Diff line number Diff line change
    @@ -944,4 +944,79 @@ ggplot(bikesAugust, aes(x = pred, y = cnt)) +
    geom_abline()

    #Sometimes it might make negative prediction. However if you look at RMSE it will be smaller than other models.
    #perhaps rounding negative predictions up to zero is a reasonable tradeoff
    #perhaps rounding negative predictions up to zero is a reasonable tradeoff

    ########################################################################
    #RIDGE AND LASSO
    ########################################################################

    #REFERENCE: https://www.r-bloggers.com/ridge-regression-and-the-lasso/

    swiss <- datasets::swiss
    #remove the Fertility column and remove the intercept column
    x <- model.matrix(Fertility~., data=swiss)[,-1]
    y <- swiss$Fertility
    lambda <- 10^seq(10, -2, length=100)

    #When lambda = 0, we get same coefficents as OLS model

    #create test and training set
    library(glmnet)
    set.seed(489)
    train = sample(x = 1:nrow(x), size= nrow(x)/2)
    test = (-train)
    ytest = y[test]

    #OLS
    swisslm <- lm(Fertility~., data = swiss, subset = train)
    s.pred <- predict(swisslm, newdata = swiss[test,])

    #Ridge (alpha = 0)
    ridge.mod <- glmnet(x[train,], y[train], alpha = 0, lambda = lambda)
    cv.out <- cv.glmnet(x[train,], y[train], alpha = 0)
    bestlamRidge <- cv.out$lambda.min
    ridge.pred <- predict(ridge.mod, s = bestlamRidge, newx = x[test,])

    #Lasso (alpha = 1)
    lasso.mod <- glmnet(x[train,], y[train], alpha = 1, lambda = lambda)
    cv.out <- cv.glmnet(x[train,], y[train], alpha = 1)
    bestlamLasso <- cv.out$lambda.min
    lasso.pred <- predict(lasso.mod, s = bestlamLasso, newx = x[test,])

    #Check RMSE LINEAR
    sqrt(mean((s.pred-ytest)^2))
    #Check RMSE RIDGE
    sqrt(mean((ridge.pred - ytest)^2))
    #Check RMSE LASSO
    sqrt(mean((lasso.pred-ytest)^2))

    ########################################################################
    #REFERENCE:https://www.youtube.com/watch?v=fAPCaue8UKQ
    ########################################################################
    #Elastic net: mix of lasso and ridge regression


    #Cross-validation to find optimal lambda for lasso regression
    CV = cv.glmnet(x=trainX, y=trainY, family="binomial/gaussian", type.measure="deviance AKA mse", alpha = 1, nlambda = 100)

    #Following plot shows the models (with different lambda values) that glmnet has fit, along with the error type for each of the models
    plot(CV)

    #Fit model
    fit = glmnet(x=trainX, y=trainY, family = "binomial", alpha=1, lambda=CV$lambda.1se)

    #???lambda.1se vs lambda.min???
    # This is a question about parsimony. The lambda.min option refers to value of λ at the lowest CV error.
    # The error at this value of λ is the average of the errors over the k folds and hence this estimate of the error is uncertain.
    # The lambda.1se represents the value of λ in the search that was simpler than the best model (lambda.min), but which has error
    # within 1 standard error of the best model. In other words, using the value of lambda.1se as the selected value for λ results
    # in a model that is slightly simpler than the best model but which cannot be distinguished from the best model in terms of error
    # given the uncertainty in the kk-fold CV estimate of the error of the best model.
    #
    # The choice is yours:
    #
    # The best model that may be too complex or slightly overfitted: lambda.min
    # The simplest model that has comparable error to the best model given the uncertainty: lambda.1se

    #List nonzero coefficients:
    fit$beta[,1]
  5. @cherryyingzizhang cherryyingzizhang revised this gist Nov 9, 2017. 1 changed file with 7 additions and 5 deletions.
    12 changes: 7 additions & 5 deletions Regression.R
    Original file line number Diff line number Diff line change
    @@ -237,6 +237,12 @@ unique(flowers$Time)
    fmla <- as.formula("Flowers ~ Intensity + Time")
    mmat <- model.matrix(fmla, data = flowers) #Time column changes to boolean: TimeLate (0 or 1)

    #Partial F-Test on a Factor Coded as a Set of Dummies
    full <- lm(y~x+as.factor(color), data=dat)
    red <- lm(y~x, data=dat)
    anova(red,full)
    #Look at the F value & Pr(>F) this anova prints out.

    ########################################################################
    #Dummy Code Categorical Variables
    #https://stats.stackexchange.com/questions/115049/why-do-we-need-to-dummy-code-categorical-variables
    @@ -938,8 +944,4 @@ ggplot(bikesAugust, aes(x = pred, y = cnt)) +
    geom_abline()

    #Sometimes it might make negative prediction. However if you look at RMSE it will be smaller than other models.
    #perhaps rounding negative predictions up to zero is a reasonable tradeoff




    #perhaps rounding negative predictions up to zero is a reasonable tradeoff
  6. @cherryyingzizhang cherryyingzizhang created this gist Nov 9, 2017.
    945 changes: 945 additions & 0 deletions Regression.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,945 @@
    ########################################################################
    #1-var Linear Regression
    ########################################################################
    #lm(formula you wanna fit, dataFrame)
    cmodel <- lm(temperature ~ chirps_per_sec, data = cricket)

    #Formula
    fmla <- temperature ~ chirps_per_sec
    fmla <- blood_pressure ~ age + weight
    #LHS: outcome, RHS: inputs.
    fmla <- as.formula("temperature ~ chirps_per_sec") #converts string to formula

    #conveniently package diagnostics from summary(cmodel) in a dataframe
    broom::glance(cmodel)

    sigr:wrapFTest(cmodel) #R^2 test

    #by default returns training data predictions
    predict(model)

    ggplot(cricket, aes(x = prediction, y = temperature)) + geom_point() + geom_abline(color = "darkblue") + ggtitle("temperature vs linear model prediction")

    #apply model to new data
    predict(model, newdata = newChirps)

    ########################################################################
    #Predicting once you fit a model
    ########################################################################

    # unemployment is in your workspace
    summary(unemployment)

    # newrates is in your workspace
    newrates

    # Predict female unemployment in the unemployment data set
    unemployment$prediction <- predict(unemployment_model)

    # load the ggplot2 package
    library(ggplot2)

    # Make a plot to compare predictions to actual (prediction on x axis).
    ggplot(unemployment, aes(x = prediction, y = female_unemployment)) +
    geom_point() +
    geom_abline(color = "blue")

    # Predict female unemployment rate when male unemployment is 5%
    pred <- predict(unemployment_model, newdata = newrates)
    # Print it
    pred

    ########################################################################
    #Multivariate Linear Regression
    ########################################################################

    # bloodpressure is in the workspace
    summary(bloodpressure)

    # Create the formula and print it
    fmla <- blood_pressure ~ age + weight
    fmla

    # Fit the model: bloodpressure_model
    bloodpressure_model <- lm(fmla, data = bloodpressure)

    # Print bloodpressure_model and call summary()
    bloodpressure_model
    summary(bloodpressure_model)

    # predict blood pressure using bloodpressure_model :prediction
    bloodpressure$prediction <- predict(bloodpressure_model)

    # plot the prediction vs actual
    ggplot(bloodpressure, aes(x = prediction, y = blood_pressure)) +
    geom_point() +
    geom_abline(color = "blue")

    #Pros of Linear Regression
    #easy to fit and to apply, concise, less prone to overfitting, interpretable
    #Cons of Linear Regression
    #can only express linear and additive relationships
    #collinearity -- when input variables are partially correlated.
    #when variables are highly correlated, signs of coefficients might not be what you expect, but does not affect model accuracy
    #model may be unstable (e.g. unusually high coefficients when variables are highly correlated)

    ########################################################################
    #Evaluating Regression Models (graphically, RMSE, R^2, (and F-test from link))
    ########################################################################
    #http://facweb.cs.depaul.edu/sjost/csc423/documents/f-test-reg.htm
    #https://feliperego.github.io/blog/2015/10/23/Interpreting-Model-Output-In-R

    #Evaluating model graphically
    #(by looking at plots of ground truth vs predictions aka y vs y_hat)
    #a well-fitted model would have:
    #x=y line runs through center of points
    #line of perfect prediction
    #a poorly fitting model would have:
    #regions where points are all on one side of x=y line
    #systematic errors - errors that are correlated w/ value of outcome
    #either you don't have all the variables in the model or
    #you need an algorithm that can find more complex relationships in data

    #Residual Plot
    #residual plot: difference between outcome & prediction vs prediction
    #good fit: no systematic errors (all the errors evenly distributed between positive and negative and same magnitude above and below)
    #bad fit: systematic errors (clusters of all positives or all negatives of residuals)

    #Gain Curve/Plot (http://www2.cs.uregina.ca/~dbd/cs831/notes/lift_chart/lift_chart.html)
    #useful when sorting instances is more important than predicting exact outcome values
    #measures how well model sorts the outcome
    #x-axis: fraction of items in model-sorted order (decreasing)
    #y-axis: fraction of total accumulated home sales
    #Wizard curve: perfect model
    GainCurvePlot(frame, xvar, truthvar, title)
    #frame is a data frame
    #xvar and truthvar are strings naming the prediction and actual outcome columns of frame
    #title is the title of the plot
    #When the predictions sort in exactly the same order, the relative Gini coefficient is 1.
    #When the model sorts poorly, the relative Gini coefficient is close to zero, or even negative.


    #Residuals
    unemployment$residuals <- unemployment$female_unemployment - unemployment$predictions
    ggplot(unemployment, aes(x = predictions, y = residuals)) +
    geom_pointrange(aes(ymin = 0, ymax = residuals)) +
    geom_hline(yintercept = 0, linetype = 3) +
    ggtitle("residuals vs. linear model prediction")


    #RMSE
    error <- houseprices$predictedPrices - houseprices$actualPrices #residuals
    err <- err^2
    rmse <- sqrt(mean(err))
    #is RMSE large or small? one way to tell is to compare it to the standard deviation of outcome

    #R^2
    #near 1: model fits well
    #near 0: no better than guessing the average value
    #R^2 = 1 - RSS (variance from model)/SST (variance of data)

    error <- houseprices$predictedPrices - houseprices$actualPrices #residuals
    rss <- sum(err^2)
    totalerr <- houseprices$actualPrices - mean(houseprices$actualPrices)
    sstot <- sum(toterr^2)
    s_squared <- 1 - (rss/sstot)

    #Correlation and R^2
    p = cor(prediction, actual) = 0.8995709
    p^2 = 0.8092278 = R^2
    #true for models that minimize squared error: linear regression & GAM regression

    #can get R^2 from summary() or glance()$r.squared or adj.r.squared

    cor(pred_y, y) #= sqrt(R^2) ONLY for trainng data & models that minimize square error

    #FAST RMSE/R_SQUARED FUNCTIONS
    rmse(actual, predicted)
    r_squared(actual, predicted)

    ########################################################################
    #Training a Model
    ########################################################################
    #if we don't have enough data to split training vs test sets, use cross validation

    #N-fold Cross Validation
    #partition data into n subsets
    #e.g. 3-fold cross validation
    #first, train a model using A and B and use that model to make pred. on C
    #then B and C to predict A
    #then A and C to predict B

    #splitPlan = vtreat::kWayCrossValidation(nRows in training data, nSplits aka n-way, dframe, y)
    #dframe and y aren't technically used so can set them both to be NULL
    #The resulting splitPlan is list of nSplits elements; each element contains two vectors:
    #train: the indices of dframe that will form the training set
    #app: the indices of dframe that will form the test (or application) set

    #split <- splitPlan[[1]], model <- lm(fmla, data = df[split$train,])
    #df$pred.cv[split$app] <- predict(model, newdata = df[split$app,])

    #if the n-fold cross validation tests look good enough, create the final model via all data
    #unfortunately you can't evaluate final model for future data because training/test
    #cross validation only tests the modelling process whereas training/test tests for future value prediction

    ########################################################################
    #EX. Generating a random split of training/test set
    #If you have a dataset dframe of size N & want random subset of X% of N, where 0 < X < 1,
    gp = runif(N) #vector of uniform random numbers
    dframe[gp < X,] #will be what you want
    dframe[gp >= X,] #will be complement
    #note: (close to x/1-x split, but not exactly due to uniform random distribution)

    #train the model
    fmla = cty ~ hwy
    mpg_model = lm(fmla, data = mpg_train)

    #test model using rmse(predCol, ycol) & r_squared(predcol, ycol)
    mpg_train$pred <- predict(mpg_model, mpg_train)
    mpg_test$pred <- predict(mpg_model, mpg_test)
    rmse_train <- rmse(mpg_train$pred, mpg_train$cty)
    rmse_test <- rmse(mpg_test$pred, mpg_test$cty)
    rsq_train <- r_squared(mpg_train$pred, mpg_train$cty)
    rsq_test <- r_squared(mpg_test$pred, mpg_test$cty)
    ggplot(mpg_test, aes(x = pred, y = cty)) +
    geom_point() +
    geom_abline()
    ########################################################################
    #EX. Create a Cross Validation Plan
    k <- 3 # Number of folds
    splitPlan <- kWayCrossValidation(nRows, k, NULL, NULL)

    # Run the 3-fold cross validation plan from splitPlan
    mpg$pred.cv <- 0
    for(i in 1:k) {
    split <- splitPlan[[i]]
    model <- lm(cty ~ hwy, data = mpg[split$train,])
    mpg$pred.cv[split$app] <- predict(model, newdata = mpg[split$app,])
    }

    # Predict from a full model
    mpg$pred <- predict(lm(cty ~ hwy, data = mpg))

    # Get the rmse of the full model's predictions
    rmse(mpg$cty, mpg$pred)

    # Get the rmse of the cross-validation predictions
    rmse(mpg$cty, mpg$pred.cv)

    ########################################################################
    #Categorical Inputs (One-hot encoding)
    ########################################################################
    model.matrix(WtLoss24 ~ Diet + age + BMI, data = diet)
    #all numerical values, converts categorical var. with N levels into N-1 indicator var

    #use unique() to see how many possible values Time takes (two: "Late" and "Early")
    unique(flowers$Time)
    fmla <- as.formula("Flowers ~ Intensity + Time")
    mmat <- model.matrix(fmla, data = flowers) #Time column changes to boolean: TimeLate (0 or 1)

    ########################################################################
    #Dummy Code Categorical Variables
    #https://stats.stackexchange.com/questions/115049/why-do-we-need-to-dummy-code-categorical-variables

    ########################################################################
    #Variable Interactions that are bad for Additive/Linear Regression
    ########################################################################
    #Additive relationship: e.g. plant_height ~ bacteria + sun
    #increase in sun causes increase in plant_height regardless of level of bacteria level

    #plant_head ~ bacteria + sun + bacteria:sun (interaction between bacteria and sunlight)
    #change in height is more (or less) the sum of the effets due to sun/bacteria
    #at higher levels of sunlight, 1 unit change in bacteria causes more change in height

    #Expressing Interactions in R
    #Interaction - Colon (:)
    y ~ a:b

    #Main Effects and Interaction:
    y ~ a*b === y ~ a + b + a:b

    #Expressing product of two variables: I
    y ~ I(a*b)

    ########################################################################
    #Transforming the response before modelling
    ########################################################################
    #Sometimes get better models by transforming output rather than predicting output directly

    #e.g. log transform

    #e.g. Log Transform for Monetary Data.
    #monetary values tend to be lognormally distributed
    #long tail wide dynamic range (60-700k)
    #in such a case, mean > median (50k vs 39k)
    #predicting the mean will overpredict typical values (as what regression typically does)
    #if you take the log of monetary data, it will be transformed into a normal distribution
    #AKA mean ~= median again, more reasonable dynamic range (1.8 - 5.8)

    #The Procedure
    model <- lm(log(y) ~ x, data = train)
    logpred <- predict(model, data = test)
    pred <- exp(logpred) #transform predictions to outcome space

    #Consequences
    #prediction errors are multiplicative in outcome space (size of error relative to size of outcome)
    #multiplicative error: pred/y
    #relative error: (pred - y)/y = pred/y - 1
    #reducing multiplicative error reduces relative error
    #log transformations are useful when you want to reduce relative error rather than additive error
    #good example: monetary. Being off by $100 is less important for $100000 but more for $200 income

    #Root Mean Squared Relative Error
    #RMSRE = sqrt(mean((pred-y)/y)^2))
    #predicting log-outcome reduces RMS-relative error
    #but model will often have larger RMSE

    ########################################################################
    #Ex. Log Transform

    # Examine Income2005 in the training set
    summary(income_train$Income2005)

    # Write the formula for log income as a function of the tests and print it
    (fmla.log <- log(Income2005) ~ Arith + Word + Parag + Math + AFQT)

    # Fit the linear model
    model.log <- lm(fmla.log, data=income_train)

    # Make predictions on income_test
    income_test$logpred <- predict(model.log, newdata= income_test)
    summary(income_test$logpred)

    # Convert the predictions to monetary units
    income_test$pred.income <- exp(income_test$logpred)
    summary(income_test$pred.income)

    # Plot predicted income (x axis) vs income
    ggplot(income_test, aes(x = pred.income, y = Income2005)) +
    geom_point() +
    geom_abline(color = "blue")

    # Add predictions to the test set
    income_test <- income_test %>%
    mutate(pred.absmodel = predict(model.abs, income_test),#model.abs
    pred.logmodel = exp(predict(model.log, income_test)))#model.log

    # Gather the predictions and calculate residuals and relative error
    income_long <- income_test %>%
    gather(key = modeltype, value = pred, pred.absmodel, pred.logmodel) %>%
    mutate(residual = pred - Income2005, # residuals
    relerr = residual/Income2005 ) # relative error

    # Calculate RMSE and relative RMSE and compare
    income_long %>%
    group_by(modeltype) %>% # group by modeltype
    summarize(rmse = sqrt(mean(residual^2)), # RMSE
    rmse.rel = sqrt(mean((relerr)^2))) # Root mean squared relative error

    ########################################################################
    #Transforming the inputs before modelling
    ########################################################################

    #Why to Transform Input Variables
    #domain knowledge/synthetic variables
    #intelligenceAnimals ~ mass.brains/mass.body^2.3
    #pragmatic reasons
    #log transform to reduce dynamic range
    #log transform because meaningful changes in variable are multiplicative
    #y approximately linear in f(x) rather than in x

    #anx ~ I(hassles^3)
    #I(): treat an expression literally (not as an interaction)

    #Linear, Quadratic, and Cubic Models
    mod_lin <- lm(anx ~ hassles, hassleframe)
    summary(mod_lin)$r.squared #0.5334847
    mod_quad <- lm(anx ~ I(hassles^2), hassleframe)
    summary(mod_quad)$r.squared #0.6241029
    mod_tritic <- lm(anx ~ I(hassles^3), hassleframe)
    summary(mod_tritic)$r.squared #0.6474421 - clearly the best

    ########################################################################
    #EX. Transforming inputs

    # houseprice is in the workspace
    summary(houseprice)

    # Create the formula for price as a function of squared size
    (fmla_sqr <- (price ~ I(size^2)))

    # Fit a model of price as a function of squared size (use fmla_sqr)
    model_sqr <- lm(fmla_sqr, data = houseprice)

    # Fit a model of price as a linear function of size
    model_lin <- lm(price ~ size, data = houseprice)

    # Make predictions and compare
    houseprice %>%
    mutate(pred_lin = predict(model_lin), # predictions from linear model
    pred_sqr = predict(model_sqr)) %>% # predictions from quadratic model
    gather(key = modeltype, value = pred, pred_lin, pred_sqr) %>% # gather the predictions
    ggplot(aes(x = size)) +
    geom_point(aes(y = price)) + # actual prices
    geom_line(aes(y = pred, color = modeltype)) + # the predictions
    scale_color_brewer(palette = "Dark2")

    ########################################################################
    #Logistic Regression to Predict Probabilities https://www.stat.wisc.edu/courses/st572-larget/handouts08-4.pdf
    ########################################################################
    #Generalized Linear Model
    glm(formula, data = train, family = binomial(link = "logit"))
    #assumes inputs additive, linear in log-odds: log(p/(1-p))
    #family: describes error distribution of model
    #logistic regresssion: family = binomial
    #outcome: two classes, e.g. a and b
    #model returns Prob(b)
    #Recommend: 0/1 or FALSE/TRUE

    predict(model, newdata = test, type = "response")
    #newdat: by default, trainng data
    #to get probabilities: use type = "response"
    #by default: returns log-odds of the event, not the probability

    #Methods of Evaluating Logistic Regression Model: pseudo-R^2
    #R^2 = 1 - RSS/TSS
    #pseudR^2 = 1 - deviance/null.deviance
    #Deviance: analogous to variance (RSS)
    #Null Deviance: Similar to TSS
    #Pseudo R^2: Deviance explained

    #Pseudo-R^2 on Training Data
    #Using broom::glance()
    glance(model) %>% summarize(pR2 = 1 - deviance/null.deviance)
    ## pseudoR2
    ## 1 0.5922402

    #Using sigr::wrapChiSqTest()
    wrapChiSqTest(model)
    ## "... pseudo-R2=0.59 ..."

    #Pseudo R^2 on Test Data
    test %>% mutate(pred = predict(model, newdata = test, type = "response")) %>%
    wrapChiSqTest("predColName", "trueValuesColName of 0 or 1", TRUE) #the "TRUE" is the target value/target event

    #GainCurvePlot great for logistic regression
    GainCurvePlot(test, "pred", "has_dmd", "DMd model on test")

    ########################################################################
    #EX. Logistic Regresssion on Sparrow Survival Probability

    #predict status (survived vs perished) based on toatl_length, weight, and humerus

    # sparrow is in the workspace
    summary(sparrow)

    # Create the survived column based on values TRUE/FALSE rather than Survived/Perished
    sparrow$survived <- sparrow$status == "Survived"

    # Create the formula
    (fmla <- as.formula("survived ~ total_length + weight + humerus"))

    # Fit the logistic regression model
    sparrow_model <- glm(fmla, data = sparrow, family = binomial(link = "logit"))

    # Call summary
    summary(sparrow_model)

    # Call glance
    (perf <- glance(sparrow_model))

    # Calculate pseudo-R-squared
    (pseudoR2 <- 1 - (sparrow_model$deviance)/(sparrow_model$null.deviance))

    # Make predictions
    sparrow$pred <- predict(sparrow_model, type = "response")

    # Look at gain curve
    GainCurvePlot(sparrow, "pred", "survived", "sparrow survival model")

    ########################################################################
    #Poisson and Quasipoisson Regression to Predict Counts
    ########################################################################
    #Predicting Counts
    #linear regression: predict values in [-infinity, infinity]
    #counts: integers in range [0, infinity]

    glm(formula, data, family = poisson_or_quasipoisson)
    #inputs are additive and linear in log(count)
    #outcome: integers
    #counts: e.g. number of traffic tickets a driver gets
    #rates: e.g. number of website hits/day
    #prediction: expected rate or intensity (not integral)
    #expected # traffic tickets; expected hits/day

    #Poisson vs Quasipoisson
    #poisson assumes that mean(y) = var(y) ("the same" as in, should be of a similar order of magnitude***)
    #if vary(y) much different from mean(y) - use quasipoisson
    #generally required a large sample size
    #if rates/counts >> 0 - regular regression is fine

    #use same pseudo R^2

    #use same predict method as logistic regression, along with type = "response"

    #usage of paste function:
    paste(outcome, "~", paste(vars, collapse = " + "))
    #"cnt ~ hr + holiday + workingday + weathersit + temp + atemp + hum + windspeed"

    ########################################################################
    #EX. Poisson/Quasipoisson

    # The outcome column
    outcome #"cnt"

    # The inputs to use
    vars ["hr", "holiday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed"]

    # Create the formula string for bikes rented as a function of the inputs
    (fmla <- paste(outcome, "~", paste(vars, collapse = " + ")))

    # Calculate the mean and variance of the outcome
    (mean_bikes <- mean(bikesJuly$cnt))
    (var_bikes <- var(bikesJuly$cnt))

    # Fit the model
    bike_model <- glm(fmla, data = bikesJuly, family = quasipoisson)

    # Call glance
    (perf <- glance(bike_model))

    # Calculate pseudo-R-squared
    (pseudoR2 <- 1 - perf$deviance/perf$null.deviance)

    # bikesAugust is in the workspace
    str(bikesAugust)

    # bike_model is in the workspace
    summary(bike_model)

    # Make predictions on August data
    bikesAugust$pred <- predict(bike_model, newdata = bikesAugust, type = "response")

    # Calculate the RMSE (CAN USE RMSE AS LONG AS YOU DON'T TRANSFORM THE OUTPUT RESPONSES)
    bikesAugust %>%
    mutate(residual = pred - cnt) %>%
    summarize(rmse = sqrt(mean(residual^2)))

    ########################################################################
    #EX. Visualize the Bike Rental Predictions as a Function of Time
    # Plot predictions and cnt by date/time
    bikesAugust %>%
    # set start to 0, convert unit to days
    mutate(instant = (instant - min(instant))/24) %>%
    # gather cnt and pred into a value column
    gather(key = valuetype, value = value, cnt, pred) %>%
    filter(instant < 14) %>% # restric to first 14 days
    # plot value by instant
    ggplot(aes(x = instant, y = value, color = valuetype, linetype = valuetype)) +
    geom_point() +
    geom_line() +
    scale_x_continuous("Day", breaks = 0:14, labels = 0:14) +
    scale_color_brewer(palette = "Dark2") +
    ggtitle("Predicted August bike rentals, Quasipoisson model")

    ########################################################################
    #Generalized Additive Models (GAM) to learn Non-Linear Transformations
    ########################################################################
    #GAM: output depends additively on unknown smooth functions of input variables
    #e.g. y ~ b0 + s1(x1) + s2(x2)
    #a GAM learns the intercept b0 and functions si's of the input
    #Use GAM to learn the relationship: quadratic? cubic? quartic?

    mgcv::gam(formula, family, data)

    #family:
    #gaussian (default): "regular" expression
    #binomial: probabilities
    #poisson/quassipoisson: counts
    #Best for larger datasets, as they are complex and more likely to overfit

    #s() function (fits a spline between input and outcome)
    anxiety ~ s(hassles)
    #to specify that an input has a non-linear relationship with the outcome
    #use s() with continuous variables NOT CATEGORICAL******************
    #more than about 10 unique values

    model <- gam(anxiety ~ s(hassles), data = hassleFrame, family = gaussian)
    summary(model)
    ##
    ## ...
    ##
    ## R-sq.(adj) = 0.619 Deviance explained = 64.1% (pseudo R-squared)
    ## GCV = 49.132 Scale est. = 45.153 n = 40
    #The "deviance explained" reports the model's unadjusted R^2

    plot(model) #gives plots of variable transformations that the algorithm learned

    #y-values of these plots
    predict(model, type = "terms")

    #make predictions with model. use as.numeric to convert the matrix from predict(gam) to a vector
    as.numeric(predict(model, newdata = hassleFrame, type = "response"))

    ########################################################################
    #EX. GAM Example

    # soybean_train is in the workspace
    summary(soybean_train)

    # Plot weight vs Time (Time on x axis)
    ggplot(soybean_train, aes(x = Time, y = weight)) +
    geom_point()

    # Load the package mgcv
    library(mgcv)

    # Create the formula
    (fmla.gam <- weight ~ s(Time))

    # Fit the GAM Model
    model.gam <- gam(fmla.gam, data = soybean_train, family = gaussian)

    # Call summary() on model.lin and look for R-squared
    summary(model.lin)

    # Call summary() on model.gam and look for R-squared
    summary(model.gam)

    # Call plot() on model.gam
    plot(model.gam)

    # soybean_test is in the workspace
    summary(soybean_test)

    # Get predictions from linear model
    soybean_test$pred.lin <- predict(model.lin, newdata = soybean_test)

    # Get predictions from gam model
    soybean_test$pred.gam <- as.numeric(predict(model.gam, newdata = soybean_test))

    # Gather the predictions into a "long" dataset
    soybean_long <- soybean_test %>%
    gather(key = modeltype, value = pred, pred.lin, pred.gam)

    # Calculate the rmse
    soybean_long %>%
    mutate(residual = weight - pred) %>% # residuals
    group_by(modeltype) %>% # group by modeltype
    summarize(rmse = sqrt(mean(residual^2))) # calculate the RMSE

    # Compare the predictions against actual weights on the test data
    soybean_long %>%
    ggplot(aes(x = Time)) + # the column for the x axis
    geom_point(aes(y = weight)) + # the y-column for the scatterplot
    geom_point(aes(y = pred, color = modeltype)) + # the y-column for the point-and-line plot
    geom_line(aes(y = pred, color = modeltype, linetype = modeltype)) + # the y-column for the point-and-line plot
    scale_color_brewer(palette = "Dark2")

    ########################################################################
    #Decision Trees
    ########################################################################
    #Rules of form:
    #if A and B and C then Y
    #non-linear concepts
    #intervals
    #non-monotonic relationships
    #non-additive interactions
    #AND: similar to multiplication

    #Pros:
    #trees have an expressive concept space (lower RMSE)
    #Cons:
    #coarse-grained predictions (can give six possible values, whereas linear can give continuous value predictions)

    #Ensembles of Trees (Several Trees) (Random Forest & Gradient-Boosted Trees)
    #ensemble model fits animal intelligence data better than single tree
    #ensembles give finer-grained predictions than single trees

    ########################################################################
    #Random Forests
    ########################################################################

    #Multiple diverse decision trees averaged together
    #reduces overfit
    #increases model expressiveness
    #finer grain predictions

    #Building a Random Forest Model
    #1. Draw bootstrapped sample from training data
    #2. For each sample grow a tree
    #at each node, pick best variable to split on (from a random subset of all variables)
    #continue until tree is grown
    #3. To score a datum, evaluate it with all the trees and average the results

    #CAN BE USED FOR BOTH CLASSIFICATION AND REGRESSION PROBLEMS
    model <- ranger::ranger(formula, trainngData, num.trees = 500, respect.unordered.factors = "order")
    #use at least 200 trees
    #mtry - number of variables to try at each node
    #by default, square root of the total number of variables
    #respect.unordered.factors
    #Specifies how to treat unordered factor variables. We recommend setting this to "order" for regression.

    ## Ranger result
    ## ...
    ## OOB (OUT OF BAG) estimate, AKA prediction error (MSE): 3103.623
    ## R squared (OOB): 0.7837386
    #NOTE: Random forest algorithm returns estimates of out-of-sample performance

    predict(model, test_data)$predictions
    #predict() inputs: model and data
    #predictions can be accessed in the element $predictions

    ########################################################################
    #EX. Random Forests

    # bikesJuly is in the workspace
    str(bikesJuly)

    # Random seed to reproduce results
    seed

    # The outcome column
    (outcome <- "cnt")

    # The input variables
    (vars <- c("hr", "holiday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed"))

    # Create the formula string for bikes rented as a function of the inputs
    (fmla <- paste(outcome, "~", paste(vars, collapse = " + ")))

    # Load the package ranger
    library(ranger)

    # Fit and print the random forest model
    (bike_model_rf <- ranger(fmla, # formula
    bikesJuly, # data
    num.trees = 500,
    respect.unordered.factors = "order",
    seed = seed))

    # bikesAugust is in the workspace
    str(bikesAugust)

    # bike_model_rf is in the workspace
    bike_model_rf

    # Make predictions on the August data
    bikesAugust$pred <- predict(bike_model_rf, newdata = bikesAugust)$predictions

    # Calculate the RMSE of the predictions
    bikesAugust %>%
    mutate(residual = pred - cnt) %>% # calculate the residual
    summarize(rmse = sqrt(mean(residual^2))) # calculate rmse

    # Plot actual outcome vs predictions (predictions on x-axis)
    ggplot(bikesAugust, aes(x = pred, y = cnt)) +
    geom_point() +
    geom_abline()

    ########################################################################
    #Categorical Inputs (One-hot encoding) MANUALLY
    ########################################################################
    #Most R functions manage the conversion for you i.e. model.matrix as we learned before
    #however, some do not e.g xgboost()
    #must convert categorical variables to numeric representation
    #conversion to indicators: one-hot encoding

    #vtreat to one-hot encode categorical variables (by transforming it into indicator variables
    #coded "lev" & to clean bad values out of numerical variables coded "clean")
    #Basic Idea:
    #designTreatmentsZ() to design a treatment plan from the training data
    #prepare() to create "clean" data
    #all numerical
    #no missing values
    #use prepare() with treatment plan for all future data

    #DESIGN TREATMENT PLAN
    vars <- c("x", "u")
    treatplan <- designTreatmentsZ(dframe, varslist, verbose = FALSE)
    #varslist: list of input variable names as strings
    #Set verbose to FALSE to suppress progress messages
    #designTreatmentsZ has property scoreFrame that describe the var-mapping & the types of each new variable.

    scoreFrame <- treatplan %>%
    magrittr::use_series(scoreFrame) %>%
    select(varName, origName, code)

    newvars <- scoreFrame %>%
    filter(code %in% c("clean", "lev")) %>%
    magrittr::use_series(varName)

    #PREPARE
    training.treat <- prepare(treatmentplan, dframe, varRestriction = newvars)
    #inputs to prepare():
    #treatmentplan: treatment plan
    #dframe: data frame
    #varestriction: list of variables to prepare (optional)
    #default: prepare all variables

    ########################################################################
    #EX. Categorical Inputs

    # dframe is in the workspace
    dframe

    # Create and print a vector of variable names
    (vars <- c("color", "size"))

    # Load the package vtreat
    library(vtreat)

    # Create the treatment plan
    treatplan <- designTreatmentsZ(dframe, vars)

    # Examine the scoreFrame
    (scoreFrame <- treatplan %>%
    use_series(scoreFrame) %>%
    select(varName, origName, code))

    # We only want the rows with codes "clean" or "lev"
    (newvars <- scoreFrame %>%
    filter(code %in% c("clean", "lev")) %>%
    use_series(varName))

    # Create the treated training data
    (dframe.treat <- prepare(treatplan, dframe, varRestriction = newvars))

    ########################################################################
    #EX. Novel Levels that e.g. model.matrix can't handle but vtreat can

    # newvars is in the workspace
    newvars

    # Print dframe and testframe
    dframe #has 'r', 'g', and 'b'
    testframe #has 'y'

    # Use prepare() to one-hot-encode testframe
    (testframe.treat <- prepare(treatplan, testframe, varRestriction = newvars))

    ########################################################################
    #Gradient Boosting
    ########################################################################
    #ensemble method that builds up a model by incrementally improving the existing one

    #1. Fit a shallow tree T1 to the data. M1 = T1
    #2. Fit a tree T2 to the residuals. Find gamma such that M2 = M1 + gamma*T2 is the best fit to data.
    #For regularized boosting, decrease learning by a factor eta in range (0,1).
    #M2 = M1 + eta*gamma*T2. A larger eta = faster learning, smaller eta = less risk of overfit
    #3. Repeat (2) until stopping condition is met.

    #Final Model: M = M1 + eta*sum(gamma_i * Ti)

    #Because gradient boosting optimizes error on the training data, it's very easy to overfit the model.
    #best practice is to estimate out of sample error via cross validation for each incremental model
    #then retroactively decide how many trees to use

    #xgboost()
    #1. Run xgb.cv() with a large number of rounds (trees). Fits the model & calculates cross-validated error as well.
    #2. xgb.cv()$evaluation_log: records estimated RMSE for each round.
    #find the number of trees that minimizes estimated RMSE: n_best
    #3. Run xgboost(), setting nrounds = n_best

    #EX.
    treatplan <- designTreatmentsZ(bikesJan, vars)
    newvars <- treatplan$scoreFrame %>%
    filter(code %in% c("clean", "lev")) %>%
    use_series(varName)

    bikesJan.treat <- prepare(treatplan, bikesJan, varRestriction = newvars)

    cv <- xgb.cv(data = as.matrix(bikesJan.treat), #a numeric matrix
    label = bikesJan$cnt, #vector of outcomes (numeric)
    objective = "reg:linear",
    nrounds = 100, nfold = 5, eta = 0.3, max_depth = 6)

    #Key inputs to xgb.cv() and xgboost()
    #data: input data as matrix; label: outcome
    #objective: for regression - "reg:linear"
    #nrounds: maximum number of trees to fit
    #eta: learning rate
    #max_depth: maximum depth of individual trees
    #nfold: (xgb.cv() only): number of folds for cross validation
    #early_stopping_rounds: after this many rounds without improvement, stop.

    elog <- as.data.frame(cv$evaluation_log)
    nrounds <- which.min(elog$test_rmse_mean) #out of sample error
    ## 78 #the index of the minimum value corresponds to best number of trees

    #run cgboost() with right number of trees to get final model
    nrounds <- 78
    model <- xgboost(data = as.matrix(bikesJan.treat),
    label = bikesJan$cnt,
    nrounds = nrounds,
    objective = "reg:linear",
    eta = 0.3,
    depth = 6)

    #Predict with xgboost() model
    bikesFeb.treat <- prepare(treatplan, bikesFeb, varRestriction = newvars)
    bikesFeb$pred <- predict(model, as.matrix(bikesFeb.treat))

    ########################################################################
    #EX. Gradient Boosting Machines

    # bikesJuly & bikesJuly.treat are in the workspace

    # Load the package xgboost
    library(xgboost)

    # Run xgb.cv
    cv <- xgb.cv(data = as.matrix(bikesJuly.treat),
    label = bikesJuly$cnt,
    nrounds = 100,
    nfold = 5,
    objective = "reg:linear",
    eta = 0.3,
    max_depth = 6,
    early_stopping_rounds = 10,
    verbose = 0 # silent
    )

    # Get the evaluation log
    elog <- as.data.frame(cv$evaluation_log)

    # Determine and print how many trees minimize training and test error
    elog %>%
    summarize(ntrees.train = which.min(train_rmse_mean), # find the index of min(train_rmse_mean)
    ntrees.test = which.min(test_rmse_mean)) # find the index of min(test_rmse_mean)
    #ntrees.train = 67, ntrees.test = 57

    #In most cases, ntrees.test is less than ntrees.train. The training error keeps decreasing
    #even after the test error starts to increase. It's important to use cross-validation to find
    #the right number of trees (as determined by ntrees.test) and avoid an overfit model.

    # Examine the workspace
    ls()

    # The number of trees to use, as determined by xgb.cv
    ntrees

    # Run xgboost
    bike_model_xgb <- xgboost(data = as.matrix(bikesJuly.treat), # training data as matrix
    label = bikesJuly$cnt, # column of outcomes
    nrounds = ntrees, # number of trees to build
    objective = "reg:linear", # objective
    eta = 0.3,
    depth = 6,
    verbose = 0 # silent
    )

    # Make predictions
    bikesAugust$pred <- predict(bike_model_xgb, as.matrix(bikesAugust.treat))

    # Plot predictions (on x axis) vs actual bike rental count
    ggplot(bikesAugust, aes(x = pred, y = cnt)) +
    geom_point() +
    geom_abline()

    #Sometimes it might make negative prediction. However if you look at RMSE it will be smaller than other models.
    #perhaps rounding negative predictions up to zero is a reasonable tradeoff