-
-
Save jangsean/4d390ee684deebd109d8c1757e42e42a to your computer and use it in GitHub Desktop.
Regression.R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ######################################################################## | |
| #1-var Linear Regression | |
| ######################################################################## | |
| #lm(formula you wanna fit, dataFrame) | |
| cmodel <- lm(temperature ~ chirps_per_sec, data = cricket) | |
| #Formula | |
| fmla <- temperature ~ chirps_per_sec | |
| fmla <- blood_pressure ~ age + weight | |
| #LHS: outcome, RHS: inputs. | |
| fmla <- as.formula("temperature ~ chirps_per_sec") #converts string to formula | |
| #conveniently package diagnostics from summary(cmodel) in a dataframe | |
| broom::glance(cmodel) | |
| sigr:wrapFTest(cmodel) #R^2 test | |
| #by default returns training data predictions | |
| predict(model) | |
| ggplot(cricket, aes(x = prediction, y = temperature)) + geom_point() + geom_abline(color = "darkblue") + ggtitle("temperature vs linear model prediction") | |
| #apply model to new data | |
| predict(model, newdata = newChirps) | |
| ######################################################################## | |
| #Predicting once you fit a model | |
| ######################################################################## | |
| # unemployment is in your workspace | |
| summary(unemployment) | |
| # newrates is in your workspace | |
| newrates | |
| # Predict female unemployment in the unemployment data set | |
| unemployment$prediction <- predict(unemployment_model) | |
| # load the ggplot2 package | |
| library(ggplot2) | |
| # Make a plot to compare predictions to actual (prediction on x axis). | |
| ggplot(unemployment, aes(x = prediction, y = female_unemployment)) + | |
| geom_point() + | |
| geom_abline(color = "blue") | |
| # Predict female unemployment rate when male unemployment is 5% | |
| pred <- predict(unemployment_model, newdata = newrates) | |
| # Print it | |
| pred | |
| ######################################################################## | |
| #Multivariate Linear Regression | |
| ######################################################################## | |
| # bloodpressure is in the workspace | |
| summary(bloodpressure) | |
| # Create the formula and print it | |
| fmla <- blood_pressure ~ age + weight | |
| fmla | |
| # Fit the model: bloodpressure_model | |
| bloodpressure_model <- lm(fmla, data = bloodpressure) | |
| # Print bloodpressure_model and call summary() | |
| bloodpressure_model | |
| summary(bloodpressure_model) | |
| # predict blood pressure using bloodpressure_model :prediction | |
| bloodpressure$prediction <- predict(bloodpressure_model) | |
| # plot the prediction vs actual | |
| ggplot(bloodpressure, aes(x = prediction, y = blood_pressure)) + | |
| geom_point() + | |
| geom_abline(color = "blue") | |
| #Pros of Linear Regression | |
| #easy to fit and to apply, concise, less prone to overfitting, interpretable | |
| #Cons of Linear Regression | |
| #can only express linear and additive relationships | |
| #collinearity -- when input variables are partially correlated. | |
| #when variables are highly correlated, signs of coefficients might not be what you expect, but does not affect model accuracy | |
| #model may be unstable (e.g. unusually high coefficients when variables are highly correlated) | |
| ######################################################################## | |
| #Evaluating Regression Models (graphically, RMSE, R^2, (and F-test from link)) | |
| ######################################################################## | |
| #http://facweb.cs.depaul.edu/sjost/csc423/documents/f-test-reg.htm | |
| #https://feliperego.github.io/blog/2015/10/23/Interpreting-Model-Output-In-R | |
| #Evaluating model graphically | |
| #(by looking at plots of ground truth vs predictions aka y vs y_hat) | |
| #a well-fitted model would have: | |
| #x=y line runs through center of points | |
| #line of perfect prediction | |
| #a poorly fitting model would have: | |
| #regions where points are all on one side of x=y line | |
| #systematic errors - errors that are correlated w/ value of outcome | |
| #either you don't have all the variables in the model or | |
| #you need an algorithm that can find more complex relationships in data | |
| #Residual Plot | |
| #residual plot: difference between outcome & prediction vs prediction | |
| #good fit: no systematic errors (all the errors evenly distributed between positive and negative and same magnitude above and below) | |
| #bad fit: systematic errors (clusters of all positives or all negatives of residuals) | |
| #Gain Curve/Plot (http://www2.cs.uregina.ca/~dbd/cs831/notes/lift_chart/lift_chart.html) | |
| #useful when sorting instances is more important than predicting exact outcome values | |
| #measures how well model sorts the outcome | |
| #x-axis: fraction of items in model-sorted order (decreasing) | |
| #y-axis: fraction of total accumulated home sales | |
| #Wizard curve: perfect model | |
| GainCurvePlot(frame, xvar, truthvar, title) | |
| #frame is a data frame | |
| #xvar and truthvar are strings naming the prediction and actual outcome columns of frame | |
| #title is the title of the plot | |
| #When the predictions sort in exactly the same order, the relative Gini coefficient is 1. | |
| #When the model sorts poorly, the relative Gini coefficient is close to zero, or even negative. | |
| #Residuals | |
| unemployment$residuals <- unemployment$female_unemployment - unemployment$predictions | |
| ggplot(unemployment, aes(x = predictions, y = residuals)) + | |
| geom_pointrange(aes(ymin = 0, ymax = residuals)) + | |
| geom_hline(yintercept = 0, linetype = 3) + | |
| ggtitle("residuals vs. linear model prediction") | |
| #RMSE | |
| error <- houseprices$predictedPrices - houseprices$actualPrices #residuals | |
| err <- err^2 | |
| rmse <- sqrt(mean(err)) | |
| #is RMSE large or small? one way to tell is to compare it to the standard deviation of outcome | |
| #R^2 | |
| #near 1: model fits well | |
| #near 0: no better than guessing the average value | |
| #R^2 = 1 - RSS (variance from model)/SST (variance of data) | |
| error <- houseprices$predictedPrices - houseprices$actualPrices #residuals | |
| rss <- sum(err^2) | |
| totalerr <- houseprices$actualPrices - mean(houseprices$actualPrices) | |
| sstot <- sum(toterr^2) | |
| s_squared <- 1 - (rss/sstot) | |
| #Correlation and R^2 | |
| p = cor(prediction, actual) = 0.8995709 | |
| p^2 = 0.8092278 = R^2 | |
| #true for models that minimize squared error: linear regression & GAM regression | |
| #can get R^2 from summary() or glance()$r.squared or adj.r.squared | |
| cor(pred_y, y) #= sqrt(R^2) ONLY for trainng data & models that minimize square error | |
| #FAST RMSE/R_SQUARED FUNCTIONS | |
| rmse(actual, predicted) | |
| r_squared(actual, predicted) | |
| ######################################################################## | |
| #Training a Model | |
| ######################################################################## | |
| #if we don't have enough data to split training vs test sets, use cross validation | |
| #N-fold Cross Validation | |
| #partition data into n subsets | |
| #e.g. 3-fold cross validation | |
| #first, train a model using A and B and use that model to make pred. on C | |
| #then B and C to predict A | |
| #then A and C to predict B | |
| #splitPlan = vtreat::kWayCrossValidation(nRows in training data, nSplits aka n-way, dframe, y) | |
| #dframe and y aren't technically used so can set them both to be NULL | |
| #The resulting splitPlan is list of nSplits elements; each element contains two vectors: | |
| #train: the indices of dframe that will form the training set | |
| #app: the indices of dframe that will form the test (or application) set | |
| #split <- splitPlan[[1]], model <- lm(fmla, data = df[split$train,]) | |
| #df$pred.cv[split$app] <- predict(model, newdata = df[split$app,]) | |
| #if the n-fold cross validation tests look good enough, create the final model via all data | |
| #unfortunately you can't evaluate final model for future data because training/test | |
| #cross validation only tests the modelling process whereas training/test tests for future value prediction | |
| ######################################################################## | |
| #EX. Generating a random split of training/test set | |
| #If you have a dataset dframe of size N & want random subset of X% of N, where 0 < X < 1, | |
| gp = runif(N) #vector of uniform random numbers | |
| dframe[gp < X,] #will be what you want | |
| dframe[gp >= X,] #will be complement | |
| #note: (close to x/1-x split, but not exactly due to uniform random distribution) | |
| #train the model | |
| fmla = cty ~ hwy | |
| mpg_model = lm(fmla, data = mpg_train) | |
| #test model using rmse(predCol, ycol) & r_squared(predcol, ycol) | |
| mpg_train$pred <- predict(mpg_model, mpg_train) | |
| mpg_test$pred <- predict(mpg_model, mpg_test) | |
| rmse_train <- rmse(mpg_train$pred, mpg_train$cty) | |
| rmse_test <- rmse(mpg_test$pred, mpg_test$cty) | |
| rsq_train <- r_squared(mpg_train$pred, mpg_train$cty) | |
| rsq_test <- r_squared(mpg_test$pred, mpg_test$cty) | |
| ggplot(mpg_test, aes(x = pred, y = cty)) + | |
| geom_point() + | |
| geom_abline() | |
| ######################################################################## | |
| #EX. Create a Cross Validation Plan | |
| k <- 3 # Number of folds | |
| splitPlan <- kWayCrossValidation(nRows, k, NULL, NULL) | |
| # Run the 3-fold cross validation plan from splitPlan | |
| mpg$pred.cv <- 0 | |
| for(i in 1:k) { | |
| split <- splitPlan[[i]] | |
| model <- lm(cty ~ hwy, data = mpg[split$train,]) | |
| mpg$pred.cv[split$app] <- predict(model, newdata = mpg[split$app,]) | |
| } | |
| # Predict from a full model | |
| mpg$pred <- predict(lm(cty ~ hwy, data = mpg)) | |
| # Get the rmse of the full model's predictions | |
| rmse(mpg$cty, mpg$pred) | |
| # Get the rmse of the cross-validation predictions | |
| rmse(mpg$cty, mpg$pred.cv) | |
| ######################################################################## | |
| #Categorical Inputs (One-hot encoding) | |
| ######################################################################## | |
| model.matrix(WtLoss24 ~ Diet + age + BMI, data = diet) | |
| #all numerical values, converts categorical var. with N levels into N-1 indicator var | |
| #use unique() to see how many possible values Time takes (two: "Late" and "Early") | |
| unique(flowers$Time) | |
| fmla <- as.formula("Flowers ~ Intensity + Time") | |
| mmat <- model.matrix(fmla, data = flowers) #Time column changes to boolean: TimeLate (0 or 1) | |
| #Partial F-Test on a Factor Coded as a Set of Dummies | |
| full <- lm(y~x+as.factor(color), data=dat) | |
| red <- lm(y~x, data=dat) | |
| anova(red,full) | |
| #Look at the F value & Pr(>F) this anova prints out. | |
| ######################################################################## | |
| #EX. Categorical Inputs | |
| # Call str on flowers to see the types of each column | |
| str(flowers) | |
| # Use unique() to see how many possible values Time takes | |
| unique(flowers$Time) | |
| # Build a formula to express Flowers as a function of Intensity and Time: fmla. Print it | |
| (fmla <- as.formula("Flowers ~ Intensity + Time")) | |
| # Use fmla and model.matrix to see how the data is represented for modeling | |
| mmat <- model.matrix(fmla, data = flowers) | |
| # Examine the first 20 lines of flowers | |
| head(flowers, n = 20) | |
| # Examine the first 20 lines of mmat | |
| head(mmat, n = 20) | |
| # flowers in is the workspace | |
| str(flowers) | |
| # fmla is in the workspace | |
| fmla | |
| # Fit a model to predict Flowers from Intensity and Time : flower_model | |
| flower_model <- lm(fmla, data = flowers) | |
| # Use summary on mmat to remind yourself of its structure | |
| summary(mmat) | |
| # Use summary to examine flower_model | |
| summary(flower_model) | |
| # Predict the number of flowers on each plant | |
| flowers$predictions <- predict(flower_model, newdata = flowers) | |
| # Plot predictions vs actual flowers (predictions on x-axis) | |
| ggplot(flowers, aes(x = predictions, y = Flowers)) + | |
| geom_point() + | |
| geom_abline(color = "blue") | |
| ######################################################################## | |
| #Dummy Code Categorical Variables | |
| #https://stats.stackexchange.com/questions/115049/why-do-we-need-to-dummy-code-categorical-variables | |
| ######################################################################## | |
| #Variable Interactions that are bad for Additive/Linear Regression | |
| ######################################################################## | |
| #Additive relationship: e.g. plant_height ~ bacteria + sun | |
| #increase in sun causes increase in plant_height regardless of level of bacteria level | |
| #plant_head ~ bacteria + sun + bacteria:sun (interaction between bacteria and sunlight) | |
| #change in height is more (or less) the sum of the effets due to sun/bacteria | |
| #at higher levels of sunlight, 1 unit change in bacteria causes more change in height | |
| #Expressing Interactions in R | |
| #Interaction - Colon (:) | |
| y ~ a:b | |
| #Main Effects and Interaction: | |
| y ~ a*b === y ~ a + b + a:b | |
| #Expressing product of two variables: I | |
| y ~ I(a*b) | |
| ######################################################################## | |
| #Transforming the response before modelling | |
| ######################################################################## | |
| #Sometimes get better models by transforming output rather than predicting output directly | |
| #e.g. log transform | |
| #e.g. Log Transform for Monetary Data. | |
| #monetary values tend to be lognormally distributed | |
| #long tail wide dynamic range (60-700k) | |
| #in such a case, mean > median (50k vs 39k) | |
| #predicting the mean will overpredict typical values (as what regression typically does) | |
| #if you take the log of monetary data, it will be transformed into a normal distribution | |
| #AKA mean ~= median again, more reasonable dynamic range (1.8 - 5.8) | |
| #The Procedure | |
| model <- lm(log(y) ~ x, data = train) | |
| logpred <- predict(model, data = test) | |
| pred <- exp(logpred) #transform predictions to outcome space | |
| #Consequences | |
| #prediction errors are multiplicative in outcome space (size of error relative to size of outcome) | |
| #multiplicative error: pred/y | |
| #relative error: (pred - y)/y = pred/y - 1 | |
| #reducing multiplicative error reduces relative error | |
| #log transformations are useful when you want to reduce relative error rather than additive error | |
| #good example: monetary. Being off by $100 is less important for $100000 but more for $200 income | |
| #Root Mean Squared Relative Error | |
| #RMSRE = sqrt(mean((pred-y)/y)^2)) | |
| #predicting log-outcome reduces RMS-relative error | |
| #but model will often have larger RMSE | |
| ######################################################################## | |
| #Ex. Log Transform | |
| # Examine Income2005 in the training set | |
| summary(income_train$Income2005) | |
| # Write the formula for log income as a function of the tests and print it | |
| (fmla.log <- log(Income2005) ~ Arith + Word + Parag + Math + AFQT) | |
| # Fit the linear model | |
| model.log <- lm(fmla.log, data=income_train) | |
| # Make predictions on income_test | |
| income_test$logpred <- predict(model.log, newdata= income_test) | |
| summary(income_test$logpred) | |
| # Convert the predictions to monetary units | |
| income_test$pred.income <- exp(income_test$logpred) | |
| summary(income_test$pred.income) | |
| # Plot predicted income (x axis) vs income | |
| ggplot(income_test, aes(x = pred.income, y = Income2005)) + | |
| geom_point() + | |
| geom_abline(color = "blue") | |
| # Add predictions to the test set | |
| income_test <- income_test %>% | |
| mutate(pred.absmodel = predict(model.abs, income_test),#model.abs | |
| pred.logmodel = exp(predict(model.log, income_test)))#model.log | |
| # Gather the predictions and calculate residuals and relative error | |
| income_long <- income_test %>% | |
| gather(key = modeltype, value = pred, pred.absmodel, pred.logmodel) %>% | |
| mutate(residual = pred - Income2005, # residuals | |
| relerr = residual/Income2005 ) # relative error | |
| # Calculate RMSE and relative RMSE and compare | |
| income_long %>% | |
| group_by(modeltype) %>% # group by modeltype | |
| summarize(rmse = sqrt(mean(residual^2)), # RMSE | |
| rmse.rel = sqrt(mean((relerr)^2))) # Root mean squared relative error | |
| ######################################################################## | |
| #Transforming the inputs before modelling | |
| ######################################################################## | |
| #Why to Transform Input Variables | |
| #domain knowledge/synthetic variables | |
| #intelligenceAnimals ~ mass.brains/mass.body^2.3 | |
| #pragmatic reasons | |
| #log transform to reduce dynamic range | |
| #log transform because meaningful changes in variable are multiplicative | |
| #y approximately linear in f(x) rather than in x | |
| #anx ~ I(hassles^3) | |
| #I(): treat an expression literally (not as an interaction) | |
| #Linear, Quadratic, and Cubic Models | |
| mod_lin <- lm(anx ~ hassles, hassleframe) | |
| summary(mod_lin)$r.squared #0.5334847 | |
| mod_quad <- lm(anx ~ I(hassles^2), hassleframe) | |
| summary(mod_quad)$r.squared #0.6241029 | |
| mod_tritic <- lm(anx ~ I(hassles^3), hassleframe) | |
| summary(mod_tritic)$r.squared #0.6474421 - clearly the best | |
| ######################################################################## | |
| #EX. Transforming inputs | |
| # houseprice is in the workspace | |
| summary(houseprice) | |
| # Create the formula for price as a function of squared size | |
| (fmla_sqr <- (price ~ I(size^2))) | |
| # Fit a model of price as a function of squared size (use fmla_sqr) | |
| model_sqr <- lm(fmla_sqr, data = houseprice) | |
| # Fit a model of price as a linear function of size | |
| model_lin <- lm(price ~ size, data = houseprice) | |
| # Make predictions and compare | |
| houseprice %>% | |
| mutate(pred_lin = predict(model_lin), # predictions from linear model | |
| pred_sqr = predict(model_sqr)) %>% # predictions from quadratic model | |
| gather(key = modeltype, value = pred, pred_lin, pred_sqr) %>% # gather the predictions | |
| ggplot(aes(x = size)) + | |
| geom_point(aes(y = price)) + # actual prices | |
| geom_line(aes(y = pred, color = modeltype)) + # the predictions | |
| scale_color_brewer(palette = "Dark2") | |
| ######################################################################## | |
| #Logistic Regression to Predict Probabilities https://www.stat.wisc.edu/courses/st572-larget/handouts08-4.pdf | |
| ######################################################################## | |
| #Generalized Linear Model | |
| glm(formula, data = train, family = binomial(link = "logit")) | |
| #assumes inputs additive, linear in log-odds: log(p/(1-p)) | |
| #family: describes error distribution of model | |
| #logistic regresssion: family = binomial | |
| #outcome: two classes, e.g. a and b | |
| #model returns Prob(b) | |
| #Recommend: 0/1 or FALSE/TRUE | |
| predict(model, newdata = test, type = "response") | |
| #newdat: by default, trainng data | |
| #to get probabilities: use type = "response" | |
| #by default: returns log-odds of the event, not the probability | |
| #Methods of Evaluating Logistic Regression Model: pseudo-R^2 | |
| #R^2 = 1 - RSS/TSS | |
| #pseudR^2 = 1 - deviance/null.deviance | |
| #Deviance: analogous to variance (RSS) | |
| #Null Deviance: Similar to TSS | |
| #Pseudo R^2: Deviance explained | |
| #Pseudo-R^2 on Training Data | |
| #Using broom::glance() | |
| glance(model) %>% summarize(pR2 = 1 - deviance/null.deviance) | |
| ## pseudoR2 | |
| ## 1 0.5922402 | |
| #Using sigr::wrapChiSqTest() | |
| wrapChiSqTest(model) | |
| ## "... pseudo-R2=0.59 ..." | |
| #Pseudo R^2 on Test Data | |
| test %>% mutate(pred = predict(model, newdata = test, type = "response")) %>% | |
| wrapChiSqTest("predColName", "trueValuesColName of 0 or 1", TRUE) #the "TRUE" is the target value/target event | |
| #GainCurvePlot great for logistic regression | |
| GainCurvePlot(test, "pred", "has_dmd", "DMd model on test") | |
| ######################################################################## | |
| #EX. Logistic Regresssion on Sparrow Survival Probability | |
| #predict status (survived vs perished) based on toatl_length, weight, and humerus | |
| # sparrow is in the workspace | |
| summary(sparrow) | |
| # Create the survived column based on values TRUE/FALSE rather than Survived/Perished | |
| sparrow$survived <- sparrow$status == "Survived" | |
| # Create the formula | |
| (fmla <- as.formula("survived ~ total_length + weight + humerus")) | |
| # Fit the logistic regression model | |
| sparrow_model <- glm(fmla, data = sparrow, family = binomial(link = "logit")) | |
| # Call summary | |
| summary(sparrow_model) | |
| # Call glance | |
| (perf <- glance(sparrow_model)) | |
| # Calculate pseudo-R-squared | |
| (pseudoR2 <- 1 - (sparrow_model$deviance)/(sparrow_model$null.deviance)) | |
| # Make predictions | |
| sparrow$pred <- predict(sparrow_model, type = "response") | |
| # Look at gain curve | |
| GainCurvePlot(sparrow, "pred", "survived", "sparrow survival model") | |
| ######################################################################## | |
| #Poisson and Quasipoisson Regression to Predict Counts | |
| ######################################################################## | |
| #Predicting Counts | |
| #linear regression: predict values in [-infinity, infinity] | |
| #counts: integers in range [0, infinity] | |
| glm(formula, data, family = poisson_or_quasipoisson) | |
| #inputs are additive and linear in log(count) | |
| #outcome: integers | |
| #counts: e.g. number of traffic tickets a driver gets | |
| #rates: e.g. number of website hits/day | |
| #prediction: expected rate or intensity (not integral) | |
| #expected # traffic tickets; expected hits/day | |
| #Poisson vs Quasipoisson | |
| #poisson assumes that mean(y) = var(y) ("the same" as in, should be of a similar order of magnitude***) | |
| #if vary(y) much different from mean(y) - use quasipoisson | |
| #generally required a large sample size | |
| #if rates/counts >> 0 - regular regression is fine | |
| #use same pseudo R^2 | |
| #use same predict method as logistic regression, along with type = "response" | |
| #usage of paste function: | |
| paste(outcome, "~", paste(vars, collapse = " + ")) | |
| #"cnt ~ hr + holiday + workingday + weathersit + temp + atemp + hum + windspeed" | |
| ######################################################################## | |
| #EX. Poisson/Quasipoisson | |
| # The outcome column | |
| outcome #"cnt" | |
| # The inputs to use | |
| vars ["hr", "holiday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed"] | |
| # Create the formula string for bikes rented as a function of the inputs | |
| (fmla <- paste(outcome, "~", paste(vars, collapse = " + "))) | |
| # Calculate the mean and variance of the outcome | |
| (mean_bikes <- mean(bikesJuly$cnt)) | |
| (var_bikes <- var(bikesJuly$cnt)) | |
| # Fit the model | |
| bike_model <- glm(fmla, data = bikesJuly, family = quasipoisson) | |
| # Call glance | |
| (perf <- glance(bike_model)) | |
| # Calculate pseudo-R-squared | |
| (pseudoR2 <- 1 - perf$deviance/perf$null.deviance) | |
| # bikesAugust is in the workspace | |
| str(bikesAugust) | |
| # bike_model is in the workspace | |
| summary(bike_model) | |
| # Make predictions on August data | |
| bikesAugust$pred <- predict(bike_model, newdata = bikesAugust, type = "response") | |
| # Calculate the RMSE (CAN USE RMSE AS LONG AS YOU DON'T TRANSFORM THE OUTPUT RESPONSES) | |
| bikesAugust %>% | |
| mutate(residual = pred - cnt) %>% | |
| summarize(rmse = sqrt(mean(residual^2))) | |
| ######################################################################## | |
| #EX. Visualize the Bike Rental Predictions as a Function of Time | |
| # Plot predictions and cnt by date/time | |
| bikesAugust %>% | |
| # set start to 0, convert unit to days | |
| mutate(instant = (instant - min(instant))/24) %>% | |
| # gather cnt and pred into a value column | |
| gather(key = valuetype, value = value, cnt, pred) %>% | |
| filter(instant < 14) %>% # restric to first 14 days | |
| # plot value by instant | |
| ggplot(aes(x = instant, y = value, color = valuetype, linetype = valuetype)) + | |
| geom_point() + | |
| geom_line() + | |
| scale_x_continuous("Day", breaks = 0:14, labels = 0:14) + | |
| scale_color_brewer(palette = "Dark2") + | |
| ggtitle("Predicted August bike rentals, Quasipoisson model") | |
| ######################################################################## | |
| #Generalized Additive Models (GAM) to learn Non-Linear Transformations | |
| ######################################################################## | |
| #GAM: output depends additively on unknown smooth functions of input variables | |
| #e.g. y ~ b0 + s1(x1) + s2(x2) | |
| #a GAM learns the intercept b0 and functions si's of the input | |
| #Use GAM to learn the relationship: quadratic? cubic? quartic? | |
| mgcv::gam(formula, family, data) | |
| #family: | |
| #gaussian (default): "regular" expression | |
| #binomial: probabilities | |
| #poisson/quassipoisson: counts | |
| #Best for larger datasets, as they are complex and more likely to overfit | |
| #s() function (fits a spline between input and outcome) | |
| anxiety ~ s(hassles) | |
| #to specify that an input has a non-linear relationship with the outcome | |
| #use s() with continuous variables NOT CATEGORICAL****************** | |
| #more than about 10 unique values | |
| model <- gam(anxiety ~ s(hassles), data = hassleFrame, family = gaussian) | |
| summary(model) | |
| ## | |
| ## ... | |
| ## | |
| ## R-sq.(adj) = 0.619 Deviance explained = 64.1% (pseudo R-squared) | |
| ## GCV = 49.132 Scale est. = 45.153 n = 40 | |
| #The "deviance explained" reports the model's unadjusted R^2 | |
| plot(model) #gives plots of variable transformations that the algorithm learned | |
| #y-values of these plots | |
| predict(model, type = "terms") | |
| #make predictions with model. use as.numeric to convert the matrix from predict(gam) to a vector | |
| as.numeric(predict(model, newdata = hassleFrame, type = "response")) | |
| ######################################################################## | |
| #EX. GAM Example | |
| # soybean_train is in the workspace | |
| summary(soybean_train) | |
| # Plot weight vs Time (Time on x axis) | |
| ggplot(soybean_train, aes(x = Time, y = weight)) + | |
| geom_point() | |
| # Load the package mgcv | |
| library(mgcv) | |
| # Create the formula | |
| (fmla.gam <- weight ~ s(Time)) | |
| # Fit the GAM Model | |
| model.gam <- gam(fmla.gam, data = soybean_train, family = gaussian) | |
| # Call summary() on model.lin and look for R-squared | |
| summary(model.lin) | |
| # Call summary() on model.gam and look for R-squared | |
| summary(model.gam) | |
| # Call plot() on model.gam | |
| plot(model.gam) | |
| # soybean_test is in the workspace | |
| summary(soybean_test) | |
| # Get predictions from linear model | |
| soybean_test$pred.lin <- predict(model.lin, newdata = soybean_test) | |
| # Get predictions from gam model | |
| soybean_test$pred.gam <- as.numeric(predict(model.gam, newdata = soybean_test)) | |
| # Gather the predictions into a "long" dataset | |
| soybean_long <- soybean_test %>% | |
| gather(key = modeltype, value = pred, pred.lin, pred.gam) | |
| # Calculate the rmse | |
| soybean_long %>% | |
| mutate(residual = weight - pred) %>% # residuals | |
| group_by(modeltype) %>% # group by modeltype | |
| summarize(rmse = sqrt(mean(residual^2))) # calculate the RMSE | |
| # Compare the predictions against actual weights on the test data | |
| soybean_long %>% | |
| ggplot(aes(x = Time)) + # the column for the x axis | |
| geom_point(aes(y = weight)) + # the y-column for the scatterplot | |
| geom_point(aes(y = pred, color = modeltype)) + # the y-column for the point-and-line plot | |
| geom_line(aes(y = pred, color = modeltype, linetype = modeltype)) + # the y-column for the point-and-line plot | |
| scale_color_brewer(palette = "Dark2") | |
| ######################################################################## | |
| #Decision Trees | |
| ######################################################################## | |
| #Rules of form: | |
| #if A and B and C then Y | |
| #non-linear concepts | |
| #intervals | |
| #non-monotonic relationships | |
| #non-additive interactions | |
| #AND: similar to multiplication | |
| #Pros: | |
| #trees have an expressive concept space (lower RMSE) | |
| #Cons: | |
| #coarse-grained predictions (can give six possible values, whereas linear can give continuous value predictions) | |
| #Ensembles of Trees (Several Trees) (Random Forest & Gradient-Boosted Trees) | |
| #ensemble model fits animal intelligence data better than single tree | |
| #ensembles give finer-grained predictions than single trees | |
| ######################################################################## | |
| #Random Forests | |
| ######################################################################## | |
| #Multiple diverse decision trees averaged together | |
| #reduces overfit | |
| #increases model expressiveness | |
| #finer grain predictions | |
| #Building a Random Forest Model | |
| #1. Draw bootstrapped sample from training data | |
| #2. For each sample grow a tree | |
| #at each node, pick best variable to split on (from a random subset of all variables) | |
| #continue until tree is grown | |
| #3. To score a datum, evaluate it with all the trees and average the results | |
| #CAN BE USED FOR BOTH CLASSIFICATION AND REGRESSION PROBLEMS | |
| model <- ranger::ranger(formula, trainngData, num.trees = 500, respect.unordered.factors = "order") | |
| #use at least 200 trees | |
| #mtry - number of variables to try at each node | |
| #by default, square root of the total number of variables | |
| #respect.unordered.factors | |
| #Specifies how to treat unordered factor variables. We recommend setting this to "order" for regression. | |
| ## Ranger result | |
| ## ... | |
| ## OOB (OUT OF BAG) estimate, AKA prediction error (MSE): 3103.623 | |
| ## R squared (OOB): 0.7837386 | |
| #NOTE: Random forest algorithm returns estimates of out-of-sample performance | |
| predict(model, test_data)$predictions | |
| #predict() inputs: model and data | |
| #predictions can be accessed in the element $predictions | |
| ######################################################################## | |
| #EX. Random Forests | |
| # bikesJuly is in the workspace | |
| str(bikesJuly) | |
| # Random seed to reproduce results | |
| seed | |
| # The outcome column | |
| (outcome <- "cnt") | |
| # The input variables | |
| (vars <- c("hr", "holiday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed")) | |
| # Create the formula string for bikes rented as a function of the inputs | |
| (fmla <- paste(outcome, "~", paste(vars, collapse = " + "))) | |
| # Load the package ranger | |
| library(ranger) | |
| # Fit and print the random forest model | |
| (bike_model_rf <- ranger(fmla, # formula | |
| bikesJuly, # data | |
| num.trees = 500, | |
| respect.unordered.factors = "order", | |
| seed = seed)) | |
| # bikesAugust is in the workspace | |
| str(bikesAugust) | |
| # bike_model_rf is in the workspace | |
| bike_model_rf | |
| # Make predictions on the August data | |
| bikesAugust$pred <- predict(bike_model_rf, newdata = bikesAugust)$predictions | |
| # Calculate the RMSE of the predictions | |
| bikesAugust %>% | |
| mutate(residual = pred - cnt) %>% # calculate the residual | |
| summarize(rmse = sqrt(mean(residual^2))) # calculate rmse | |
| # Plot actual outcome vs predictions (predictions on x-axis) | |
| ggplot(bikesAugust, aes(x = pred, y = cnt)) + | |
| geom_point() + | |
| geom_abline() | |
| ######################################################################## | |
| #Categorical Inputs (One-hot encoding) MANUALLY | |
| ######################################################################## | |
| #Most R functions manage the conversion for you i.e. model.matrix as we learned before | |
| #however, some do not e.g xgboost() | |
| #must convert categorical variables to numeric representation | |
| #conversion to indicators: one-hot encoding | |
| #vtreat to one-hot encode categorical variables (by transforming it into indicator variables | |
| #coded "lev" & to clean bad values out of numerical variables coded "clean") | |
| #Basic Idea: | |
| #designTreatmentsZ() to design a treatment plan from the training data | |
| #prepare() to create "clean" data | |
| #all numerical | |
| #no missing values | |
| #use prepare() with treatment plan for all future data | |
| #DESIGN TREATMENT PLAN | |
| vars <- c("x", "u") | |
| treatplan <- designTreatmentsZ(dframe, varslist, verbose = FALSE) | |
| #varslist: list of input variable names as strings | |
| #Set verbose to FALSE to suppress progress messages | |
| #designTreatmentsZ has property scoreFrame that describe the var-mapping & the types of each new variable. | |
| scoreFrame <- treatplan %>% | |
| magrittr::use_series(scoreFrame) %>% | |
| select(varName, origName, code) | |
| newvars <- scoreFrame %>% | |
| filter(code %in% c("clean", "lev")) %>% | |
| magrittr::use_series(varName) | |
| #PREPARE | |
| training.treat <- prepare(treatmentplan, dframe, varRestriction = newvars) | |
| #inputs to prepare(): | |
| #treatmentplan: treatment plan | |
| #dframe: data frame | |
| #varestriction: list of variables to prepare (optional) | |
| #default: prepare all variables | |
| ######################################################################## | |
| #EX. Categorical Inputs | |
| # dframe is in the workspace | |
| dframe | |
| # Create and print a vector of variable names | |
| (vars <- c("color", "size")) | |
| # Load the package vtreat | |
| library(vtreat) | |
| # Create the treatment plan | |
| treatplan <- designTreatmentsZ(dframe, vars) | |
| # Examine the scoreFrame | |
| (scoreFrame <- treatplan %>% | |
| use_series(scoreFrame) %>% | |
| select(varName, origName, code)) | |
| # We only want the rows with codes "clean" or "lev" | |
| (newvars <- scoreFrame %>% | |
| filter(code %in% c("clean", "lev")) %>% | |
| use_series(varName)) | |
| # Create the treated training data | |
| (dframe.treat <- prepare(treatplan, dframe, varRestriction = newvars)) | |
| ######################################################################## | |
| #EX. Novel Levels that e.g. model.matrix can't handle but vtreat can | |
| # newvars is in the workspace | |
| newvars | |
| # Print dframe and testframe | |
| dframe #has 'r', 'g', and 'b' | |
| testframe #has 'y' | |
| # Use prepare() to one-hot-encode testframe | |
| (testframe.treat <- prepare(treatplan, testframe, varRestriction = newvars)) | |
| ######################################################################## | |
| #Gradient Boosting | |
| ######################################################################## | |
| #ensemble method that builds up a model by incrementally improving the existing one | |
| #1. Fit a shallow tree T1 to the data. M1 = T1 | |
| #2. Fit a tree T2 to the residuals. Find gamma such that M2 = M1 + gamma*T2 is the best fit to data. | |
| #For regularized boosting, decrease learning by a factor eta in range (0,1). | |
| #M2 = M1 + eta*gamma*T2. A larger eta = faster learning, smaller eta = less risk of overfit | |
| #3. Repeat (2) until stopping condition is met. | |
| #Final Model: M = M1 + eta*sum(gamma_i * Ti) | |
| #Because gradient boosting optimizes error on the training data, it's very easy to overfit the model. | |
| #best practice is to estimate out of sample error via cross validation for each incremental model | |
| #then retroactively decide how many trees to use | |
| #xgboost() | |
| #1. Run xgb.cv() with a large number of rounds (trees). Fits the model & calculates cross-validated error as well. | |
| #2. xgb.cv()$evaluation_log: records estimated RMSE for each round. | |
| #find the number of trees that minimizes estimated RMSE: n_best | |
| #3. Run xgboost(), setting nrounds = n_best | |
| #EX. | |
| treatplan <- designTreatmentsZ(bikesJan, vars) | |
| newvars <- treatplan$scoreFrame %>% | |
| filter(code %in% c("clean", "lev")) %>% | |
| use_series(varName) | |
| bikesJan.treat <- prepare(treatplan, bikesJan, varRestriction = newvars) | |
| cv <- xgb.cv(data = as.matrix(bikesJan.treat), #a numeric matrix | |
| label = bikesJan$cnt, #vector of outcomes (numeric) | |
| objective = "reg:linear", | |
| nrounds = 100, nfold = 5, eta = 0.3, max_depth = 6) | |
| #Key inputs to xgb.cv() and xgboost() | |
| #data: input data as matrix; label: outcome | |
| #objective: for regression - "reg:linear" | |
| #nrounds: maximum number of trees to fit | |
| #eta: learning rate | |
| #max_depth: maximum depth of individual trees | |
| #nfold: (xgb.cv() only): number of folds for cross validation | |
| #early_stopping_rounds: after this many rounds without improvement, stop. | |
| elog <- as.data.frame(cv$evaluation_log) | |
| nrounds <- which.min(elog$test_rmse_mean) #out of sample error | |
| ## 78 #the index of the minimum value corresponds to best number of trees | |
| #run cgboost() with right number of trees to get final model | |
| nrounds <- 78 | |
| model <- xgboost(data = as.matrix(bikesJan.treat), | |
| label = bikesJan$cnt, | |
| nrounds = nrounds, | |
| objective = "reg:linear", | |
| eta = 0.3, | |
| depth = 6) | |
| #Predict with xgboost() model | |
| bikesFeb.treat <- prepare(treatplan, bikesFeb, varRestriction = newvars) | |
| bikesFeb$pred <- predict(model, as.matrix(bikesFeb.treat)) | |
| ######################################################################## | |
| #EX. Gradient Boosting Machines | |
| # bikesJuly & bikesJuly.treat are in the workspace | |
| # Load the package xgboost | |
| library(xgboost) | |
| # Run xgb.cv | |
| cv <- xgb.cv(data = as.matrix(bikesJuly.treat), | |
| label = bikesJuly$cnt, | |
| nrounds = 100, | |
| nfold = 5, | |
| objective = "reg:linear", | |
| eta = 0.3, | |
| max_depth = 6, | |
| early_stopping_rounds = 10, | |
| verbose = 0 # silent | |
| ) | |
| # Get the evaluation log | |
| elog <- as.data.frame(cv$evaluation_log) | |
| # Determine and print how many trees minimize training and test error | |
| elog %>% | |
| summarize(ntrees.train = which.min(train_rmse_mean), # find the index of min(train_rmse_mean) | |
| ntrees.test = which.min(test_rmse_mean)) # find the index of min(test_rmse_mean) | |
| #ntrees.train = 67, ntrees.test = 57 | |
| #In most cases, ntrees.test is less than ntrees.train. The training error keeps decreasing | |
| #even after the test error starts to increase. It's important to use cross-validation to find | |
| #the right number of trees (as determined by ntrees.test) and avoid an overfit model. | |
| # Examine the workspace | |
| ls() | |
| # The number of trees to use, as determined by xgb.cv | |
| ntrees | |
| # Run xgboost | |
| bike_model_xgb <- xgboost(data = as.matrix(bikesJuly.treat), # training data as matrix | |
| label = bikesJuly$cnt, # column of outcomes | |
| nrounds = ntrees, # number of trees to build | |
| objective = "reg:linear", # objective | |
| eta = 0.3, | |
| depth = 6, | |
| verbose = 0 # silent | |
| ) | |
| # Make predictions | |
| bikesAugust$pred <- predict(bike_model_xgb, as.matrix(bikesAugust.treat)) | |
| # Plot predictions (on x axis) vs actual bike rental count | |
| ggplot(bikesAugust, aes(x = pred, y = cnt)) + | |
| geom_point() + | |
| geom_abline() | |
| #Sometimes it might make negative prediction. However if you look at RMSE it will be smaller than other models. | |
| #perhaps rounding negative predictions up to zero is a reasonable tradeoff | |
| ######################################################################## | |
| #RIDGE AND LASSO | |
| ######################################################################## | |
| #REFERENCE: https://www.r-bloggers.com/ridge-regression-and-the-lasso/ | |
| swiss <- datasets::swiss | |
| #remove the Fertility column and remove the intercept column | |
| x <- model.matrix(Fertility~., data=swiss)[,-1] | |
| y <- swiss$Fertility | |
| lambda <- 10^seq(10, -2, length=100) | |
| #When lambda = 0, we get same coefficents as OLS model | |
| #create test and training set | |
| library(glmnet) | |
| #OLS | |
| swisslm <- lm(Fertility~., data = swiss[train,]) | |
| s.pred <- predict(swisslm, newdata = swiss[test,]) | |
| #Ridge (alpha = 0) | |
| cv.out <- cv.glmnet(x[train,], y[train], alpha = 0, lambda = vectorOfPossibleLambdas) | |
| bestlamRidge <- cv.out$lambda.min | |
| ridge.mod <- glmnet(x[train,], y[train], alpha = 0, lambda = bestlamRidge) | |
| ridge.pred <- predict(ridge.mod, s = bestlamRidge, newx = x[test,]) | |
| #Lasso (alpha = 1) | |
| cv.out <- cv.glmnet(x[train,], y[train], alpha = 1, lambda = vectorOfPossibleLambdas) | |
| bestlamLasso <- cv.out$lambda.min | |
| lasso.mod <- glmnet(x[train,], y[train], alpha = 1, lambda = bestlamLasso) | |
| lasso.pred <- predict(lasso.mod, s = bestlamLasso, newx = x[test,]) | |
| #Check RMSE LINEAR | |
| sqrt(mean((s.pred-ytest)^2)) | |
| #Check RMSE RIDGE | |
| sqrt(mean((ridge.pred - ytest)^2)) | |
| #Check RMSE LASSO | |
| sqrt(mean((lasso.pred-ytest)^2)) | |
| ######################################################################## | |
| #REFERENCE:https://www.youtube.com/watch?v=fAPCaue8UKQ | |
| ######################################################################## | |
| #Elastic net: mix of lasso and ridge regression | |
| #Cross-validation to find optimal lambda for lasso regression | |
| CV = cv.glmnet(x=trainX, y=trainY, family="binomial/gaussian", type.measure="deviance AKA mse", alpha = 1, nlambda = 100) | |
| #Following plot shows the models (with different lambda values) that glmnet has fit, along with the error type for each of the models | |
| plot(CV) | |
| #Fit model | |
| fit = glmnet(x=trainX, y=trainY, family = "binomial", alpha=1, lambda=CV$lambda.1se) | |
| #???lambda.1se vs lambda.min??? | |
| # This is a question about parsimony. The lambda.min option refers to value of λ at the lowest CV error. | |
| # The error at this value of λ is the average of the errors over the k folds and hence this estimate of the error is uncertain. | |
| # The lambda.1se represents the value of λ in the search that was simpler than the best model (lambda.min), but which has error | |
| # within 1 standard error of the best model. In other words, using the value of lambda.1se as the selected value for λ results | |
| # in a model that is slightly simpler than the best model but which cannot be distinguished from the best model in terms of error | |
| # given the uncertainty in the kk-fold CV estimate of the error of the best model. | |
| # | |
| # The choice is yours: | |
| # | |
| # The best model that may be too complex or slightly overfitted: lambda.min | |
| # The simplest model that has comparable error to the best model given the uncertainty: lambda.1se | |
| #List nonzero coefficients: | |
| fit$beta[,1] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment