Skip to content

Instantly share code, notes, and snippets.

@jangsean
Forked from cherryyingzizhang/Regression.R
Created September 19, 2023 16:09
Show Gist options
  • Select an option

  • Save jangsean/4d390ee684deebd109d8c1757e42e42a to your computer and use it in GitHub Desktop.

Select an option

Save jangsean/4d390ee684deebd109d8c1757e42e42a to your computer and use it in GitHub Desktop.
Regression.R
########################################################################
#1-var Linear Regression
########################################################################
#lm(formula you wanna fit, dataFrame)
cmodel <- lm(temperature ~ chirps_per_sec, data = cricket)
#Formula
fmla <- temperature ~ chirps_per_sec
fmla <- blood_pressure ~ age + weight
#LHS: outcome, RHS: inputs.
fmla <- as.formula("temperature ~ chirps_per_sec") #converts string to formula
#conveniently package diagnostics from summary(cmodel) in a dataframe
broom::glance(cmodel)
sigr:wrapFTest(cmodel) #R^2 test
#by default returns training data predictions
predict(model)
ggplot(cricket, aes(x = prediction, y = temperature)) + geom_point() + geom_abline(color = "darkblue") + ggtitle("temperature vs linear model prediction")
#apply model to new data
predict(model, newdata = newChirps)
########################################################################
#Predicting once you fit a model
########################################################################
# unemployment is in your workspace
summary(unemployment)
# newrates is in your workspace
newrates
# Predict female unemployment in the unemployment data set
unemployment$prediction <- predict(unemployment_model)
# load the ggplot2 package
library(ggplot2)
# Make a plot to compare predictions to actual (prediction on x axis).
ggplot(unemployment, aes(x = prediction, y = female_unemployment)) +
geom_point() +
geom_abline(color = "blue")
# Predict female unemployment rate when male unemployment is 5%
pred <- predict(unemployment_model, newdata = newrates)
# Print it
pred
########################################################################
#Multivariate Linear Regression
########################################################################
# bloodpressure is in the workspace
summary(bloodpressure)
# Create the formula and print it
fmla <- blood_pressure ~ age + weight
fmla
# Fit the model: bloodpressure_model
bloodpressure_model <- lm(fmla, data = bloodpressure)
# Print bloodpressure_model and call summary()
bloodpressure_model
summary(bloodpressure_model)
# predict blood pressure using bloodpressure_model :prediction
bloodpressure$prediction <- predict(bloodpressure_model)
# plot the prediction vs actual
ggplot(bloodpressure, aes(x = prediction, y = blood_pressure)) +
geom_point() +
geom_abline(color = "blue")
#Pros of Linear Regression
#easy to fit and to apply, concise, less prone to overfitting, interpretable
#Cons of Linear Regression
#can only express linear and additive relationships
#collinearity -- when input variables are partially correlated.
#when variables are highly correlated, signs of coefficients might not be what you expect, but does not affect model accuracy
#model may be unstable (e.g. unusually high coefficients when variables are highly correlated)
########################################################################
#Evaluating Regression Models (graphically, RMSE, R^2, (and F-test from link))
########################################################################
#http://facweb.cs.depaul.edu/sjost/csc423/documents/f-test-reg.htm
#https://feliperego.github.io/blog/2015/10/23/Interpreting-Model-Output-In-R
#Evaluating model graphically
#(by looking at plots of ground truth vs predictions aka y vs y_hat)
#a well-fitted model would have:
#x=y line runs through center of points
#line of perfect prediction
#a poorly fitting model would have:
#regions where points are all on one side of x=y line
#systematic errors - errors that are correlated w/ value of outcome
#either you don't have all the variables in the model or
#you need an algorithm that can find more complex relationships in data
#Residual Plot
#residual plot: difference between outcome & prediction vs prediction
#good fit: no systematic errors (all the errors evenly distributed between positive and negative and same magnitude above and below)
#bad fit: systematic errors (clusters of all positives or all negatives of residuals)
#Gain Curve/Plot (http://www2.cs.uregina.ca/~dbd/cs831/notes/lift_chart/lift_chart.html)
#useful when sorting instances is more important than predicting exact outcome values
#measures how well model sorts the outcome
#x-axis: fraction of items in model-sorted order (decreasing)
#y-axis: fraction of total accumulated home sales
#Wizard curve: perfect model
GainCurvePlot(frame, xvar, truthvar, title)
#frame is a data frame
#xvar and truthvar are strings naming the prediction and actual outcome columns of frame
#title is the title of the plot
#When the predictions sort in exactly the same order, the relative Gini coefficient is 1.
#When the model sorts poorly, the relative Gini coefficient is close to zero, or even negative.
#Residuals
unemployment$residuals <- unemployment$female_unemployment - unemployment$predictions
ggplot(unemployment, aes(x = predictions, y = residuals)) +
geom_pointrange(aes(ymin = 0, ymax = residuals)) +
geom_hline(yintercept = 0, linetype = 3) +
ggtitle("residuals vs. linear model prediction")
#RMSE
error <- houseprices$predictedPrices - houseprices$actualPrices #residuals
err <- err^2
rmse <- sqrt(mean(err))
#is RMSE large or small? one way to tell is to compare it to the standard deviation of outcome
#R^2
#near 1: model fits well
#near 0: no better than guessing the average value
#R^2 = 1 - RSS (variance from model)/SST (variance of data)
error <- houseprices$predictedPrices - houseprices$actualPrices #residuals
rss <- sum(err^2)
totalerr <- houseprices$actualPrices - mean(houseprices$actualPrices)
sstot <- sum(toterr^2)
s_squared <- 1 - (rss/sstot)
#Correlation and R^2
p = cor(prediction, actual) = 0.8995709
p^2 = 0.8092278 = R^2
#true for models that minimize squared error: linear regression & GAM regression
#can get R^2 from summary() or glance()$r.squared or adj.r.squared
cor(pred_y, y) #= sqrt(R^2) ONLY for trainng data & models that minimize square error
#FAST RMSE/R_SQUARED FUNCTIONS
rmse(actual, predicted)
r_squared(actual, predicted)
########################################################################
#Training a Model
########################################################################
#if we don't have enough data to split training vs test sets, use cross validation
#N-fold Cross Validation
#partition data into n subsets
#e.g. 3-fold cross validation
#first, train a model using A and B and use that model to make pred. on C
#then B and C to predict A
#then A and C to predict B
#splitPlan = vtreat::kWayCrossValidation(nRows in training data, nSplits aka n-way, dframe, y)
#dframe and y aren't technically used so can set them both to be NULL
#The resulting splitPlan is list of nSplits elements; each element contains two vectors:
#train: the indices of dframe that will form the training set
#app: the indices of dframe that will form the test (or application) set
#split <- splitPlan[[1]], model <- lm(fmla, data = df[split$train,])
#df$pred.cv[split$app] <- predict(model, newdata = df[split$app,])
#if the n-fold cross validation tests look good enough, create the final model via all data
#unfortunately you can't evaluate final model for future data because training/test
#cross validation only tests the modelling process whereas training/test tests for future value prediction
########################################################################
#EX. Generating a random split of training/test set
#If you have a dataset dframe of size N & want random subset of X% of N, where 0 < X < 1,
gp = runif(N) #vector of uniform random numbers
dframe[gp < X,] #will be what you want
dframe[gp >= X,] #will be complement
#note: (close to x/1-x split, but not exactly due to uniform random distribution)
#train the model
fmla = cty ~ hwy
mpg_model = lm(fmla, data = mpg_train)
#test model using rmse(predCol, ycol) & r_squared(predcol, ycol)
mpg_train$pred <- predict(mpg_model, mpg_train)
mpg_test$pred <- predict(mpg_model, mpg_test)
rmse_train <- rmse(mpg_train$pred, mpg_train$cty)
rmse_test <- rmse(mpg_test$pred, mpg_test$cty)
rsq_train <- r_squared(mpg_train$pred, mpg_train$cty)
rsq_test <- r_squared(mpg_test$pred, mpg_test$cty)
ggplot(mpg_test, aes(x = pred, y = cty)) +
geom_point() +
geom_abline()
########################################################################
#EX. Create a Cross Validation Plan
k <- 3 # Number of folds
splitPlan <- kWayCrossValidation(nRows, k, NULL, NULL)
# Run the 3-fold cross validation plan from splitPlan
mpg$pred.cv <- 0
for(i in 1:k) {
split <- splitPlan[[i]]
model <- lm(cty ~ hwy, data = mpg[split$train,])
mpg$pred.cv[split$app] <- predict(model, newdata = mpg[split$app,])
}
# Predict from a full model
mpg$pred <- predict(lm(cty ~ hwy, data = mpg))
# Get the rmse of the full model's predictions
rmse(mpg$cty, mpg$pred)
# Get the rmse of the cross-validation predictions
rmse(mpg$cty, mpg$pred.cv)
########################################################################
#Categorical Inputs (One-hot encoding)
########################################################################
model.matrix(WtLoss24 ~ Diet + age + BMI, data = diet)
#all numerical values, converts categorical var. with N levels into N-1 indicator var
#use unique() to see how many possible values Time takes (two: "Late" and "Early")
unique(flowers$Time)
fmla <- as.formula("Flowers ~ Intensity + Time")
mmat <- model.matrix(fmla, data = flowers) #Time column changes to boolean: TimeLate (0 or 1)
#Partial F-Test on a Factor Coded as a Set of Dummies
full <- lm(y~x+as.factor(color), data=dat)
red <- lm(y~x, data=dat)
anova(red,full)
#Look at the F value & Pr(>F) this anova prints out.
########################################################################
#EX. Categorical Inputs
# Call str on flowers to see the types of each column
str(flowers)
# Use unique() to see how many possible values Time takes
unique(flowers$Time)
# Build a formula to express Flowers as a function of Intensity and Time: fmla. Print it
(fmla <- as.formula("Flowers ~ Intensity + Time"))
# Use fmla and model.matrix to see how the data is represented for modeling
mmat <- model.matrix(fmla, data = flowers)
# Examine the first 20 lines of flowers
head(flowers, n = 20)
# Examine the first 20 lines of mmat
head(mmat, n = 20)
# flowers in is the workspace
str(flowers)
# fmla is in the workspace
fmla
# Fit a model to predict Flowers from Intensity and Time : flower_model
flower_model <- lm(fmla, data = flowers)
# Use summary on mmat to remind yourself of its structure
summary(mmat)
# Use summary to examine flower_model
summary(flower_model)
# Predict the number of flowers on each plant
flowers$predictions <- predict(flower_model, newdata = flowers)
# Plot predictions vs actual flowers (predictions on x-axis)
ggplot(flowers, aes(x = predictions, y = Flowers)) +
geom_point() +
geom_abline(color = "blue")
########################################################################
#Dummy Code Categorical Variables
#https://stats.stackexchange.com/questions/115049/why-do-we-need-to-dummy-code-categorical-variables
########################################################################
#Variable Interactions that are bad for Additive/Linear Regression
########################################################################
#Additive relationship: e.g. plant_height ~ bacteria + sun
#increase in sun causes increase in plant_height regardless of level of bacteria level
#plant_head ~ bacteria + sun + bacteria:sun (interaction between bacteria and sunlight)
#change in height is more (or less) the sum of the effets due to sun/bacteria
#at higher levels of sunlight, 1 unit change in bacteria causes more change in height
#Expressing Interactions in R
#Interaction - Colon (:)
y ~ a:b
#Main Effects and Interaction:
y ~ a*b === y ~ a + b + a:b
#Expressing product of two variables: I
y ~ I(a*b)
########################################################################
#Transforming the response before modelling
########################################################################
#Sometimes get better models by transforming output rather than predicting output directly
#e.g. log transform
#e.g. Log Transform for Monetary Data.
#monetary values tend to be lognormally distributed
#long tail wide dynamic range (60-700k)
#in such a case, mean > median (50k vs 39k)
#predicting the mean will overpredict typical values (as what regression typically does)
#if you take the log of monetary data, it will be transformed into a normal distribution
#AKA mean ~= median again, more reasonable dynamic range (1.8 - 5.8)
#The Procedure
model <- lm(log(y) ~ x, data = train)
logpred <- predict(model, data = test)
pred <- exp(logpred) #transform predictions to outcome space
#Consequences
#prediction errors are multiplicative in outcome space (size of error relative to size of outcome)
#multiplicative error: pred/y
#relative error: (pred - y)/y = pred/y - 1
#reducing multiplicative error reduces relative error
#log transformations are useful when you want to reduce relative error rather than additive error
#good example: monetary. Being off by $100 is less important for $100000 but more for $200 income
#Root Mean Squared Relative Error
#RMSRE = sqrt(mean((pred-y)/y)^2))
#predicting log-outcome reduces RMS-relative error
#but model will often have larger RMSE
########################################################################
#Ex. Log Transform
# Examine Income2005 in the training set
summary(income_train$Income2005)
# Write the formula for log income as a function of the tests and print it
(fmla.log <- log(Income2005) ~ Arith + Word + Parag + Math + AFQT)
# Fit the linear model
model.log <- lm(fmla.log, data=income_train)
# Make predictions on income_test
income_test$logpred <- predict(model.log, newdata= income_test)
summary(income_test$logpred)
# Convert the predictions to monetary units
income_test$pred.income <- exp(income_test$logpred)
summary(income_test$pred.income)
# Plot predicted income (x axis) vs income
ggplot(income_test, aes(x = pred.income, y = Income2005)) +
geom_point() +
geom_abline(color = "blue")
# Add predictions to the test set
income_test <- income_test %>%
mutate(pred.absmodel = predict(model.abs, income_test),#model.abs
pred.logmodel = exp(predict(model.log, income_test)))#model.log
# Gather the predictions and calculate residuals and relative error
income_long <- income_test %>%
gather(key = modeltype, value = pred, pred.absmodel, pred.logmodel) %>%
mutate(residual = pred - Income2005, # residuals
relerr = residual/Income2005 ) # relative error
# Calculate RMSE and relative RMSE and compare
income_long %>%
group_by(modeltype) %>% # group by modeltype
summarize(rmse = sqrt(mean(residual^2)), # RMSE
rmse.rel = sqrt(mean((relerr)^2))) # Root mean squared relative error
########################################################################
#Transforming the inputs before modelling
########################################################################
#Why to Transform Input Variables
#domain knowledge/synthetic variables
#intelligenceAnimals ~ mass.brains/mass.body^2.3
#pragmatic reasons
#log transform to reduce dynamic range
#log transform because meaningful changes in variable are multiplicative
#y approximately linear in f(x) rather than in x
#anx ~ I(hassles^3)
#I(): treat an expression literally (not as an interaction)
#Linear, Quadratic, and Cubic Models
mod_lin <- lm(anx ~ hassles, hassleframe)
summary(mod_lin)$r.squared #0.5334847
mod_quad <- lm(anx ~ I(hassles^2), hassleframe)
summary(mod_quad)$r.squared #0.6241029
mod_tritic <- lm(anx ~ I(hassles^3), hassleframe)
summary(mod_tritic)$r.squared #0.6474421 - clearly the best
########################################################################
#EX. Transforming inputs
# houseprice is in the workspace
summary(houseprice)
# Create the formula for price as a function of squared size
(fmla_sqr <- (price ~ I(size^2)))
# Fit a model of price as a function of squared size (use fmla_sqr)
model_sqr <- lm(fmla_sqr, data = houseprice)
# Fit a model of price as a linear function of size
model_lin <- lm(price ~ size, data = houseprice)
# Make predictions and compare
houseprice %>%
mutate(pred_lin = predict(model_lin), # predictions from linear model
pred_sqr = predict(model_sqr)) %>% # predictions from quadratic model
gather(key = modeltype, value = pred, pred_lin, pred_sqr) %>% # gather the predictions
ggplot(aes(x = size)) +
geom_point(aes(y = price)) + # actual prices
geom_line(aes(y = pred, color = modeltype)) + # the predictions
scale_color_brewer(palette = "Dark2")
########################################################################
#Logistic Regression to Predict Probabilities https://www.stat.wisc.edu/courses/st572-larget/handouts08-4.pdf
########################################################################
#Generalized Linear Model
glm(formula, data = train, family = binomial(link = "logit"))
#assumes inputs additive, linear in log-odds: log(p/(1-p))
#family: describes error distribution of model
#logistic regresssion: family = binomial
#outcome: two classes, e.g. a and b
#model returns Prob(b)
#Recommend: 0/1 or FALSE/TRUE
predict(model, newdata = test, type = "response")
#newdat: by default, trainng data
#to get probabilities: use type = "response"
#by default: returns log-odds of the event, not the probability
#Methods of Evaluating Logistic Regression Model: pseudo-R^2
#R^2 = 1 - RSS/TSS
#pseudR^2 = 1 - deviance/null.deviance
#Deviance: analogous to variance (RSS)
#Null Deviance: Similar to TSS
#Pseudo R^2: Deviance explained
#Pseudo-R^2 on Training Data
#Using broom::glance()
glance(model) %>% summarize(pR2 = 1 - deviance/null.deviance)
## pseudoR2
## 1 0.5922402
#Using sigr::wrapChiSqTest()
wrapChiSqTest(model)
## "... pseudo-R2=0.59 ..."
#Pseudo R^2 on Test Data
test %>% mutate(pred = predict(model, newdata = test, type = "response")) %>%
wrapChiSqTest("predColName", "trueValuesColName of 0 or 1", TRUE) #the "TRUE" is the target value/target event
#GainCurvePlot great for logistic regression
GainCurvePlot(test, "pred", "has_dmd", "DMd model on test")
########################################################################
#EX. Logistic Regresssion on Sparrow Survival Probability
#predict status (survived vs perished) based on toatl_length, weight, and humerus
# sparrow is in the workspace
summary(sparrow)
# Create the survived column based on values TRUE/FALSE rather than Survived/Perished
sparrow$survived <- sparrow$status == "Survived"
# Create the formula
(fmla <- as.formula("survived ~ total_length + weight + humerus"))
# Fit the logistic regression model
sparrow_model <- glm(fmla, data = sparrow, family = binomial(link = "logit"))
# Call summary
summary(sparrow_model)
# Call glance
(perf <- glance(sparrow_model))
# Calculate pseudo-R-squared
(pseudoR2 <- 1 - (sparrow_model$deviance)/(sparrow_model$null.deviance))
# Make predictions
sparrow$pred <- predict(sparrow_model, type = "response")
# Look at gain curve
GainCurvePlot(sparrow, "pred", "survived", "sparrow survival model")
########################################################################
#Poisson and Quasipoisson Regression to Predict Counts
########################################################################
#Predicting Counts
#linear regression: predict values in [-infinity, infinity]
#counts: integers in range [0, infinity]
glm(formula, data, family = poisson_or_quasipoisson)
#inputs are additive and linear in log(count)
#outcome: integers
#counts: e.g. number of traffic tickets a driver gets
#rates: e.g. number of website hits/day
#prediction: expected rate or intensity (not integral)
#expected # traffic tickets; expected hits/day
#Poisson vs Quasipoisson
#poisson assumes that mean(y) = var(y) ("the same" as in, should be of a similar order of magnitude***)
#if vary(y) much different from mean(y) - use quasipoisson
#generally required a large sample size
#if rates/counts >> 0 - regular regression is fine
#use same pseudo R^2
#use same predict method as logistic regression, along with type = "response"
#usage of paste function:
paste(outcome, "~", paste(vars, collapse = " + "))
#"cnt ~ hr + holiday + workingday + weathersit + temp + atemp + hum + windspeed"
########################################################################
#EX. Poisson/Quasipoisson
# The outcome column
outcome #"cnt"
# The inputs to use
vars ["hr", "holiday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed"]
# Create the formula string for bikes rented as a function of the inputs
(fmla <- paste(outcome, "~", paste(vars, collapse = " + ")))
# Calculate the mean and variance of the outcome
(mean_bikes <- mean(bikesJuly$cnt))
(var_bikes <- var(bikesJuly$cnt))
# Fit the model
bike_model <- glm(fmla, data = bikesJuly, family = quasipoisson)
# Call glance
(perf <- glance(bike_model))
# Calculate pseudo-R-squared
(pseudoR2 <- 1 - perf$deviance/perf$null.deviance)
# bikesAugust is in the workspace
str(bikesAugust)
# bike_model is in the workspace
summary(bike_model)
# Make predictions on August data
bikesAugust$pred <- predict(bike_model, newdata = bikesAugust, type = "response")
# Calculate the RMSE (CAN USE RMSE AS LONG AS YOU DON'T TRANSFORM THE OUTPUT RESPONSES)
bikesAugust %>%
mutate(residual = pred - cnt) %>%
summarize(rmse = sqrt(mean(residual^2)))
########################################################################
#EX. Visualize the Bike Rental Predictions as a Function of Time
# Plot predictions and cnt by date/time
bikesAugust %>%
# set start to 0, convert unit to days
mutate(instant = (instant - min(instant))/24) %>%
# gather cnt and pred into a value column
gather(key = valuetype, value = value, cnt, pred) %>%
filter(instant < 14) %>% # restric to first 14 days
# plot value by instant
ggplot(aes(x = instant, y = value, color = valuetype, linetype = valuetype)) +
geom_point() +
geom_line() +
scale_x_continuous("Day", breaks = 0:14, labels = 0:14) +
scale_color_brewer(palette = "Dark2") +
ggtitle("Predicted August bike rentals, Quasipoisson model")
########################################################################
#Generalized Additive Models (GAM) to learn Non-Linear Transformations
########################################################################
#GAM: output depends additively on unknown smooth functions of input variables
#e.g. y ~ b0 + s1(x1) + s2(x2)
#a GAM learns the intercept b0 and functions si's of the input
#Use GAM to learn the relationship: quadratic? cubic? quartic?
mgcv::gam(formula, family, data)
#family:
#gaussian (default): "regular" expression
#binomial: probabilities
#poisson/quassipoisson: counts
#Best for larger datasets, as they are complex and more likely to overfit
#s() function (fits a spline between input and outcome)
anxiety ~ s(hassles)
#to specify that an input has a non-linear relationship with the outcome
#use s() with continuous variables NOT CATEGORICAL******************
#more than about 10 unique values
model <- gam(anxiety ~ s(hassles), data = hassleFrame, family = gaussian)
summary(model)
##
## ...
##
## R-sq.(adj) = 0.619 Deviance explained = 64.1% (pseudo R-squared)
## GCV = 49.132 Scale est. = 45.153 n = 40
#The "deviance explained" reports the model's unadjusted R^2
plot(model) #gives plots of variable transformations that the algorithm learned
#y-values of these plots
predict(model, type = "terms")
#make predictions with model. use as.numeric to convert the matrix from predict(gam) to a vector
as.numeric(predict(model, newdata = hassleFrame, type = "response"))
########################################################################
#EX. GAM Example
# soybean_train is in the workspace
summary(soybean_train)
# Plot weight vs Time (Time on x axis)
ggplot(soybean_train, aes(x = Time, y = weight)) +
geom_point()
# Load the package mgcv
library(mgcv)
# Create the formula
(fmla.gam <- weight ~ s(Time))
# Fit the GAM Model
model.gam <- gam(fmla.gam, data = soybean_train, family = gaussian)
# Call summary() on model.lin and look for R-squared
summary(model.lin)
# Call summary() on model.gam and look for R-squared
summary(model.gam)
# Call plot() on model.gam
plot(model.gam)
# soybean_test is in the workspace
summary(soybean_test)
# Get predictions from linear model
soybean_test$pred.lin <- predict(model.lin, newdata = soybean_test)
# Get predictions from gam model
soybean_test$pred.gam <- as.numeric(predict(model.gam, newdata = soybean_test))
# Gather the predictions into a "long" dataset
soybean_long <- soybean_test %>%
gather(key = modeltype, value = pred, pred.lin, pred.gam)
# Calculate the rmse
soybean_long %>%
mutate(residual = weight - pred) %>% # residuals
group_by(modeltype) %>% # group by modeltype
summarize(rmse = sqrt(mean(residual^2))) # calculate the RMSE
# Compare the predictions against actual weights on the test data
soybean_long %>%
ggplot(aes(x = Time)) + # the column for the x axis
geom_point(aes(y = weight)) + # the y-column for the scatterplot
geom_point(aes(y = pred, color = modeltype)) + # the y-column for the point-and-line plot
geom_line(aes(y = pred, color = modeltype, linetype = modeltype)) + # the y-column for the point-and-line plot
scale_color_brewer(palette = "Dark2")
########################################################################
#Decision Trees
########################################################################
#Rules of form:
#if A and B and C then Y
#non-linear concepts
#intervals
#non-monotonic relationships
#non-additive interactions
#AND: similar to multiplication
#Pros:
#trees have an expressive concept space (lower RMSE)
#Cons:
#coarse-grained predictions (can give six possible values, whereas linear can give continuous value predictions)
#Ensembles of Trees (Several Trees) (Random Forest & Gradient-Boosted Trees)
#ensemble model fits animal intelligence data better than single tree
#ensembles give finer-grained predictions than single trees
########################################################################
#Random Forests
########################################################################
#Multiple diverse decision trees averaged together
#reduces overfit
#increases model expressiveness
#finer grain predictions
#Building a Random Forest Model
#1. Draw bootstrapped sample from training data
#2. For each sample grow a tree
#at each node, pick best variable to split on (from a random subset of all variables)
#continue until tree is grown
#3. To score a datum, evaluate it with all the trees and average the results
#CAN BE USED FOR BOTH CLASSIFICATION AND REGRESSION PROBLEMS
model <- ranger::ranger(formula, trainngData, num.trees = 500, respect.unordered.factors = "order")
#use at least 200 trees
#mtry - number of variables to try at each node
#by default, square root of the total number of variables
#respect.unordered.factors
#Specifies how to treat unordered factor variables. We recommend setting this to "order" for regression.
## Ranger result
## ...
## OOB (OUT OF BAG) estimate, AKA prediction error (MSE): 3103.623
## R squared (OOB): 0.7837386
#NOTE: Random forest algorithm returns estimates of out-of-sample performance
predict(model, test_data)$predictions
#predict() inputs: model and data
#predictions can be accessed in the element $predictions
########################################################################
#EX. Random Forests
# bikesJuly is in the workspace
str(bikesJuly)
# Random seed to reproduce results
seed
# The outcome column
(outcome <- "cnt")
# The input variables
(vars <- c("hr", "holiday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed"))
# Create the formula string for bikes rented as a function of the inputs
(fmla <- paste(outcome, "~", paste(vars, collapse = " + ")))
# Load the package ranger
library(ranger)
# Fit and print the random forest model
(bike_model_rf <- ranger(fmla, # formula
bikesJuly, # data
num.trees = 500,
respect.unordered.factors = "order",
seed = seed))
# bikesAugust is in the workspace
str(bikesAugust)
# bike_model_rf is in the workspace
bike_model_rf
# Make predictions on the August data
bikesAugust$pred <- predict(bike_model_rf, newdata = bikesAugust)$predictions
# Calculate the RMSE of the predictions
bikesAugust %>%
mutate(residual = pred - cnt) %>% # calculate the residual
summarize(rmse = sqrt(mean(residual^2))) # calculate rmse
# Plot actual outcome vs predictions (predictions on x-axis)
ggplot(bikesAugust, aes(x = pred, y = cnt)) +
geom_point() +
geom_abline()
########################################################################
#Categorical Inputs (One-hot encoding) MANUALLY
########################################################################
#Most R functions manage the conversion for you i.e. model.matrix as we learned before
#however, some do not e.g xgboost()
#must convert categorical variables to numeric representation
#conversion to indicators: one-hot encoding
#vtreat to one-hot encode categorical variables (by transforming it into indicator variables
#coded "lev" & to clean bad values out of numerical variables coded "clean")
#Basic Idea:
#designTreatmentsZ() to design a treatment plan from the training data
#prepare() to create "clean" data
#all numerical
#no missing values
#use prepare() with treatment plan for all future data
#DESIGN TREATMENT PLAN
vars <- c("x", "u")
treatplan <- designTreatmentsZ(dframe, varslist, verbose = FALSE)
#varslist: list of input variable names as strings
#Set verbose to FALSE to suppress progress messages
#designTreatmentsZ has property scoreFrame that describe the var-mapping & the types of each new variable.
scoreFrame <- treatplan %>%
magrittr::use_series(scoreFrame) %>%
select(varName, origName, code)
newvars <- scoreFrame %>%
filter(code %in% c("clean", "lev")) %>%
magrittr::use_series(varName)
#PREPARE
training.treat <- prepare(treatmentplan, dframe, varRestriction = newvars)
#inputs to prepare():
#treatmentplan: treatment plan
#dframe: data frame
#varestriction: list of variables to prepare (optional)
#default: prepare all variables
########################################################################
#EX. Categorical Inputs
# dframe is in the workspace
dframe
# Create and print a vector of variable names
(vars <- c("color", "size"))
# Load the package vtreat
library(vtreat)
# Create the treatment plan
treatplan <- designTreatmentsZ(dframe, vars)
# Examine the scoreFrame
(scoreFrame <- treatplan %>%
use_series(scoreFrame) %>%
select(varName, origName, code))
# We only want the rows with codes "clean" or "lev"
(newvars <- scoreFrame %>%
filter(code %in% c("clean", "lev")) %>%
use_series(varName))
# Create the treated training data
(dframe.treat <- prepare(treatplan, dframe, varRestriction = newvars))
########################################################################
#EX. Novel Levels that e.g. model.matrix can't handle but vtreat can
# newvars is in the workspace
newvars
# Print dframe and testframe
dframe #has 'r', 'g', and 'b'
testframe #has 'y'
# Use prepare() to one-hot-encode testframe
(testframe.treat <- prepare(treatplan, testframe, varRestriction = newvars))
########################################################################
#Gradient Boosting
########################################################################
#ensemble method that builds up a model by incrementally improving the existing one
#1. Fit a shallow tree T1 to the data. M1 = T1
#2. Fit a tree T2 to the residuals. Find gamma such that M2 = M1 + gamma*T2 is the best fit to data.
#For regularized boosting, decrease learning by a factor eta in range (0,1).
#M2 = M1 + eta*gamma*T2. A larger eta = faster learning, smaller eta = less risk of overfit
#3. Repeat (2) until stopping condition is met.
#Final Model: M = M1 + eta*sum(gamma_i * Ti)
#Because gradient boosting optimizes error on the training data, it's very easy to overfit the model.
#best practice is to estimate out of sample error via cross validation for each incremental model
#then retroactively decide how many trees to use
#xgboost()
#1. Run xgb.cv() with a large number of rounds (trees). Fits the model & calculates cross-validated error as well.
#2. xgb.cv()$evaluation_log: records estimated RMSE for each round.
#find the number of trees that minimizes estimated RMSE: n_best
#3. Run xgboost(), setting nrounds = n_best
#EX.
treatplan <- designTreatmentsZ(bikesJan, vars)
newvars <- treatplan$scoreFrame %>%
filter(code %in% c("clean", "lev")) %>%
use_series(varName)
bikesJan.treat <- prepare(treatplan, bikesJan, varRestriction = newvars)
cv <- xgb.cv(data = as.matrix(bikesJan.treat), #a numeric matrix
label = bikesJan$cnt, #vector of outcomes (numeric)
objective = "reg:linear",
nrounds = 100, nfold = 5, eta = 0.3, max_depth = 6)
#Key inputs to xgb.cv() and xgboost()
#data: input data as matrix; label: outcome
#objective: for regression - "reg:linear"
#nrounds: maximum number of trees to fit
#eta: learning rate
#max_depth: maximum depth of individual trees
#nfold: (xgb.cv() only): number of folds for cross validation
#early_stopping_rounds: after this many rounds without improvement, stop.
elog <- as.data.frame(cv$evaluation_log)
nrounds <- which.min(elog$test_rmse_mean) #out of sample error
## 78 #the index of the minimum value corresponds to best number of trees
#run cgboost() with right number of trees to get final model
nrounds <- 78
model <- xgboost(data = as.matrix(bikesJan.treat),
label = bikesJan$cnt,
nrounds = nrounds,
objective = "reg:linear",
eta = 0.3,
depth = 6)
#Predict with xgboost() model
bikesFeb.treat <- prepare(treatplan, bikesFeb, varRestriction = newvars)
bikesFeb$pred <- predict(model, as.matrix(bikesFeb.treat))
########################################################################
#EX. Gradient Boosting Machines
# bikesJuly & bikesJuly.treat are in the workspace
# Load the package xgboost
library(xgboost)
# Run xgb.cv
cv <- xgb.cv(data = as.matrix(bikesJuly.treat),
label = bikesJuly$cnt,
nrounds = 100,
nfold = 5,
objective = "reg:linear",
eta = 0.3,
max_depth = 6,
early_stopping_rounds = 10,
verbose = 0 # silent
)
# Get the evaluation log
elog <- as.data.frame(cv$evaluation_log)
# Determine and print how many trees minimize training and test error
elog %>%
summarize(ntrees.train = which.min(train_rmse_mean), # find the index of min(train_rmse_mean)
ntrees.test = which.min(test_rmse_mean)) # find the index of min(test_rmse_mean)
#ntrees.train = 67, ntrees.test = 57
#In most cases, ntrees.test is less than ntrees.train. The training error keeps decreasing
#even after the test error starts to increase. It's important to use cross-validation to find
#the right number of trees (as determined by ntrees.test) and avoid an overfit model.
# Examine the workspace
ls()
# The number of trees to use, as determined by xgb.cv
ntrees
# Run xgboost
bike_model_xgb <- xgboost(data = as.matrix(bikesJuly.treat), # training data as matrix
label = bikesJuly$cnt, # column of outcomes
nrounds = ntrees, # number of trees to build
objective = "reg:linear", # objective
eta = 0.3,
depth = 6,
verbose = 0 # silent
)
# Make predictions
bikesAugust$pred <- predict(bike_model_xgb, as.matrix(bikesAugust.treat))
# Plot predictions (on x axis) vs actual bike rental count
ggplot(bikesAugust, aes(x = pred, y = cnt)) +
geom_point() +
geom_abline()
#Sometimes it might make negative prediction. However if you look at RMSE it will be smaller than other models.
#perhaps rounding negative predictions up to zero is a reasonable tradeoff
########################################################################
#RIDGE AND LASSO
########################################################################
#REFERENCE: https://www.r-bloggers.com/ridge-regression-and-the-lasso/
swiss <- datasets::swiss
#remove the Fertility column and remove the intercept column
x <- model.matrix(Fertility~., data=swiss)[,-1]
y <- swiss$Fertility
lambda <- 10^seq(10, -2, length=100)
#When lambda = 0, we get same coefficents as OLS model
#create test and training set
library(glmnet)
#OLS
swisslm <- lm(Fertility~., data = swiss[train,])
s.pred <- predict(swisslm, newdata = swiss[test,])
#Ridge (alpha = 0)
cv.out <- cv.glmnet(x[train,], y[train], alpha = 0, lambda = vectorOfPossibleLambdas)
bestlamRidge <- cv.out$lambda.min
ridge.mod <- glmnet(x[train,], y[train], alpha = 0, lambda = bestlamRidge)
ridge.pred <- predict(ridge.mod, s = bestlamRidge, newx = x[test,])
#Lasso (alpha = 1)
cv.out <- cv.glmnet(x[train,], y[train], alpha = 1, lambda = vectorOfPossibleLambdas)
bestlamLasso <- cv.out$lambda.min
lasso.mod <- glmnet(x[train,], y[train], alpha = 1, lambda = bestlamLasso)
lasso.pred <- predict(lasso.mod, s = bestlamLasso, newx = x[test,])
#Check RMSE LINEAR
sqrt(mean((s.pred-ytest)^2))
#Check RMSE RIDGE
sqrt(mean((ridge.pred - ytest)^2))
#Check RMSE LASSO
sqrt(mean((lasso.pred-ytest)^2))
########################################################################
#REFERENCE:https://www.youtube.com/watch?v=fAPCaue8UKQ
########################################################################
#Elastic net: mix of lasso and ridge regression
#Cross-validation to find optimal lambda for lasso regression
CV = cv.glmnet(x=trainX, y=trainY, family="binomial/gaussian", type.measure="deviance AKA mse", alpha = 1, nlambda = 100)
#Following plot shows the models (with different lambda values) that glmnet has fit, along with the error type for each of the models
plot(CV)
#Fit model
fit = glmnet(x=trainX, y=trainY, family = "binomial", alpha=1, lambda=CV$lambda.1se)
#???lambda.1se vs lambda.min???
# This is a question about parsimony. The lambda.min option refers to value of λ at the lowest CV error.
# The error at this value of λ is the average of the errors over the k folds and hence this estimate of the error is uncertain.
# The lambda.1se represents the value of λ in the search that was simpler than the best model (lambda.min), but which has error
# within 1 standard error of the best model. In other words, using the value of lambda.1se as the selected value for λ results
# in a model that is slightly simpler than the best model but which cannot be distinguished from the best model in terms of error
# given the uncertainty in the kk-fold CV estimate of the error of the best model.
#
# The choice is yours:
#
# The best model that may be too complex or slightly overfitted: lambda.min
# The simplest model that has comparable error to the best model given the uncertainty: lambda.1se
#List nonzero coefficients:
fit$beta[,1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment