Created
January 13, 2021 23:42
-
-
Save tejseth/50ef14c04e9a3663045701171310592c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(tidyverse) | |
| library(xgboost) | |
| library(magrittr) | |
| library(dplyr) | |
| library(Matrix) | |
| library(na.tools) | |
| library(ggimage) | |
| library(nflfastR) | |
| library(gt) | |
| library(mgcv) | |
| library(scales) | |
| library(ggforce) | |
| library(remotes) | |
| library(ggtext) | |
| source("https://raw.githubusercontent.com/mrcaseb/nflfastR/master/R/helper_add_nflscrapr_mutations.R") | |
| source("https://raw.githubusercontent.com/mrcaseb/nflfastR/master/R/helper_add_ep_wp.R") | |
| source("https://raw.githubusercontent.com/mrcaseb/nflfastR/master/R/helper_add_cp_cpoe.R") | |
| seasons <- 2010:2020 | |
| pbp <- purrr::map_df(seasons, function(x) { | |
| readRDS( | |
| url( | |
| glue::glue("https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_{x}.rds") | |
| ) | |
| ) | |
| }) | |
| pbp_rp <- pbp %>% | |
| filter(!is_na(epa), play_type=="no_play" | play_type=="pass" | play_type=="run") | |
| pbp_rp <- pbp_rp %>% | |
| mutate( | |
| pass = if_else(str_detect(desc, "( pass)|(sacked)|(scramble)"), 1, 0), | |
| rush = if_else(str_detect(desc, "(left end)|(left tackle)|(left guard)|(up the middle)|(right guard)|(right tackle)|(right end)") & pass == 0, 1, 0), | |
| success = ifelse(epa>0, 1 , 0) | |
| ) | |
| pbp_rp <- pbp_rp %>% filter(pass==1 | rush==1) | |
| pbp_rp <- pbp_rp %>% | |
| mutate(season = substr(old_game_id, 1, 4)) | |
| pbp_rp <- pbp_rp %>% | |
| mutate( | |
| posteam = case_when( | |
| posteam == 'OAK' ~ 'LV', | |
| posteam == 'SD' ~ 'LAC', | |
| posteam == 'STL' ~ 'LA', | |
| TRUE ~ posteam | |
| ) | |
| ) | |
| pbp_rp <- pbp_rp %>% | |
| mutate( | |
| defteam = case_when( | |
| defteam == 'OAK' ~ 'LV', | |
| defteam == 'SD' ~ 'LAC', | |
| defteam == 'STL' ~ 'LA', | |
| TRUE ~ defteam | |
| ) | |
| ) | |
| rush_attempts <- pbp_rp %>% | |
| filter(rush_attempt == 1, qb_scramble == 0, qb_dropback == 0) | |
| rush_attempts %>% | |
| group_by(season, defteam) %>% | |
| summarize(def_ypc = mean(yards_gained), | |
| count = n()) %>% | |
| filter(count >= 100) %>% | |
| select(-count) -> def_ypc | |
| rush_attempts <- rush_attempts %>% | |
| left_join(def_ypc, by = c("season", "defteam")) | |
| rush_attempts2 <- rush_attempts %>% | |
| mutate(yards_rushed = case_when(yards_gained > 20 ~ 20L, | |
| yards_gained < -5 ~ -5L, | |
| TRUE ~ as.integer(yards_gained)), | |
| label = yards_rushed + 5L) | |
| rush_attempts3 <- rush_attempts2 %>% | |
| mutate(run_left_end = if_else((run_gap == "end" & run_location == "left"), 1, 0), | |
| run_left_guard = if_else((run_gap == "guard" & run_location == "left"), 1, 0), | |
| run_left_tackle = if_else((run_gap == "tackle" & run_location == "left"), 1, 0), | |
| run_right_end = if_else((run_gap == "end" & run_location == "right"), 1, 0), | |
| run_right_guard = if_else((run_gap == "guard" & run_location == "right"), 1, 0), | |
| run_right_tackle = if_else((run_gap == "tackle" & run_location == "right"), 1, 0), | |
| run_middle = if_else((run_location == "middle"), 1, 0)) | |
| rush_attempts4 <- rush_attempts3 %>% | |
| select(yardline_100, quarter_seconds_remaining, half_seconds_remaining, | |
| game_seconds_remaining, qtr, down, goal_to_go, ydstogo, shotgun, no_huddle, | |
| no_score_prob, ep, wp, def_ypc, label) %>% | |
| filter(!is.na(label)) %>% | |
| filter(!is.na(down)) | |
| nrounds <- 100 | |
| params <- | |
| list( | |
| booster = "gbtree", | |
| objective = "multi:softprob", | |
| eval_metric = c("mlogloss"), | |
| num_class = 26, | |
| eta = .025, | |
| gamma = 2, | |
| subsample=0.8, | |
| colsample_bytree=0.8, | |
| max_depth = 4, | |
| min_child_weight = 1 | |
| ) | |
| smp_size <- floor(0.80 * nrow(rush_attempts4)) | |
| set.seed(123) | |
| ind <- sample(seq_len(nrow(rush_attempts4)), size = smp_size) | |
| ind_train <- rush_attempts4[ind, ] | |
| ind_test <- rush_attempts4[-ind, ] | |
| full_train <- xgboost::xgb.DMatrix(as.matrix(ind_train %>% select(-label)), label = as.integer(ind_train$label)) | |
| ryoe_model <- xgboost::xgboost(params = params, data = full_train, nrounds = nrounds, verbose = 2) | |
| imp <- xgb.importance(colnames(ind_train), model = ryoe_model) | |
| xgb.plot.importance(imp) | |
| rushes_2020 <- rush_attempts3 %>% | |
| filter(season == 2020) %>% | |
| select(yardline_100, quarter_seconds_remaining, half_seconds_remaining, | |
| game_seconds_remaining, qtr, down, goal_to_go, ydstogo, shotgun, no_huddle, | |
| no_score_prob, ep, wp, def_ypc) %>% | |
| mutate(index = 1:n()) | |
| ryoe_2020 <- stats::predict(ryoe_model, | |
| as.matrix(rushes_2020 %>% | |
| select(yardline_100, quarter_seconds_remaining, half_seconds_remaining, | |
| game_seconds_remaining, qtr, down, goal_to_go, ydstogo, shotgun, no_huddle, | |
| no_score_prob, ep, wp, def_ypc))) %>% | |
| tibble::as_tibble() %>% | |
| dplyr::rename(prob = "value") %>% | |
| dplyr::bind_cols(purrr::map_dfr(seq_along(rushes_2020$index), function(x) { | |
| tibble::tibble("xyds_rushed" = -5:20, | |
| "down" = rushes_2020$down[[x]], | |
| "yardline_100" = rushes_2020$yardline_100[[x]], | |
| "quarter_seconds_remaining" = rushes_2020$quarter_seconds_remaining[[x]], | |
| "half_seconds_remaining" = rushes_2020$half_seconds_remaining[[x]], | |
| "game_seconds_remaining" = rushes_2020$game_seconds_remaining[[x]], | |
| "qtr" = rushes_2020$qtr[[x]], | |
| "goal_to_go" = rushes_2020$goal_to_go[[x]], | |
| "ydstogo" = rushes_2020$ydstogo[[x]], | |
| "shotgun" = rushes_2020$shotgun[[x]], | |
| "no_huddle" = rushes_2020$no_huddle[[x]], | |
| "no_score_prob" = rushes_2020$no_score_prob[[x]], | |
| "ep" = rushes_2020$ep[[x]], | |
| "wp" = rushes_2020$wp[[x]], | |
| "index" = rushes_2020$index[[x]]) | |
| })) %>% | |
| dplyr::group_by(.data$index) %>% | |
| dplyr::mutate(max_loss = dplyr::if_else(.data$yardline_100 < 95, -5L, as.integer(.data$yardline_100 - 99L)), | |
| max_gain = dplyr::if_else(.data$yardline_100 > 20, 20L, as.integer(.data$yardline_100)), | |
| cum_prob = cumsum(.data$prob), | |
| prob = dplyr::case_when(.data$xyds_rushed == .data$max_loss ~ .data$prob, | |
| .data$xyds_rushed == .data$max_gain ~ 1 - dplyr::lag(.data$cum_prob, 1), | |
| TRUE ~ .data$prob), | |
| yardline_100 = .data$yardline_100 - .data$xyds_rushed) %>% | |
| dplyr::filter(.data$xyds_rushed >= .data$max_loss, .data$xyds_rushed <= .data$max_gain) %>% | |
| dplyr::select(-.data$cum_prob) %>% | |
| dplyr::summarise(x_rush_yards = sum(.data$prob * .data$xyds_rushed)) %>% | |
| ungroup() | |
| rushes_2020_2 <- rushes_2020 %>% | |
| inner_join(ryoe_2020) | |
| pbp_2020 <- pbp_rp %>% | |
| inner_join(rushes_2020_2) %>% | |
| select(posteam, defteam, rusher_player_name, yards_gained, x_rush_yards, epa) %>% | |
| mutate(ryoe = yards_gained - x_rush_yards) | |
| rushers_2020 <- pbp_2020 %>% | |
| filter(!is.na(rusher_player_name)) %>% | |
| group_by(rusher_player_name, posteam) %>% | |
| summarize(rushes = n(), | |
| sum_ryoe = sum(ryoe, na.rm = T), | |
| avg_ryoe = mean(ryoe, na.rm =T), | |
| mean_epa = mean(epa, na.rm = T)) %>% | |
| filter(rushes > 107) %>% | |
| arrange(desc(avg_ryoe)) | |
| ############################################################################ | |
| rushers_2020 <- rushers_2020 %>% | |
| left_join(teams_colors_logos, by = c('posteam' = 'team_abbr')) | |
| scatter_plot <- rushers_2020 %>% | |
| ggplot() + | |
| geom_smooth(aes(x = mean_epa, y = avg_ryoe), method = "lm", color = "grey") + | |
| ggrepel::geom_text_repel( | |
| aes(x = mean_epa, y = avg_ryoe, label = rusher_player_name), | |
| box.padding = 0.3, size = 5 | |
| ) + | |
| geom_point( | |
| aes(x = mean_epa, y = avg_ryoe, size = rushes, fill = team_color, color = team_color2), | |
| shape = 21 | |
| ) + | |
| scale_color_identity(aesthetics = c("fill", "color")) + | |
| scale_size(name = "Designed Rushes") + | |
| theme_minimal() + | |
| scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + | |
| scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) + | |
| labs(x = "EPA Per Rush", | |
| y = "Rushing Yards Over Expected Per Rush", | |
| title = "RYOE and EPA Are Correlated", | |
| subtitle = "RYOE is a xgboost model, min. of 105 designed rushes", | |
| caption = "By Tej Seth | @mfbanalytics | @deceptivespeed_") + | |
| theme( | |
| panel.grid.minor = element_blank(), | |
| plot.title = element_text(face = "bold", size = 20, hjust = 0.5), | |
| plot.subtitle = element_text(size = 10, hjust = 0.5), | |
| axis.text = element_text(size = 14), | |
| axis.title.y = element_text(size = 14) | |
| ) | |
| scatter_plot | |
| ggsave( | |
| "ryoe-1.png", scatter_plot, | |
| height = 10, width = 16, dpi = "retina" | |
| ) | |
| rusher_faces<- pbp_2020 %>% | |
| filter(!is.na(rusher_player_name)) %>% | |
| group_by(rusher_player_name, posteam) %>% | |
| summarize(rushes = n(), | |
| sum_ryoe = sum(ryoe, na.rm = T), | |
| avg_ryoe = mean(ryoe, na.rm =T), | |
| mean_epa = mean(epa, na.rm = T)) %>% | |
| filter(rushes > 75) %>% | |
| arrange(desc(avg_ryoe)) | |
| write.csv(rusher_faces, "rusher_faces.csv") | |
| rusher_faces <- read.csv("~/RYOE/rusher_faces.csv") | |
| tab_data <- rusher_faces %>% | |
| mutate(RK = as.integer(rank)) %>% | |
| select(RK, rusher, headshot, mean_epa, avg_ryoe) %>% | |
| arrange(RK) | |
| tab_function <- function(data, ...){ | |
| data %>% | |
| gt() %>% | |
| text_transform( | |
| locations = cells_body(vars(headshot)), | |
| fn = function(x){ | |
| web_image( | |
| url = x, | |
| height = px(25) | |
| ) | |
| } | |
| ) %>% | |
| cols_label( | |
| RK = "Rank", | |
| rusher = "Rusher", | |
| headshot = "", | |
| mean_epa = "EPA/Rush", | |
| avg_ryoe = "RYOE") %>% | |
| data_color( | |
| columns = vars(avg_ryoe), | |
| colors = scales::col_numeric( | |
| palette = c("#af8dc3", "#f7f7f7", "#7fbf7b"), | |
| domain = c(-3, 2) | |
| ) | |
| ) %>% | |
| tab_style( | |
| style = cell_text(weight = "bold"), | |
| locations = cells_body( | |
| columns = vars(RK, rusher) | |
| ) | |
| ) %>% | |
| tab_header( | |
| title = "Top 30 RYOE Rushers", | |
| subtitle = "RYOE = Rushing Yards Over Expected" | |
| ) %>% | |
| tab_options( | |
| column_labels.background.color = "white", | |
| column_labels.font.weight = "bold", | |
| table.border.top.width = px(3), | |
| table.border.top.color = "transparent", | |
| table.border.bottom.color = "transparent", | |
| table.border.bottom.width = px(3), | |
| column_labels.border.top.width = px(3), | |
| column_labels.border.top.color = "transparent", | |
| column_labels.border.bottom.width = px(3), | |
| column_labels.border.bottom.color = "black", | |
| data_row.padding = px(3), | |
| source_notes.font.size = 12, | |
| table.font.size = 16, | |
| heading.align = "middle", | |
| ... | |
| ) %>% | |
| opt_table_font( | |
| font = list( | |
| default_fonts() | |
| ) | |
| ) | |
| } | |
| gt_tab1 <- tab_data %>% | |
| filter(RK < 31) %>% | |
| tab_function() | |
| gt_tab1 | |
| gtsave(gt_tab1, "gt-tab1.png") | |
| gt_tab2 <- tab_data %>% | |
| filter(RK >= 31) %>% | |
| tab_function() %>% | |
| tab_header( | |
| title = "Rushers 31-60", | |
| subtitle = "RYOE is a xgboost model made by Tej Seth (@mfbanalytics)" | |
| ) %>% | |
| tab_style( | |
| style = cell_borders( | |
| sides = "left", | |
| color = "black", | |
| weight = px(3) | |
| ), | |
| locations = | |
| list( | |
| cells_body( | |
| columns = 1 | |
| ), | |
| cells_column_labels(1) | |
| ) | |
| ) | |
| gt_tab2 | |
| gtsave(gt_tab2, "gt-tab2.png") | |
| img1 <- magick::image_read("gt-tab1.png") | |
| img2 <- magick::image_read("gt-tab2.png") | |
| img3 <- magick::image_append(c(img1, img2)) | |
| img3 | |
| ggsave(img3, "ryoe-2.png") | |
| rusher_faces2 <- read.csv("~/RYOE/rusher_faces.csv") | |
| rusher_faces2 <- rusher_faces2 %>% | |
| left_join(teams_colors_logos, by = c('posteam' = 'team_abbr')) | |
| rusher_faces2 <- rusher_faces2 %>% | |
| arrange(rank) | |
| teams_xp <- pbp_2020 %>% | |
| group_by(posteam) %>% | |
| summarize(team_avg = mean(x_rush_yards, na.rm = T)) | |
| rusher_faces2 <- rusher_faces2 %>% | |
| left_join(teams_xp) | |
| rusher_faces2 <- rusher_faces2 %>% | |
| mutate(yards_gained = avg_ryoe + team_avg, | |
| new_rank = row_number()) | |
| rusher_faces2 %>% | |
| ggplot() + | |
| geom_link( | |
| mapping = aes(x = team_avg, y = new_rank, xend = yards_gained, yend = new_rank, size = 2, color = team_color) | |
| ) + | |
| theme_bw() + | |
| scale_colour_identity() + | |
| geom_image(aes(x = team_avg, y = new_rank, image = team_logo_espn), size = 0.04, asp = 16/9) + | |
| geom_image(aes(x = yards_gained, y = new_rank, image = headshot), size = 0.04, asp = 16/9) + | |
| labs( | |
| x = "Rushing Yards Average", | |
| y = "", | |
| title = "Each Team's Most Used Rusher's RYOE", | |
| subtitle = "RYOE = Rushing Yards Over Expected, if a player's face is to the right of their logo they have a positive RYOE", | |
| caption = "By Tej Seth | @mfbanalytics | @deceptivespeed_" | |
| ) + | |
| theme( | |
| plot.title = element_markdown(hjust = 0.5, size = 20, face = "bold"), | |
| plot.subtitle = element_markdown(hjust = 0.5, size = 12), | |
| axis.title.y=element_blank(), | |
| axis.text.y=element_blank(), | |
| axis.ticks.y=element_blank(), | |
| legend.position = "none", | |
| panel.grid.major.y = element_blank(), | |
| panel.grid.minor.y = element_blank(), | |
| panel.background = element_blank(), | |
| panel.border= element_blank() | |
| ) + | |
| scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + | |
| scale_y_reverse(breaks = scales::pretty_breaks(n = 10)) | |
| ggsave('ryoe-3.png', dpi=300, height=9*.8, width=16*.8) | |
| top_rushers <- pbp_2020 %>% | |
| filter(!is.na(rusher_player_name)) %>% | |
| group_by(rusher_player_name) %>% | |
| summarize(rushes = n(), | |
| sum_ryoe = sum(ryoe, na.rm = T)) %>% | |
| filter(rushes > 107) %>% | |
| arrange(desc(sum_ryoe)) %>% | |
| filter(sum_ryoe > 0) | |
| top_rushers <- top_rushers %>% | |
| left_join(rusher_faces) | |
| top_rushers <- top_rushers %>% | |
| left_join(teams_colors_logos, by = c("posteam" = "team_abbr")) %>% | |
| filter(!is.na(rank)) %>% | |
| mutate(rank = row_number()) | |
| link_to_img <- function(x, width = 50) { | |
| glue::glue("<img src='{x}' width='{width}'/>") | |
| } | |
| bar_plot <- top_rushers %>% | |
| mutate(label = link_to_img(headshot), | |
| rank = as.integer(rank)) %>% | |
| ggplot() + | |
| geom_col( | |
| aes( | |
| x = rank, y = sum_ryoe, | |
| fill = team_color, color = team_color2 | |
| ), | |
| width = 0.4 | |
| ) + | |
| geom_image(aes(x = rank, y = sum_ryoe + 5 , image = headshot), asp = 16/9, size = 0.06) + | |
| scale_color_identity(aesthetics = c("fill", "color")) + | |
| theme_minimal() + | |
| scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + | |
| scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) + | |
| labs(x = NULL, | |
| y = "Total RYOE\n", | |
| title = "The Running Backs 10 in Rushing Yards Over Expected", | |
| caption = "By Tej Seth | @mfbanalytics") + | |
| theme( | |
| panel.grid.minor = element_blank(), | |
| plot.title = element_text(face = "bold", size = 20, hjust = 0.5), | |
| plot.subtitle = element_text(size = 10, hjust = 0.5), | |
| axis.text = element_text(size = 14, face = "bold"), | |
| axis.title.y = element_text(size = 16, face = "bold") | |
| ) | |
| bar_plot | |
| ggsave( | |
| "ryoe-4.png", bar_plot, | |
| height = 10, width = 16, dpi = "retina" | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment