Created
November 19, 2020 01:20
-
-
Save eamcvey/540191bda4f6fa4f09db44492d04975c to your computer and use it in GitHub Desktop.
reprex_string_parsing_speed
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ``` r | |
| # Setup ------------------------------------------------------------------- | |
| library(tidyverse) | |
| library(reprex) | |
| library(tictoc) | |
| #> Warning: package 'tictoc' was built under R version 4.0.2 | |
| ## Optionally, for furrr approach | |
| library(furrr) | |
| #> Warning: package 'furrr' was built under R version 4.0.2 | |
| #> Loading required package: future | |
| #> Warning: package 'future' was built under R version 4.0.2 | |
| future::plan(multisession) | |
| ## | |
| # Test data --------------------------------------------------------------- | |
| # A tiny sample of the log lines | |
| mini_lines <- list("[15:35:52] [Render thread/INFO]: Reloading ResourceManager: Default", | |
| "[15:44:25] [Render thread/INFO]: [CHAT] You don't have Tenn1s's permission to build here.", | |
| "[15:45:23] [Render thread/INFO]: [CHAT] ", "[15:45:45] [Render thread/INFO]: [CHAT] You don't have Olivia193's permission to build here.", | |
| "[15:45:56] [Render thread/INFO]: [CHAT] [Legendz] Vugoe : trix poop", | |
| "[16:29:14] [Render thread/INFO]: [CHAT] You will respawn in 4 seconds!", | |
| "[16:29:17] [Render thread/WARN]: Unknown custom packed identifier: badlion:timers", | |
| "[16:29:18] [Render thread/INFO]: [CHAT] Gooce slipped in BBQ sauce off the edge spilled by Adst3r.", | |
| "[16:29:18] [Render thread/INFO]: [CHAT] You have respawned!", | |
| "[16:29:19] [Render thread/INFO]: [CHAT] ") | |
| # Parsing function -------------------------------------------------------- | |
| # Extract the timestamp from a log line | |
| # If the line starts with a timestamp, return it as a character, otherwise return NA | |
| extract_ts <- function(line) { | |
| # logical whether the line starts with an opening square bracket | |
| bracket_start <- str_detect(line, '^\\[') | |
| if(bracket_start) { | |
| open_bracket_pos <- str_locate(line, '\\[') %>% as_tibble() %>% pull(start) | |
| close_bracket_pos <- str_locate(line, '\\]') %>% as_tibble() %>% pull(start) | |
| ts <- str_sub(line, (open_bracket_pos + 1), (close_bracket_pos - 1)) | |
| } else { | |
| ts <- NA | |
| } | |
| return(ts) | |
| } | |
| # Parse the data ---------------------------------------------------------- | |
| # Extract the timestamp portion of the log lines | |
| all_ts <- purrr::map_chr(mini_lines, ~extract_ts(.x)) | |
| # Now do this on a larger list of lines to demonstrate processing time | |
| lotta_lines <- rep(mini_lines, times = 1000) | |
| # Takes about 12 seconds on my machine | |
| tic() | |
| lotta_ts <- purrr::map_chr(lotta_lines, ~extract_ts(.x)) | |
| toc() | |
| #> 11.274 sec elapsed | |
| # Try furrr --------------------------------------------------------------- | |
| # Takes about 4 seconds on my machine | |
| tic() | |
| lotta_ts <- furrr::future_map(lotta_lines, ~extract_ts(.x)) | |
| toc() | |
| #> 4.262 sec elapsed | |
| ``` | |
| <sup>Created on 2020-11-18 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup> |
With a well-formed regular expression, stringi will parse the 10 million timestamps in less than four seconds:
library(stringi)
library(tictoc)
# A tiny sample of the log lines
mini_lines <- list("[15:35:52] [Render thread/INFO]: Reloading ResourceManager: Default",
"[15:44:25] [Render thread/INFO]: [CHAT] You don't have Tenn1s's permission to build here.",
"[15:45:23] [Render thread/INFO]: [CHAT] ", "[15:45:45] [Render thread/INFO]: [CHAT] You don't have Olivia193's permission to build here.",
"[15:45:56] [Render thread/INFO]: [CHAT] [Legendz] Vugoe : trix poop",
"[16:29:14] [Render thread/INFO]: [CHAT] You will respawn in 4 seconds!",
"[16:29:17] [Render thread/WARN]: Unknown custom packed identifier: badlion:timers",
"[16:29:18] [Render thread/INFO]: [CHAT] Gooce slipped in BBQ sauce off the edge spilled by Adst3r.",
"[16:29:18] [Render thread/INFO]: [CHAT] You have respawned!",
"[16:29:19] [Render thread/INFO]: [CHAT] ")
ts <- stri_extract_first_regex(mini_lines, "\\d\\d:\\d\\d:\\d\\d")
ts
#> [1] "15:35:52" "15:44:25" "15:45:23" "15:45:45" "15:45:56" "16:29:14"
#> [7] "16:29:17" "16:29:18" "16:29:18" "16:29:19"
lotta_lines <- rep(mini_lines, times = 1000)
tic()
lotta_ts <- stri_extract_first_regex(lotta_lines, "\\d\\d:\\d\\d:\\d\\d")
toc()
#> 0.005 sec elapsed
lotta_lines <- rep(mini_lines, times = 1000000)
length(lotta_lines)
#> [1] 10000000
tic()
lotta_ts <- stri_extract_first_regex(lotta_lines, "\\d\\d:\\d\\d:\\d\\d")
toc()
#> 3.742 sec elapsedCreated on 2020-11-21 by the reprex package (v0.3.0)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Created on 2020-11-20 by the reprex package (v0.3.0)