Skip to content

Instantly share code, notes, and snippets.

@eamcvey
Created November 19, 2020 01:20
Show Gist options
  • Select an option

  • Save eamcvey/540191bda4f6fa4f09db44492d04975c to your computer and use it in GitHub Desktop.

Select an option

Save eamcvey/540191bda4f6fa4f09db44492d04975c to your computer and use it in GitHub Desktop.
reprex_string_parsing_speed
``` r
# Setup -------------------------------------------------------------------
library(tidyverse)
library(reprex)
library(tictoc)
#> Warning: package 'tictoc' was built under R version 4.0.2
## Optionally, for furrr approach
library(furrr)
#> Warning: package 'furrr' was built under R version 4.0.2
#> Loading required package: future
#> Warning: package 'future' was built under R version 4.0.2
future::plan(multisession)
##
# Test data ---------------------------------------------------------------
# A tiny sample of the log lines
mini_lines <- list("[15:35:52] [Render thread/INFO]: Reloading ResourceManager: Default",
"[15:44:25] [Render thread/INFO]: [CHAT] You don't have Tenn1s's permission to build here.",
"[15:45:23] [Render thread/INFO]: [CHAT] ", "[15:45:45] [Render thread/INFO]: [CHAT] You don't have Olivia193's permission to build here.",
"[15:45:56] [Render thread/INFO]: [CHAT] [Legendz] Vugoe : trix poop",
"[16:29:14] [Render thread/INFO]: [CHAT] You will respawn in 4 seconds!",
"[16:29:17] [Render thread/WARN]: Unknown custom packed identifier: badlion:timers",
"[16:29:18] [Render thread/INFO]: [CHAT] Gooce slipped in BBQ sauce off the edge spilled by Adst3r.",
"[16:29:18] [Render thread/INFO]: [CHAT] You have respawned!",
"[16:29:19] [Render thread/INFO]: [CHAT] ")
# Parsing function --------------------------------------------------------
# Extract the timestamp from a log line
# If the line starts with a timestamp, return it as a character, otherwise return NA
extract_ts <- function(line) {
# logical whether the line starts with an opening square bracket
bracket_start <- str_detect(line, '^\\[')
if(bracket_start) {
open_bracket_pos <- str_locate(line, '\\[') %>% as_tibble() %>% pull(start)
close_bracket_pos <- str_locate(line, '\\]') %>% as_tibble() %>% pull(start)
ts <- str_sub(line, (open_bracket_pos + 1), (close_bracket_pos - 1))
} else {
ts <- NA
}
return(ts)
}
# Parse the data ----------------------------------------------------------
# Extract the timestamp portion of the log lines
all_ts <- purrr::map_chr(mini_lines, ~extract_ts(.x))
# Now do this on a larger list of lines to demonstrate processing time
lotta_lines <- rep(mini_lines, times = 1000)
# Takes about 12 seconds on my machine
tic()
lotta_ts <- purrr::map_chr(lotta_lines, ~extract_ts(.x))
toc()
#> 11.274 sec elapsed
# Try furrr ---------------------------------------------------------------
# Takes about 4 seconds on my machine
tic()
lotta_ts <- furrr::future_map(lotta_lines, ~extract_ts(.x))
toc()
#> 4.262 sec elapsed
```
<sup>Created on 2020-11-18 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
@dantonnoriega
Copy link

# Test data ---------------------------------------------------------------

# A tiny sample of the log lines
# (`writeLines` was unhappy with list so using `c`)
mini_lines <- c("[15:35:52] [Render thread/INFO]: Reloading ResourceManager: Default", 
                                      "[15:44:25] [Render thread/INFO]: [CHAT] You don't have Tenn1s's permission to build here.", 
                                      "[15:45:23] [Render thread/INFO]: [CHAT] ", "[15:45:45] [Render thread/INFO]: [CHAT] You don't have Olivia193's permission to build here.", 
                                      "[15:45:56] [Render thread/INFO]: [CHAT] [Legendz] Vugoe : trix poop", 
                                      "[16:29:14] [Render thread/INFO]: [CHAT] You will respawn in 4 seconds!", 
                                      "[16:29:17] [Render thread/WARN]: Unknown custom packed identifier: badlion:timers", 
                                      "[16:29:18] [Render thread/INFO]: [CHAT] Gooce slipped in BBQ sauce off the edge spilled by Adst3r.", 
                                      "[16:29:18] [Render thread/INFO]: [CHAT] You have respawned!", 
                                      "[16:29:19] [Render thread/INFO]: [CHAT] ")

lotta_lines <- rep(mini_lines, times = 1000)

# using bash tools ---------------------------------------------------------
# write the `lotta_lines` to a file, so it acts like a text log
ff <- tempfile(fileext=".txt")
writeLines(lotta_lines, ff)

# check the file on disk
system(sprintf('head %s', ff), intern = TRUE)
#>  [1] "[15:35:52] [Render thread/INFO]: Reloading ResourceManager: Default"                               
#>  [2] "[15:44:25] [Render thread/INFO]: [CHAT] You don't have Tenn1s's permission to build here."         
#>  [3] "[15:45:23] [Render thread/INFO]: [CHAT] "                                                          
#>  [4] "[15:45:45] [Render thread/INFO]: [CHAT] You don't have Olivia193's permission to build here."      
#>  [5] "[15:45:56] [Render thread/INFO]: [CHAT] [Legendz] Vugoe : trix poop"                               
#>  [6] "[16:29:14] [Render thread/INFO]: [CHAT] You will respawn in 4 seconds!"                            
#>  [7] "[16:29:17] [Render thread/WARN]: Unknown custom packed identifier: badlion:timers"                 
#>  [8] "[16:29:18] [Render thread/INFO]: [CHAT] Gooce slipped in BBQ sauce off the edge spilled by Adst3r."
#>  [9] "[16:29:18] [Render thread/INFO]: [CHAT] You have respawned!"                                       
#> [10] "[16:29:19] [Render thread/INFO]: [CHAT] "

# structure here is nice; i would use `cut`
# - `cut`
#   : -d" " splits/delimits on a SINGLE space
#   : -f1 takes the first "field" (time stamp)
# - `tr` (operates per character)
#   : -d "[]" deletes all instances of "[" OR "]" (per character operation)
tictoc::tic()
cmd = sprintf('cut -d" " -f1 %s | tr -d "[]"', ff)
system(cmd)

# read the results of the command
data.table::fread(cmd = cmd, header = FALSE, col.names = 'timestamp')
#>        timestamp
#>     1:  15:35:52
#>     2:  15:44:25
#>     3:  15:45:23
#>     4:  15:45:45
#>     5:  15:45:56
#>    ---          
#>  9996:  16:29:14
#>  9997:  16:29:17
#>  9998:  16:29:18
#>  9999:  16:29:18
#> 10000:  16:29:19
tictoc::toc()
#> 0.133 sec elapsed

Created on 2020-11-20 by the reprex package (v0.3.0)

@noamross
Copy link

With a well-formed regular expression, stringi will parse the 10 million timestamps in less than four seconds:

library(stringi)
library(tictoc)

# A tiny sample of the log lines
mini_lines <- list("[15:35:52] [Render thread/INFO]: Reloading ResourceManager: Default",
                   "[15:44:25] [Render thread/INFO]: [CHAT] You don't have Tenn1s's permission to build here.",
                   "[15:45:23] [Render thread/INFO]: [CHAT] ", "[15:45:45] [Render thread/INFO]: [CHAT] You don't have Olivia193's permission to build here.",
                   "[15:45:56] [Render thread/INFO]: [CHAT] [Legendz] Vugoe : trix poop",
                   "[16:29:14] [Render thread/INFO]: [CHAT] You will respawn in 4 seconds!",
                   "[16:29:17] [Render thread/WARN]: Unknown custom packed identifier: badlion:timers",
                   "[16:29:18] [Render thread/INFO]: [CHAT] Gooce slipped in BBQ sauce off the edge spilled by Adst3r.",
                   "[16:29:18] [Render thread/INFO]: [CHAT] You have respawned!",
                   "[16:29:19] [Render thread/INFO]: [CHAT] ")



ts <- stri_extract_first_regex(mini_lines, "\\d\\d:\\d\\d:\\d\\d")
ts
#>  [1] "15:35:52" "15:44:25" "15:45:23" "15:45:45" "15:45:56" "16:29:14"
#>  [7] "16:29:17" "16:29:18" "16:29:18" "16:29:19"

lotta_lines <- rep(mini_lines, times = 1000)

tic()
lotta_ts <- stri_extract_first_regex(lotta_lines, "\\d\\d:\\d\\d:\\d\\d")
toc()
#> 0.005 sec elapsed

lotta_lines <- rep(mini_lines, times = 1000000)
length(lotta_lines)
#> [1] 10000000
tic()
lotta_ts <- stri_extract_first_regex(lotta_lines, "\\d\\d:\\d\\d:\\d\\d")
toc()
#> 3.742 sec elapsed

Created on 2020-11-21 by the reprex package (v0.3.0)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment