burch-cm · December 8, 2020 19:27
diff --git a/extract_weights.R b/extract_weights.R
 # Vehicles from the GSA have a gross vehicle weight rating (GVWR), but this is stored in the vehicle description in inconsistent 
 # text formats (sometimes spaces as separators, sometimes commas, sometimes no separators). One consistent thing is that the weights
 # always end with 'LBS', which means it's easy enough to write a regex that can extract them from the text.

 # Same GSA vehicle description
 sample_text <- c("4x4 SUV 10,000 LBS GVWR", "2x4 PICKUP 7 001 LBS GVWR", "SEDAN 6000LBS")

 # using {stringr} and regex:
 library(stringr)
 library(magrittr)

 # First, extract digits from the text where the string of digits ends with 'LBS'.
 # Allow for optional spaces and commas as well.

 sample_text %>%
  str_extract(regex("[\\d(,?) ]+(?= *LBS)"))

 # this leaves me with c(" 10,000 ", " 7 001 ", " 6000")
 # first, remove spaces and commas
 # stringr::str_remove_all() takes the regex() argument as well, and makes this easy

 sample_text %>%
  str_extract(regex("[\\d(,?) ]+(?= *LBS)")) %>%
  str_remove_all(regex("[ ,]"))
  
 # which yields c("10000", "7001", "6000"). Much better.
 # Finally, convert to numeric so that it can be used in calculations.

 sample_text %>%
  str_extract(regex("[\\d(,?) ]+(?= *LBS)")) %>%
  str_remove_all(regex("[ ,]")) %>%
  as.numeric()
  
 # c(10000, 7001, 6000)
 # Done!
	# Vehicles from the GSA have a gross vehicle weight rating (GVWR), but this is stored in the vehicle description in inconsistent
	# text formats (sometimes spaces as separators, sometimes commas, sometimes no separators). One consistent thing is that the weights
	# always end with 'LBS', which means it's easy enough to write a regex that can extract them from the text.

	# Same GSA vehicle description
	sample_text <- c("4x4 SUV 10,000 LBS GVWR", "2x4 PICKUP 7 001 LBS GVWR", "SEDAN 6000LBS")

	# using {stringr} and regex:
	library(stringr)
	library(magrittr)

	# First, extract digits from the text where the string of digits ends with 'LBS'.
	# Allow for optional spaces and commas as well.

	sample_text %>%
	str_extract(regex("[\\d(,?) ]+(?= *LBS)"))

	# this leaves me with c(" 10,000 ", " 7 001 ", " 6000")
	# first, remove spaces and commas
	# stringr::str_remove_all() takes the regex() argument as well, and makes this easy

	sample_text %>%
	str_extract(regex("[\\d(,?) ]+(?= *LBS)")) %>%
	str_remove_all(regex("[ ,]"))

	# which yields c("10000", "7001", "6000"). Much better.
	# Finally, convert to numeric so that it can be used in calculations.

	sample_text %>%
	str_extract(regex("[\\d(,?) ]+(?= *LBS)")) %>%
	str_remove_all(regex("[ ,]")) %>%
	as.numeric()

	# c(10000, 7001, 6000)
	# Done!
No results found