Skip to content

Instantly share code, notes, and snippets.

@vi3k6i5
Last active May 28, 2023 21:05
Show Gist options
  • Select an option

  • Save vi3k6i5/604eefd92866d081cfa19f862224e4a0 to your computer and use it in GitHub Desktop.

Select an option

Save vi3k6i5/604eefd92866d081cfa19f862224e4a0 to your computer and use it in GitHub Desktop.
Benchmarking timing performance Keyword Extraction between regex and flashtext
#!/bin/python
from flashtext.keyword import KeywordProcessor
import random
import string
import re
import time
def get_word_of_length(str_length):
"""
generate a radom word of given length
"""
return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length))
# generate a list of 100K words of randomly chosen size
all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)]
string_length_to_time_map = []
for keywords_length in range(0, 20000, 1000):
"""
Run the benchmark from 0 to 20K at 1K interval.
"""
# chose 1000 terms and create a string to search in.
all_words_chosen = random.sample(all_words, 1000)
story = ' '.join(all_words_chosen)
# get unique keywords from the list of words generated.
unique_keywords_sublist = list(set(random.sample(all_words, keywords_length)))
# compile regex
compiled_re = re.compile('|'.join([r'\b' + keyword + r'\b' for keyword in unique_keywords_sublist]))
# add keywords to flashtext
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(unique_keywords_sublist)
# time the modules
start = time.time()
_ = keyword_processor.extract_keywords(story)
mid = time.time()
_ = compiled_re.findall(story)
end = time.time()
# add results to dictionary as Count, FlashText time, Regex time
string_length_to_time_map.append((len(unique_keywords_sublist), mid - start, end - mid))
print('Count | FlashText | Regex ')
print('------------------------------')
for value in string_length_to_time_map:
print(str(value[0]).ljust(6), '|',
"{0:.5f}".format(value[1]).ljust(9), '|',
"{0:.5f}".format(value[2]).ljust(9), '|',)
# Sample Output: Count, Timinging in Seconds.
# Count | FlashText | Regex
# ------------------------------
# 0 | 0.00118 | 0.00079 |
# 1000 | 0.00149 | 0.00944 |
# 10000 | 0.00214 | 0.11071 |
# 19000 | 0.00223 | 0.19117 |
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment