Skip to content

Instantly share code, notes, and snippets.

@vi3k6i5
Last active May 28, 2023 21:05
Show Gist options
  • Select an option

  • Save vi3k6i5/604eefd92866d081cfa19f862224e4a0 to your computer and use it in GitHub Desktop.

Select an option

Save vi3k6i5/604eefd92866d081cfa19f862224e4a0 to your computer and use it in GitHub Desktop.
Benchmarking timing performance Keyword Extraction between regex and flashtext
#!/bin/python
from flashtext.keyword import KeywordProcessor
import random
import string
import re
import time
def get_word_of_length(str_length):
"""
generate a radom word of given length
"""
return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length))
# generate a list of 100K words of randomly chosen size
all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)]
string_length_to_time_map = []
for keywords_length in range(0, 20000, 1000):
"""
Run the benchmark from 0 to 20K at 1K interval.
"""
# chose 1000 terms and create a string to search in.
all_words_chosen = random.sample(all_words, 1000)
story = ' '.join(all_words_chosen)
# get unique keywords from the list of words generated.
unique_keywords_sublist = list(set(random.sample(all_words, keywords_length)))
# compile regex
compiled_re = re.compile('|'.join([r'\b' + keyword + r'\b' for keyword in unique_keywords_sublist]))
# add keywords to flashtext
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(unique_keywords_sublist)
# time the modules
start = time.time()
_ = keyword_processor.extract_keywords(story)
mid = time.time()
_ = compiled_re.findall(story)
end = time.time()
# add results to dictionary as Count, FlashText time, Regex time
string_length_to_time_map.append((len(unique_keywords_sublist), mid - start, end - mid))
print('Count | FlashText | Regex ')
print('------------------------------')
for value in string_length_to_time_map:
print(str(value[0]).ljust(6), '|',
"{0:.5f}".format(value[1]).ljust(9), '|',
"{0:.5f}".format(value[2]).ljust(9), '|',)
# output: Count, Timinging in Seconds.
# Count | FlashText | Regex
# ------------------------------
# 0 | 0.00118 | 0.00079 |
# 1000 | 0.00149 | 0.00944 |
# 2000 | 0.00186 | 0.02233 |
# 3000 | 0.00160 | 0.02909 |
# 4000 | 0.00171 | 0.03714 |
# 5000 | 0.00166 | 0.05882 |
# 6000 | 0.00212 | 0.06964 |
# 7000 | 0.00174 | 0.09775 |
# 8000 | 0.00169 | 0.08188 |
# 9000 | 0.00206 | 0.09672 |
# 10000 | 0.00214 | 0.11071 |
# 11000 | 0.00242 | 0.10869 |
# 12000 | 0.00221 | 0.12309 |
# 13000 | 0.00246 | 0.14295 |
# 14000 | 0.00188 | 0.15157 |
# 15000 | 0.00198 | 0.15419 |
# 16000 | 0.00207 | 0.17049 |
# 17000 | 0.00236 | 0.17179 |
# 18000 | 0.00253 | 0.20256 |
# 19000 | 0.00223 | 0.19117 |
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment