Skip to content

Instantly share code, notes, and snippets.

@vi3k6i5
Last active May 28, 2023 21:05
Show Gist options
  • Select an option

  • Save vi3k6i5/604eefd92866d081cfa19f862224e4a0 to your computer and use it in GitHub Desktop.

Select an option

Save vi3k6i5/604eefd92866d081cfa19f862224e4a0 to your computer and use it in GitHub Desktop.
Benchmarking timing performance Keyword Extraction between regex and flashtext
#!/bin/python
from flashtext.keyword import KeywordProcessor
import random
import string
import re
import time
def get_word_of_length(str_length):
"""
generate a radom word of given length
"""
return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length))
# generate a list of 100K words of randomly chosen size
all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)]
string_length_to_time_map = []
for keywords_length in range(0, 20000, 1000):
"""
Run the benchmark from 0 to 20K at 1K interval.
"""
# chose 1000 terms and create a string to search in.
all_words_chosen = random.sample(all_words, 1000)
story = ' '.join(all_words_chosen)
# get unique keywords from the list of words generated.
unique_keywords_sublist = list(set(random.sample(all_words, keywords_length)))
# compile regex
compiled_re = re.compile('|'.join([r'\b' + keyword + r'\b' for keyword in unique_keywords_sublist]))
# add keywords to flashtext
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(unique_keywords_sublist)
# time the modules
start = time.time()
_ = keyword_processor.extract_keywords(story)
mid = time.time()
_ = compiled_re.findall(story)
end = time.time()
# add results to dictionary as Count, FlashText time, Regex time
string_length_to_time_map.append((len(unique_keywords_sublist), mid - start, end - mid))
print('Count | FlashText | Regex ')
print('------------------------------')
for value in string_length_to_time_map:
print(str(value[0]).ljust(6), '|',
"{0:.5f}".format(value[1]).ljust(9), '|',
"{0:.5f}".format(value[2]).ljust(9), '|',)
## output:
Count | FlashText | Regex
------------------------------
0 | 0.00117 | 0.00074 |
1000 | 0.00154 | 0.00922 |
1993 | 0.00151 | 0.01851 |
2989 | 0.00155 | 0.03045 |
3990 | 0.00158 | 0.03659 |
4981 | 0.00165 | 0.05268 |
5984 | 0.00172 | 0.05781 |
6964 | 0.00175 | 0.06641 |
7955 | 0.00246 | 0.08042 |
8940 | 0.00178 | 0.08879 |
9927 | 0.00182 | 0.09536 |
10911 | 0.00174 | 0.10961 |
11888 | 0.00180 | 0.11821 |
12880 | 0.00181 | 0.12703 |
13871 | 0.00191 | 0.13219 |
14855 | 0.00232 | 0.14235 |
15808 | 0.00191 | 0.14962 |
16786 | 0.00188 | 0.16094 |
17749 | 0.00219 | 0.17573 |
18728 | 0.00190 | 0.17997 |
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment