Last active
May 28, 2023 21:05
-
-
Save vi3k6i5/604eefd92866d081cfa19f862224e4a0 to your computer and use it in GitHub Desktop.
Revisions
-
vi3k6i5 revised this gist
Oct 18, 2017 . 1 changed file with 10 additions and 10 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -14,9 +14,9 @@ def get_word_of_length(str_length): print('Count | FlashText | Regex ') print('-------------------------------') for keywords_length in range(0, 20001, 1000): # chose 5000 terms and create a string to search in. all_words_chosen = random.sample(all_words, 5000) story = ' '.join(all_words_chosen) # get unique keywords from the list of words generated. @@ -39,12 +39,12 @@ def get_word_of_length(str_length): print(str(keywords_length).ljust(6), '|', "{0:.5f}".format(mid - start).ljust(9), '|', "{0:.5f}".format(end - mid).ljust(9), '|',) # Count | FlashText | Regex # ------------------------------- # 0 | 0.01668 | 0.00418 | # 1000 | 0.02040 | 0.04781 | # 5000 | 0.02180 | 0.26495 | # 10000 | 0.02282 | 0.50019 | # 15000 | 0.02140 | 0.73295 | # 20000 | 0.02270 | 0.92905 | -
vi3k6i5 revised this gist
Oct 17, 2017 . No changes.There are no files selected for viewing
-
vi3k6i5 revised this gist
Oct 17, 2017 . No changes.There are no files selected for viewing
-
vi3k6i5 revised this gist
Oct 17, 2017 . 1 changed file with 17 additions and 28 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -5,22 +5,16 @@ import re import time def get_word_of_length(str_length): # generate a random word of given length return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length)) # generate a list of 100K words of randomly chosen size all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)] print('Count | FlashText | Regex ') print('-------------------------------') for keywords_length in [0, 1000, 5000, 10000, 15000]: # chose 1000 terms and create a string to search in. all_words_chosen = random.sample(all_words, 1000) story = ' '.join(all_words_chosen) @@ -41,21 +35,16 @@ def get_word_of_length(str_length): mid = time.time() _ = compiled_re.findall(story) end = time.time() # print output print(str(keywords_length).ljust(6), '|', "{0:.5f}".format(mid - start).ljust(9), '|', "{0:.5f}".format(end - mid).ljust(9), '|',) # output: # Count | FlashText | Regex # ------------------------------- # 0 | 0.00320 | 0.00072 | # 1000 | 0.00422 | 0.01112 | # 5000 | 0.00410 | 0.04966 | # 10000 | 0.00438 | 0.10228 | # 15000 | 0.00424 | 0.18067 | -
vi3k6i5 revised this gist
Oct 12, 2017 . 1 changed file with 1 addition and 17 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -52,26 +52,10 @@ def get_word_of_length(str_length): "{0:.5f}".format(value[1]).ljust(9), '|', "{0:.5f}".format(value[2]).ljust(9), '|',) # Sample Output: Count, Timinging in Seconds. # Count | FlashText | Regex # ------------------------------ # 0 | 0.00118 | 0.00079 | # 1000 | 0.00149 | 0.00944 | # 10000 | 0.00214 | 0.11071 | # 19000 | 0.00223 | 0.19117 | -
vi3k6i5 revised this gist
Oct 3, 2017 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -52,7 +52,7 @@ def get_word_of_length(str_length): "{0:.5f}".format(value[1]).ljust(9), '|', "{0:.5f}".format(value[2]).ljust(9), '|',) # output: Count, Timinging in Seconds. # Count | FlashText | Regex # ------------------------------ # 0 | 0.00118 | 0.00079 | -
vi3k6i5 revised this gist
Oct 3, 2017 . 1 changed file with 20 additions and 20 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -55,23 +55,23 @@ def get_word_of_length(str_length): ## output: # Count | FlashText | Regex # ------------------------------ # 0 | 0.00118 | 0.00079 | # 1000 | 0.00149 | 0.00944 | # 2000 | 0.00186 | 0.02233 | # 3000 | 0.00160 | 0.02909 | # 4000 | 0.00171 | 0.03714 | # 5000 | 0.00166 | 0.05882 | # 6000 | 0.00212 | 0.06964 | # 7000 | 0.00174 | 0.09775 | # 8000 | 0.00169 | 0.08188 | # 9000 | 0.00206 | 0.09672 | # 10000 | 0.00214 | 0.11071 | # 11000 | 0.00242 | 0.10869 | # 12000 | 0.00221 | 0.12309 | # 13000 | 0.00246 | 0.14295 | # 14000 | 0.00188 | 0.15157 | # 15000 | 0.00198 | 0.15419 | # 16000 | 0.00207 | 0.17049 | # 17000 | 0.00236 | 0.17179 | # 18000 | 0.00253 | 0.20256 | # 19000 | 0.00223 | 0.19117 | -
vi3k6i5 revised this gist
Oct 3, 2017 . 1 changed file with 22 additions and 22 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -53,25 +53,25 @@ def get_word_of_length(str_length): "{0:.5f}".format(value[2]).ljust(9), '|',) ## output: # Count | FlashText | Regex # ------------------------------ # 0 | 0.00117 | 0.00074 | # 1000 | 0.00154 | 0.00922 | # 1993 | 0.00151 | 0.01851 | # 2989 | 0.00155 | 0.03045 | # 3990 | 0.00158 | 0.03659 | # 4981 | 0.00165 | 0.05268 | # 5984 | 0.00172 | 0.05781 | # 6964 | 0.00175 | 0.06641 | # 7955 | 0.00246 | 0.08042 | # 8940 | 0.00178 | 0.08879 | # 9927 | 0.00182 | 0.09536 | # 10911 | 0.00174 | 0.10961 | # 11888 | 0.00180 | 0.11821 | # 12880 | 0.00181 | 0.12703 | # 13871 | 0.00191 | 0.13219 | # 14855 | 0.00232 | 0.14235 | # 15808 | 0.00191 | 0.14962 | # 16786 | 0.00188 | 0.16094 | # 17749 | 0.00219 | 0.17573 | # 18728 | 0.00190 | 0.17997 | -
vi3k6i5 created this gist
Oct 3, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,77 @@ #!/bin/python from flashtext.keyword import KeywordProcessor import random import string import re import time def get_word_of_length(str_length): """ generate a radom word of given length """ return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length)) # generate a list of 100K words of randomly chosen size all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)] string_length_to_time_map = [] for keywords_length in range(0, 20000, 1000): """ Run the benchmark from 0 to 20K at 1K interval. """ # chose 1000 terms and create a string to search in. all_words_chosen = random.sample(all_words, 1000) story = ' '.join(all_words_chosen) # get unique keywords from the list of words generated. unique_keywords_sublist = list(set(random.sample(all_words, keywords_length))) # compile regex compiled_re = re.compile('|'.join([r'\b' + keyword + r'\b' for keyword in unique_keywords_sublist])) # add keywords to flashtext keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list(unique_keywords_sublist) # time the modules start = time.time() _ = keyword_processor.extract_keywords(story) mid = time.time() _ = compiled_re.findall(story) end = time.time() # add results to dictionary as Count, FlashText time, Regex time string_length_to_time_map.append((len(unique_keywords_sublist), mid - start, end - mid)) print('Count | FlashText | Regex ') print('------------------------------') for value in string_length_to_time_map: print(str(value[0]).ljust(6), '|', "{0:.5f}".format(value[1]).ljust(9), '|', "{0:.5f}".format(value[2]).ljust(9), '|',) ## output: Count | FlashText | Regex ------------------------------ 0 | 0.00117 | 0.00074 | 1000 | 0.00154 | 0.00922 | 1993 | 0.00151 | 0.01851 | 2989 | 0.00155 | 0.03045 | 3990 | 0.00158 | 0.03659 | 4981 | 0.00165 | 0.05268 | 5984 | 0.00172 | 0.05781 | 6964 | 0.00175 | 0.06641 | 7955 | 0.00246 | 0.08042 | 8940 | 0.00178 | 0.08879 | 9927 | 0.00182 | 0.09536 | 10911 | 0.00174 | 0.10961 | 11888 | 0.00180 | 0.11821 | 12880 | 0.00181 | 0.12703 | 13871 | 0.00191 | 0.13219 | 14855 | 0.00232 | 0.14235 | 15808 | 0.00191 | 0.14962 | 16786 | 0.00188 | 0.16094 | 17749 | 0.00219 | 0.17573 | 18728 | 0.00190 | 0.17997 |