Skip to content

Instantly share code, notes, and snippets.

@vi3k6i5
Last active May 28, 2023 21:05
Show Gist options
  • Select an option

  • Save vi3k6i5/604eefd92866d081cfa19f862224e4a0 to your computer and use it in GitHub Desktop.

Select an option

Save vi3k6i5/604eefd92866d081cfa19f862224e4a0 to your computer and use it in GitHub Desktop.

Revisions

  1. vi3k6i5 revised this gist Oct 18, 2017. 1 changed file with 10 additions and 10 deletions.
    20 changes: 10 additions & 10 deletions flashtext_regex_timing_keyword_extraction.py
    Original file line number Diff line number Diff line change
    @@ -14,9 +14,9 @@ def get_word_of_length(str_length):

    print('Count | FlashText | Regex ')
    print('-------------------------------')
    for keywords_length in [0, 1000, 5000, 10000, 15000]:
    # chose 1000 terms and create a string to search in.
    all_words_chosen = random.sample(all_words, 1000)
    for keywords_length in range(0, 20001, 1000):
    # chose 5000 terms and create a string to search in.
    all_words_chosen = random.sample(all_words, 5000)
    story = ' '.join(all_words_chosen)

    # get unique keywords from the list of words generated.
    @@ -39,12 +39,12 @@ def get_word_of_length(str_length):
    print(str(keywords_length).ljust(6), '|',
    "{0:.5f}".format(mid - start).ljust(9), '|',
    "{0:.5f}".format(end - mid).ljust(9), '|',)

    # output:

    # Count | FlashText | Regex
    # -------------------------------
    # 0 | 0.00320 | 0.00072 |
    # 1000 | 0.00422 | 0.01112 |
    # 5000 | 0.00410 | 0.04966 |
    # 10000 | 0.00438 | 0.10228 |
    # 15000 | 0.00424 | 0.18067 |
    # 0 | 0.01668 | 0.00418 |
    # 1000 | 0.02040 | 0.04781 |
    # 5000 | 0.02180 | 0.26495 |
    # 10000 | 0.02282 | 0.50019 |
    # 15000 | 0.02140 | 0.73295 |
    # 20000 | 0.02270 | 0.92905 |
  2. vi3k6i5 revised this gist Oct 17, 2017. No changes.
  3. vi3k6i5 revised this gist Oct 17, 2017. No changes.
  4. vi3k6i5 revised this gist Oct 17, 2017. 1 changed file with 17 additions and 28 deletions.
    45 changes: 17 additions & 28 deletions flashtext_regex_timing_keyword_extraction.py
    Original file line number Diff line number Diff line change
    @@ -5,22 +5,16 @@
    import re
    import time


    def get_word_of_length(str_length):
    """
    generate a radom word of given length
    """
    # generate a random word of given length
    return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length))

    # generate a list of 100K words of randomly chosen size
    all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)]

    string_length_to_time_map = []

    for keywords_length in range(0, 20000, 1000):
    """
    Run the benchmark from 0 to 20K at 1K interval.
    """
    print('Count | FlashText | Regex ')
    print('-------------------------------')
    for keywords_length in [0, 1000, 5000, 10000, 15000]:
    # chose 1000 terms and create a string to search in.
    all_words_chosen = random.sample(all_words, 1000)
    story = ' '.join(all_words_chosen)
    @@ -41,21 +35,16 @@ def get_word_of_length(str_length):
    mid = time.time()
    _ = compiled_re.findall(story)
    end = time.time()

    # add results to dictionary as Count, FlashText time, Regex time
    string_length_to_time_map.append((len(unique_keywords_sublist), mid - start, end - mid))

    print('Count | FlashText | Regex ')
    print('------------------------------')
    for value in string_length_to_time_map:
    print(str(value[0]).ljust(6), '|',
    "{0:.5f}".format(value[1]).ljust(9), '|',
    "{0:.5f}".format(value[2]).ljust(9), '|',)

    # Sample Output: Count, Timinging in Seconds.
    # Count | FlashText | Regex
    # ------------------------------
    # 0 | 0.00118 | 0.00079 |
    # 1000 | 0.00149 | 0.00944 |
    # 10000 | 0.00214 | 0.11071 |
    # 19000 | 0.00223 | 0.19117 |
    # print output
    print(str(keywords_length).ljust(6), '|',
    "{0:.5f}".format(mid - start).ljust(9), '|',
    "{0:.5f}".format(end - mid).ljust(9), '|',)

    # output:
    # Count | FlashText | Regex
    # -------------------------------
    # 0 | 0.00320 | 0.00072 |
    # 1000 | 0.00422 | 0.01112 |
    # 5000 | 0.00410 | 0.04966 |
    # 10000 | 0.00438 | 0.10228 |
    # 15000 | 0.00424 | 0.18067 |
  5. vi3k6i5 revised this gist Oct 12, 2017. 1 changed file with 1 addition and 17 deletions.
    18 changes: 1 addition & 17 deletions flashtext_regex_timing_keyword_extraction.py
    Original file line number Diff line number Diff line change
    @@ -52,26 +52,10 @@ def get_word_of_length(str_length):
    "{0:.5f}".format(value[1]).ljust(9), '|',
    "{0:.5f}".format(value[2]).ljust(9), '|',)

    # output: Count, Timinging in Seconds.
    # Sample Output: Count, Timinging in Seconds.
    # Count | FlashText | Regex
    # ------------------------------
    # 0 | 0.00118 | 0.00079 |
    # 1000 | 0.00149 | 0.00944 |
    # 2000 | 0.00186 | 0.02233 |
    # 3000 | 0.00160 | 0.02909 |
    # 4000 | 0.00171 | 0.03714 |
    # 5000 | 0.00166 | 0.05882 |
    # 6000 | 0.00212 | 0.06964 |
    # 7000 | 0.00174 | 0.09775 |
    # 8000 | 0.00169 | 0.08188 |
    # 9000 | 0.00206 | 0.09672 |
    # 10000 | 0.00214 | 0.11071 |
    # 11000 | 0.00242 | 0.10869 |
    # 12000 | 0.00221 | 0.12309 |
    # 13000 | 0.00246 | 0.14295 |
    # 14000 | 0.00188 | 0.15157 |
    # 15000 | 0.00198 | 0.15419 |
    # 16000 | 0.00207 | 0.17049 |
    # 17000 | 0.00236 | 0.17179 |
    # 18000 | 0.00253 | 0.20256 |
    # 19000 | 0.00223 | 0.19117 |
  6. vi3k6i5 revised this gist Oct 3, 2017. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion flashtext_regex_timing_keyword_extraction.py
    Original file line number Diff line number Diff line change
    @@ -52,7 +52,7 @@ def get_word_of_length(str_length):
    "{0:.5f}".format(value[1]).ljust(9), '|',
    "{0:.5f}".format(value[2]).ljust(9), '|',)

    ## output:
    # output: Count, Timinging in Seconds.
    # Count | FlashText | Regex
    # ------------------------------
    # 0 | 0.00118 | 0.00079 |
  7. vi3k6i5 revised this gist Oct 3, 2017. 1 changed file with 20 additions and 20 deletions.
    40 changes: 20 additions & 20 deletions flashtext_regex_timing_keyword_extraction.py
    Original file line number Diff line number Diff line change
    @@ -55,23 +55,23 @@ def get_word_of_length(str_length):
    ## output:
    # Count | FlashText | Regex
    # ------------------------------
    # 0 | 0.00117 | 0.00074 |
    # 1000 | 0.00154 | 0.00922 |
    # 1993 | 0.00151 | 0.01851 |
    # 2989 | 0.00155 | 0.03045 |
    # 3990 | 0.00158 | 0.03659 |
    # 4981 | 0.00165 | 0.05268 |
    # 5984 | 0.00172 | 0.05781 |
    # 6964 | 0.00175 | 0.06641 |
    # 7955 | 0.00246 | 0.08042 |
    # 8940 | 0.00178 | 0.08879 |
    # 9927 | 0.00182 | 0.09536 |
    # 10911 | 0.00174 | 0.10961 |
    # 11888 | 0.00180 | 0.11821 |
    # 12880 | 0.00181 | 0.12703 |
    # 13871 | 0.00191 | 0.13219 |
    # 14855 | 0.00232 | 0.14235 |
    # 15808 | 0.00191 | 0.14962 |
    # 16786 | 0.00188 | 0.16094 |
    # 17749 | 0.00219 | 0.17573 |
    # 18728 | 0.00190 | 0.17997 |
    # 0 | 0.00118 | 0.00079 |
    # 1000 | 0.00149 | 0.00944 |
    # 2000 | 0.00186 | 0.02233 |
    # 3000 | 0.00160 | 0.02909 |
    # 4000 | 0.00171 | 0.03714 |
    # 5000 | 0.00166 | 0.05882 |
    # 6000 | 0.00212 | 0.06964 |
    # 7000 | 0.00174 | 0.09775 |
    # 8000 | 0.00169 | 0.08188 |
    # 9000 | 0.00206 | 0.09672 |
    # 10000 | 0.00214 | 0.11071 |
    # 11000 | 0.00242 | 0.10869 |
    # 12000 | 0.00221 | 0.12309 |
    # 13000 | 0.00246 | 0.14295 |
    # 14000 | 0.00188 | 0.15157 |
    # 15000 | 0.00198 | 0.15419 |
    # 16000 | 0.00207 | 0.17049 |
    # 17000 | 0.00236 | 0.17179 |
    # 18000 | 0.00253 | 0.20256 |
    # 19000 | 0.00223 | 0.19117 |
  8. vi3k6i5 revised this gist Oct 3, 2017. 1 changed file with 22 additions and 22 deletions.
    44 changes: 22 additions & 22 deletions flashtext_regex_timing_keyword_extraction.py
    Original file line number Diff line number Diff line change
    @@ -53,25 +53,25 @@ def get_word_of_length(str_length):
    "{0:.5f}".format(value[2]).ljust(9), '|',)

    ## output:
    Count | FlashText | Regex
    ------------------------------
    0 | 0.00117 | 0.00074 |
    1000 | 0.00154 | 0.00922 |
    1993 | 0.00151 | 0.01851 |
    2989 | 0.00155 | 0.03045 |
    3990 | 0.00158 | 0.03659 |
    4981 | 0.00165 | 0.05268 |
    5984 | 0.00172 | 0.05781 |
    6964 | 0.00175 | 0.06641 |
    7955 | 0.00246 | 0.08042 |
    8940 | 0.00178 | 0.08879 |
    9927 | 0.00182 | 0.09536 |
    10911 | 0.00174 | 0.10961 |
    11888 | 0.00180 | 0.11821 |
    12880 | 0.00181 | 0.12703 |
    13871 | 0.00191 | 0.13219 |
    14855 | 0.00232 | 0.14235 |
    15808 | 0.00191 | 0.14962 |
    16786 | 0.00188 | 0.16094 |
    17749 | 0.00219 | 0.17573 |
    18728 | 0.00190 | 0.17997 |
    # Count | FlashText | Regex
    # ------------------------------
    # 0 | 0.00117 | 0.00074 |
    # 1000 | 0.00154 | 0.00922 |
    # 1993 | 0.00151 | 0.01851 |
    # 2989 | 0.00155 | 0.03045 |
    # 3990 | 0.00158 | 0.03659 |
    # 4981 | 0.00165 | 0.05268 |
    # 5984 | 0.00172 | 0.05781 |
    # 6964 | 0.00175 | 0.06641 |
    # 7955 | 0.00246 | 0.08042 |
    # 8940 | 0.00178 | 0.08879 |
    # 9927 | 0.00182 | 0.09536 |
    # 10911 | 0.00174 | 0.10961 |
    # 11888 | 0.00180 | 0.11821 |
    # 12880 | 0.00181 | 0.12703 |
    # 13871 | 0.00191 | 0.13219 |
    # 14855 | 0.00232 | 0.14235 |
    # 15808 | 0.00191 | 0.14962 |
    # 16786 | 0.00188 | 0.16094 |
    # 17749 | 0.00219 | 0.17573 |
    # 18728 | 0.00190 | 0.17997 |
  9. vi3k6i5 created this gist Oct 3, 2017.
    77 changes: 77 additions & 0 deletions flashtext_regex_timing_keyword_extraction.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,77 @@
    #!/bin/python
    from flashtext.keyword import KeywordProcessor
    import random
    import string
    import re
    import time


    def get_word_of_length(str_length):
    """
    generate a radom word of given length
    """
    return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length))

    # generate a list of 100K words of randomly chosen size
    all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)]

    string_length_to_time_map = []

    for keywords_length in range(0, 20000, 1000):
    """
    Run the benchmark from 0 to 20K at 1K interval.
    """
    # chose 1000 terms and create a string to search in.
    all_words_chosen = random.sample(all_words, 1000)
    story = ' '.join(all_words_chosen)

    # get unique keywords from the list of words generated.
    unique_keywords_sublist = list(set(random.sample(all_words, keywords_length)))

    # compile regex
    compiled_re = re.compile('|'.join([r'\b' + keyword + r'\b' for keyword in unique_keywords_sublist]))

    # add keywords to flashtext
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(unique_keywords_sublist)

    # time the modules
    start = time.time()
    _ = keyword_processor.extract_keywords(story)
    mid = time.time()
    _ = compiled_re.findall(story)
    end = time.time()

    # add results to dictionary as Count, FlashText time, Regex time
    string_length_to_time_map.append((len(unique_keywords_sublist), mid - start, end - mid))

    print('Count | FlashText | Regex ')
    print('------------------------------')
    for value in string_length_to_time_map:
    print(str(value[0]).ljust(6), '|',
    "{0:.5f}".format(value[1]).ljust(9), '|',
    "{0:.5f}".format(value[2]).ljust(9), '|',)

    ## output:
    Count | FlashText | Regex
    ------------------------------
    0 | 0.00117 | 0.00074 |
    1000 | 0.00154 | 0.00922 |
    1993 | 0.00151 | 0.01851 |
    2989 | 0.00155 | 0.03045 |
    3990 | 0.00158 | 0.03659 |
    4981 | 0.00165 | 0.05268 |
    5984 | 0.00172 | 0.05781 |
    6964 | 0.00175 | 0.06641 |
    7955 | 0.00246 | 0.08042 |
    8940 | 0.00178 | 0.08879 |
    9927 | 0.00182 | 0.09536 |
    10911 | 0.00174 | 0.10961 |
    11888 | 0.00180 | 0.11821 |
    12880 | 0.00181 | 0.12703 |
    13871 | 0.00191 | 0.13219 |
    14855 | 0.00232 | 0.14235 |
    15808 | 0.00191 | 0.14962 |
    16786 | 0.00188 | 0.16094 |
    17749 | 0.00219 | 0.17573 |
    18728 | 0.00190 | 0.17997 |