kphetrungnapha · April 6, 2019 11:17 · Apr 6, 2019 · Apr 6, 2019
diff --git a/find_similarity_threshold.md b/find_similarity_threshold.md
@@ -1,5 +1,3 @@
-
-
 ```python
 import pandas as pd
 import numpy as np
@@ -385,6 +383,4 @@ for threshold in thresholds:
     false positive: 0
     accuracy: 0.9867021276595744
 
-    ======================================
-
-
+    ======================================
diff --git a/find_similarity_threshold.md b/find_similarity_threshold.md
@@ -0,0 +1,390 @@
+
+
+```python
+import pandas as pd
+import numpy as np
+import json
+import spacy
+import nltk
+import string
+import re
+```
+
+
+```python
+# Prepare actual similarity data
+data_xlsx = pd.read_excel('./similarity_manually_label.xlsx', 'Sheet1', index_col=0)
+actual_matrix = np.array(data_xlsx.values)
+```
+
+
+```python
+# Import base and test data
+with open('./text_similarity_base.json') as data_file:    
+    text_similarity_base = json.load(data_file)
+
+with open('./text_similarity_test.json') as data_file:    
+    text_similarity_test = json.load(data_file)
+```
+
+
+```python
+# Create base and test data frames
+base_df = pd.DataFrame.from_dict(text_similarity_base, orient='columns')
+test_df = pd.DataFrame.from_dict(text_similarity_test, orient='columns')
+```
+
+
+```python
+nltk.download('punkt')
+```
+
+    [nltk_data] Downloading package punkt to /Users/kittisakp/nltk_data...
+    [nltk_data]   Package punkt is already up-to-date!
+
+
+
+
+
+    True
+
+
+
+
+```python
+# Text pre-processing functions
+stemmer = nltk.stem.porter.PorterStemmer()
+remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
+stopwords = nltk.corpus.stopwords.words('english')
+
+def tokenize(text):
+    return nltk.word_tokenize(text)
+
+def stem_tokens(tokens):
+    return [stemmer.stem(item) for item in tokens]
+
+def remove_stopwords(tokens):
+    return [item for item in tokens if item not in stopwords]
+
+def keep_alphabetic(tokens):
+    return [item for item in tokens if item.isalpha()]
+
+def reduce_lengthening(tokens):
+    pattern = re.compile(r"(.)\1{2,}")
+    return [pattern.sub(r"\1\1", item) for item in tokens]
+
+'''lowercase, punctuation, remove stopwords, only alphabetic, reduce lengthening, stem'''
+def normalize(text):
+    lower_text_without_punctuation = text.lower().translate(remove_punctuation_map)
+    return ' '.join(
+                stem_tokens(
+                reduce_lengthening(
+                keep_alphabetic(
+                remove_stopwords(
+                tokenize(
+                lower_text_without_punctuation))))))
+```
+
+
+```python
+# Text cleansing
+base_df['normalized_text'] = base_df['text'].apply(lambda text: normalize(text))
+test_df['normalized_text'] = test_df['text'].apply(lambda text: normalize(text))
+```
+
+
+```python
+nlp = spacy.load('en_vectors_web_lg')
+```
+
+
+```python
+# Define constants
+thresholds = [
+    0,
+    0.125,
+    0.25,
+    0.375,
+    0.5,
+    0.625,
+    0.75,
+    0.8,
+    0.825,
+    0.85,
+    0.875,
+    0.9,
+    0.91,
+    0.92,
+    0.93,
+    0.94,
+    0.95,
+    0.96,
+    0.97,
+    0.98,
+    0.99,
+    1
+]
+
+base_sentences = base_df['normalized_text'].values
+test_sentences = test_df['normalized_text'].values
+base_count = len(base_sentences)
+test_count = len(test_sentences)
+```
+
+
+```python
+def calculate_similarity(threshold):    
+    predict_matrix = np.array([[None for j in range(test_count)] for i in range(base_count)])
+    tp_count = 0
+    tn_count = 0
+    fp_count = 0
+    fn_count = 0
+
+    # Prepare predict data
+    for base_index, base_value in enumerate(base_sentences):
+        base_doc = nlp(base_value)
+
+        for test_index, test_value in enumerate(test_sentences):
+            test_doc = nlp(test_value)
+            similarity = test_doc.similarity(base_doc)
+
+            if similarity >= threshold:
+                predict_matrix[base_index][test_index] = 1 # 1 means duplicate
+            else:
+                predict_matrix[base_index][test_index] = 0 # 0 means non-duplicate
+
+    # Calculate result
+    for i in range(base_count):
+        for j in range(test_count):
+            actual = actual_matrix[i][j]
+            predict = predict_matrix[i][j]
+
+            if actual == 0 and predict == 0: # true negative
+                tn_count += 1
+            elif actual == 1 and predict == 1: # true position
+                tp_count += 1
+            elif actual == 1 and predict == 0: # false negative 
+                fn_count += 1
+            elif actual == 0 and predict == 1: # false positive
+                fp_count += 1
+
+    accuracy = (tn_count + tp_count) / (tn_count + tp_count + fn_count + fp_count)
+
+    print("threshold:", threshold)
+    print("true negative:", tn_count)
+    print("true position:", tp_count)
+    print("false negative:", fn_count)
+    print("false positive:", fp_count)
+    print("accuracy:", accuracy)
+    print("\n======================================\n")
+```
+
+
+```python
+print("Base count: %d, Test count: %d, Total = %d\n" % (base_count, test_count, base_count * test_count))
+
+for threshold in thresholds:
+    calculate_similarity(threshold)
+```
+
+    Base count: 94, Test count: 20, Total = 1880
+
+    threshold: 0
+    true negative: 0
+    true position: 25
+    false negative: 0
+    false positive: 1855
+    accuracy: 0.013297872340425532
+
+    ======================================
+
+    threshold: 0.125
+    true negative: 20
+    true position: 25
+    false negative: 0
+    false positive: 1835
+    accuracy: 0.023936170212765957
+
+    ======================================
+
+    threshold: 0.25
+    true negative: 22
+    true position: 25
+    false negative: 0
+    false positive: 1833
+    accuracy: 0.025
+
+    ======================================
+
+    threshold: 0.375
+    true negative: 44
+    true position: 24
+    false negative: 1
+    false positive: 1811
+    accuracy: 0.036170212765957444
+
+    ======================================
+
+    threshold: 0.5
+    true negative: 138
+    true position: 23
+    false negative: 2
+    false positive: 1717
+    accuracy: 0.08563829787234042
+
+    ======================================
+
+    threshold: 0.625
+    true negative: 452
+    true position: 20
+    false negative: 5
+    false positive: 1403
+    accuracy: 0.251063829787234
+
+    ======================================
+
+    threshold: 0.75
+    true negative: 1198
+    true position: 14
+    false negative: 11
+    false positive: 657
+    accuracy: 0.6446808510638298
+
+    ======================================
+
+    threshold: 0.8
+    true negative: 1514
+    true position: 10
+    false negative: 15
+    false positive: 341
+    accuracy: 0.8106382978723404
+
+    ======================================
+
+    threshold: 0.825
+    true negative: 1633
+    true position: 6
+    false negative: 19
+    false positive: 222
+    accuracy: 0.8718085106382979
+
+    ======================================
+
+    threshold: 0.85
+    true negative: 1735
+    true position: 3
+    false negative: 22
+    false positive: 120
+    accuracy: 0.924468085106383
+
+    ======================================
+
+    threshold: 0.875
+    true negative: 1806
+    true position: 0
+    false negative: 25
+    false positive: 49
+    accuracy: 0.9606382978723405
+
+    ======================================
+
+    threshold: 0.9
+    true negative: 1843
+    true position: 0
+    false negative: 25
+    false positive: 12
+    accuracy: 0.9803191489361702
+
+    ======================================
+
+    threshold: 0.91
+    true negative: 1850
+    true position: 0
+    false negative: 25
+    false positive: 5
+    accuracy: 0.9840425531914894
+
+    ======================================
+
+    threshold: 0.92
+    true negative: 1851
+    true position: 0
+    false negative: 25
+    false positive: 4
+    accuracy: 0.9845744680851064
+
+    ======================================
+
+    threshold: 0.93
+    true negative: 1855
+    true position: 0
+    false negative: 25
+    false positive: 0
+    accuracy: 0.9867021276595744
+
+    ======================================
+
+    threshold: 0.94
+    true negative: 1855
+    true position: 0
+    false negative: 25
+    false positive: 0
+    accuracy: 0.9867021276595744
+
+    ======================================
+
+    threshold: 0.95
+    true negative: 1855
+    true position: 0
+    false negative: 25
+    false positive: 0
+    accuracy: 0.9867021276595744
+
+    ======================================
+
+    threshold: 0.96
+    true negative: 1855
+    true position: 0
+    false negative: 25
+    false positive: 0
+    accuracy: 0.9867021276595744
+
+    ======================================
+
+    threshold: 0.97
+    true negative: 1855
+    true position: 0
+    false negative: 25
+    false positive: 0
+    accuracy: 0.9867021276595744
+
+    ======================================
+
+    threshold: 0.98
+    true negative: 1855
+    true position: 0
+    false negative: 25
+    false positive: 0
+    accuracy: 0.9867021276595744
+
+    ======================================
+
+    threshold: 0.99
+    true negative: 1855
+    true position: 0
+    false negative: 25
+    false positive: 0
+    accuracy: 0.9867021276595744
+
+    ======================================
+
+    threshold: 1
+    true negative: 1855
+    true position: 0
+    false negative: 25
+    false positive: 0
+    accuracy: 0.9867021276595744
+
+    ======================================
+
+
No results found