Created
April 6, 2019 11:17
-
-
Save kphetrungnapha/c8e3bf3e508146a43898e80010a28247 to your computer and use it in GitHub Desktop.
Revisions
-
Kittisak Phetrungnapha revised this gist
Apr 6, 2019 . 1 changed file with 1 addition and 5 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,5 +1,3 @@ ```python import pandas as pd import numpy as np @@ -385,6 +383,4 @@ for threshold in thresholds: false positive: 0 accuracy: 0.9867021276595744 ====================================== -
Kittisak Phetrungnapha created this gist
Apr 6, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,390 @@ ```python import pandas as pd import numpy as np import json import spacy import nltk import string import re ``` ```python # Prepare actual similarity data data_xlsx = pd.read_excel('./similarity_manually_label.xlsx', 'Sheet1', index_col=0) actual_matrix = np.array(data_xlsx.values) ``` ```python # Import base and test data with open('./text_similarity_base.json') as data_file: text_similarity_base = json.load(data_file) with open('./text_similarity_test.json') as data_file: text_similarity_test = json.load(data_file) ``` ```python # Create base and test data frames base_df = pd.DataFrame.from_dict(text_similarity_base, orient='columns') test_df = pd.DataFrame.from_dict(text_similarity_test, orient='columns') ``` ```python nltk.download('punkt') ``` [nltk_data] Downloading package punkt to /Users/kittisakp/nltk_data... [nltk_data] Package punkt is already up-to-date! True ```python # Text pre-processing functions stemmer = nltk.stem.porter.PorterStemmer() remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) stopwords = nltk.corpus.stopwords.words('english') def tokenize(text): return nltk.word_tokenize(text) def stem_tokens(tokens): return [stemmer.stem(item) for item in tokens] def remove_stopwords(tokens): return [item for item in tokens if item not in stopwords] def keep_alphabetic(tokens): return [item for item in tokens if item.isalpha()] def reduce_lengthening(tokens): pattern = re.compile(r"(.)\1{2,}") return [pattern.sub(r"\1\1", item) for item in tokens] '''lowercase, punctuation, remove stopwords, only alphabetic, reduce lengthening, stem''' def normalize(text): lower_text_without_punctuation = text.lower().translate(remove_punctuation_map) return ' '.join( stem_tokens( reduce_lengthening( keep_alphabetic( remove_stopwords( tokenize( lower_text_without_punctuation)))))) ``` ```python # Text cleansing base_df['normalized_text'] = base_df['text'].apply(lambda text: normalize(text)) test_df['normalized_text'] = test_df['text'].apply(lambda text: normalize(text)) ``` ```python nlp = spacy.load('en_vectors_web_lg') ``` ```python # Define constants thresholds = [ 0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.8, 0.825, 0.85, 0.875, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1 ] base_sentences = base_df['normalized_text'].values test_sentences = test_df['normalized_text'].values base_count = len(base_sentences) test_count = len(test_sentences) ``` ```python def calculate_similarity(threshold): predict_matrix = np.array([[None for j in range(test_count)] for i in range(base_count)]) tp_count = 0 tn_count = 0 fp_count = 0 fn_count = 0 # Prepare predict data for base_index, base_value in enumerate(base_sentences): base_doc = nlp(base_value) for test_index, test_value in enumerate(test_sentences): test_doc = nlp(test_value) similarity = test_doc.similarity(base_doc) if similarity >= threshold: predict_matrix[base_index][test_index] = 1 # 1 means duplicate else: predict_matrix[base_index][test_index] = 0 # 0 means non-duplicate # Calculate result for i in range(base_count): for j in range(test_count): actual = actual_matrix[i][j] predict = predict_matrix[i][j] if actual == 0 and predict == 0: # true negative tn_count += 1 elif actual == 1 and predict == 1: # true position tp_count += 1 elif actual == 1 and predict == 0: # false negative fn_count += 1 elif actual == 0 and predict == 1: # false positive fp_count += 1 accuracy = (tn_count + tp_count) / (tn_count + tp_count + fn_count + fp_count) print("threshold:", threshold) print("true negative:", tn_count) print("true position:", tp_count) print("false negative:", fn_count) print("false positive:", fp_count) print("accuracy:", accuracy) print("\n======================================\n") ``` ```python print("Base count: %d, Test count: %d, Total = %d\n" % (base_count, test_count, base_count * test_count)) for threshold in thresholds: calculate_similarity(threshold) ``` Base count: 94, Test count: 20, Total = 1880 threshold: 0 true negative: 0 true position: 25 false negative: 0 false positive: 1855 accuracy: 0.013297872340425532 ====================================== threshold: 0.125 true negative: 20 true position: 25 false negative: 0 false positive: 1835 accuracy: 0.023936170212765957 ====================================== threshold: 0.25 true negative: 22 true position: 25 false negative: 0 false positive: 1833 accuracy: 0.025 ====================================== threshold: 0.375 true negative: 44 true position: 24 false negative: 1 false positive: 1811 accuracy: 0.036170212765957444 ====================================== threshold: 0.5 true negative: 138 true position: 23 false negative: 2 false positive: 1717 accuracy: 0.08563829787234042 ====================================== threshold: 0.625 true negative: 452 true position: 20 false negative: 5 false positive: 1403 accuracy: 0.251063829787234 ====================================== threshold: 0.75 true negative: 1198 true position: 14 false negative: 11 false positive: 657 accuracy: 0.6446808510638298 ====================================== threshold: 0.8 true negative: 1514 true position: 10 false negative: 15 false positive: 341 accuracy: 0.8106382978723404 ====================================== threshold: 0.825 true negative: 1633 true position: 6 false negative: 19 false positive: 222 accuracy: 0.8718085106382979 ====================================== threshold: 0.85 true negative: 1735 true position: 3 false negative: 22 false positive: 120 accuracy: 0.924468085106383 ====================================== threshold: 0.875 true negative: 1806 true position: 0 false negative: 25 false positive: 49 accuracy: 0.9606382978723405 ====================================== threshold: 0.9 true negative: 1843 true position: 0 false negative: 25 false positive: 12 accuracy: 0.9803191489361702 ====================================== threshold: 0.91 true negative: 1850 true position: 0 false negative: 25 false positive: 5 accuracy: 0.9840425531914894 ====================================== threshold: 0.92 true negative: 1851 true position: 0 false negative: 25 false positive: 4 accuracy: 0.9845744680851064 ====================================== threshold: 0.93 true negative: 1855 true position: 0 false negative: 25 false positive: 0 accuracy: 0.9867021276595744 ====================================== threshold: 0.94 true negative: 1855 true position: 0 false negative: 25 false positive: 0 accuracy: 0.9867021276595744 ====================================== threshold: 0.95 true negative: 1855 true position: 0 false negative: 25 false positive: 0 accuracy: 0.9867021276595744 ====================================== threshold: 0.96 true negative: 1855 true position: 0 false negative: 25 false positive: 0 accuracy: 0.9867021276595744 ====================================== threshold: 0.97 true negative: 1855 true position: 0 false negative: 25 false positive: 0 accuracy: 0.9867021276595744 ====================================== threshold: 0.98 true negative: 1855 true position: 0 false negative: 25 false positive: 0 accuracy: 0.9867021276595744 ====================================== threshold: 0.99 true negative: 1855 true position: 0 false negative: 25 false positive: 0 accuracy: 0.9867021276595744 ====================================== threshold: 1 true negative: 1855 true position: 0 false negative: 25 false positive: 0 accuracy: 0.9867021276595744 ======================================