Created
June 26, 2016 10:52
-
-
Save benjaminalt/b87fd68e542a68e02877e1e833014444 to your computer and use it in GitHub Desktop.
Edit distance (dynamic programming)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| """ | |
| Author: Benjamin Alt (benjamin_alt@outlook.com) | |
| Calculate the character-level edit distance between a reference string and one or multiple hypotheses. | |
| """ | |
| from __future__ import print_function | |
| REFERENCE = "wenn es im Juni viel donnert kommt ein trüber Sommer" | |
| HYPOTHESES = ["im Juni viel Sonne kommt einen trüberen Sommer", | |
| "viel Donner im Juni einen trüben Sommer bringt", | |
| "Juni Donner einen Sommer", | |
| "im Juni viel Donner bringt einen trüben Sommer", | |
| "wenns im Juno viel Donner gibts einen trüben Sommer"] | |
| SUBSTITUTION_PENALTY = 1 | |
| INSERTION_PENALTY = 1 | |
| DELETION_PENALTY = 1 | |
| def calculate_char_edit_distance(hypothesis, reference): | |
| """Return the word edit distance between the hypothesis and the reference.""" | |
| M = len(reference) | |
| N = len(hypothesis) | |
| # Initialize the matrix | |
| matrix = [[0 for x in range(M + 1)] for y in range(N + 1)] # M + 1 columns, N + 1 rows | |
| for index, row in enumerate(matrix): | |
| row[0] = index | |
| for index, col in enumerate(matrix[0]): | |
| matrix[0][index] = index | |
| # Fill the matrix fields with the appropriate score | |
| for row in range(1, len(matrix)): | |
| for col in range(1, len(matrix[row])): | |
| diagnonal_score = matrix[row - 1][col - 1] | |
| left_score = matrix[row][col - 1] | |
| top_score = matrix[row - 1][col] | |
| if reference[col - 1] == hypothesis[row - 1]: | |
| matrix[row][col] = min([diagnonal_score, left_score, top_score]) | |
| else: | |
| matrix[row][col] = min([diagnonal_score + SUBSTITUTION_PENALTY, left_score + DELETION_PENALTY, top_score + INSERTION_PENALTY]) | |
| print_matrix(matrix, hypothesis, reference) | |
| # The bottom right entry is the edit distance | |
| return matrix[N][M] | |
| def print_matrix(matrix, hypothesis, reference): | |
| """Print the dynamic programming matrix.""" | |
| header_string = " " | |
| for ref_char in reference: | |
| header_string += "%4s" % ref_char | |
| print(header_string) | |
| hypothesis = " " + hypothesis | |
| for index, row in enumerate(matrix): | |
| row_string = "%4s" % hypothesis[index] | |
| for col in row: | |
| row_string += "%4d" % col | |
| print(row_string) | |
| if __name__ == '__main__': | |
| for hypothesis in HYPOTHESES: | |
| edit_distance = calculate_char_edit_distance(hypothesis, REFERENCE) | |
| print("\nEdit distance: {}\n".format(edit_distance)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| """ | |
| Author: Benjamin Alt (benjamin_alt@outlook.com) | |
| Calculate the word-level edit distance between a reference string and one or multiple hypotheses. | |
| """ | |
| from __future__ import print_function | |
| REFERENCE = "wenn es im Juni viel donnert kommt ein trüber Sommer" | |
| HYPOTHESES = ["im Juni viel Sonne kommt einen trüberen Sommer", | |
| "viel Donner im Juni einen trüben Sommer bringt", | |
| "Juni Donner einen Sommer", | |
| "im Juni viel Donner bringt einen trüben Sommer", | |
| "wenns im Juno viel Donner gibts einen trüben Sommer"] | |
| SUBSTITUTION_PENALTY = 1 | |
| INSERTION_PENALTY = 1 | |
| DELETION_PENALTY = 1 | |
| def calculate_word_edit_distance(hypothesis, reference): | |
| """Return the word edit distance between the hypothesis and the reference.""" | |
| hyp_words = hypothesis.split() | |
| ref_words = reference.split() | |
| M = len(ref_words) | |
| N = len(hyp_words) | |
| # Initialize the matrix | |
| matrix = [[0 for x in range(M + 1)] for y in range(N + 1)] # M + 1 columns, N + 1 rows | |
| for index, row in enumerate(matrix): | |
| row[0] = index | |
| for index, col in enumerate(matrix[0]): | |
| matrix[0][index] = index | |
| # Fill the matrix fields with the appropriate score | |
| for row in range(1, len(matrix)): | |
| for col in range(1, len(matrix[row])): | |
| diagnonal_score = matrix[row - 1][col - 1] | |
| left_score = matrix[row][col - 1] | |
| top_score = matrix[row - 1][col] | |
| if ref_words[col - 1] == hyp_words[row - 1]: | |
| matrix[row][col] = min([diagnonal_score, left_score, top_score]) | |
| else: | |
| matrix[row][col] = min([diagnonal_score + SUBSTITUTION_PENALTY, left_score + DELETION_PENALTY, top_score + INSERTION_PENALTY]) | |
| print_matrix(matrix, hyp_words, ref_words) | |
| # The bottom right entry is the edit distance | |
| return matrix[N][M] | |
| def print_matrix(matrix, hyp_words, ref_words): | |
| """Print the dynamic programming matrix with hypothesis hyp_words and reference ref_words.""" | |
| header_string = " " | |
| for ref_word in ref_words: | |
| header_string += "%10s" % ref_word | |
| print(header_string) | |
| hyp_words = [""] + hyp_words | |
| for index, row in enumerate(matrix): | |
| row_string = "%10s" % hyp_words[index] | |
| for col in row: | |
| row_string += "%10d" % col | |
| print(row_string) | |
| if __name__ == '__main__': | |
| for hypothesis in HYPOTHESES: | |
| edit_distance = calculate_word_edit_distance(hypothesis, REFERENCE) | |
| print("\nEdit distance: {}\n".format(edit_distance)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment