Skip to content

Instantly share code, notes, and snippets.

@benjaminalt
Created June 26, 2016 10:52
Show Gist options
  • Select an option

  • Save benjaminalt/b87fd68e542a68e02877e1e833014444 to your computer and use it in GitHub Desktop.

Select an option

Save benjaminalt/b87fd68e542a68e02877e1e833014444 to your computer and use it in GitHub Desktop.
Edit distance (dynamic programming)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Author: Benjamin Alt (benjamin_alt@outlook.com)
Calculate the character-level edit distance between a reference string and one or multiple hypotheses.
"""
from __future__ import print_function
REFERENCE = "wenn es im Juni viel donnert kommt ein trüber Sommer"
HYPOTHESES = ["im Juni viel Sonne kommt einen trüberen Sommer",
"viel Donner im Juni einen trüben Sommer bringt",
"Juni Donner einen Sommer",
"im Juni viel Donner bringt einen trüben Sommer",
"wenns im Juno viel Donner gibts einen trüben Sommer"]
SUBSTITUTION_PENALTY = 1
INSERTION_PENALTY = 1
DELETION_PENALTY = 1
def calculate_char_edit_distance(hypothesis, reference):
"""Return the word edit distance between the hypothesis and the reference."""
M = len(reference)
N = len(hypothesis)
# Initialize the matrix
matrix = [[0 for x in range(M + 1)] for y in range(N + 1)] # M + 1 columns, N + 1 rows
for index, row in enumerate(matrix):
row[0] = index
for index, col in enumerate(matrix[0]):
matrix[0][index] = index
# Fill the matrix fields with the appropriate score
for row in range(1, len(matrix)):
for col in range(1, len(matrix[row])):
diagnonal_score = matrix[row - 1][col - 1]
left_score = matrix[row][col - 1]
top_score = matrix[row - 1][col]
if reference[col - 1] == hypothesis[row - 1]:
matrix[row][col] = min([diagnonal_score, left_score, top_score])
else:
matrix[row][col] = min([diagnonal_score + SUBSTITUTION_PENALTY, left_score + DELETION_PENALTY, top_score + INSERTION_PENALTY])
print_matrix(matrix, hypothesis, reference)
# The bottom right entry is the edit distance
return matrix[N][M]
def print_matrix(matrix, hypothesis, reference):
"""Print the dynamic programming matrix."""
header_string = " "
for ref_char in reference:
header_string += "%4s" % ref_char
print(header_string)
hypothesis = " " + hypothesis
for index, row in enumerate(matrix):
row_string = "%4s" % hypothesis[index]
for col in row:
row_string += "%4d" % col
print(row_string)
if __name__ == '__main__':
for hypothesis in HYPOTHESES:
edit_distance = calculate_char_edit_distance(hypothesis, REFERENCE)
print("\nEdit distance: {}\n".format(edit_distance))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Author: Benjamin Alt (benjamin_alt@outlook.com)
Calculate the word-level edit distance between a reference string and one or multiple hypotheses.
"""
from __future__ import print_function
REFERENCE = "wenn es im Juni viel donnert kommt ein trüber Sommer"
HYPOTHESES = ["im Juni viel Sonne kommt einen trüberen Sommer",
"viel Donner im Juni einen trüben Sommer bringt",
"Juni Donner einen Sommer",
"im Juni viel Donner bringt einen trüben Sommer",
"wenns im Juno viel Donner gibts einen trüben Sommer"]
SUBSTITUTION_PENALTY = 1
INSERTION_PENALTY = 1
DELETION_PENALTY = 1
def calculate_word_edit_distance(hypothesis, reference):
"""Return the word edit distance between the hypothesis and the reference."""
hyp_words = hypothesis.split()
ref_words = reference.split()
M = len(ref_words)
N = len(hyp_words)
# Initialize the matrix
matrix = [[0 for x in range(M + 1)] for y in range(N + 1)] # M + 1 columns, N + 1 rows
for index, row in enumerate(matrix):
row[0] = index
for index, col in enumerate(matrix[0]):
matrix[0][index] = index
# Fill the matrix fields with the appropriate score
for row in range(1, len(matrix)):
for col in range(1, len(matrix[row])):
diagnonal_score = matrix[row - 1][col - 1]
left_score = matrix[row][col - 1]
top_score = matrix[row - 1][col]
if ref_words[col - 1] == hyp_words[row - 1]:
matrix[row][col] = min([diagnonal_score, left_score, top_score])
else:
matrix[row][col] = min([diagnonal_score + SUBSTITUTION_PENALTY, left_score + DELETION_PENALTY, top_score + INSERTION_PENALTY])
print_matrix(matrix, hyp_words, ref_words)
# The bottom right entry is the edit distance
return matrix[N][M]
def print_matrix(matrix, hyp_words, ref_words):
"""Print the dynamic programming matrix with hypothesis hyp_words and reference ref_words."""
header_string = " "
for ref_word in ref_words:
header_string += "%10s" % ref_word
print(header_string)
hyp_words = [""] + hyp_words
for index, row in enumerate(matrix):
row_string = "%10s" % hyp_words[index]
for col in row:
row_string += "%10d" % col
print(row_string)
if __name__ == '__main__':
for hypothesis in HYPOTHESES:
edit_distance = calculate_word_edit_distance(hypothesis, REFERENCE)
print("\nEdit distance: {}\n".format(edit_distance))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment