benjaminalt · June 26, 2016 10:52
diff --git a/edit_distance_chars.py b/edit_distance_chars.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 """
 Author: Benjamin Alt (benjamin_alt@outlook.com)

 Calculate the character-level edit distance between a reference string and one or multiple hypotheses.
 """

 from __future__ import print_function

 REFERENCE = "wenn es im Juni viel donnert kommt ein trüber Sommer"
 HYPOTHESES = ["im Juni viel Sonne kommt einen trüberen Sommer",
              "viel Donner im Juni einen trüben Sommer bringt",
              "Juni Donner einen Sommer",
              "im Juni viel Donner bringt einen trüben Sommer",
              "wenns im Juno viel Donner gibts einen trüben Sommer"]
 SUBSTITUTION_PENALTY = 1
 INSERTION_PENALTY = 1
 DELETION_PENALTY = 1

 def calculate_char_edit_distance(hypothesis, reference):
    """Return the word edit distance between the hypothesis and the reference."""
    M = len(reference)
    N = len(hypothesis)
    
    # Initialize the matrix
    matrix = [[0 for x in range(M + 1)] for y in range(N + 1)] # M + 1 columns, N + 1 rows
    for index, row in enumerate(matrix):
        row[0] = index
    for index, col in enumerate(matrix[0]):
        matrix[0][index] = index
    
    # Fill the matrix fields with the appropriate score
    for row in range(1, len(matrix)):
        for col in range(1, len(matrix[row])):
            diagnonal_score = matrix[row - 1][col - 1]
            left_score = matrix[row][col - 1]
            top_score = matrix[row - 1][col]
            if reference[col - 1] == hypothesis[row - 1]:
                matrix[row][col] = min([diagnonal_score, left_score, top_score])
            else:
                matrix[row][col] = min([diagnonal_score + SUBSTITUTION_PENALTY, left_score + DELETION_PENALTY, top_score + INSERTION_PENALTY])
    print_matrix(matrix, hypothesis, reference)
    
    # The bottom right entry is the edit distance
    return matrix[N][M]
    
 def print_matrix(matrix, hypothesis, reference):
    """Print the dynamic programming matrix."""
    header_string = "            "
    for ref_char in reference:
        header_string += "%4s" % ref_char
    print(header_string)
    hypothesis = " " + hypothesis
    for index, row in enumerate(matrix):
        row_string = "%4s" % hypothesis[index]
        for col in row:
            row_string += "%4d" % col
        print(row_string)
        

 if __name__ == '__main__':
    for hypothesis in HYPOTHESES:
        edit_distance = calculate_char_edit_distance(hypothesis, REFERENCE)
        print("\nEdit distance: {}\n".format(edit_distance))
diff --git a/edit_distance_words.py b/edit_distance_words.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 """
 Author: Benjamin Alt (benjamin_alt@outlook.com)

 Calculate the word-level edit distance between a reference string and one or multiple hypotheses.
 """

 from __future__ import print_function

 REFERENCE = "wenn es im Juni viel donnert kommt ein trüber Sommer"
 HYPOTHESES = ["im Juni viel Sonne kommt einen trüberen Sommer",
              "viel Donner im Juni einen trüben Sommer bringt",
              "Juni Donner einen Sommer",
              "im Juni viel Donner bringt einen trüben Sommer",
              "wenns im Juno viel Donner gibts einen trüben Sommer"]
 SUBSTITUTION_PENALTY = 1
 INSERTION_PENALTY = 1
 DELETION_PENALTY = 1

 def calculate_word_edit_distance(hypothesis, reference):
    """Return the word edit distance between the hypothesis and the reference."""
    hyp_words = hypothesis.split()
    ref_words = reference.split()
    M = len(ref_words)
    N = len(hyp_words)
    
    # Initialize the matrix
    matrix = [[0 for x in range(M + 1)] for y in range(N + 1)] # M + 1 columns, N + 1 rows
    for index, row in enumerate(matrix):
        row[0] = index
    for index, col in enumerate(matrix[0]):
        matrix[0][index] = index
    
    # Fill the matrix fields with the appropriate score
    for row in range(1, len(matrix)):
        for col in range(1, len(matrix[row])):
            diagnonal_score = matrix[row - 1][col - 1]
            left_score = matrix[row][col - 1]
            top_score = matrix[row - 1][col]
            if ref_words[col - 1] == hyp_words[row - 1]:
                matrix[row][col] = min([diagnonal_score, left_score, top_score])
            else:
                matrix[row][col] = min([diagnonal_score + SUBSTITUTION_PENALTY, left_score + DELETION_PENALTY, top_score + INSERTION_PENALTY])
    print_matrix(matrix, hyp_words, ref_words)
    
    # The bottom right entry is the edit distance
    return matrix[N][M]
    
 def print_matrix(matrix, hyp_words, ref_words):
    """Print the dynamic programming matrix with hypothesis hyp_words and reference ref_words."""
    header_string = "                    "
    for ref_word in ref_words:
        header_string += "%10s" % ref_word
    print(header_string)
    hyp_words = [""] + hyp_words
    for index, row in enumerate(matrix):
        row_string = "%10s" % hyp_words[index]
        for col in row:
            row_string += "%10d" % col
        print(row_string)
        

 if __name__ == '__main__':
    for hypothesis in HYPOTHESES:
        edit_distance = calculate_word_edit_distance(hypothesis, REFERENCE)
        print("\nEdit distance: {}\n".format(edit_distance))
	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""
	Author: Benjamin Alt (benjamin_alt@outlook.com)

	Calculate the character-level edit distance between a reference string and one or multiple hypotheses.
	"""

	from __future__ import print_function

	REFERENCE = "wenn es im Juni viel donnert kommt ein trüber Sommer"
	HYPOTHESES = ["im Juni viel Sonne kommt einen trüberen Sommer",
	"viel Donner im Juni einen trüben Sommer bringt",
	"Juni Donner einen Sommer",
	"im Juni viel Donner bringt einen trüben Sommer",
	"wenns im Juno viel Donner gibts einen trüben Sommer"]
	SUBSTITUTION_PENALTY = 1
	INSERTION_PENALTY = 1
	DELETION_PENALTY = 1

	def calculate_char_edit_distance(hypothesis, reference):
	"""Return the word edit distance between the hypothesis and the reference."""
	M = len(reference)
	N = len(hypothesis)

	# Initialize the matrix
	matrix = [[0 for x in range(M + 1)] for y in range(N + 1)] # M + 1 columns, N + 1 rows
	for index, row in enumerate(matrix):
	row[0] = index
	for index, col in enumerate(matrix[0]):
	matrix[0][index] = index

	# Fill the matrix fields with the appropriate score
	for row in range(1, len(matrix)):
	for col in range(1, len(matrix[row])):
	diagnonal_score = matrix[row - 1][col - 1]
	left_score = matrix[row][col - 1]
	top_score = matrix[row - 1][col]
	if reference[col - 1] == hypothesis[row - 1]:
	matrix[row][col] = min([diagnonal_score, left_score, top_score])
	else:
	matrix[row][col] = min([diagnonal_score + SUBSTITUTION_PENALTY, left_score + DELETION_PENALTY, top_score + INSERTION_PENALTY])
	print_matrix(matrix, hypothesis, reference)

	# The bottom right entry is the edit distance
	return matrix[N][M]

	def print_matrix(matrix, hypothesis, reference):
	"""Print the dynamic programming matrix."""
	header_string = " "
	for ref_char in reference:
	header_string += "%4s" % ref_char
	print(header_string)
	hypothesis = " " + hypothesis
	for index, row in enumerate(matrix):
	row_string = "%4s" % hypothesis[index]
	for col in row:
	row_string += "%4d" % col
	print(row_string)


	if __name__ == '__main__':
	for hypothesis in HYPOTHESES:
	edit_distance = calculate_char_edit_distance(hypothesis, REFERENCE)
	print("\nEdit distance: {}\n".format(edit_distance))
No results found