1 """
2 Compute minimum edit distance between two strings, C{source}
3 and C{target}, using Levenshtein distance. Return a list of pairs representing
4 the alignment of the two strings resulting from the minimal
5 editing path from C{source} to C{target}.
6
7 We build two tables:
8
9 - viterbi: a table in which each cell (i,j) contains the
10 cheapest edit cost, an int, for aligning target[:i+1]
11 with source[:j+1].
12
13 - paths: a table in which each cell (i,j) contains the (row,col)
14 pair for the predecessor cell in the cheapest edit path to (i,j).
15
16
17 We also compute C{pairs}, an alignment of characters in the two
18 strings that corresponds to executing the minimum distance edits that
19 produce C{target} from C{source}; thus, C{pairs} is a least cost alignment.
20
21 @var initial_cost: the absurd cost, used to initialize values
22 in the Viterbi table
23 @type initial_cost: int
24 @var initial_predecessor: the absurd predecessor, a non-existent
25 row-column pair used to initialize values
26 in the C{paths} table.
27 @type initial_predecessor: tuple
28 @var source0: example value for source word.
29 @type source0: string
30 @var target0: example value for target word.
31 @type target0: string
32 @var eps: the string value representing the empty string in printing
33 alignments.
34 @var edit_costs: Change this to alter the cost of the three possible
35 editing operations, substitution, deletion, and
36 insertion.
37 @type edit_costs: dictionary
38 """
39
40 import sys
41 initial_cost = sys.maxint
42 initial_predecessor = (-1,-1)
43 edit_costs = {'insertion': 1,
44 'deletion': 1,
45 'substitution': 2}
46 eps = '0'
47
48 -def align (target,source):
49 """Compute best alignment using Levenshtein distance. Return a
50 Viterbi table, a paths table, and a list of pairs.
51
52 The two tables:
53
54 - viterbi: a table in which each cell (i,j) contains the
55 cheapest edit cost, an int, for the alignment of
56 source[:i+1] with target[:j+1].
57
58 - paths: a table in which each cell (i,j) contains the (row,col)
59 pair for the predecessor cell in the cheapest edit path to (i,j).
60
61 Columns in the table cover target characters,
62 Rows cover source characters.
63
64 We also compute C{pairs}, an alignment of characters in the two
65 strings that corresponds to executing the minimum distance edits
66 that produce C{target} from C{source}. This least cost alignment
67 is represented as a sequence of pairs C{p}, such that C{p[0]} (from
68 target) is aligned with C{p[1]} (from source).
69 """
70 viterbi = []
71 paths = []
72 target = '#'+target
73 source = '#'+source
74 target_len = len(target)
75 source_len = len(source)
76 initialize_tables(viterbi,paths,target_len,source_len)
77
78
79
80
81 last = (target_len-1,source_len-1)
82 pairs = follow_path(last,paths,target,source)
83
84 return (viterbi,paths,pairs)
85
86
87
89 """
90 First row and first column are computed at init time
91 since each of those cells has only one possible predecessor cell.
92
93 - First col: viterbi((0,i)) = viterbi((0,i-1)) + deletion_cost
94 - First row: viterbi((i,0)) = viterbi((i-1,0)) + insertion_cost
95
96 For all other cells, we proceed as follows:
97
98 - in C{paths} we enter in the absurd predecessor C{initial_predecessor}.
99 - in C{viterbi} we enter in the absurd cost C{initial_cost}
100 """
101 last_i_init = last_j_init = 0
102 for i in range(target_len):
103 this_col_score = []
104 this_col_path = []
105 viterbi.append(this_col_score)
106 paths.append(this_col_path)
107 for j in range(source_len):
108 if i == 0:
109 this_col_score.append(last_i_init)
110 this_col_path.append((i,j-1))
111 last_i_init += edit_costs['deletion']
112 elif j == 0:
113 last_j_init += edit_costs['insertion']
114 this_col_score.append(last_j_init)
115 this_col_path.append((i-1,j))
116 else:
117 this_col_score.append(initial_cost)
118 this_col_path.append(initial_predecessor)
119
120
122 """
123 This function is called with C{last} set to the cell coordinate
124 for the upper right hand corner of the paths table. This is a pair
125 of integers (i,j) corresponding to positions i in target and j in
126 source. C{last} is the last cell visited in the best edit
127 path. That cell contains the coordinates of the second-to-last
128 step in the best edit path. Going to the previous cell yields the
129 coordinates of the third-to-last cell, and so on, until we are led
130 inevitably back to the first step, which is always (0,0).
131
132 Return the list of CHARACTER PAIRS from source and target,
133 corresponding to cell coordinates visited, from first to last.
134 """
135 pass
136
137
138
139
140
141
142
144 print ' ',
145 for c in target:
146 print '%s ' % (c,),
147 print
148 for i in range(len(source)-1,-1,-1):
149 print source[i],
150 for j in range(len(target)):
151 print '%.*d' % (2,viterbi[j][i]),
152 print
153 print ' ',
154 for c in target:
155 print '%s ' % (c,),
156
158 """
159 C{pairs} is a sequence of character pairs, such
160 that p[0] is a character from the target string aligned
161 with p[1] from the source string.
162 """
163 print
164 for p in pairs:
165 print p[0],
166 print
167 for p in pairs:
168 print p[1],
169 print
170 for p in pairs:
171 print '-',
172 print
173 score = 0
174 for p in pairs:
175 if p[0] == p[1]:
176 cost = 0
177 elif p[0] == eps:
178 cost = edit_costs['deletion']
179 elif p[1] == eps:
180 cost = edit_costs['insertion']
181 else:
182 cost = edit_costs['substitution']
183 print cost,
184 score += cost
185 print
186 print
187 print 'Total: %d' % (score,)
188
189 (target0,source0) = ('execution','intention')
190 (a_target0,a_source0) = ('#execution','#intention')
191 (target1,source1) = ('spat','at')
192 (target2,source2) = ('faltluence', 'flatulence')
193 (target3,source3) = ('fluency', 'flatulence')
194 (target4,source4) = ('drive', 'brief')
195 (target5,source5) = ('drive', 'divers')
196