Package aligner :: Module align
[hide private]
[frames] | no frames]

Source Code for Module aligner.align

  1  """ 
  2  Compute minimum edit distance between two strings, C{source} 
  3  and C{target}, using Levenshtein distance. Return a list of pairs representing 
  4  the alignment of the two strings resulting from the minimal 
  5  editing path from C{source} to C{target}. 
  6   
  7  We build two tables: 
  8   
  9         - viterbi: a table in which each cell (i,j) contains the 
 10         cheapest edit cost, an int, for aligning target[:i+1] 
 11         with source[:j+1]. 
 12   
 13         - paths: a table in which each cell (i,j) contains the (row,col) 
 14         pair for the predecessor cell in the cheapest edit path to (i,j). 
 15   
 16   
 17  We also compute C{pairs}, an alignment of characters in the two 
 18  strings that corresponds to executing the minimum distance edits that 
 19  produce C{target} from C{source}; thus, C{pairs} is a least cost alignment. 
 20   
 21  @var initial_cost: the absurd cost, used to initialize values 
 22                     in the Viterbi table 
 23  @type initial_cost: int 
 24  @var initial_predecessor: the absurd  predecessor, a non-existent 
 25                            row-column pair used to initialize values 
 26                            in the C{paths} table. 
 27  @type initial_predecessor: tuple 
 28  @var source0: example value for source word. 
 29  @type source0: string 
 30  @var target0: example value for target word. 
 31  @type target0: string 
 32  @var eps: the string value representing the empty string in printing 
 33            alignments. 
 34  @var edit_costs: Change this to alter the cost of the three possible 
 35                   editing operations, substitution, deletion, and 
 36                   insertion. 
 37  @type edit_costs: dictionary 
 38  """ 
 39   
 40  import sys 
 41  initial_cost = sys.maxint 
 42  initial_predecessor = (-1,-1) 
 43  edit_costs = {'insertion': 1, 
 44                'deletion': 1, 
 45                'substitution': 2} 
 46  eps = '0' 
 47   
48 -def align (target,source):
49 """Compute best alignment using Levenshtein distance. Return a 50 Viterbi table, a paths table, and a list of pairs. 51 52 The two tables: 53 54 - viterbi: a table in which each cell (i,j) contains the 55 cheapest edit cost, an int, for the alignment of 56 source[:i+1] with target[:j+1]. 57 58 - paths: a table in which each cell (i,j) contains the (row,col) 59 pair for the predecessor cell in the cheapest edit path to (i,j). 60 61 Columns in the table cover target characters, 62 Rows cover source characters. 63 64 We also compute C{pairs}, an alignment of characters in the two 65 strings that corresponds to executing the minimum distance edits 66 that produce C{target} from C{source}. This least cost alignment 67 is represented as a sequence of pairs C{p}, such that C{p[0]} (from 68 target) is aligned with C{p[1]} (from source). 69 """ 70 viterbi = [] 71 paths = [] 72 target = '#'+target 73 source = '#'+source 74 target_len = len(target) 75 source_len = len(source) 76 initialize_tables(viterbi,paths,target_len,source_len) 77 ## Fill in code here computing viterbi table and paths table. 78 79 ## After viterbi and path tables are filled, compute actual 80 ## alignment of strings, to be stored in C{pairs}. 81 last = (target_len-1,source_len-1) 82 pairs = follow_path(last,paths,target,source) 83 # print_alignment(pairs) 84 return (viterbi,paths,pairs)
85 # return pairs 86 87
88 -def initialize_tables(viterbi,paths,target_len,source_len):
89 """ 90 First row and first column are computed at init time 91 since each of those cells has only one possible predecessor cell. 92 93 - First col: viterbi((0,i)) = viterbi((0,i-1)) + deletion_cost 94 - First row: viterbi((i,0)) = viterbi((i-1,0)) + insertion_cost 95 96 For all other cells, we proceed as follows: 97 98 - in C{paths} we enter in the absurd predecessor C{initial_predecessor}. 99 - in C{viterbi} we enter in the absurd cost C{initial_cost} 100 """ 101 last_i_init = last_j_init = 0 102 for i in range(target_len): 103 this_col_score = [] 104 this_col_path = [] 105 viterbi.append(this_col_score) 106 paths.append(this_col_path) 107 for j in range(source_len): 108 if i == 0: # Doing col 0 109 this_col_score.append(last_i_init) 110 this_col_path.append((i,j-1)) 111 last_i_init += edit_costs['deletion'] 112 elif j == 0: # Doing row 0 113 last_j_init += edit_costs['insertion'] 114 this_col_score.append(last_j_init) 115 this_col_path.append((i-1,j)) 116 else: 117 this_col_score.append(initial_cost) # expensive is easy to beat. 118 this_col_path.append(initial_predecessor)
119 120
121 -def follow_path(last,paths,target,source):
122 """ 123 This function is called with C{last} set to the cell coordinate 124 for the upper right hand corner of the paths table. This is a pair 125 of integers (i,j) corresponding to positions i in target and j in 126 source. C{last} is the last cell visited in the best edit 127 path. That cell contains the coordinates of the second-to-last 128 step in the best edit path. Going to the previous cell yields the 129 coordinates of the third-to-last cell, and so on, until we are led 130 inevitably back to the first step, which is always (0,0). 131 132 Return the list of CHARACTER PAIRS from source and target, 133 corresponding to cell coordinates visited, from first to last. 134 """ 135 pass
136 137 ################################################################### 138 ## 139 ## P r i n t i n g R o u t i n e s 140 ## 141 ################################################################### 142 156 188 189 (target0,source0) = ('execution','intention') 190 (a_target0,a_source0) = ('#execution','#intention') 191 (target1,source1) = ('spat','at') 192 (target2,source2) = ('faltluence', 'flatulence') 193 (target3,source3) = ('fluency', 'flatulence') 194 (target4,source4) = ('drive', 'brief') 195 (target5,source5) = ('drive', 'divers') 196