# Ling 581: Top Down Parser # # Copyright (C) 2006 San Diego State University # Author: Mark Gawron class Parser: """ Class for top down recursive descent parser. """ def __init__(self, grammar, string=''): """ A parser instance must have a grammar at instance creation time. The terminals, productions, and start_cat for the grammar are stored upon creation. If a string is supplied it is converted to a list words and stored in self.input. A string may also be supplied when the recognizer- or parser- method is called. """ self.productions = grammar.productions self.start_cat = grammar.start_cat if not grammar.terminals: grammar.compute_terminals() self.terminals = grammar.terminals self.input = self.string_to_wordlist(string) self.recursion_limit=100 self.recursion_depth=0 ## Main body of recognizer code starts here! def recognize_string(self, string=''): """ @param string: A string. The string to be parsed. Immediately converted to a [WordList].by self.legal_input. @return: The result of calling recurse_recognize on the start-cat, the empty agenda, and the wordlist generated from string. """ if self.legal_input(string): self.recursion_depth = 0 return self.recurse_recognize([self.start_cat],[],self.input) else: return False def recurse_recognize(self,goals,agenda,wordlist): """ Parse wordlist using C{goals} (derivation). Return true if current grammar accepts C{wordlist}. Else False. Return True whenever C{goals} and C{wordlist} are empty. Suppose C{goals} is non-empty: If C{goals}[0] is a nonterminal, call C{expand} (expand it with the grammar and continue parsing recursively). If C{goals}[0] is a terminal then call C{match_word} (try to match the next word and continue parsing). Both C{expand} and C{match_word} contain recursive calls to recurse_recognize. Suppose C{goals} is empty: Then if C{wordlist} is empty, return True. Else backtrack (If {agenda} is non-empty continue parsing with the next state on the agenda; else return False) backtrack also contains a recursive call to C{recurse_recognize}. @param goals: A list of categories and/or words proposed to cover wordlist, a partial derivation from the grammar. @param agenda: A list of C{ParserState}s: each a pair of ([GoalList],[WordList]) @param wordlist: a list of words. @rtype: """ if len(goals) > 0: next_goal = goals[0] if not next_goal in self.terminals: return self.expand(next_goal,goals[1:],agenda,wordlist) else: return self.match_word(next_goal,wordlist,goals[1:],agenda) elif len(wordlist) == 0: #Success state! No goals, no words! return True else: # Fail state return self.backtrack(agenda) def expand(self,next_goal,rest_goals,agenda,wordlist): """ Expand C{next_goal} using grammar (C{self.productions}) Generate new C{GoalList} using one production and add unused productions to C{agenda}. Keep parsing with new C{GoalList}, new C{agenda}, old C{WordList}. @param next_goal: grammar nonterminal @param rest_goals: other gramar elements from the same gramar production @param agenda: list of parser states. @param wordlist: list of words (strings). """ productions = self.productions[next_goal] next_production = productions[0] for p in productions[1:]: agenda = [ParserState(p+rest_goals,wordlist)] + agenda self.trace_expand(next_goal,next_production,rest_goals, agenda,wordlist) self.recursion_check() ## a bandaid for some recursion issues. return self.recurse_recognize(next_production+rest_goals, agenda, wordlist) def match_word(self,next_goal,wordlist,rest_goals,agenda): """ C{next_goal} is a terminal. Suppose C{wordlist} is non-empty: If next_goal matches C{wordlist}[0], keep parsing with C{rest_goals}, C{agenda} and C{wordlist}[1:]. Else this parse path fails; call backtrack (if C{agenda} is non-empty, continue parsing with the next parser state on C{agenda}, and if C{agenda} is empty, : False) Suppose C{wordlist} is empty. This parse path fails. Call C{backtrack}. @param next_goal: grammar preterminal @param wordlist: list of words (strings). @param rest_goals: other gramar elements from the same gramar production @param agenda: list of parser states. """ if len(wordlist) > 0: if next_goal == wordlist[0]: ## We just matched a word! Discard and keep parsing rest of wordlist self.trace_match(True,next_goal,wordlist[0],rest_goals,wordlist[1:]) return self.recurse_recognize(rest_goals,agenda,wordlist[1:]) else: self.trace_match(False,next_goal,wordlist[0],rest_goals,wordlist[1:]) return self.backtrack(agenda) else: self.trace_match(False,next_goal,'**empty**',rest_goals,[]) return self.backtrack(agenda) def backtrack(self, agenda): """ Suppose C{agenda} is non-empty: Then:: agenda[0].GoalList = new goals agenda[0].WordList = new wordlist Keep parsing with new goals and new worldlist and popped agenda. Suppose C{agenda} is empty return False """ if len(agenda) > 0: self.trace_backtrack(agenda[0].GoalList,agenda[0].WordList) return self.recurse_recognize(agenda[0].GoalList,agenda[1:],agenda[0].WordList) else: return False ### Main body of recognizer code ends here. def parse_string(self, string=''): """ Not yet implemented """ if self.legal_input(string): print 'Not yet implemented!' return False ### Utility methods, tracing and type conversion below here def trace (self,boolean=True): print boolean if boolean: self._trace = True else: self._trace = False def trace_match(self, boolean,next_goal,word,rest_goals,rest_wordlist): if boolean and self._trace==True: print 'Match succeeded: %s %s %s %s' % (next_goal, word, rest_goals,rest_wordlist) elif self._trace == True: print 'Match failed: %s %s %s %s' % (next_goal, word, rest_goals,rest_wordlist) def trace_expand(self,next_goal,next_production,goals, agenda,wordlist): if self._trace==True: print 'Expanding %s as %s' % (next_goal,next_production) print ' Goals: %s' % (next_production+goals) print ' Agenda: ' for pstate in agenda: print ' %s' % (pstate,) def trace_backtrack(self,goals,wordlist): if self._trace == True: print 'Backtracking to ' print ' Goals: %s' % goals print ' Wordlist: %s' % wordlist def string_to_wordlist (self, string): return string.lower().split() def legal_input (self, string=''): if not string: self.input = self.input else: self.input = self.string_to_wordlist(string) if self.input: return True else: print 'Recognizer Error: No input given!' return False def recursion_check(self): if self.recursion_limit: if self.recursion_depth > self.recursion_limit: raise Exception('Recursion Depth exceeded!') else: self.recursion_depth += 1 class Grammar: """ Class for Context free grammars Grammar rules stored in self.productions Start Cat stored in self.start_cat Provided: a method for computing the terminals of the grammar. Unchecked assumption: terminals never use upper case (built into parser) """ def __init__(self, start_cat, trace=False): """ C{self.productions}: a dictionary whose keys are categories and whose values are lists of productions """ self._trace = trace self.start_cat = start_cat self.productions = {} self.terminals = [] def __str__(self): """ If g1 is a grammar object run 'print g1' to trigger this code, which builds the string representation of the grammar that is printed """ str ='' for Cat in self.productions: str += Cat for R in self.productions[Cat]: str += '\t=> ' for dtr in R: str += '%s ' % dtr str += '\n' str += '\t ------\n' return str def add_production (self, cat, production): self.productions[cat]=self.productions.get(cat,[])+[production] if self.terminals: self.compute_terminals() return production def compute_terminals(self): """ Place a set of grammar terminals in self.terminals Assumption: No symbol that ever occurs in the LHS of a production ever needs to occur in an input string. Counterexample: If 's' is the start symbol of the grammar it is also the possession-marking 'word' in English. """ terminals=[] for cat in self.productions: for p in self.productions[cat]: for d in p: if not d in self.productions: terminals+=[d] self.terminals = terminals class ParserState: """ A parser state is basically a pair of a goallist (derivation) and a wordlist, with a Pythonic class instance wrapper. """ def __init__(self,GoalList,WordList): self.GoalList = GoalList self.WordList = WordList def __str__(self): """ Return nice string rep. Look like a pair! Because that's what you are! @rtype: string """ return "(%s, %s)" % (self.GoalList,self.WordList) def __getitem__(self, i): """ This is a little silly! Purely for illustration! Allows ParserState instances to be accessed using the usual Python sequence syntax >>> ps = ParserState(['S'],['a', 'dog', 'walks']) >>> ps[0] ['S] >>> ps[1] ['a', 'dog', 'walks'] """ if i == 0: return self.GoalList elif i == 1: return self.WordList else: raise Exception('Illegal index for parse state!') ########################################################################### ### Load time execution code starts here ########################################################################### def demo (strings): global g1, p1, g2, p2 ## The first grammar g1 = Grammar('s') g1.add_production('s',['np','vp']) g1.add_production('s',['vp']) g1.add_production('vp',['v']) g1.add_production('vp',['v','np']) g1.add_production('np',['pname']) g1.add_production('pname',['john']) g1.add_production('pname',['mary']) g1.add_production('np',['d','n']) g1.add_production('d',['the']) g1.add_production('n',['boy']) g1.add_production('n',['girl']) g1.add_production('n',['beans']) g1.add_production('v',['likes']) g1.add_production('v',['run']) g1.add_production('v',['eat']) p1 = Parser(g1) p1.trace() ## The second grammar g2 = Grammar('s') g2.add_production('s',['np','vp']) g2.add_production('s',['vp']) g2.add_production('vp',['v']) g2.add_production('vp',['v','np']) g2.add_production('np',['pname']) g2.add_production('pname',['john']) g2.add_production('pname',['mary']) g2.add_production('np',['d','n']) g2.add_production('d',['the']) g2.add_production('n',['boy']) g2.add_production('n',['girl']) g2.add_production('n',['tree']) g2.add_production('n',['flowers']) g2.add_production('n',['beans']) g2.add_production('v',['likes']) g2.add_production('v',['run']) g2.add_production('v',['eat']) g2.add_production('p',['with']) g2.add_production('pp',['p','np']) g2.add_production('np',['np','pp']) p2 = Parser(g2) p2.trace() for s in strings: print s print '---------' print p1.recognize_string(s) print if __name__ == '__main__': strings = ['John likes Mary', 'The boy likes the girl', 'Eat beans'] demo(strings)