# Ling 581: Top Down Parser
#
# Copyright (C) 2006 San Diego State University
# Author: Mark Gawron <gawron@mail.sdsu.edu>

class Parser:
    """
    Class for top down recursive descent parser.
    """

    def __init__(self, grammar, string=''):
      """
      A parser instance must have a grammar at instance creation time.
      The terminals, productions, and start_cat for the grammar
      are stored upon creation.
      
      If a string is supplied it is converted to a list words
      and stored in self.input.  A string may also be supplied
      when the recognizer- or parser- method is called.
      """
      self.productions = grammar.productions
      self.start_cat = grammar.start_cat
      if not grammar.terminals:
          grammar.compute_terminals()
      self.terminals = grammar.terminals
      self.input = self.string_to_wordlist(string)
      self.recursion_limit=100
      self.recursion_depth=0

    ## Main body of recognizer code starts here!

    def recognize_string(self, string=''):
        """
        @param string: A string.  The string to be parsed.
                      Immediately converted to a [WordList].by self.legal_input.
                
        @return: The result of calling recurse_recognize on the start-cat,
        the empty agenda, and the wordlist generated from string.
        """
        if self.legal_input(string):
            self.recursion_depth = 0
            return self.recurse_recognize([self.start_cat],[],self.input)
        else:
            return False

    def recurse_recognize(self,goals,agenda,wordlist):
        """
        Parse wordlist using C{goals} (derivation).  Return true
        if current grammar accepts C{wordlist}. Else False.
        
        Return True whenever C{goals} and C{wordlist} are empty.

        Suppose C{goals} is non-empty:
        
        If C{goals}[0] is a nonterminal, call C{expand} (expand it with the grammar
        and continue parsing recursively).  If C{goals}[0] is a terminal
        then call C{match_word} (try to match the next word and continue parsing).
        Both C{expand} and C{match_word} contain recursive calls to recurse_recognize.

        Suppose C{goals} is empty:

        Then if C{wordlist} is empty, return True. Else backtrack (If {agenda} is non-empty
        continue parsing with the next state on the agenda; else return False)
        backtrack also contains a recursive call to C{recurse_recognize}.

        @param goals: A list of categories and/or words proposed to cover
                      wordlist, a partial derivation from the grammar.
        @param agenda: A list of C{ParserState}s: each a pair of ([GoalList],[WordList])
        @param wordlist: a list of words.
        @rtype: 
        """
        if len(goals) > 0:
            next_goal = goals[0]
            if not next_goal in self.terminals:
                return self.expand(next_goal,goals[1:],agenda,wordlist)
            else:
                return self.match_word(next_goal,wordlist,goals[1:],agenda)
        elif len(wordlist) == 0:  #Success state! No goals, no words!
            return True
        else:
            # Fail state
            return self.backtrack(agenda)

    def expand(self,next_goal,rest_goals,agenda,wordlist):
        """
        Expand C{next_goal} using grammar (C{self.productions})
        Generate new C{GoalList} using one production and add unused productions
        to C{agenda}. Keep parsing with new C{GoalList}, new C{agenda}, old C{WordList}.
        @param next_goal: grammar nonterminal
        @param rest_goals: other gramar elements from the same gramar production
        @param agenda: list of parser states.
        @param wordlist: list of words (strings).
        """
        productions = self.productions[next_goal]
        next_production = productions[0]
        for p in productions[1:]:
            agenda = [ParserState(p+rest_goals,wordlist)] + agenda
        self.trace_expand(next_goal,next_production,rest_goals, agenda,wordlist)
        self.recursion_check()  ## a bandaid for some recursion issues.
        return self.recurse_recognize(next_production+rest_goals, agenda, wordlist)

    def match_word(self,next_goal,wordlist,rest_goals,agenda):
        """
        C{next_goal} is a terminal.

        Suppose C{wordlist} is non-empty:
        
        If next_goal matches C{wordlist}[0],  keep parsing with C{rest_goals}, C{agenda}
        and C{wordlist}[1:].

        Else this parse path fails;  call backtrack (if C{agenda}
        is non-empty, continue parsing with the next parser state
        on C{agenda}, and if C{agenda} is empty, <Return>: False)

        Suppose C{wordlist} is empty.

        This parse path fails.  Call C{backtrack}.

        @param next_goal: grammar preterminal
        @param wordlist: list of words (strings).
        @param rest_goals: other gramar elements from the same gramar production
        @param agenda: list of parser states.
        """
        if len(wordlist) > 0:
                 if next_goal == wordlist[0]:
                    ## We just matched a word! Discard and keep parsing rest of wordlist
                    self.trace_match(True,next_goal,wordlist[0],rest_goals,wordlist[1:])
                    return self.recurse_recognize(rest_goals,agenda,wordlist[1:])
                 else:
                    self.trace_match(False,next_goal,wordlist[0],rest_goals,wordlist[1:])
                    return self.backtrack(agenda)
        else:
            self.trace_match(False,next_goal,'**empty**',rest_goals,[])
            return self.backtrack(agenda)
        

    def backtrack(self, agenda):
        """
        Suppose C{agenda} is non-empty:

        Then::
          agenda[0].GoalList = new goals
          agenda[0].WordList = new wordlist
        Keep parsing with new goals and new worldlist and popped agenda.

        Suppose C{agenda} is empty

        return False
        """
        if len(agenda) > 0:
            self.trace_backtrack(agenda[0].GoalList,agenda[0].WordList)
            return self.recurse_recognize(agenda[0].GoalList,agenda[1:],agenda[0].WordList)
        else:
            return False
        
    ### Main body of recognizer code ends here.

    def parse_string(self, string=''):
        """
        Not yet implemented
        """
        if self.legal_input(string):
            print 'Not yet implemented!'
            return False


    ### Utility methods, tracing and type conversion below here
        
    def trace (self,boolean=True):
        print boolean
        if boolean:
            self._trace = True
        else:
            self._trace = False

    def trace_match(self, boolean,next_goal,word,rest_goals,rest_wordlist):
        if boolean and self._trace==True:
            print 'Match succeeded: %s %s %s %s' % (next_goal, word, rest_goals,rest_wordlist)
        elif self._trace == True:
            print 'Match failed: %s %s %s %s' % (next_goal, word, rest_goals,rest_wordlist)

    def trace_expand(self,next_goal,next_production,goals, agenda,wordlist):
        if self._trace==True:
            print 'Expanding %s as %s' % (next_goal,next_production)
            print '  Goals:  %s' % (next_production+goals)
            print '  Agenda:  '
            for pstate in agenda:
                print '     %s' % (pstate,)
                

    def trace_backtrack(self,goals,wordlist):
        if self._trace == True:
            print 'Backtracking to '
            print '   Goals: %s' % goals
            print '   Wordlist: %s' % wordlist

    def string_to_wordlist (self, string):
        return string.lower().split()

    def legal_input (self, string=''):
        if not string:
            self.input = self.input
        else:
            self.input = self.string_to_wordlist(string)
        if self.input:
            return True
        else:
            print 'Recognizer Error: No input given!'
            return False

    def recursion_check(self):
        if self.recursion_limit:
            if self.recursion_depth > self.recursion_limit:
                raise Exception('Recursion Depth exceeded!')
            else:
                self.recursion_depth += 1

class Grammar:
    """
    Class for Context free grammars

    Grammar rules stored in self.productions
    Start Cat stored in self.start_cat
    Provided: a method for computing the terminals of the grammar.
    Unchecked assumption: terminals never use upper case (built into parser)
    """

    def __init__(self, start_cat, trace=False):
        """
        C{self.productions}: a dictionary whose keys are categories
        and whose values are lists of productions
        """
        self._trace = trace
        self.start_cat = start_cat
        self.productions = {}
        self.terminals = []
        
    def __str__(self):
        """
        If g1 is a grammar object run 'print g1' to
        trigger this code, which builds the
        string representation of the grammar that is printed
        """
        str =''
        for Cat in self.productions:
            str += Cat
            for R in self.productions[Cat]:
                str += '\t=> '
                for dtr in R:
                    str += '%s ' % dtr
                str += '\n'
            str += '\t ------\n'
        return str


    def add_production (self, cat, production):
        self.productions[cat]=self.productions.get(cat,[])+[production]
        if self.terminals:
            self.compute_terminals()
        return production

    def compute_terminals(self):
        """
        Place a set of grammar terminals in self.terminals

        Assumption: No symbol that ever occurs in the LHS
        of a production ever needs to occur in an input string.
                    
        Counterexample: If 's' is the start symbol of the grammar
        it is also the possession-marking 'word' in English.
        """
        terminals=[]
        for cat in self.productions:
            for p in self.productions[cat]:
                for d in p:
                    if not d in self.productions:
                        terminals+=[d]
        self.terminals = terminals

class ParserState:
    """
    A parser state is basically a pair of a goallist (derivation)
    and a wordlist, with a Pythonic class instance wrapper.
    """
    
    def __init__(self,GoalList,WordList):
        self.GoalList = GoalList
        self.WordList = WordList

    def __str__(self):
        """
        Return nice string rep.
        
        Look like a pair!
        Because that's what you are!
        @rtype: string
        """
        return "(%s, %s)" % (self.GoalList,self.WordList)

    def __getitem__(self, i):
        """
        This is a little silly! Purely for illustration!
        Allows ParserState instances to be accessed using
        the usual Python sequence syntax

           >>> ps = ParserState(['S'],['a', 'dog', 'walks'])
           >>> ps[0]
           ['S]
           >>> ps[1]
           ['a', 'dog', 'walks']
        """
        if i == 0:
            return self.GoalList
        elif i == 1:
            return self.WordList
        else:
            raise Exception('Illegal index for parse state!')
        
###########################################################################
###  Load time execution code starts here
###########################################################################
        
def demo (strings):
    global g1, p1, g2, p2
    ## The first grammar
    g1 = Grammar('s')
    g1.add_production('s',['np','vp'])
    g1.add_production('s',['vp'])
    g1.add_production('vp',['v'])
    g1.add_production('vp',['v','np'])
    g1.add_production('np',['pname'])
    g1.add_production('pname',['john'])
    g1.add_production('pname',['mary'])
    g1.add_production('np',['d','n'])
    g1.add_production('d',['the'])
    g1.add_production('n',['boy'])
    g1.add_production('n',['girl'])
    g1.add_production('n',['beans'])
    g1.add_production('v',['likes'])
    g1.add_production('v',['run'])
    g1.add_production('v',['eat'])
    p1 = Parser(g1)
    p1.trace()
    ## The second grammar
    g2 = Grammar('s')
    g2.add_production('s',['np','vp'])
    g2.add_production('s',['vp'])
    g2.add_production('vp',['v'])
    g2.add_production('vp',['v','np'])
    g2.add_production('np',['pname'])
    g2.add_production('pname',['john'])
    g2.add_production('pname',['mary'])
    g2.add_production('np',['d','n'])
    g2.add_production('d',['the'])
    g2.add_production('n',['boy'])
    g2.add_production('n',['girl'])
    g2.add_production('n',['tree'])
    g2.add_production('n',['flowers'])
    g2.add_production('n',['beans'])
    g2.add_production('v',['likes'])
    g2.add_production('v',['run'])
    g2.add_production('v',['eat'])
    g2.add_production('p',['with'])
    g2.add_production('pp',['p','np'])
    g2.add_production('np',['np','pp'])
    p2 = Parser(g2)
    p2.trace()
    
    for s in strings:
        print s
        print '---------'
        print p1.recognize_string(s)
        print
    

if __name__ == '__main__':
    strings = ['John likes Mary',
               'The boy likes the girl',
               'Eat beans']
    demo(strings)