import xml.dom.minidom,re sre = re.compile('&') class SensevalInst (object): def __init__(self,sense,position,inst_id,context): self.sense = sense self.position = position self.id = inst_id self.context = context def parse_file_and_extract_events(xml_file,event_list,indent=0): g = open(xml_file,'r') g_doc = xml.dom.minidom.parse(g) g.close() extract_events(g_doc,event_list, indent,True) def extract_events (node,event_list, indent=0, tagging=False): """ C{node} is the root of an xml.dom.minidom tree for a senseval xml file. Construct a list of events (feature-dict, class pairs). """ while node: if node.nodeType == node.ELEMENT_NODE and node.nodeName=='senseval_instances': print True feature_dict = {} sis = node.getElementsByTagName('senseval_instance') for si in sis: sense = si.getAttribute('sense') position = int(si.getAttribute('position')) inst_id = si.getAttribute('id') sentence = si.firstChild.wholeText ## Now make sentence a list of word tag pairs context = [wordtag.split('_') for wordtag in sentence.split()] si = SensevalInst(sense,position,inst_id,context) ## Now extract features from the list C{context}. ## C{context[position]} returns our word of interest. # print context event_list.append((si,sense)) extract_events (node.firstChild,event_list,indent+4, tagging) node = node.nextSibling def wsd_features (context,position,context_id,vocab): """ Not implemented. Return a dictionary of the features extracted. """ features = {} ### Code here! return features def extract_vocab(event_list, n=300): """ Return a dictionary of n most frequently occurring words in THESE contexts, excluding stop words. Shd probably compare stemmed forms. Not done yet. @return vocab: vocab[{word}] returns a list of the context ids C{word} occurs in. """ # Google's stoplist with most preps removed. "and" added, word added stopwords = [ 'I', 'a', 'an', 'are', 'as', 'and', 'be', 'com', 'how', 'is', 'it', 'of', 'or', 'that', 'the', 'this', 'was', 'what', 'when', 'where', 'who', 'will', 'with', 'the', 'www'] vocab = {} ## Code goes here. return vocab def print_event (fh,feature_dict,sense): print >> fh, 'BEGIN EVENT' print >> fh, sense for k in feature_dict: print >> fh, '%s\t%s' % (k,feature_dict[k]) print >> fh, 'END EVENT' if __name__ == '__main__': word = 'hard.pos' data_file = 'senseval-hard.xml' event_file = 'senseval-hard.evt' event_list = [] parse_file_and_extract_events(data_file,event_list) ## Extract the 300 most frequent vocab words from the events ## Store as a dictionary. vocab = extract_vocab(event_list,300) e_fh = open(event_file,'w') for (s_inst, sense) in event_list: context = s_inst.context position = s_inst.position context_id = s_inst.id feature_dict = wsd_features(context,position,context_id,vocab) print_event(e_fh,feature_dict,sense) e_fh.close()