#!/usr/bin/env python import sys, math ## Handy for log probabilities. neg_infinity = float("-infinity") print_probs_flag = True def word_and_tag_counts(training_file): fsock_train=open(training_file,'r',0) print >> sys.stderr, 'Reading %s' % training_file wordtag_count = { } ## word tag pair counts tag_count = { } ## tag counts: tagtag = { } ## tag bigrams for line in fsock_train: splitline=line.rstrip().split() if len(splitline)>0: oldtag = 'START' tag_count[oldtag] = tag_count.get(oldtag,0) + 1 for elem in splitline: ## Each elem shd be a word_tag pair connected by "_" wt_pair=elem.split('_') if len(wt_pair)==2: (word,tag)=wt_pair # Initialize if necessary wordtag_count.setdefault(word,{}) wordtag_count[word][tag]=wordtag_count[word].get(tag,0) + 1 # Initialize if necessary tagtag.setdefault(oldtag,{}) tagtag[oldtag][tag]=tagtag[oldtag].get(tag,0) + 1 tag_count[tag]=tag_count.get(tag,0)+1 oldtag=tag else: print >> sys.stderr, 'Ill-formed wd/tg pair: %s' % wt_pair if len(wt_pair)==2: ## wt_pair = last word tag pair in line tagtag.setdefault(tag,{}) tagtag[tag]['END']=tagtag[tag].get('END',0) + 1 tag_count['END']=tag_count.get('END',0)+1 else: continue fsock_train.close() return (tagtag,tag_count,wordtag_count) def smoothed_conditional_probs(a_b_count,a_count,laplace=0.5): """ This estimates log p(b | a) using Laplacian (add C{laplace}, default 0.5) smoothing. C{a_b_count} is a dictionary storing counts of (a,b) pairs, which are assumed to come from a single set of types (a vocabulary, a set of tags, etc.). C{a_count} is a dictionary storing counts of a's, which is assumed to contain counts for all the types instantiated in (a,b) pairs (counts for all the words, all the tags, etc.). Return a single dictionary of dictionaries storing the smoothed log_prob(b | a), retrievable as: >>> log_prob[a][b] @rtype: a dictionary of dictionaries. """ ## Do transition log probs types=a_count.keys() ntypes=len(types) log_prob={} # Smoothed Transition log probs. for t1 in types: for t2 in types: a_b_count.setdefault(t1,{}) # Init if necessary a_b_count[t1].setdefault(t2,0) a_count.setdefault(t1,0) # init if necessary log_prob.setdefault(t2,{}) log_prob[t2][t1] = math.log(a_b_count[t1][t2]+0.5,2) - \ math.log(a_count[t1]+(0.5*ntypes),2) return log_prob def unsmoothed_conditional_probs (a_b_count,a_count): """ This estimates log p(b | a) using Max Likelihood. C{a_b_count} is a dictionary storing counts of (a,b) pairs. C{a_count} is a dictionary storing counts of a's. Return a single dictionary of dictionaries storing the smoothed log_prob(b | a), retrievable as: >>> log_prob[a][b] @rtype: a dictionary of dictionaries. """ cond_probs = {} for b in a_b_count: this_b_dic = a_b_count[b] for a in this_b_dic: logprob = math.log(float(a_b_count[b][a])/a_count[a],2) cond_probs.setdefault(b,{})[a] = logprob return cond_probs def print_bigram_model(model_file, bigram_model,desc,counts=False, \ read_mode='a'): fsock_model = start_model_write(model_file,desc,read_mode) for ti in bigram_model: this_dic = bigram_model[ti] for tj in this_dic: q = this_dic[tj] if print_probs_flag and not counts: print >> fsock_model, '%s\t%s\t%.3f\t%.6f' % (ti, tj, q, 2**q) elif counts: print >> fsock_model, '%s\t%s\t%d' % (ti, tj, q) else: print >> fsock_model, '%s\t%.s\t%.3f' % (ti, tj, q) end_model_write(fsock_model,desc) def print_unigram_model(model_file, unigram_model,desc,counts=False, \ read_mode='a'): fsock_model = start_model_write(model_file,desc,read_mode) for ti in unigram_model: q = unigram_model[ti] if print_probs_flag and not counts: print >> fsock_model, '%s\t%.3f\t%.6f' % (ti, q,2**q) elif counts: print >> fsock_model, '%s\t%d' % (ti, q) else: print >> fsock_model, '%s\t%.3f' % (ti, q) end_model_write(fsock_model,desc) def start_model_write(model_file,desc,read_mode): fsock_model=open(model_file,read_mode,0) print >> fsock_model, 'Start %s model' % desc print >> sys.stderr, 'Writing %s model to %s' % (desc,model_file) return fsock_model def end_model_write(fsock_model,desc): print >> fsock_model, 'End %s model' % desc fsock_model.close() ############################################################ ################ Main Program ################ ############################################################ if __name__ == '__main__': if len(sys.argv) > 3: smooth_flag = sys.argv[3] if smooth_flag in ['false', 'False', 'F','f']: smooth_flag = False elif smooth_flag in ['true', 'True', 'T','t']: smooth_flag = True else: print >> sys.stderr, 'Usage: ngram_model.py ' print >> sys.stderr, 'Smooth flag = True or False' sys.exit() if smooth_flag: print >> sys.stderr, 'Smoothing turned on' else: print >> sys.stderr, 'Smoothing turned off' else: smooth_flag = True if len(sys.argv) > 2: training_file = sys.argv[1] model_file = sys.argv[2] elif len(sys.argv) > 1: training_file = sys.argv[1] print >> sys.stderr, 'No model file!\n' else: print >> sys.stderr, 'Usage: ngram_model.py ()' sys.exit() (tagtag,tag_count,wordtag_count) = word_and_tag_counts(training_file) ## Smooth tag tag counts and do log probs print >> sys.stderr, 'Computing tag tag model.' if smooth_flag: print >> sys.stderr, 'Smoothing tag tag probs!' A = smoothed_conditional_probs(tagtag,tag_count) else: A = unsmoothed_conditional_probs(tagtag,tag_count) print >> sys.stderr, 'Computing word tag model.' # Note: Not smoothing these! Left as exercise for you! B = unsmoothed_conditional_probs(wordtag_count,tag_count) print_bigram_model(model_file, A,'tag tag',False, 'w') print_bigram_model(model_file,B,'word tag',False) print_unigram_model(model_file,tag_count,'Tag Counts',True) print_bigram_model(model_file,tagtag,'tag tag Counts',True) print_bigram_model(model_file,wordtag_count,'Word Tag Counts',True)