#!/usr/bin/perl -w ## tagger - a baseline tagger # Read training data open TRAIN, "$ARGV[0]" || die "Can't open training file $ARGV[0]!\n"; print STDERR "Reading $ARGV[0]\n"; while () { chomp; my @line = split /\s+/; foreach my $word (@line) { my ($w,$t) = split /_/, $word; $word_counts{$w}{$t}++; $tag_counts{$t}++; } } close TRAIN; # Find the most common tag per word print STDERR "Finding most common tags\n"; foreach my $w (keys %word_counts) { my $c = 0; my $tag = ""; foreach my $t (keys %{$word_counts{$w}}) { if ($word_counts{$w}{$t} > $c) { $c = $word_counts{$w}{$t}; $tag = $t; } } $tags{$w} = $tag; } # Find the most common tag overall $default_tag = ""; $c = 0; foreach my $t (keys %tag_counts) { if ($tag_counts{$t} > $c) { $c = $tag_counts{$t}; $default_tag = $t; } } # Add tags to the test data open TEST, "$ARGV[1]" || die "Can't open test file $ARGV[1]!\n"; print STDERR "Reading $ARGV[1]\n"; while () { chomp; my @line = split /\s+/; foreach my $w (@line) { if (exists $tags{$w}) { print $w, "_", $tags{$w}, " "; } else { print $w, "_", $default_tag, " "; } } print "\n"; } close TEST;