cd GIZA++-v2/
make
make snt2cooc.out
cd ../mkcls-v2/
make
mkdir -p bin
cp GIZA++-v2/GIZA++ bin/
cp GIZA++-v2/snt2cooc.out bin/
cp mkcls-v2/mkcls bin/
mkdir -p moses
svn co https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk moses
cd moses
./regenerate-makefiles.sh
./configure --with-srilm=/path-to-srilm
make -j 4
mkdir -p bin/moses-scripts
###Edit moses/scripts/Makefile
TARGETDIR=/full-path-to-workspace/bin/moses-scripts
BINDIR=/full-path-to-workspace/bin
###
cd moses/scripts/
make release
bin/moses-scripts/scripts-YYYYMMDD-HHMM with released versions of all the scripts. You will call these versions when training/tuning Moses.make release should indicate this.
export SCRIPTS_ROOTDIR=/full-path-to-workspace/bin/moses-scripts/scripts-YYYYMMDD-HHMM
tar xzf scripts.tgz
scripts/tokenizer.perl
scripts/lowercase.perl
scripts/wrap-xml.perl
wget ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v11b.pl
As these instructions indicate, vocab creation and *.snt file creation are unnecessary with moses (though they were with GIZA++), moses has internally run scripts that do this. Start moses with a lowercase tokenized sentence-aligned training corpus.
mkdir -p working-dir/corpus
scripts/tokenizer.perl -l fr < wmt07/training/europarl-v3.fr-en.fr > working-dir/corpus/europarl.tok.fr
scripts/tokenizer.perl -l en < wmt07/training/europarl-v3.fr-en.en > working-dir/corpus/europarl.tok.en
bin/moses-scripts/scripts-YYYYMMDD-HHMM/training/clean-corpus-n.perl working-dir/corpus/europarl.tok fr en working-dir/corpus/europarl.clean 1 40
scripts/lowercase.perl < working-dir/corpus/europarl.clean.fr > working-dir/corpus/europarl.lowercased.fr
scripts/lowercase.perl < working-dir/corpus/europarl.clean.en > working-dir/corpus/europarl.lowercased.en
mkdir -p working-dir/lm
scripts/tokenizer.perl -l en < wmt07/training/europarl-v3.en > working-dir/lm/europarl.tok
scripts/lowercase.perl < working-dir/lm/europarl.tok > working-dir/lm/europarl.lowercased
/path-to-srilm/bin/i686/ngram-count -order 5 -interpolate -kndiscount -text working-dir/lm/europarl.lowercased -lm working-dir/lm/europarl.lm
bin/moses-scripts/scripts-YYYYMMDD-HHMM/training/train-factored-phrase-model.perl -scripts-root-dir bin/moses-scripts/scripts-YYYYMMDD-HHMM
-root-dir working-dir -corpus working-dir/corpus/europarl.lowercased -f
fr -e en -alignment grow-diag-final-and -reordering
msd-bidirectional-fe -lm 0:5:working-dir/lm/europarl.lm:0
JMG Note: My compiled version of GIZA++ issued an unknown
option error message about the "coocurrenceFile" option it was
given. This was compiled from Chris Dyer's patched sources.
mkdir -p working-dir/tuning
scripts/tokenizer.perl -l fr < wmt07/dev/dev2006.fr > working-dir/tuning/input.tok
scripts/tokenizer.perl -l en < wmt07/dev/dev2006.en > working-dir/tuning/reference.tok
scripts/lowercase.perl < working-dir/tuning/input.tok > working-dir/tuning/input
scripts/lowercase.perl < working-dir/tuning/reference.tok > working-dir/tuning/reference
bin/moses-scripts/scripts-YYYYMMDD-HHMM/training/mert-moses.pl
working-dir/tuning/input working-dir/tuning/reference
moses/moses-cmd/src/moses working-dir/model/moses.ini --working-dir
working-dir/tuning --rootdir bin/moses-scripts/scripts-YYYYMMDD-HHMM
scripts/reuse-weights.perl working-dir/tuning/moses.ini <
working-dir/model/moses.ini >
working-dir/tuning/moses.weight-reused.ini
mkdir -p working-dir/evaluation
scripts/tokenizer.perl -l fr < wmt07/devtest/devtest2006.fr > working-dir/evaluation/devtest2006.input.tok
scripts/tokenizer.perl -l en < wmt07/devtest/devtest2006.en > working-dir/evaluation/devtest2006.reference.tok
scripts/lowercase.perl < working-dir/evaluation/devtest2006.input.tok > working-dir/evaluation/devtest2006.input
scripts/lowercase.perl < working-dir/evaluation/devtest2006.reference.tok > working-dir/evaluation/devtest2006.reference
bin/moses-scripts/scripts-YYYYMMDD-HHMM/training/filter-model-given-input.pl
working-dir/evaluation/filtered.devtest2006
working-dir/tuning/moses.weight-reused.ini
working-dir/evaluation/devtest2006.input
moses/moses-cmd/src/moses -config
working-dir/evaluation/filtered.devtest2006/moses.ini -input-file
working-dir/evaluation/devtest2006.input >
working-dir/evaluation/devtest2006.output
bin/moses-scripts/scripts-YYYYMMDD-HHMM/recaser/train-recaser.perl -train-script bin/moses-scripts/scripts-YYYYMMDD-HHMM/training/train-factored-phrase-model.perl -ngram-count /path-to-srilm/bin/i686/ngram-count -corpus working-dir/lm/europarl.tok -dir recaser
bin/moses-scripts/scripts-YYYYMMDD-HHMM/recaser/recase.perl
-model recaser/moses.ini -in working-dir/evaluation/devtest2006.output
-moses moses/moses-cmd/src/moses >
working-dir/evaluation/devtest2006.output.recased
scripts/detokenizer.perl -l en <
working-dir/evaluation/devtest2006.output.recased >
working-dir/evaluation/devtest2006.output.detokenized
scripts/wrap-xml.perl wmt07/devtest/test2006-ref.en.sgm en baseline <
working-dir/evaluation/devtest2006.output.detokenized >
working-dir/evaluation/devtest2006.output.sgm
JMG: Added system id argument to this command
mteval-v11b.pl -r wmt07/devtest/devtest2006-ref.en.sgm -t
working-dir/evaluation/devtest2006.output.sgm -s
wmt07/devtest/devtest2006-src.fr.sgm -c