/[svn]/OpenMaTrEx/trunk/Makefile
ViewVC logotype

Contents of /OpenMaTrEx/trunk/Makefile

Parent Directory Parent Directory | Revision Log Revision Log


Revision 268 - (show annotations)
Fri Mar 25 05:19:28 2011 UTC (6 years, 9 months ago) by mikel
File size: 38996 byte(s)
correct a bash error
1 # $Id:$
2 #-------------------------------------------------------------------------------
3 # This file is part of OpenMaTrEx: a marker-driven corpus-based machine
4 # translation system.
5 #
6 # Copyright (c) 2004-2010 Dublin City University
7 # (c) 2004-2007 Steve Armstrong, Yvette Graham, Nano Gough, Declan Groves,
8 # Yanjun Ma, Nicolas Stroppa, John Tinsley, Andy Way, Bart Mellebeek
9 # (c) 2010-2011 Pratyush Banerjee, Sandipan Dandapat, Mikel L. Forcada,
10 # Declan Groves, Sergio Penkale, John Tinsley, Antonio Toral, Yanjun Ma
11 #
12 # This program is free software: you can redistribute it and/or modify
13 # it under the terms of the GNU General Public License as published by
14 # the Free Software Foundation, either version 3 of the License, or
15 # (at your option) any later version.
16 #
17 # This program is distributed in the hope that it will be useful,
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 # GNU General Public License for more details.
21 #
22 # You should have received a copy of the GNU General Public License
23 # along with this program. If not, see <http://www.gnu.org/licenses/>
24 #-------------------------------------------------------------------------------
25
26
27 # NOTE: all these parameters can be set on the command line.
28 # For example: OpenMaTrEx SL=fr TL=en init
29 # This means that all the parameters below are default that
30 # can be overridden.
31 ENCODING = UTF-8
32 # Debugging turned off
33 DEBUG = NO
34 # Main location of OpenMaTrEx
35 TOOLDIR = $(OPENMATREX_DIR)
36 MERGE_FILES = $(OPENMATREX_DIR)/tools/merge
37
38 MAKEFILES=$(OPENMATREX_DIR)/Makefile
39
40 # Parameters to filter parallel corpus.
41 # No line longer than 100 words, no sentence length ratio above 9.0
42 FILTER_PARAMS = 100_9.0_9.0
43
44 # By default, we need to chunk the training data
45 CHUNKING_INPUT = train
46
47 #TTABLE = final_tables/smt-ttable
48 SUBDIR = .
49
50 # Options to give to IRSTLM for training a language model
51 LM_OPTIONS = -s improved-kneser-ney
52 LM_INPUT = lmtrain
53 LM_ORDER = 5
54
55 # Parameters for the merge script, e.g. 'OpenMaTrEx merge_ebmt_with_moses MERGE_PARAMS="MAX_PHRASE_LENGTH=8"
56 MERGE_PARAMS =
57
58 # misc tools
59
60 OPENMATREX_ROOT = $(OPENMATREX_DIR)
61
62 # Location of the Moses decoder executable
63 DECODER=$(MOSES)
64 # Parameters to pass to Moses. For example, to use cube pruning: OpenMaTrEx moses_decode DECODER_PARAMS="-s 2000 -cube-pruning-pop-limit 2000 -search-algorithm 1"
65 DECODER_PARAMS = -s 500
66 DECODER_OUTPUT = testset
67 MERT_PARAMS =
68
69 # Moses+IRSTLM options
70 TRAINING_OPTIONS = -reordering msd-bidirectional-fe -lm 0:$(LM_ORDER):$$PWD/l_models/lm.irstlm.gz:1
71
72
73 # Word packing options
74 STEP = 1
75 K = 3
76 COF = 5
77 AC = 0.6
78 NO_GS_SENT = 502
79 GS_FILE = orig/gold.zh.en
80
81
82 # export LC_ALL=POSIX
83 ############################################
84 # Init the encoding configuration
85 # If file lmtrain.$(TL) is not present, use train.$(TL) to train the language model
86 init:
87 echo -n "=====> Initializing source and target..."
88 ([ -d orig ] && [ -s orig/train.$(SL) ]) || (echo "\nError: orig/train.$(SL) nonexistent or empty" ; false)
89 echo $(SL) > orig/source
90 ([ -d orig ] && [ -s orig/train.$(TL) ]) || (echo "\nError: orig/train.$(TL) nonexistent or empty" ; false)
91 echo $(TL) > orig/target
92 ([ -s orig/lmtrain.$(TL) ]) || (echo "\nWarning: using orig/train.$(TL) to train target language model"; ln -f orig/train.$(TL) orig/lmtrain.$(TL); true ) ;\
93 echo $(ENCODING) > orig/encoding
94 if test $(DEBUG) = "YES";\
95 then echo "DebugTraceMode" > orig/debugmethod;\
96 else echo "NoTraceMode" > orig/debugmethod;\
97 fi;\
98 touch .OpenMaTrEx_timestamp
99 echo " OK."
100
101
102 check_files: check_source check_target
103
104 check_source:
105 echo -n "=====> Verifying source..."
106 ([ -d orig ] && [ -s orig/source ]) || (echo "\nError: orig/source not found. Please run init" ; false)
107 SL=`cat orig/source`; \
108 ([ -d orig ] && [ -s orig/train.$$SL ]) || (echo "\nError: orig/train.$$SL nonexistent or empty" ; false)
109 echo " OK."
110
111 check_target:
112 echo -n "=====> Verifying target..."
113 ([ -d orig ] && [ -s orig/target ]) || (echo "\nError: orig/target not found. Please run init" ; false)
114 TL=`cat orig/target`; \
115 ([ -d orig ] && [ -s orig/train.$$TL ]) || (echo "\nError: orig/train.$$TL nonexistent or empty" ; false)
116 echo " OK."
117
118 check_lmtrain:
119 echo -n "=====> Verifying LM training file..."
120 ([ -d orig ] && [ -s orig/target ]) || (echo "\nError: orig/target not found. Please run init" ; false)
121 TL=`cat orig/target`; \
122 ([ -d orig ] && [ -s orig/lmtrain.$$TL ]) || (echo "\nError: orig/lmtrain.$$TL nonexistent or empty. Please provide this file or run init." ; false)
123 echo " OK."
124
125 check_baseline:
126 TL=`cat orig/target`; \
127 echo -n "=====> Checking baseline..."; \
128 [ -d decoding ] && [ -f decoding/testset.$$TL ]
129 echo " OK."
130
131 ############################################
132 filter: check_files
133 SL=`cat orig/source`; TL=`cat orig/target`; \
134 echo -n "=====> Filtering corpus..."; \
135 mkdir -p filtered; \
136 echo '$(FILTER_PARAMS)' > filtered/filter_params; \
137 $$OPENMATREX_DIR/tools/filter_corpus filtered orig/train.$$SL orig/train.$$TL '$(FILTER_PARAMS)'; \
138 ln -fb --suffix .no_filter filtered/train.$$SL orig/train.$$SL; \
139 ln -fb --suffix .no_filter filtered/train.$$TL orig/train.$$TL; \
140 touch filtered/done; \
141 echo " OK."; \
142
143 no_filter: check_files
144 SL=`cat orig/source`; TL=`cat orig/target`; \
145 echo -n "=====> Filtering corpus..."; \
146 mkdir -p filtered; \
147 echo 'no_filtering' > filtered/filter_params; \
148 ln -s ../orig/train.$$SL filtered/train.$$SL; \
149 ln -s ../orig/train.$$TL filtered/train.$$TL; \
150 touch filtered/done; \
151 echo " OK."; \
152
153 check_filter:
154 SL=`cat orig/source`; TL=`cat orig/target`; \
155 echo -n "=====> Veryfing filtering..."; \
156 [ -d filtered ] && \
157 [ -f filtered/train.$$SL ] && [ -f filtered/train.$$TL ]
158 echo " OK."
159
160 ###########################################
161
162 marker-based_chunking_source:
163 SL=`cat orig/source`; \
164 ENCODING=`cat orig/encoding`; \
165 DEBUGMETHOD=`cat orig/debugmethod`; \
166 echo -n "=====> Marker-based chunking of the source..."; \
167 mkdir -p chunked; \
168 CONFIG_FILE=`mktemp OpenMaTrEx.ini.XXXX`; \
169 echo '<OpenMaTrEx_configuration>' > $$CONFIG_FILE; \
170 echo '<chunker baseclass="org.dcu.matrex.openmatrex.chunkers.Chunker" class="org.dcu.matrex.openmatrex.chunkers.MarkerBasedChunker">' >> $$CONFIG_FILE; \
171 echo '<markersFile class="java.lang.String" value="'$(OPENMATREX_ROOT)/marker_files/marker_words.$$SL'" />' >> $$CONFIG_FILE; \
172 echo '</chunker>' >> $$CONFIG_FILE; \
173 echo '<inputDataFile class="java.lang.String" value="orig/'$(CHUNKING_INPUT)'.'$$SL'" />' >> $$CONFIG_FILE; \
174 echo '<outputDataFile class="java.lang.String" value="chunked/'$(CHUNKING_INPUT)'.'$$SL'" />' >> $$CONFIG_FILE; \
175 echo '<encoding class="java.lang.String" value="'$$ENCODING'" />' >> $$CONFIG_FILE; \
176 echo '<traceMode class="org.dcu.matrex.openmatrex.utils.'$$DEBUGMETHOD'" />' >>$$CONFIG_FILE; \
177 echo '</OpenMaTrEx_configuration>' >> $$CONFIG_FILE; \
178 java $(JAVA_OPTIONS) -cp $(OPENMATREX_ROOT)/OpenMaTrEx.jar org.dcu.matrex.openmatrex.main.Chunk $$CONFIG_FILE > chunked/source_chunking.log; \
179 rm $$CONFIG_FILE
180 echo " OK."
181
182
183 ############################################
184 marker-based_chunking_target:
185 TL=`cat orig/target`; \
186 ENCODING=`cat orig/encoding`; \
187 DEBUGMETHOD=`cat orig/debugmethod`; \
188 echo -n "=====> Marker-based chunking of the target..."; \
189 mkdir -p chunked; \
190 CONFIG_FILE=`mktemp OpenMaTrEx.ini.XXXX`; \
191 echo '<OpenMaTrEx_configuration>' > $$CONFIG_FILE; \
192 echo '<chunker baseclass="org.dcu.matrex.openmatrex.chunkers.Chunker" class="org.dcu.matrex.openmatrex.chunkers.MarkerBasedChunker">' >> $$CONFIG_FILE; \
193 echo '<markersFile class="java.lang.String" value="'$(OPENMATREX_ROOT)/marker_files/marker_words.$$TL'" />' >> $$CONFIG_FILE; \
194 echo '</chunker>' >> $$CONFIG_FILE; \
195 echo '<inputDataFile class="java.lang.String" value="orig/'$(CHUNKING_INPUT)'.'$$TL'" />' >> $$CONFIG_FILE; \
196 echo '<outputDataFile class="java.lang.String" value="chunked/'$(CHUNKING_INPUT)'.'$$TL'" />' >> $$CONFIG_FILE; \
197 echo '<encoding class="java.lang.String" value="'$$ENCODING'" />' >> $$CONFIG_FILE; \
198 echo '<traceMode class="org.dcu.matrex.openmatrex.utils.'$$DEBUGMETHOD'" />' >>$$CONFIG_FILE; \
199 echo '</OpenMaTrEx_configuration>' >> $$CONFIG_FILE; \
200 java $(JAVA_OPTIONS) -cp $(OPENMATREX_ROOT)/OpenMaTrEx.jar org.dcu.matrex.openmatrex.main.Chunk $$CONFIG_FILE > chunked/target_chunking.log; \
201 rm $$CONFIG_FILE
202 echo " OK."
203
204
205
206 ############################################
207 # check_chunking
208 ebmt_alignments:
209 SL=`cat orig/source`; TL=`cat orig/target`; ENCODING=`cat orig/encoding`; \
210 DEBUGMETHOD=`cat orig/debugmethod`; \
211 echo -n "=====> EBMT chunk alignment..."; \
212 mkdir -p ebmt-alignments; \
213 sed -r -e 's/ / ||| /g' < moses_training/model/lex.e2f | grep -v NULL > final_tables/st_words.probs; \
214 sed -r -e 's/ / ||| /g' < moses_training/model/lex.f2e | grep -v NULL > final_tables/ts_words.probs; \
215 # This is needed because the ts_words.probs, the lines of which have \
216 # the structure "t s p(t|s)", were being interpreted as "s t p(t|s)" instead \
217 # by aligners/WordProbsChunkDistance.java \
218 cp final_tables/ts_words.probs final_tables/ts_words.probs_save; \
219 perl -F/'\|\|\|'/ -lane 'print $$F[1] . " ||| " . $$F[0] . " ||| " . $$F[2]' final_tables/ts_words.probs_save >final_tables/ts_words.probs; \
220 echo '<OpenMaTrEx_configuration>' > OpenMaTrEx.ini; \
221 echo '<sourceTrainingDataFile class="java.lang.String" value="chunked/train.'$$SL'" />' >> OpenMaTrEx.ini; \
222 echo '<targetTrainingDataFile class="java.lang.String" value="chunked/train.'$$TL'" />' >> OpenMaTrEx.ini; \
223 echo '<chunksAligner baseclass="org.dcu.matrex.openmatrex.aligners.ChunksAligner" class="org.dcu.matrex.openmatrex.aligners.EditDistanceChunksAligner" />' >> OpenMaTrEx.ini; \
224 echo '<alignedDataFactory baseclass="org.dcu.matrex.openmatrex.data.AlignedDataFactory" class="org.dcu.matrex.openmatrex.data.MapAlignedDataFactory" />' >> OpenMaTrEx.ini; \
225 echo '<chunkDistanceList class="java.util.Vector" list="yes" >' >> OpenMaTrEx.ini; \
226 echo '<chunkDistance1 baseclass="org.dcu.matrex.openmatrex.aligners.ChunkDistance" class="org.dcu.matrex.openmatrex.aligners.CognateChunkDistance" />' >> OpenMaTrEx.ini; \
227 echo '<chunkDistance2 baseclass="org.dcu.matrex.openmatrex.aligners.ChunkDistance" class="org.dcu.matrex.openmatrex.aligners.WordProbsChunkDistance" />' >> OpenMaTrEx.ini; \
228 echo '</chunkDistanceList>' >> OpenMaTrEx.ini; \
229 echo '<trainingResultsPrefix class="java.lang.String" value="ebmt-alignments/train" />' >> OpenMaTrEx.ini; \
230 echo '<wordsAligner baseclass="org.dcu.matrex.openmatrex.aligners.WordsAligner" class="org.dcu.matrex.openmatrex.aligners.GizaWordsAligner" >' >> OpenMaTrEx.ini; \
231 echo '<filename class="java.lang.String" value="final_tables/ts_words.probs" />' >> OpenMaTrEx.ini; \
232 # This one is not used, but it's needed in the call \
233 echo '<filename class="java.lang.String" value="final_tables/st_words.probs" />' >> OpenMaTrEx.ini; \
234 echo '</wordsAligner>' >> OpenMaTrEx.ini; \
235 echo '<encoding class="java.lang.String" value="'$$ENCODING'" />' >>OpenMaTrEx.ini; \
236 echo '<traceMode class="org.dcu.matrex.openmatrex.utils.'$$DEBUGMETHOD'" />' >>OpenMaTrEx.ini; \
237 echo '</OpenMaTrEx_configuration>' >> OpenMaTrEx.ini; \
238 java $(JAVA_OPTIONS) -cp $(OPENMATREX_ROOT)/OpenMaTrEx.jar org.dcu.matrex.openmatrex.main.Align > ebmt-alignments/alignments.log
239 rm OpenMaTrEx.ini
240 echo " OK."
241
242
243 ebmt_alignments_on_disk:
244 SL=`cat orig/source`; TL=`cat orig/target`; ENCODING=`cat orig/encoding`; \
245 DEBUGMETHOD=`cat orig/debugmethod`; \
246 echo -n "=====> EBMT chunk alignment..."; \
247 mkdir -p ebmt-alignments; \
248 sed -r -e 's/ / ||| /g' < moses_training/model/lex.e2f | grep -v NULL > final_tables/st_words.probs; \
249 sed -r -e 's/ / ||| /g' < moses_training/model/lex.f2e | grep -v NULL > final_tables/ts_words.probs; \
250 # This is needed because the ts_words.probs, the lines of which have \
251 # the structure "t s p(t|s)", were being interpreted as "s t p(t|s)" instead \
252 # by aligners/WordProbsChunkDistance.java \
253 cp final_tables/ts_words.probs final_tables/ts_words.probs_save; \
254 perl -F/'\|\|\|'/ -lane 'print $$F[1] . " ||| " . $$F[0] . " ||| " . $$F[2]' final_tables/ts_words.probs_save >final_tables/ts_words.probs; \
255 echo '<OpenMaTrEx_configuration>' > OpenMaTrEx.ini; \
256 echo '<sourceTrainingDataFile class="java.lang.String" value="chunked/train.'$$SL'" />' >> OpenMaTrEx.ini; \
257 echo '<targetTrainingDataFile class="java.lang.String" value="chunked/train.'$$TL'" />' >> OpenMaTrEx.ini; \
258 echo '<chunksAligner baseclass="org.dcu.matrex.openmatrex.aligners.ChunksAligner" class="org.dcu.matrex.openmatrex.aligners.EditDistanceChunksAligner" />' >> OpenMaTrEx.ini; \
259 echo '<alignedDataFactory baseclass="org.dcu.matrex.openmatrex.data.AlignedDataFactory" class="org.dcu.matrex.openmatrex.data.MapAlignedDataFactory" />' >> OpenMaTrEx.ini; \
260 echo '<chunkDistanceList class="java.util.Vector" list="yes" >' >> OpenMaTrEx.ini; \
261 echo '<chunkDistance1 baseclass="org.dcu.matrex.openmatrex.aligners.ChunkDistance" class="org.dcu.matrex.openmatrex.aligners.CognateChunkDistance" />' >> OpenMaTrEx.ini; \
262 echo '<chunkDistance2 baseclass="org.dcu.matrex.openmatrex.aligners.ChunkDistance" class="org.dcu.matrex.openmatrex.aligners.WordProbsChunkDistance" />' >> OpenMaTrEx.ini; \
263 echo '</chunkDistanceList>' >> OpenMaTrEx.ini; \
264 echo '<trainingResultsPrefix class="java.lang.String" value="ebmt-alignments/train" />' >> OpenMaTrEx.ini; \
265 echo '<wordsAligner baseclass="org.dcu.matrex.openmatrex.aligners.WordsAligner" class="org.dcu.matrex.openmatrex.aligners.GizaWordsAligner" >' >> OpenMaTrEx.ini; \
266 echo '<filename class="java.lang.String" value="final_tables/ts_words.probs" />' >> OpenMaTrEx.ini; \
267 # This one is not used, but it's needed in the call \
268 echo '<filename class="java.lang.String" value="final_tables/st_words.probs" />' >> OpenMaTrEx.ini; \
269 echo '</wordsAligner>' >> OpenMaTrEx.ini; \
270 echo '<encoding class="java.lang.String" value="'$$ENCODING'" />' >> OpenMaTrEx.ini; \
271 echo '<traceMode class="org.dcu.matrex.openmatrex.utils.'$$DEBUGMETHOD'" />' >>OpenMaTrEx.ini; \
272 echo '</OpenMaTrEx_configuration>' >> OpenMaTrEx.ini; \
273 java $(JAVA_OPTIONS) -cp $(OPENMATREX_ROOT)/OpenMaTrEx.jar org.dcu.matrex.openmatrex.main.AlignNoCounts > ebmt-alignments/alignments.log
274 sort ebmt-alignments/train.ac | uniq -c | perl -pane 's/^ *([0-9]+) (.*)/$$2 ||| $$1.0/' > ebmt-alignments/train.ac.counts
275 mv ebmt-alignments/train.ac.counts ebmt-alignments/train.ac
276 rm OpenMaTrEx.ini
277 echo " OK."
278
279 ebmt_alignments_on_disk_with_tags:
280 SL=`cat orig/source`; TL=`cat orig/target`; ENCODING=`cat orig/encoding`; \
281 DEBUGMETHOD=`cat orig/debugmethod`; \
282 echo -n "=====> EBMT chunk alignment..."; \
283 mkdir -p ebmt-alignments; \
284 sed -r -e 's/ / ||| /g' < moses_training/model/lex.e2f | grep -v NULL > final_tables/st_words.probs; \
285 sed -r -e 's/ / ||| /g' < moses_training/model/lex.f2e | grep -v NULL > final_tables/ts_words.probs; \
286 # This is needed because the ts_words.probs, the lines of which have \
287 # the structure "t s p(t|s)", were being interpreted as "s t p(t|s)" instead \
288 # by aligners/WordProbsChunkDistance.java \
289 cp final_tables/ts_words.probs final_tables/ts_words.probs_save; \
290 perl -F/'\|\|\|'/ -lane 'print $$F[1] . " ||| " . $$F[0] . " ||| " . $$F[2]' final_tables/ts_words.probs_save >final_tables/ts_words.probs; \
291 echo '<OpenMaTrEx_configuration>' > OpenMaTrEx.ini; \
292 echo '<sourceTrainingDataFile class="java.lang.String" value="chunked/train.'$$SL'.tagged" />' >> OpenMaTrEx.ini; \
293 echo '<targetTrainingDataFile class="java.lang.String" value="chunked/train.'$$TL'.tagged" />' >> OpenMaTrEx.ini; \
294 echo '<chunksAligner baseclass="org.dcu.matrex.openmatrex.aligners.ChunksAligner" class="org.dcu.matrex.openmatrex.aligners.EditDistanceChunksAligner" />' >> OpenMaTrEx.ini; \
295 echo '<alignedDataFactory baseclass="org.dcu.matrex.openmatrex.data.AlignedDataFactory" class="org.dcu.matrex.openmatrex.data.MapAlignedDataFactory" />' >> OpenMaTrEx.ini; \
296 echo '<chunkDistanceList class="java.util.Vector" list="yes" >' >> OpenMaTrEx.ini; \
297 echo '<chunkDistance1 baseclass="org.dcu.matrex.openmatrex.aligners.ChunkDistance" class="org.dcu.matrex.openmatrex.aligners.CognateChunkDistance" />' >> OpenMaTrEx.ini; \
298 echo '<chunkDistance2 baseclass="org.dcu.matrex.openmatrex.aligners.ChunkDistance" class="org.dcu.matrex.openmatrex.aligners.WordProbsChunkDistance" />' >> OpenMaTrEx.ini; \
299 echo '<chunkDistance3 baseclass="org.dcu.matrex.openmatrex.aligners.ChunkDistance" class="org.dcu.matrex.openmatrex.aligners.TagBasedChunkDistance" />' >> OpenMaTrEx.ini; \
300 echo '</chunkDistanceList>' >> OpenMaTrEx.ini; \
301 echo '<trainingResultsPrefix class="java.lang.String" value="ebmt-alignments/train" />' >> OpenMaTrEx.ini; \
302 echo '<wordsAligner baseclass="org.dcu.matrex.openmatrex.aligners.WordsAligner" class="org.dcu.matrex.openmatrex.aligners.GizaWordsAligner" >' >> OpenMaTrEx.ini; \
303 echo '<filename class="java.lang.String" value="final_tables/ts_words.probs" />' >> OpenMaTrEx.ini; \
304 # This one is not used, but it's needed in the call \
305 echo '<filename class="java.lang.String" value="final_tables/st_words.probs" />' >> OpenMaTrEx.ini; \
306 echo '</wordsAligner>' >> OpenMaTrEx.ini; \
307 echo '<encoding class="java.lang.String" value="'$$ENCODING'" />' >> OpenMaTrEx.ini; \
308 echo '<traceMode class="org.dcu.matrex.openmatrex.utils.'$$DEBUGMETHOD'" />' >>OpenMaTrEx.ini; \
309 echo '</OpenMaTrEx_configuration>' >> OpenMaTrEx.ini; \
310 java $(JAVA_OPTIONS) -cp $(OPENMATREX_ROOT)/OpenMaTrEx.jar org.dcu.matrex.openmatrex.main.AlignNoCounts > ebmt-alignments/alignments.log
311 sort ebmt-alignments/train.ac | uniq -c | perl -pane 's/^ *([0-9]+) (.*)/$$2 ||| $$1.0/' > ebmt-alignments/train.ac.counts
312 mv ebmt-alignments/train.ac.counts ebmt-alignments/train.ac
313 rm OpenMaTrEx.ini
314 echo " OK."
315
316
317 ebmt_alignments_on_disk_with_id:
318 SL=`cat orig/source`; TL=`cat orig/target`; ENCODING=`cat orig/encoding`; \
319 DEBUGMETHOD=`cat orig/debugmethod`; \
320 echo -n "=====> EBMT chunk alignment..."; \
321 mkdir -p ebmt-alignments; \
322 sed -r -e 's/ / ||| /g' < moses_training/model/lex.e2f | grep -v NULL > final_tables/st_words.probs; \
323 sed -r -e 's/ / ||| /g' < moses_training/model/lex.f2e | grep -v NULL > final_tables/ts_words.probs; \
324 # This is needed because the ts_words.probs, the lines of which have \
325 # the structure "t s p(t|s)", were being interpreted as "s t p(t|s)" instead \
326 # by aligners/WordProbsChunkDistance.java \
327 cp final_tables/ts_words.probs final_tables/ts_words.probs_save; \
328 perl -F/'\|\|\|'/ -lane 'print $$F[1] . " ||| " . $$F[0] . " ||| " . $$F[2]' final_tables/ts_words.probs_save >final_tables/ts_words.probs; \
329 echo '<OpenMaTrEx_configuration>' > OpenMaTrEx.ini; \
330 echo '<sourceTrainingDataFile class="java.lang.String" value="chunked/train.'$$SL'" />' >> OpenMaTrEx.ini; \
331 echo '<targetTrainingDataFile class="java.lang.String" value="chunked/train.'$$TL'" />' >> OpenMaTrEx.ini; \
332 echo '<chunksAligner baseclass="org.dcu.matrex.openmatrex.aligners.ChunksAligner" class="org.dcu.matrex.openmatrex.aligners.EditDistanceChunksAligner" />' >> OpenMaTrEx.ini; \
333 echo '<alignedDataFactory baseclass="org.dcu.matrex.openmatrex.data.AlignedDataFactory" class="org.dcu.matrex.openmatrex.data.MapAlignedDataFactory" />' >> OpenMaTrEx.ini; \
334 echo '<chunkDistanceList class="java.util.Vector" list="yes" >' >> OpenMaTrEx.ini; \
335 echo '<chunkDistance1 baseclass="org.dcu.matrex.openmatrex.aligners.ChunkDistance" class="org.dcu.matrex.openmatrex.aligners.CognateChunkDistance" />' >> OpenMaTrEx.ini; \
336 echo '<chunkDistance2 baseclass="org.dcu.matrex.openmatrex.aligners.ChunkDistance" class="org.dcu.matrex.openmatrex.aligners.WordProbsChunkDistance" />' >> OpenMaTrEx.ini; \
337 echo '</chunkDistanceList>' >> OpenMaTrEx.ini; \
338 echo '<trainingResultsPrefix class="java.lang.String" value="ebmt-alignments/train" />' >> OpenMaTrEx.ini; \
339 echo '<wordsAligner baseclass="org.dcu.matrex.openmatrex.aligners.WordsAligner" class="org.dcu.matrex.openmatrex.aligners.GizaWordsAligner" >' >> OpenMaTrEx.ini; \
340 echo '<filename class="java.lang.String" value="final_tables/ts_words.probs" />' >> OpenMaTrEx.ini; \
341 # This one is not used, but it's needed in the call \
342 echo '<filename class="java.lang.String" value="final_tables/st_words.probs" />' >> OpenMaTrEx.ini; \
343 echo '</wordsAligner>' >> OpenMaTrEx.ini; \
344 echo '<encoding class="java.lang.String" value="'$$ENCODING'" />' >> OpenMaTrEx.ini; \
345 echo '<traceMode class="org.dcu.matrex.openmatrex.utils.'$$DEBUGMETHOD'" />' >>OpenMaTrEx.ini; \
346 echo '</OpenMaTrEx_configuration>' >> OpenMaTrEx.ini; \
347 java $(JAVA_OPTIONS) -cp $(OPENMATREX_ROOT)/OpenMaTrEx.jar org.dcu.matrex.openmatrex.main.AlignNoCounts --with-ids > ebmt-alignments/alignments.log
348 sort ebmt-alignments/train.ac | uniq -c | perl -pane 's/^ *([0-9]+) (.*)/$$2 ||| $$1.0/' > ebmt-alignments/train.ac.counts
349 mv ebmt-alignments/train.ac.counts ebmt-alignments/train.ac
350 rm OpenMaTrEx.ini
351 echo " OK."
352
353 ebmt_alignments_on_disk_with_tags_with_id:
354 SL=`cat orig/source`; TL=`cat orig/target`; ENCODING=`cat orig/encoding`; \
355 DEBUGMETHOD=`cat orig/debugmethod`; \
356 echo -n "=====> EBMT chunk alignment..."; \
357 mkdir -p ebmt-alignments; \
358 sed -r -e 's/ / ||| /g' < moses_training/model/lex.e2f | grep -v NULL > final_tables/st_words.probs; \
359 sed -r -e 's/ / ||| /g' < moses_training/model/lex.f2e | grep -v NULL > final_tables/ts_words.probs; \
360 # This is needed because the ts_words.probs, the lines of which have \
361 # the structure "t s p(t|s)", were being interpreted as "s t p(t|s)" instead \
362 # by aligners/WordProbsChunkDistance.java \
363 cp final_tables/ts_words.probs final_tables/ts_words.probs_save; \
364 perl -F/'\|\|\|'/ -lane 'print $$F[1] . " ||| " . $$F[0] . " ||| " . $$F[2]' final_tables/ts_words.probs_save >final_tables/ts_words.probs; \
365 echo '<OpenMaTrEx_configuration>' > OpenMaTrEx.ini; \
366 echo '<sourceTrainingDataFile class="java.lang.String" value="chunked/train.'$$SL'.tagged" />' >> OpenMaTrEx.ini; \
367 echo '<targetTrainingDataFile class="java.lang.String" value="chunked/train.'$$TL'.tagged" />' >> OpenMaTrEx.ini; \
368 echo '<chunksAligner baseclass="org.dcu.matrex.openmatrex.aligners.ChunksAligner" class="org.dcu.matrex.openmatrex.aligners.EditDistanceChunksAligner" />' >> OpenMaTrEx.ini; \
369 echo '<alignedDataFactory baseclass="org.dcu.matrex.openmatrex.data.AlignedDataFactory" class="org.dcu.matrex.openmatrex.data.MapAlignedDataFactory" />' >> OpenMaTrEx.ini; \
370 echo '<chunkDistanceList class="java.util.Vector" list="yes" >' >> OpenMaTrEx.ini; \
371 echo '<chunkDistance1 baseclass="org.dcu.matrex.openmatrex.aligners.ChunkDistance" class="org.dcu.matrex.openmatrex.aligners.CognateChunkDistance" />' >> OpenMaTrEx.ini; \
372 echo '<chunkDistance2 baseclass="org.dcu.matrex.openmatrex.aligners.ChunkDistance" class="org.dcu.matrex.openmatrex.aligners.WordProbsChunkDistance" />' >> OpenMaTrEx.ini; \
373 echo '<chunkDistance3 baseclass="org.dcu.matrex.openmatrex.aligners.ChunkDistance" class="org.dcu.matrex.openmatrex.aligners.TagBasedChunkDistance" />' >> OpenMaTrEx.ini; \
374 echo '</chunkDistanceList>' >> OpenMaTrEx.ini; \
375 echo '<trainingResultsPrefix class="java.lang.String" value="ebmt-alignments/train" />' >> OpenMaTrEx.ini; \
376 echo '<wordsAligner baseclass="org.dcu.matrex.openmatrex.aligners.WordsAligner" class="org.dcu.matrex.openmatrex.aligners.GizaWordsAligner" >' >> OpenMaTrEx.ini; \
377 echo '<filename class="java.lang.String" value="final_tables/ts_words.probs" />' >> OpenMaTrEx.ini; \
378 # This one is not used, but it's needed in the call \
379 echo '<filename class="java.lang.String" value="final_tables/st_words.probs" />' >> OpenMaTrEx.ini; \
380 echo '</wordsAligner>' >> OpenMaTrEx.ini; \
381 echo '<encoding class="java.lang.String" value="'$$ENCODING'" />' >> OpenMaTrEx.ini; \
382 echo '<traceMode class="org.dcu.matrex.openmatrex.utils.'$$DEBUGMETHOD'" />' >>OpenMaTrEx.ini; \
383 echo '</OpenMaTrEx_configuration>' >> OpenMaTrEx.ini; \
384 java $(JAVA_OPTIONS) -cp $(OPENMATREX_ROOT)/OpenMaTrEx.jar org.dcu.matrex.openmatrex.main.AlignNoCounts --with-ids > ebmt-alignments/alignments.log
385 sort ebmt-alignments/train.ac | uniq -c | perl -pane 's/^ *([0-9]+) (.*)/$$2 ||| $$1.0/' > ebmt-alignments/train.ac.counts
386 mv ebmt-alignments/train.ac.counts ebmt-alignments/train.ac
387 rm OpenMaTrEx.ini
388 echo " OK."
389
390 ebmt_alignments_with_context:
391 SL=`cat orig/source`; TL=`cat orig/target`; ENCODING=`cat orig/encoding`; \
392 DEBUGMETHOD=`cat orig/debugmethod`; \
393 echo -n "=====> EBMT chunk alignment with context..."; \
394 mkdir -p ebmt-alignments; \
395 sed -r -e 's/ / ||| /g' < moses_training/model/lex.e2f | grep -v NULL > final_tables/st_words.probs; \
396 sed -r -e 's/ / ||| /g' < moses_training/model/lex.f2e | grep -v NULL > final_tables/ts_words.probs; \
397 # This is needed because the ts_words.probs, the lines of which have \
398 # the structure "t s p(t|s)", were being interpreted as "s t p(t|s)" instead \
399 # by aligners/WordProbsChunkDistance.java \
400 cp final_tables/ts_words.probs final_tables/ts_words.probs_save; \
401 perl -F/'\|\|\|'/ -lane 'print $$F[1] . " ||| " . $$F[0] . " ||| " . $$F[2]' final_tables/ts_words.probs_save >final_tables/ts_words.probs; \
402 echo '<OpenMaTrEx_configuration>' > OpenMaTrEx.ini; \
403 echo '<sourceTrainingDataFile class="java.lang.String" value="chunked/train.'$$SL'" />' >> OpenMaTrEx.ini; \
404 echo '<targetTrainingDataFile class="java.lang.String" value="chunked/train.'$$TL'" />' >> OpenMaTrEx.ini; \
405 echo '<outputFile class="java.lang.String" value="ebmt-alignments/alignments.with_context" />' >> OpenMaTrEx.ini; \
406 echo '<chunksAligner baseclass="org.dcu.matrex.openmatrex.aligners.ChunksAligner" class="org.dcu.matrex.openmatrex.aligners.MostProbableChunksAligner" />' >> OpenMaTrEx.ini; \
407 echo '<alignedDataFactory baseclass="org.dcu.matrex.openmatrex.data.AlignedDataFactory" class="org.dcu.matrex.openmatrex.data.MapAlignedDataFactory" />' >> OpenMaTrEx.ini; \
408 echo '<chunkDistanceList class="java.util.Vector" list="yes" >' >> OpenMaTrEx.ini; \
409 echo '<chunkDistance2 baseclass="org.dcu.matrex.openmatrex.aligners.ChunkDistance" class="org.dcu.matrex.openmatrex.aligners.WordProbsChunkDistance" />' >> OpenMaTrEx.ini; \
410 echo '</chunkDistanceList>' >> OpenMaTrEx.ini; \
411 echo '<wordsAligner baseclass="org.dcu.matrex.openmatrex.aligners.WordsAligner" class="org.dcu.matrex.openmatrex.aligners.GizaWordsAligner" >' >> OpenMaTrEx.ini; \
412 echo '<filename class="java.lang.String" value="final_tables/ts_words.probs" />' >> OpenMaTrEx.ini; \
413 # This one is not used, but it's needed in the call \
414 echo '<filename class="java.lang.String" value="final_tables/st_words.probs" />' >> OpenMaTrEx.ini; \
415 echo '</wordsAligner>' >> OpenMaTrEx.ini; \
416 echo '<encoding class="java.lang.String" value="'$$ENCODING'" />' >> OpenMaTrEx.ini; \
417 echo '<traceMode class="org.dcu.matrex.openmatrex.utils.'$$DEBUGMETHOD'" />' >>OpenMaTrEx.ini; \
418 echo '</OpenMaTrEx_configuration>' >> OpenMaTrEx.ini; \
419 java $(JAVA_OPTIONS) -cp $(OPENMATREX_ROOT)/OpenMaTrEx.jar org.dcu.matrex.openmatrex.main.AlignWithContext > ebmt-alignments/alignments_with_context.log
420 rm OpenMaTrEx.ini
421 echo " OK."
422
423 #word_alignments_with_context:
424 #
425
426
427
428 #######################################################
429 moses_training_steps:
430 SL=`cat orig/source`; TL=`cat orig/target`; \
431 echo -n "=====> Performing training with Moses..."; \
432 mkdir -p $(SUBDIR)/moses_training; \
433 mkdir -p $(SUBDIR)/final_tables; \
434 if [ -f $(SUBDIR)/moses_training/model/phrase-table.gz -a $(LAST_STEP) -ge 6 -a $(FIRST_STEP) -le 6 ] ; then echo -n " Erasing previous phrase table.. "; rm -f $(SUBDIR)/moses_training/model/phrase-table.gz; fi ;\
435 $(MOSES_SCRIPTS_DIR)/training/train-model.perl --scripts-root-dir $(MOSES_SCRIPTS_DIR) \
436 --root-dir $(SUBDIR)/moses_training \
437 --corpus orig/train --f $$SL --e $$TL \
438 --first-step $(FIRST_STEP) \
439 --last-step $(LAST_STEP) \
440 $(TRAINING_OPTIONS) \
441 >$(SUBDIR)/moses_training/moses_log 2>&1; \
442 EXIT_CODE=$$?; \
443 if [ $$EXIT_CODE -ne 0 ]; then \
444 echo "=====X(!) moses training failed. Check moses_training/moses_log for details"; \
445 exit $$EXIT_CODE; \
446 fi; \
447 rm -f $(SUBDIR)/final_tables/smt-ttable; \
448 rm -f $(SUBDIR)/moses_training/model/*half*
449 rm -f $(SUBDIR)/moses_training/model/*sorted*
450 ln -s ../moses_training/model/phrase-table.0-0.gz $(SUBDIR)/final_tables/smt-ttable; \
451 echo " OK."
452
453 ###################################################
454 merge_ebmt_with_moses:
455 $(OPENMATREX_DIR)/tools/merge/merge.sh PATH=$(MERGE_FILES) --ebmt '$(MERGE_PARAMS)';
456
457 merge_other_with_moses:
458 $(OPENMATREX_DIR)/tools/merge/merge.sh PATH=$(MERGE_FILES) LEX_PARAMS="-e" '$(MERGE_PARAMS)'; \
459
460 merge_ebmt_only:
461 echo -n "=====> Preparing EBMT chunks only for decoding with Moses..."; \
462 $(OPENMATREX_DIR)/tools/merge/merge_ebmt_only.sh PATH=$(MERGE_FILES) '$(MERGE_PARAMS)' > merged/merge.log 2>&1; \
463 echo " OK."; \
464
465 add_ebmt_feature:
466 echo -n "=====> Incorporating EBMT feature to phrase table...";
467 [ -s merged/extract.aligner.gz ] || (echo "\nmerged/extract.aligner.gz not found or empty. Please merge ebmt phrases with moses phrases before continuing." ; false)
468 ([ -s moses_training/model/phrase-table.gz ] || [ -s moses_training/model/phrase-table-noebmtfeature.sorted.gz ] ) || (echo "\nphrase-table not found or empty. Please complete the moses training before continuing." ; false)
469 export LC_ALL=C ; \
470 zcat merged/extract.aligner.gz | perl -pane 's/^([^|]+ \|\|\| [^|]+ \|\|\|).*/$$1/' | sort | uniq | gzip > merged/ebmt-phrases.gz ; \
471 ([ ! -f moses_training/model/phrase-table-noebmtfeature.sorted.gz ] && (zcat moses_training/model/phrase-table.gz | sort | gzip > moses_training/model/phrase-table-noebmtfeature.sorted.gz) ) || echo "\nadd_ebmt_feature has already been run. Using original phrase table." ; \
472 $(OPENMATREX_DIR)/tools/addEBMTFeature.py merged/ebmt-phrases.gz moses_training/model/phrase-table-noebmtfeature.sorted.gz | gzip > moses_training/model/phrase-table.gz ;
473 [ ! -f moses_training/model/moses.noebmtfeature.ini ] && (mv moses_training/model/moses.ini moses_training/model/moses.noebmtfeature.ini) ; true
474 cat moses_training/model/moses.noebmtfeature.ini | perl -ne 'if (/^\[ttable-file\]/) {print;$$l=<>; $$l =~ s/^(.*) ([0-9]+) ([^ ]+)$$/$$1." ".($$2+1)." ".$$3/e; print $$l;} elsif (/^\[weight-t\]/) {print; print "0.2\n"} else {print;}' > moses_training/model/moses.ini
475 echo " OK."
476
477 moses_mert:
478 SL=`cat orig/source`; TL=`cat orig/target`; \
479 echo -n "=====> Performing MERT with Moses..."; \
480 mkdir -p $(SUBDIR)/mert-work; \
481 $(MOSES_SCRIPTS_DIR)/training/mert-moses.pl --rootdir $(MOSES_SCRIPTS_DIR) \
482 --mertdir=$(MOSES_SRC)/mert \
483 --nbest=200 \
484 --decoder-flags="$(DECODER_PARAMS)" \
485 $(MERT_PARAMS) \
486 orig/devset.$$SL orig/devset.$$TL \
487 $(DECODER) moses_training/model/moses.ini >$(SUBDIR)/mert-work/mert-out.log 2>$(SUBDIR)/mert-work/mert-err.log; \
488 echo " OK."
489
490 moses_decode:
491 SL=`cat orig/source`; TL=`cat orig/target`; \
492 echo -n "=====> Performing decoding with Moses..."; \
493 mkdir -p $(SUBDIR)/decoding; \
494 if [ -z $(PREFIX) ]; then PREFIX=$(DECODER_OUTPUT); else PREFIX=$(PREFIX); fi; \
495 touch $(SUBDIR)/decoding/starting; \
496 for file in $(SUBDIR)/moses_training/model/filtered-testset/moses.ini \
497 $(SUBDIR)/mert-work/moses.ini \
498 $(SUBDIR)/moses_training/model/moses.ini \
499 ./moses_training/model/filtered-testset/moses.ini \
500 ./mert-work/moses.ini \
501 ./moses_training/model/moses.ini; do \
502 if [ -f $$file ]; then \
503 CONFIG=$$file; \
504 break; \
505 fi \
506 done; \
507 $(DECODER) -config $$CONFIG $(DECODER_PARAMS) \
508 < orig/$$PREFIX.$$SL > $(SUBDIR)/decoding/$$PREFIX.$$TL \
509 2>$(SUBDIR)/decoding/moses.log; \
510 touch $(SUBDIR)/decoding/done
511 echo " OK."
512
513 ####################################################
514 language_model: check_lmtrain
515 TL=`cat orig/target`; \
516 echo "=====> Creating language model using IRSTLM..."; \
517 if [ -z $(PREFIX) ]; then PREFIX=$(LM_INPUT); else PREFIX=$(PREFIX); fi; \
518 mkdir -p l_models; \
519 TMP=`mktemp -d` ;\
520 $(IRSTLM)/bin/build-lm.sh $(LM_OPTIONS) -t $$TMP -i orig/$$PREFIX.$$TL -n $(LM_ORDER) -o l_models/lm.irstlm > l_models/irstlm.log 2>&1; \
521 rm -rf $$TMP ;\
522 touch l_models/done
523 echo " OK."
524
525 ######################################################################
526 ebmt_decode:
527 SL=`cat orig/source`; TL=`cat orig/target`; ENCODING=`cat orig/encoding`; \
528 DEBUGMETHOD=`cat orig/debugmethod`; \
529 echo "'$$SL' '$$TL' '$$ENCODING'"; \
530 echo -n "=====> EBMT decoding..."; \
531 # Copy the ts_words.probs file to align.aw; \
532 cp final_tables/ts_words.probs ebmt-alignments/train.aw; \
533 mkdir -p ebmt-results; \
534 echo '<OpenMaTrEx_configuration>' > OpenMaTrEx.ini; \
535 echo '<sourceTrainingDataFile class="java.lang.String" value="chunked/train.'$$SL'" />' >> OpenMaTrEx.ini; \
536 echo '<targetTrainingDataFile class="java.lang.String" value="chunked/train.'$$TL'" />' >> OpenMaTrEx.ini; \
537 echo '<encoding class="java.lang.String" value="'$$ENCODING'" />' >>OpenMaTrEx.ini; \
538 echo '<decoder baseclass="org.dcu.matrex.openmatrex.decoders.Decoder" class="org.dcu.matrex.openmatrex.decoders.NaiveDecoder" />' >>OpenMaTrEx.ini; \
539 echo '<sourceChunker baseclass="org.dcu.matrex.openmatrex.chunkers.Chunker" class="org.dcu.matrex.openmatrex.chunkers.MarkerBasedChunker">' >> OpenMaTrEx.ini; \
540 echo '<markersFile class="java.lang.String" value="$(OPENMATREX_ROOT)/marker_files/marker_words.'$$SL'" />' >> OpenMaTrEx.ini; \
541 echo '</sourceChunker>' >> OpenMaTrEx.ini; \
542 echo '<trainingResultsPrefix class="java.lang.String" value="ebmt-alignments/train"/>' >> OpenMaTrEx.ini; \
543 echo '<testDataFile class="java.lang.String" value="orig/testset.'$$SL'" />' >> OpenMaTrEx.ini; \
544 echo '<outputFile class="java.lang.String" value="ebmt-results/testset.'$$SL'.translated.'$$TL'" />' >> OpenMaTrEx.ini; \
545 echo '<traceMode class="org.dcu.matrex.openmatrex.utils.'$$DEBUGMETHOD'" />' >>OpenMaTrEx.ini; \
546 echo '</OpenMaTrEx_configuration>' >> OpenMaTrEx.ini; \
547 java $(JAVA_OPTIONS) -cp $(OPENMATREX_ROOT)/OpenMaTrEx.jar org.dcu.matrex.openmatrex.main.Decode > ebmt-results/decoding.log
548 rm OpenMaTrEx.ini
549 echo " OK."
550
551 train_ebmt_moses:
552 $(MAKE) filter
553 $(MAKE) language_model
554 $(MAKE) moses_training_steps FIRST_STEP=1 LAST_STEP=5
555 $(MAKE) marker-based_chunking_source
556 $(MAKE) marker-based_chunking_target
557 $(MAKE) ebmt_alignments_on_disk_with_id
558 $(MAKE) merge_ebmt_with_moses
559 $(MAKE) moses_training_steps FIRST_STEP=6 LAST_STEP=9
560 $(MAKE) add_ebmt_feature
561 $(MAKE) moses_mert
562
563 eval_output:
564 SL=`cat orig/source`; TL=`cat orig/target`; \
565 echo -n "=====> Evaluating MT output..."; \
566 INPUT_TEST_SET=decoding/testset.$$TL; \
567 if [ $(SUBDIR) != "." ] && [ -f $(SUBDIR)/decoding/testset.$$TL ]; then \
568 INPUT_TEST_SET=$(SUBDIR)/decoding/testset.$$TL; \
569 fi; \
570 mkdir -p $(SUBDIR)/eval; \
571 $$OPENMATREX_DIR/tools/mt_metric.sh orig/testset.$$SL orig/testset.$$TL $$INPUT_TEST_SET $$TL > $(SUBDIR)/eval/results; \
572 # mv *.sgm $(SUBDIR)/eval/ \
573 echo " OK."
574
575 eval_ebmt_output:
576 SL=`cat orig/source`; TL=`cat orig/target`; \
577 echo -n "=====> Evaluating MT output..."; \
578 INPUT_TEST_SET=ebmt-results/testset.$$TL; \
579 if [ $(SUBDIR) != "." ] && [ -f $(SUBDIR)/ebmt-results/testset.$$TL ]; then \
580 INPUT_TEST_SET=$(SUBDIR)/ebmt-results/testset.$$TL; \
581 fi; \
582 mkdir -p $(SUBDIR)/eval; \
583 $$OPENMATREX_DIR/tools/mt_metric.sh orig/testset.$$SL orig/testset.$$TL $$INPUT_TEST_SET $$TL > $(SUBDIR)/eval/results
584 # mv *.sgm $(SUBDIR)/eval/
585 echo " OK."
586
587 filter_model:
588 echo -n "=====> Filtering model given input..."; \
589 SL=`cat orig/source`; TL=`cat orig/target`; \
590 for file in mert-work/moses.ini \
591 moses_training/model/moses.ini ; do\
592 if [ -f $$file ]; then \
593 CONFIG=$$file; \
594 break; \
595 fi \
596 done; \
597 $(MOSES_SCRIPTS_DIR)/training/filter-model-given-input.pl \
598 moses_training/model/filtered-testset \
599 $$CONFIG \
600 orig/testset.$$SL \
601 > moses_training/filtering-testset.log 2>&1 ;\
602 echo " OK."
603
604 binarize:
605 echo -n "=====> Binarizing tables..."; \
606 SL=`cat orig/source`; TL=`cat orig/target`; \
607 for file in moses_training/model/filtered-testset/moses.ini \
608 mert-work/moses.ini \
609 moses_training/model/moses.ini ; do\
610 if [ -f $$file ]; then \
611 CONFIG=$$file; \
612 break; \
613 fi \
614 done; \
615 if [ $$CONFIG = "moses_training/model/filtered-testset/moses.ini" ] ; then \
616 echo -n " Note we are binarizing the FILTERED tables..."; \
617 fi; \
618 export LC_ALL=C; \
619 TTABLE=`cat $$CONFIG | perl -ne 'if (/\\[ttable-file\\]/) {$$l = <>; $$l =~ /([^ ]+)$$/; print $$1}'` ;\
620 RTABLE=`cat $$CONFIG | perl -ne 'if (/\\[distortion-file\\]/) {$$l = <>; $$l =~ /([^ ]+)$$/; print $$1}'` ;\
621 zcat -f $$TTABLE | sort | $(MOSES_SRC)/misc/processPhraseTable -ttable 0 0 - -nscores 5 -out `dirname $$TTABLE`/phrase-table.binary ; \
622 zcat -f $$RTABLE | $(MOSES_SRC)/misc/processLexicalTable -out `dirname $$RTABLE`/reordering-table.binary ;\
623 mv $$CONFIG $$CONFIG.no-binary ;\
624 cat $$CONFIG.no-binary | perl -ne 'if (/^\[ttable-file\]/) {print;$$l=<>; $$l =~ /^0 (.*) ([^ ]+)$$/; $$f=$$1; $$t=$$2; \\\
625 $$t=~s/^(.*)\/[^\/]+$$/$$1/; print "1 ".$$f." ".$$t."/phrase-table.binary\n"} elsif (/^\[distortion-file\]/) {print;$$l=<>; $$l =~ /^(.*) ([^ ]+)$$/; $$f=$$1; $$t=$$2; $$t=~s/^(.*)\/[^\/]+$$/$$1/; print $$f." ".$$t."/reordering-table.binary\n"} else {print}' > $$CONFIG ;\
626 echo " OK."
627
628 word_packing_4_mt:
629 echo "=====> Performing word packing for mt..."; \
630 SL=`cat orig/source`; TL=`cat orig/target`; \
631 $(OPENMATREX_DIR)/tools/wordpacking/word_packing.sh e $(STEP) $$SL $$TL $(COF) $(AC) $(K) 0; \
632 $(OPENMATREX_DIR)/tools/wordpacking/find_best_bleu.sh word_packing $(STEP) e; \
633 $(OPENMATREX_DIR)/tools/wordpacking/word_packing.sh f $(STEP) $$SL $$TL $(COF) $(AC) $(K) 0; \
634 $(OPENMATREX_DIR)/tools/wordpacking/find_best_bleu.sh word_packing $(STEP) f; \
635 $(OPENMATREX_DIR)/tools/wordpacking/word_packing_combined.sh $(STEP) $$SL $$TL 0; \
636 $(OPENMATREX_DIR)/tools/wordpacking/find_best_combined_bleu.sh word_packing $(STEP) \
637 echo " OK."
638
639 word_packing_4_alignment:
640 echo "=====> Performing word packing for alignment..."; \
641 SL=`cat orig/source`; TL=`cat orig/target`; \
642 $(OPENMATREX_DIR)/tools/wordpacking/word_packing.sh e $(STEP) $$SL $$TL $(COF) $(AC) $(K) 1 $(NO_GS_SENT) $(GS_FILE); \
643 $(OPENMATREX_DIR)/tools/wordpacking/find_best_aer.sh word_packing $(STEP) e; \
644 $(OPENMATREX_DIR)/tools/wordpacking/word_packing.sh f $(STEP) $$SL $$TL $(COF) $(AC) $(K) 1 $(NO_GS_SENT) $(GS_FILE); \
645 $(OPENMATREX_DIR)/tools/wordpacking/find_best_aer.sh word_packing $(STEP) f; \
646 $(OPENMATREX_DIR)/tools/wordpacking/word_packing_combined.sh $(STEP) $$SL $$TL 1 $(NO_GS_SENT) $(GS_FILE); \
647 $(OPENMATREX_DIR)/tools/wordpacking/find_best_combined_aer.sh word_packing $(STEP) \
648 echo " OK."
649
650

Properties

Name Value
svn:executable *

Mikel L. Forcada">Mikel L. Forcada
ViewVC Help
Powered by ViewVC 1.1.5