00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043 #include "ckd_alloc.h"
00044 #include "ngram_model_dmp.h"
00045 #include "pio.h"
00046 #include "err.h"
00047 #include "byteorder.h"
00048 #include "listelem_alloc.h"
00049
00050 #include <assert.h>
00051 #include <stdio.h>
00052 #include <string.h>
00053 #include <stdlib.h>
00054 #include <limits.h>
00055
00056 static const char darpa_hdr[] = "Darpa Trigram LM";
00057 static ngram_funcs_t ngram_model_dmp_funcs;
00058
00059 #define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
00060 #define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams)
00061 #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
00062
00063 static unigram_t *
00064 new_unigram_table(int32 n_ug)
00065 {
00066 unigram_t *table;
00067 int32 i;
00068
00069 table = ckd_calloc(n_ug, sizeof(unigram_t));
00070 for (i = 0; i < n_ug; i++) {
00071 table[i].prob1.f = -99.0;
00072 table[i].bo_wt1.f = -99.0;
00073 }
00074 return table;
00075 }
00076
00077 ngram_model_t *
00078 ngram_model_dmp_read(cmd_ln_t *config,
00079 const char *file_name,
00080 logmath_t *lmath)
00081 {
00082 ngram_model_t *base;
00083 ngram_model_dmp_t *model;
00084 FILE *fp;
00085 int do_mmap, do_swap;
00086 int32 is_pipe;
00087 int32 i, j, k, vn, n, ts;
00088 int32 n_unigram;
00089 int32 n_bigram;
00090 int32 n_trigram;
00091 char str[1024];
00092 unigram_t *ugptr;
00093 bigram_t *bgptr;
00094 trigram_t *tgptr;
00095 char *tmp_word_str;
00096 char *map_base = NULL;
00097 size_t offset = 0, filesize;
00098
00099 base = NULL;
00100 do_mmap = FALSE;
00101 if (config)
00102 do_mmap = cmd_ln_boolean_r(config, "-mmap");
00103
00104 if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
00105 E_ERROR("Dump file %s not found\n", file_name);
00106 goto error_out;
00107 }
00108
00109 if (is_pipe && do_mmap) {
00110 E_WARN("Dump file is compressed, will not use memory-mapped I/O\n");
00111 do_mmap = 0;
00112 }
00113
00114 do_swap = FALSE;
00115 if (fread(&k, sizeof(k), 1, fp) != 1)
00116 goto error_out;
00117 if (k != strlen(darpa_hdr)+1) {
00118 SWAP_INT32(&k);
00119 if (k != strlen(darpa_hdr)+1) {
00120 E_ERROR("Wrong magic header size number %x: %s is not a dump file\n", k, file_name);
00121 goto error_out;
00122 }
00123 do_swap = 1;
00124 }
00125 if (fread(str, 1, k, fp) != (size_t) k) {
00126 E_ERROR("Cannot read header\n");
00127 goto error_out;
00128 }
00129 if (strncmp(str, darpa_hdr, k) != 0) {
00130 E_ERROR("Wrong header %s: %s is not a dump file\n", darpa_hdr);
00131 goto error_out;
00132 }
00133
00134 if (do_mmap) {
00135 if (do_swap) {
00136 E_INFO
00137 ("Byteswapping required, will not use memory-mapped I/O for LM file\n");
00138 do_mmap = 0;
00139 }
00140 else {
00141 E_INFO("Will use memory-mapped I/O for LM file\n");
00142 #ifdef __ADSPBLACKFIN__
00143 E_FATAL("memory mapping is not supported at the moment.");
00144 #else
00145 #endif
00146 }
00147 }
00148
00149 if (fread(&k, sizeof(k), 1, fp) != 1)
00150 goto error_out;
00151 if (do_swap) SWAP_INT32(&k);
00152 if (fread(str, 1, k, fp) != (size_t) k) {
00153 E_ERROR("Cannot read LM filename in header\n");
00154 goto error_out;
00155 }
00156
00157
00158 if (fread(&vn, sizeof(vn), 1, fp) != 1)
00159 goto error_out;
00160 if (do_swap) SWAP_INT32(&vn);
00161 if (vn <= 0) {
00162
00163 if (fread(&ts, sizeof(ts), 1, fp) != 1)
00164 goto error_out;
00165 if (do_swap) SWAP_INT32(&ts);
00166
00167
00168 for (;;) {
00169 if (fread(&k, sizeof(k), 1, fp) != 1)
00170 goto error_out;
00171 if (do_swap) SWAP_INT32(&k);
00172 if (k == 0)
00173 break;
00174 if (fread(str, 1, k, fp) != (size_t) k) {
00175 E_ERROR("fread(word) failed\n");
00176 goto error_out;
00177 }
00178 }
00179
00180 if (fread(&n_unigram, sizeof(n_unigram), 1, fp) != 1)
00181 goto error_out;
00182 if (do_swap) SWAP_INT32(&n_unigram);
00183 }
00184 else {
00185 n_unigram = vn;
00186 }
00187
00188
00189 if (fread(&n_bigram, sizeof(n_bigram), 1, fp) != 1)
00190 goto error_out;
00191 if (do_swap) SWAP_INT32(&n_bigram);
00192 if (fread(&n_trigram, sizeof(n_trigram), 1, fp) != 1)
00193 goto error_out;
00194 if (do_swap) SWAP_INT32(&n_trigram);
00195 E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
00196
00197
00198 model = ckd_calloc(1, sizeof(*model));
00199 base = &model->base;
00200 if (n_trigram > 0)
00201 n = 3;
00202 else if (n_bigram > 0)
00203 n = 2;
00204 else
00205 n = 1;
00206 ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram);
00207 base->n_counts[0] = n_unigram;
00208 base->n_counts[1] = n_bigram;
00209 base->n_counts[2] = n_trigram;
00210
00211
00212
00213 model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
00214 ugptr = model->lm3g.unigrams;
00215 for (i = 0; i <= n_unigram; ++i) {
00216
00217 if (fread(ugptr, sizeof(int32), 1, fp) != 1) {
00218 E_ERROR("fread(mapid[%d]) failed\n", i);
00219 goto error_out;
00220 }
00221
00222 if (fread(ugptr, sizeof(unigram_t), 1, fp) != 1) {
00223 E_ERROR("fread(unigrams) failed\n");
00224 ngram_model_free(base);
00225 fclose_comp(fp, is_pipe);
00226 return NULL;
00227 }
00228
00229 if (do_swap) {
00230 SWAP_INT32(&ugptr->prob1.l);
00231 SWAP_INT32(&ugptr->bo_wt1.l);
00232 SWAP_INT32(&ugptr->bigrams);
00233 }
00234
00235 ugptr->prob1.l = logmath_log10_to_log(lmath, ugptr->prob1.f);
00236 ugptr->bo_wt1.l = logmath_log10_to_log(lmath, ugptr->bo_wt1.f);
00237 E_DEBUG(2, ("ug %d: prob %d bo %d bigrams %d\n",
00238 i, ugptr->prob1.l, ugptr->bo_wt1.l, ugptr->bigrams));
00239 ++ugptr;
00240 }
00241 E_INFO("%8d = LM.unigrams(+trailer) read\n", n_unigram);
00242
00243
00244 if (do_mmap) {
00245 offset = ftell(fp);
00246 fseek(fp, 0, SEEK_END);
00247 filesize = ftell(fp);
00248 fseek(fp, offset, SEEK_SET);
00249
00250
00251 if (offset & 0x3) {
00252 E_WARN("-mmap specified, but tseg_base is not word-aligned. Will not memory-map.\n");
00253 do_mmap = FALSE;
00254 }
00255 else {
00256 model->dump_mmap = mmio_file_read(file_name);
00257 if (model->dump_mmap == NULL) {
00258 do_mmap = FALSE;
00259 }
00260 else {
00261 map_base = mmio_file_ptr(model->dump_mmap);
00262 }
00263 }
00264 }
00265
00266
00267 if (do_mmap) {
00268 model->lm3g.bigrams = (bigram_t *) (map_base + offset);
00269 offset += (n_bigram + 1) * sizeof(bigram_t);
00270 }
00271 else {
00272 model->lm3g.bigrams =
00273 ckd_calloc(n_bigram + 1, sizeof(bigram_t));
00274 if (fread(model->lm3g.bigrams, sizeof(bigram_t), n_bigram + 1, fp)
00275 != (size_t) n_bigram + 1) {
00276 E_ERROR("fread(bigrams) failed\n");
00277 goto error_out;
00278 }
00279 if (do_swap) {
00280 for (i = 0, bgptr = model->lm3g.bigrams; i <= n_bigram;
00281 i++, bgptr++) {
00282 SWAP_INT16(&bgptr->wid);
00283 SWAP_INT16(&bgptr->prob2);
00284 SWAP_INT16(&bgptr->bo_wt2);
00285 SWAP_INT16(&bgptr->trigrams);
00286 }
00287 }
00288 }
00289 E_INFO("%8d = LM.bigrams(+trailer) read\n", n_bigram);
00290
00291
00292 if (n_trigram > 0) {
00293 if (do_mmap) {
00294 model->lm3g.trigrams = (trigram_t *) (map_base + offset);
00295 offset += n_trigram * sizeof(trigram_t);
00296 }
00297 else {
00298 model->lm3g.trigrams =
00299 ckd_calloc(n_trigram, sizeof(trigram_t));
00300 if (fread
00301 (model->lm3g.trigrams, sizeof(trigram_t), n_trigram, fp)
00302 != (size_t) n_trigram) {
00303 E_ERROR("fread(trigrams) failed\n");
00304 goto error_out;
00305 }
00306 if (do_swap) {
00307 for (i = 0, tgptr = model->lm3g.trigrams; i < n_trigram;
00308 i++, tgptr++) {
00309 SWAP_INT16(&tgptr->wid);
00310 SWAP_INT16(&tgptr->prob3);
00311 }
00312 }
00313 }
00314 E_INFO("%8d = LM.trigrams read\n", n_trigram);
00315
00316 model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
00317 model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
00318 }
00319
00320
00321 if (do_mmap)
00322 fseek(fp, offset, SEEK_SET);
00323 if (fread(&k, sizeof(k), 1, fp) != 1)
00324 goto error_out;
00325 if (do_swap) SWAP_INT32(&k);
00326 model->lm3g.n_prob2 = k;
00327 model->lm3g.prob2 = ckd_calloc(k, sizeof(*model->lm3g.prob2));
00328 if (fread(model->lm3g.prob2, sizeof(*model->lm3g.prob2), k, fp) != (size_t) k) {
00329 E_ERROR("fread(prob2) failed\n");
00330 goto error_out;
00331 }
00332 for (i = 0; i < k; i++) {
00333 if (do_swap)
00334 SWAP_INT32(&model->lm3g.prob2[i].l);
00335
00336 model->lm3g.prob2[i].l = logmath_log10_to_log(lmath, model->lm3g.prob2[i].f);
00337 }
00338 E_INFO("%8d = LM.prob2 entries read\n", k);
00339
00340
00341 if (base->n > 2) {
00342 if (fread(&k, sizeof(k), 1, fp) != 1)
00343 goto error_out;
00344 if (do_swap) SWAP_INT32(&k);
00345 model->lm3g.n_bo_wt2 = k;
00346 model->lm3g.bo_wt2 = ckd_calloc(k, sizeof(*model->lm3g.bo_wt2));
00347 if (fread(model->lm3g.bo_wt2, sizeof(*model->lm3g.bo_wt2), k, fp) != (size_t) k) {
00348 E_ERROR("fread(bo_wt2) failed\n");
00349 goto error_out;
00350 }
00351 for (i = 0; i < k; i++) {
00352 if (do_swap)
00353 SWAP_INT32(&model->lm3g.bo_wt2[i].l);
00354
00355 model->lm3g.bo_wt2[i].l = logmath_log10_to_log(lmath, model->lm3g.bo_wt2[i].f);
00356 }
00357 E_INFO("%8d = LM.bo_wt2 entries read\n", k);
00358 }
00359
00360
00361 if (base->n > 2) {
00362 if (fread(&k, sizeof(k), 1, fp) != 1)
00363 goto error_out;
00364 if (do_swap) SWAP_INT32(&k);
00365 model->lm3g.n_prob3 = k;
00366 model->lm3g.prob3 = ckd_calloc(k, sizeof(*model->lm3g.prob3));
00367 if (fread(model->lm3g.prob3, sizeof(*model->lm3g.prob3), k, fp) != (size_t) k) {
00368 E_ERROR("fread(prob3) failed\n");
00369 goto error_out;
00370 }
00371 for (i = 0; i < k; i++) {
00372 if (do_swap)
00373 SWAP_INT32(&model->lm3g.prob3[i].l);
00374
00375 model->lm3g.prob3[i].l = logmath_log10_to_log(lmath, model->lm3g.prob3[i].f);
00376 }
00377 E_INFO("%8d = LM.prob3 entries read\n", k);
00378 }
00379
00380
00381 if (do_mmap)
00382 offset = ftell(fp);
00383 if (n_trigram > 0) {
00384 if (do_mmap) {
00385 memcpy(&k, map_base + offset, sizeof(k));
00386 offset += sizeof(int32);
00387 model->lm3g.tseg_base = (int32 *) (map_base + offset);
00388 offset += k * sizeof(int32);
00389 }
00390 else {
00391 k = (n_bigram + 1) / BG_SEG_SZ + 1;
00392 if (fread(&k, sizeof(k), 1, fp) != 1)
00393 goto error_out;
00394 if (do_swap) SWAP_INT32(&k);
00395 model->lm3g.tseg_base = ckd_calloc(k, sizeof(int32));
00396 if (fread(model->lm3g.tseg_base, sizeof(int32), k, fp) !=
00397 (size_t) k) {
00398 E_ERROR("fread(tseg_base) failed\n");
00399 goto error_out;
00400 }
00401 if (do_swap)
00402 for (i = 0; i < k; i++)
00403 SWAP_INT32(&model->lm3g.tseg_base[i]);
00404 }
00405 E_INFO("%8d = LM.tseg_base entries read\n", k);
00406 }
00407
00408
00409 if (do_mmap) {
00410 memcpy(&k, map_base + offset, sizeof(k));
00411 offset += sizeof(int32);
00412 tmp_word_str = (char *) (map_base + offset);
00413 offset += k;
00414 }
00415 else {
00416 base->writable = TRUE;
00417 if (fread(&k, sizeof(k), 1, fp) != 1)
00418 goto error_out;
00419 if (do_swap) SWAP_INT32(&k);
00420 tmp_word_str = ckd_calloc(k, 1);
00421 if (fread(tmp_word_str, 1, k, fp) != (size_t) k) {
00422 E_ERROR("fread(word-string) failed\n");
00423 goto error_out;
00424 }
00425 }
00426
00427
00428 for (i = 0, j = 0; i < k; i++)
00429 if (tmp_word_str[i] == '\0')
00430 j++;
00431 if (j != n_unigram) {
00432 E_ERROR("Error reading word strings (%d doesn't match n_unigrams %d)\n",
00433 j, n_unigram);
00434 goto error_out;
00435 }
00436
00437
00438 if (do_mmap) {
00439 j = 0;
00440 for (i = 0; i < n_unigram; i++) {
00441 base->word_str[i] = tmp_word_str + j;
00442 if (hash_table_enter(base->wid, base->word_str[i],
00443 (void *)(long)i) != (void *)(long)i) {
00444 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
00445 }
00446 j += strlen(base->word_str[i]) + 1;
00447 }
00448 }
00449 else {
00450 j = 0;
00451 for (i = 0; i < n_unigram; i++) {
00452 base->word_str[i] = ckd_salloc(tmp_word_str + j);
00453 if (hash_table_enter(base->wid, base->word_str[i],
00454 (void *)(long)i) != (void *)(long)i) {
00455 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
00456 }
00457 j += strlen(base->word_str[i]) + 1;
00458 }
00459 free(tmp_word_str);
00460 }
00461 E_INFO("%8d = ascii word strings read\n", i);
00462
00463 fclose_comp(fp, is_pipe);
00464 return base;
00465
00466 error_out:
00467 if (fp)
00468 fclose_comp(fp, is_pipe);
00469 ngram_model_free(base);
00470 return NULL;
00471 }
00472
00473 ngram_model_dmp_t *
00474 ngram_model_dmp_build(ngram_model_t *base)
00475 {
00476 ngram_model_dmp_t *model;
00477 ngram_model_t *newbase;
00478 ngram_iter_t *itor;
00479 sorted_list_t sorted_prob2;
00480 sorted_list_t sorted_bo_wt2;
00481 sorted_list_t sorted_prob3;
00482 bigram_t *bgptr;
00483 trigram_t *tgptr;
00484 int i, bgcount, tgcount, seg;
00485
00486 if (base->funcs == &ngram_model_dmp_funcs) {
00487 E_INFO("Using existing DMP model.\n");
00488 return (ngram_model_dmp_t *)ngram_model_retain(base);
00489 }
00490
00491
00492 E_INFO("Building DMP model...\n");
00493 model = ckd_calloc(1, sizeof(*model));
00494 newbase = &model->base;
00495 ngram_model_init(newbase, &ngram_model_dmp_funcs,
00496 logmath_retain(base->lmath),
00497 base->n, base->n_counts[0]);
00498
00499 memcpy(newbase->n_counts, base->n_counts,
00500 base->n * sizeof(*base->n_counts));
00501
00502 newbase->writable = TRUE;
00503
00504 model->lm3g.unigrams = new_unigram_table(newbase->n_counts[0] + 1);
00505 for (itor = ngram_model_mgrams(base, 0); itor;
00506 itor = ngram_iter_next(itor)) {
00507 int32 prob1, bo_wt1;
00508 int32 const *wids;
00509
00510
00511
00512 wids = ngram_iter_get(itor, &prob1, &bo_wt1);
00513 model->lm3g.unigrams[wids[0]].prob1.l = prob1;
00514 model->lm3g.unigrams[wids[0]].bo_wt1.l = bo_wt1;
00515 newbase->word_str[wids[0]] = ckd_salloc(ngram_word(base, wids[0]));
00516 if ((hash_table_enter_int32(newbase->wid,
00517 newbase->word_str[wids[0]], wids[0]))
00518 != wids[0]) {
00519 E_WARN("Duplicate word in dictionary: %s\n", newbase->word_str[wids[0]]);
00520 }
00521 }
00522 E_INFO("%8d = #unigrams created\n", newbase->n_counts[0]);
00523
00524
00525
00526
00527 init_sorted_list(&sorted_prob2);
00528 if (newbase->n > 2) {
00529 init_sorted_list(&sorted_bo_wt2);
00530 init_sorted_list(&sorted_prob3);
00531 }
00532
00533 bgptr = model->lm3g.bigrams = ckd_calloc(newbase->n_counts[1] + 1, sizeof(bigram_t));
00534 if (newbase->n > 2) {
00535 tgptr = model->lm3g.trigrams = ckd_calloc(newbase->n_counts[2], sizeof(trigram_t));
00536 model->lm3g.tseg_base =
00537 ckd_calloc((newbase->n_counts[1] + 1) / BG_SEG_SZ + 1, sizeof(int32));
00538 }
00539 else
00540 tgptr = NULL;
00541
00542
00543
00544 for (i = 0; i < newbase->n_counts[0]; ++i) {
00545 ngram_iter_t *uitor;
00546 bgcount = bgptr - model->lm3g.bigrams;
00547
00548 model->lm3g.unigrams[i].bigrams = bgcount;
00549 E_DEBUG(2, ("unigram %d: %s => bigram %d\n", i, newbase->word_str[i], bgcount));
00550
00551 uitor = ngram_ng_iter(base, i, NULL, 0);
00552 for (itor = ngram_iter_successors(uitor);
00553 itor; ++bgptr, itor = ngram_iter_next(itor)) {
00554 int32 prob2, bo_wt2;
00555 int32 const *wids;
00556 ngram_iter_t *titor;
00557
00558 wids = ngram_iter_get(itor, &prob2, &bo_wt2);
00559
00560 if (bgptr - model->lm3g.bigrams >= newbase->n_counts[1]) {
00561 ngram_iter_free(itor);
00562 break;
00563 }
00564
00565 bgptr->wid = wids[1];
00566 bgptr->prob2 = sorted_id(&sorted_prob2, &prob2);
00567 if (newbase->n > 2) {
00568 tgcount = (tgptr - model->lm3g.trigrams);
00569 bgcount = (bgptr - model->lm3g.bigrams);
00570
00571
00572 bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2);
00573
00574
00575
00576 seg = bgcount >> LOG_BG_SEG_SZ;
00577
00578
00579
00580 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
00581 model->lm3g.tseg_base[seg] = tgcount;
00582
00583 bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg];
00584 E_DEBUG(2, ("bigram %d %s %s => trigram %d:%d\n",
00585 bgcount,
00586 newbase->word_str[wids[0]],
00587 newbase->word_str[wids[1]],
00588 seg, bgptr->trigrams));
00589
00590
00591 for (titor = ngram_iter_successors(itor);
00592 titor; ++tgptr, titor = ngram_iter_next(titor)) {
00593 int32 prob3, dummy;
00594
00595 assert(tgptr - model->lm3g.trigrams < newbase->n_counts[2]);
00596 wids = ngram_iter_get(titor, &prob3, &dummy);
00597 tgptr->wid = wids[2];
00598 tgptr->prob3 = sorted_id(&sorted_prob3, &prob3);
00599 E_DEBUG(2, ("trigram %d %s %s %s => prob %d\n",
00600 tgcount,
00601 newbase->word_str[wids[0]],
00602 newbase->word_str[wids[1]],
00603 newbase->word_str[wids[2]],
00604 tgptr->prob3));
00605 }
00606 }
00607 }
00608 ngram_iter_free(uitor);
00609 }
00610
00611 bgcount = bgptr - model->lm3g.bigrams;
00612 tgcount = tgptr - model->lm3g.trigrams;
00613 seg = bgcount >> LOG_BG_SEG_SZ;
00614 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
00615 model->lm3g.tseg_base[seg] = tgcount;
00616 model->lm3g.unigrams[i].bigrams = bgcount;
00617 bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg];
00618
00619
00620 model->lm3g.n_prob2 = sorted_prob2.free;
00621 model->lm3g.prob2 = vals_in_sorted_list(&sorted_prob2);
00622 E_INFO("%8d = #bigrams created\n", newbase->n_counts[1]);
00623 E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
00624 free_sorted_list(&sorted_prob2);
00625 if (newbase->n > 2) {
00626
00627 model->lm3g.n_bo_wt2 = sorted_bo_wt2.free;
00628 model->lm3g.bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2);
00629 free_sorted_list(&sorted_bo_wt2);
00630 E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
00631
00632 model->lm3g.n_prob3 = sorted_prob3.free;
00633 model->lm3g.prob3 = vals_in_sorted_list(&sorted_prob3);
00634 E_INFO("%8d = #trigrams created\n", newbase->n_counts[2]);
00635 E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
00636 free_sorted_list(&sorted_prob3);
00637
00638 model->lm3g.tginfo = ckd_calloc(newbase->n_counts[0], sizeof(tginfo_t *));
00639 model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
00640 }
00641
00642 return model;
00643 }
00644
00645 static void
00646 fwrite_int32(FILE *fh, int32 val)
00647 {
00648 fwrite(&val, 4, 1, fh);
00649 }
00650
00651 static void
00652 fwrite_ug(FILE *fh, unigram_t *ug, logmath_t *lmath)
00653 {
00654 int32 bogus = -1;
00655 float32 log10val;
00656
00657
00658 fwrite(&bogus, 4, 1, fh);
00659
00660 log10val = logmath_log_to_log10(lmath, ug->prob1.l);
00661 fwrite(&log10val, 4, 1, fh);
00662 log10val = logmath_log_to_log10(lmath, ug->bo_wt1.l);
00663 fwrite(&log10val, 4, 1, fh);
00664 fwrite_int32(fh, ug->bigrams);
00665 }
00666
00667 static void
00668 fwrite_bg(FILE *fh, bigram_t *bg)
00669 {
00670 fwrite(bg, sizeof(*bg), 1, fh);
00671 }
00672
00673 static void
00674 fwrite_tg(FILE *fh, trigram_t *tg)
00675 {
00676 fwrite(tg, sizeof(*tg), 1, fh);
00677 }
00678
00681 static char const *fmtdesc[] = {
00682 "BEGIN FILE FORMAT DESCRIPTION",
00683 "Header string length (int32) and string (including trailing 0)",
00684 "Original LM filename string-length (int32) and filename (including trailing 0)",
00685 "(int32) version number (present iff value <= 0)",
00686 "(int32) original LM file modification timestamp (iff version# present)",
00687 "(int32) string-length and string (including trailing 0) (iff version# present)",
00688 "... previous entry continued any number of times (iff version# present)",
00689 "(int32) 0 (terminating sequence of strings) (iff version# present)",
00690 "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)",
00691 "(int32) lm_t.ucount (must be > 0)",
00692 "(int32) lm_t.bcount",
00693 "(int32) lm_t.tcount",
00694 "lm_t.ucount+1 unigrams (including sentinel)",
00695 "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3",
00696 "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)",
00697 "(int32) lm_t.n_prob2",
00698 "(int32) lm_t.prob2[]",
00699 "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)",
00700 "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)",
00701 "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)",
00702 "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)",
00703 "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)",
00704 "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)",
00705 "(int32) Sum(all word string-lengths, including trailing 0 for each)",
00706 "All word strings (including trailing 0 for each)",
00707 "END FILE FORMAT DESCRIPTION",
00708 NULL,
00709 };
00710
00711 static void
00712 ngram_model_dmp_write_header(FILE * fh)
00713 {
00714 int32 k;
00715 k = strlen(darpa_hdr) + 1;
00716 fwrite_int32(fh, k);
00717 fwrite(darpa_hdr, 1, k, fh);
00718 }
00719
00720 static void
00721 ngram_model_dmp_write_lm_filename(FILE * fh, const char *lmfile)
00722 {
00723 int32 k;
00724
00725 k = strlen(lmfile) + 1;
00726 fwrite_int32(fh, k);
00727 fwrite(lmfile, 1, k, fh);
00728 }
00729
00730 #define LMDMP_VERSION_TG_16BIT -1
00734 static void
00735 ngram_model_dmp_write_version(FILE * fh, int32 mtime)
00736 {
00737 fwrite_int32(fh, LMDMP_VERSION_TG_16BIT);
00738 fwrite_int32(fh, mtime);
00739 }
00740
00741 static void
00742 ngram_model_dmp_write_ngram_counts(FILE * fh, ngram_model_t *model)
00743 {
00744 fwrite_int32(fh, model->n_counts[0]);
00745 fwrite_int32(fh, model->n_counts[1]);
00746 fwrite_int32(fh, model->n_counts[2]);
00747 }
00748
00749 static void
00750 ngram_model_dmp_write_fmtdesc(FILE * fh)
00751 {
00752 int32 i, k;
00753 long pos;
00754
00755
00756 for (i = 0; fmtdesc[i] != NULL; i++) {
00757 k = strlen(fmtdesc[i]) + 1;
00758 fwrite_int32(fh, k);
00759 fwrite(fmtdesc[i], 1, k, fh);
00760 }
00761
00762 pos = ftell(fh);
00763 k = pos & 3;
00764 if (k) {
00765 fwrite_int32(fh, 4-k);
00766 fwrite("!!!!", 1, 4-k, fh);
00767 }
00768 fwrite_int32(fh, 0);
00769 }
00770
00771 static void
00772 ngram_model_dmp_write_unigram(FILE *fh, ngram_model_t *model)
00773 {
00774 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00775 int32 i;
00776
00777 for (i = 0; i <= model->n_counts[0]; i++) {
00778 fwrite_ug(fh, &(lm->lm3g.unigrams[i]), model->lmath);
00779 }
00780 }
00781
00782
00783 static void
00784 ngram_model_dmp_write_bigram(FILE *fh, ngram_model_t *model)
00785 {
00786 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00787 int32 i;
00788
00789 for (i = 0; i <= model->n_counts[1]; i++) {
00790 fwrite_bg(fh, &(lm->lm3g.bigrams[i]));
00791 }
00792
00793 }
00794
00795 static void
00796 ngram_model_dmp_write_trigram(FILE *fh, ngram_model_t *model)
00797 {
00798 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00799 int32 i;
00800
00801 for (i = 0; i < model->n_counts[2]; i++) {
00802 fwrite_tg(fh, &(lm->lm3g.trigrams[i]));
00803 }
00804 }
00805
00806 static void
00807 ngram_model_dmp_write_bgprob(FILE *fh, ngram_model_t *model)
00808 {
00809 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00810 int32 i;
00811
00812 fwrite_int32(fh, lm->lm3g.n_prob2);
00813 for (i = 0; i < lm->lm3g.n_prob2; i++) {
00814 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob2[i].l);
00815 fwrite(&log10val, 4, 1, fh);
00816 }
00817 }
00818
00819 static void
00820 ngram_model_dmp_write_tgbowt(FILE *fh, ngram_model_t *model)
00821 {
00822 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00823 int32 i;
00824
00825 fwrite_int32(fh, lm->lm3g.n_bo_wt2);
00826 for (i = 0; i < lm->lm3g.n_bo_wt2; i++) {
00827 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.bo_wt2[i].l);
00828 fwrite(&log10val, 4, 1, fh);
00829 }
00830 }
00831
00832 static void
00833 ngram_model_dmp_write_tgprob(FILE *fh, ngram_model_t *model)
00834 {
00835 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00836 int32 i;
00837
00838 fwrite_int32(fh, lm->lm3g.n_prob3);
00839 for (i = 0; i < lm->lm3g.n_prob3; i++) {
00840 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob3[i].l);
00841 fwrite(&log10val, 4, 1, fh);
00842 }
00843 }
00844
00845 static void
00846 ngram_model_dmp_write_tg_segbase(FILE *fh, ngram_model_t *model)
00847 {
00848 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00849 int32 i, k;
00850
00851 k = (model->n_counts[1] + 1) / BG_SEG_SZ + 1;
00852 fwrite_int32(fh, k);
00853 for (i = 0; i < k; i++)
00854 fwrite_int32(fh, lm->lm3g.tseg_base[i]);
00855 }
00856
00857 static void
00858 ngram_model_dmp_write_wordstr(FILE *fh, ngram_model_t *model)
00859 {
00860 int32 i, k;
00861
00862 k = 0;
00863 for (i = 0; i < model->n_counts[0]; i++)
00864 k += strlen(model->word_str[i]) + 1;
00865 fwrite_int32(fh, k);
00866 for (i = 0; i < model->n_counts[0]; i++)
00867 fwrite(model->word_str[i], 1,
00868 strlen(model->word_str[i]) + 1, fh);
00869 }
00870
00871 int
00872 ngram_model_dmp_write(ngram_model_t *base,
00873 const char *file_name)
00874 {
00875 ngram_model_dmp_t *model;
00876 ngram_model_t *newbase;
00877 FILE *fh;
00878
00879
00880 model = ngram_model_dmp_build(base);
00881 newbase = &model->base;
00882
00883
00884
00885 if ((fh = fopen(file_name, "wb")) == NULL) {
00886 E_ERROR("Cannot create file %s\n", file_name);
00887 return -1;
00888 }
00889 ngram_model_dmp_write_header(fh);
00890 ngram_model_dmp_write_lm_filename(fh, file_name);
00891 ngram_model_dmp_write_version(fh, 0);
00892 ngram_model_dmp_write_fmtdesc(fh);
00893 ngram_model_dmp_write_ngram_counts(fh, newbase);
00894 ngram_model_dmp_write_unigram(fh, newbase);
00895 ngram_model_dmp_write_bigram(fh, newbase);
00896 ngram_model_dmp_write_trigram(fh, newbase);
00897 ngram_model_dmp_write_bgprob(fh, newbase);
00898 if (newbase->n > 2) {
00899 ngram_model_dmp_write_tgbowt(fh, newbase);
00900 ngram_model_dmp_write_tgprob(fh, newbase);
00901 ngram_model_dmp_write_tg_segbase(fh, newbase);
00902 }
00903 ngram_model_dmp_write_wordstr(fh, newbase);
00904 ngram_model_free(newbase);
00905
00906 return fclose(fh);
00907 }
00908
00909 static int
00910 ngram_model_dmp_apply_weights(ngram_model_t *base, float32 lw,
00911 float32 wip, float32 uw)
00912 {
00913 ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
00914 lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
00915 return 0;
00916 }
00917
00918
00919
00920
00921 #define NGRAM_MODEL_TYPE ngram_model_dmp_t
00922 #include "lm3g_templates.c"
00923
00924 static void
00925 ngram_model_dmp_free(ngram_model_t *base)
00926 {
00927 ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
00928
00929 ckd_free(model->lm3g.unigrams);
00930 ckd_free(model->lm3g.prob2);
00931 if (model->dump_mmap) {
00932 mmio_file_unmap(model->dump_mmap);
00933 }
00934 else {
00935 ckd_free(model->lm3g.bigrams);
00936 if (base->n > 2) {
00937 ckd_free(model->lm3g.trigrams);
00938 ckd_free(model->lm3g.tseg_base);
00939 }
00940 }
00941 if (base->n > 2) {
00942 ckd_free(model->lm3g.bo_wt2);
00943 ckd_free(model->lm3g.prob3);
00944 }
00945
00946 lm3g_tginfo_free(base, &model->lm3g);
00947 }
00948
00949 static ngram_funcs_t ngram_model_dmp_funcs = {
00950 ngram_model_dmp_free,
00951 ngram_model_dmp_apply_weights,
00952 lm3g_template_score,
00953 lm3g_template_raw_score,
00954 lm3g_template_add_ug,
00955 lm3g_template_flush,
00956 lm3g_template_iter,
00957 lm3g_template_mgrams,
00958 lm3g_template_successors,
00959 lm3g_template_iter_get,
00960 lm3g_template_iter_next,
00961 lm3g_template_iter_free
00962 };