00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036 #ifdef _WIN32_WCE
00037
00038
00039 #include <windows.h>
00040 #else
00041 #include <time.h>
00042 #endif
00043 #include <stdio.h>
00044 #include <string.h>
00045 #include <assert.h>
00046
00047
00048 #include "err.h"
00049 #include "pio.h"
00050 #include "ckd_alloc.h"
00051 #include "prim_type.h"
00052 #include "strfuncs.h"
00053 #include "hash_table.h"
00054
00055 #include "fsg_model.h"
00056
00057
00058 #define FSG_MODEL_BEGIN_DECL "FSG_BEGIN"
00059 #define FSG_MODEL_END_DECL "FSG_END"
00060 #define FSG_MODEL_N_DECL "N"
00061 #define FSG_MODEL_NUM_STATES_DECL "NUM_STATES"
00062 #define FSG_MODEL_S_DECL "S"
00063 #define FSG_MODEL_START_STATE_DECL "START_STATE"
00064 #define FSG_MODEL_F_DECL "F"
00065 #define FSG_MODEL_FINAL_STATE_DECL "FINAL_STATE"
00066 #define FSG_MODEL_T_DECL "T"
00067 #define FSG_MODEL_TRANSITION_DECL "TRANSITION"
00068 #define FSG_MODEL_COMMENT_CHAR '#'
00069
00070
00071 static int32
00072 nextline_str2words(FILE * fp, int32 * lineno,
00073 char **lineptr, char ***wordptr)
00074 {
00075 for (;;) {
00076 size_t len;
00077 int32 n;
00078
00079 ckd_free(*lineptr);
00080 if ((*lineptr = fread_line(fp, &len)) == NULL)
00081 return -1;
00082
00083 (*lineno)++;
00084
00085 if ((*lineptr)[0] == FSG_MODEL_COMMENT_CHAR)
00086 continue;
00087
00088 n = str2words(*lineptr, NULL, 0);
00089 if (n == 0)
00090 continue;
00091
00092
00093 if (*wordptr == NULL)
00094 *wordptr = ckd_calloc(n, sizeof(**wordptr));
00095 else
00096 *wordptr = ckd_realloc(*wordptr, n * sizeof(**wordptr));
00097 return str2words(*lineptr, *wordptr, n);
00098 }
00099 }
00100
00101 void
00102 fsg_model_trans_add(fsg_model_t * fsg,
00103 int32 from, int32 to, int32 logp, int32 wid)
00104 {
00105 fsg_link_t *link;
00106 gnode_t *gn;
00107
00108
00109 for (gn = fsg->trans[from][to]; gn; gn = gnode_next(gn)) {
00110 link = (fsg_link_t *) gnode_ptr(gn);
00111
00112 if (link->wid == wid) {
00113 if (link->logs2prob < logp)
00114 link->logs2prob = logp;
00115 return;
00116 }
00117 }
00118
00119
00120 link = listelem_malloc(fsg->link_alloc);
00121 link->from_state = from;
00122 link->to_state = to;
00123 link->logs2prob = logp;
00124 link->wid = wid;
00125
00126 fsg->trans[from][to] =
00127 glist_add_ptr(fsg->trans[from][to], (void *) link);
00128 }
00129
00130 int32
00131 fsg_model_null_trans_add(fsg_model_t * fsg, int32 from, int32 to, int32 logp)
00132 {
00133 fsg_link_t *link;
00134
00135
00136 if (logp > 0) {
00137 E_FATAL("Null transition prob must be <= 1.0 (state %d -> %d)\n",
00138 from, to);
00139 }
00140
00141
00142 if (from == to)
00143 return -1;
00144
00145
00146 link = fsg->null_trans[from][to];
00147 if (link) {
00148 assert(link->wid < 0);
00149 if (link->logs2prob < logp) {
00150 link->logs2prob = logp;
00151 return 0;
00152 }
00153 else
00154 return -1;
00155 }
00156
00157
00158 link = listelem_malloc(fsg->link_alloc);
00159 link->from_state = from;
00160 link->to_state = to;
00161 link->logs2prob = logp;
00162 link->wid = -1;
00163
00164 fsg->null_trans[from][to] = link;
00165
00166 return 1;
00167 }
00168
00169 glist_t
00170 fsg_model_null_trans_closure(fsg_model_t * fsg, glist_t nulls)
00171 {
00172 gnode_t *gn1, *gn2;
00173 int updated;
00174 fsg_link_t *tl1, *tl2;
00175 int32 k, n;
00176
00177 E_INFO("Computing transitive closure for null transitions\n");
00178
00179 if (nulls == NULL) {
00180 int i, j;
00181
00182 for (i = 0; i < fsg->n_state; ++i) {
00183 for (j = 0; j < fsg->n_state; ++j) {
00184 if (fsg->null_trans[i][j])
00185 nulls = glist_add_ptr(nulls, fsg->null_trans[i][j]);
00186 }
00187 }
00188 }
00189
00190
00191
00192
00193
00194 n = 0;
00195 do {
00196 updated = FALSE;
00197
00198 for (gn1 = nulls; gn1; gn1 = gnode_next(gn1)) {
00199 tl1 = (fsg_link_t *) gnode_ptr(gn1);
00200 assert(tl1->wid < 0);
00201
00202 for (gn2 = nulls; gn2; gn2 = gnode_next(gn2)) {
00203 tl2 = (fsg_link_t *) gnode_ptr(gn2);
00204
00205 if (tl1->to_state == tl2->from_state) {
00206 k = fsg_model_null_trans_add(fsg,
00207 tl1->from_state,
00208 tl2->to_state,
00209 tl1->logs2prob +
00210 tl2->logs2prob);
00211 if (k >= 0) {
00212 updated = TRUE;
00213 if (k > 0) {
00214 nulls =
00215 glist_add_ptr(nulls,
00216 (void *) fsg->
00217 null_trans[tl1->
00218 from_state][tl2->
00219 to_state]);
00220 n++;
00221 }
00222 }
00223 }
00224 }
00225 }
00226 } while (updated);
00227
00228 E_INFO("%d null transitions added\n", n);
00229
00230 return nulls;
00231 }
00232
00233 int
00234 fsg_model_word_id(fsg_model_t *fsg, char const *word)
00235 {
00236 int wid;
00237
00238
00239 for (wid = 0; wid < fsg->n_word; ++wid) {
00240 if (0 == strcmp(fsg->vocab[wid], word))
00241 break;
00242 }
00243
00244 if (wid == fsg->n_word)
00245 return -1;
00246 return wid;
00247 }
00248
00249 int
00250 fsg_model_word_add(fsg_model_t *fsg, char const *word)
00251 {
00252 int wid;
00253
00254
00255 wid = fsg_model_word_id(fsg, word);
00256
00257 if (wid == -1) {
00258 wid = fsg->n_word;
00259 if (fsg->n_word == fsg->n_word_alloc) {
00260 fsg->n_word_alloc += 10;
00261 fsg->vocab = ckd_realloc(fsg->vocab,
00262 fsg->n_word_alloc * sizeof(*fsg->vocab));
00263 if (fsg->silwords)
00264 fsg->silwords = bitvec_realloc(fsg->silwords, fsg->n_word_alloc);
00265 if (fsg->altwords)
00266 fsg->altwords = bitvec_realloc(fsg->altwords, fsg->n_word_alloc);
00267 }
00268 ++fsg->n_word;
00269 fsg->vocab[wid] = ckd_salloc(word);
00270 }
00271 return wid;
00272 }
00273
00274 int
00275 fsg_model_add_silence(fsg_model_t * fsg, char const *silword,
00276 int state, float32 silprob)
00277 {
00278 int32 logsilp;
00279 int n_trans, silwid, src;
00280
00281 E_INFO("Adding silence transitions for %s to FSG\n", silword);
00282
00283 silwid = fsg_model_word_add(fsg, silword);
00284 logsilp = (int32) (logmath_log(fsg->lmath, silprob) * fsg->lw);
00285 if (fsg->silwords == NULL)
00286 fsg->silwords = bitvec_alloc(fsg->n_word_alloc);
00287 bitvec_set(fsg->silwords, silwid);
00288
00289 n_trans = 0;
00290 if (state == -1) {
00291 for (src = 0; src < fsg->n_state; src++) {
00292 fsg_model_trans_add(fsg, src, src, logsilp, silwid);
00293 ++n_trans;
00294 }
00295 }
00296 else {
00297 fsg_model_trans_add(fsg, state, state, logsilp, silwid);
00298 ++n_trans;
00299 }
00300
00301 E_INFO("Added %d silence word transitions\n", n_trans);
00302 return n_trans;
00303 }
00304
00305 int
00306 fsg_model_add_alt(fsg_model_t * fsg, char const *baseword,
00307 char const *altword)
00308 {
00309 int i, j, basewid, altwid;
00310 int ntrans;
00311
00312
00313 for (basewid = 0; basewid < fsg->n_word; ++basewid)
00314 if (0 == strcmp(fsg->vocab[basewid], baseword))
00315 break;
00316 if (basewid == fsg->n_word) {
00317 E_ERROR("Base word %s not present in FSG vocabulary!\n", baseword);
00318 return -1;
00319 }
00320 altwid = fsg_model_word_add(fsg, altword);
00321 if (fsg->altwords == NULL)
00322 fsg->altwords = bitvec_alloc(fsg->n_word_alloc);
00323 bitvec_set(fsg->altwords, altwid);
00324
00325 E_INFO("Adding alternate word transitions (%s,%s) to FSG\n",
00326 baseword, altword);
00327
00328
00329
00330 ntrans = 0;
00331 for (i = 0; i < fsg->n_state; ++i) {
00332 for (j = 0; j < fsg->n_state; ++j) {
00333 glist_t trans;
00334 gnode_t *gn;
00335
00336 trans = fsg->trans[i][j];
00337 for (gn = trans; gn; gn = gnode_next(gn)) {
00338 fsg_link_t *fl = gnode_ptr(gn);
00339 if (fl->wid == basewid) {
00340 fsg_link_t *link;
00341
00342
00343 link = listelem_malloc(fsg->link_alloc);
00344 link->from_state = i;
00345 link->to_state = j;
00346 link->logs2prob = fl->logs2prob;
00347 link->wid = altwid;
00348
00349 trans =
00350 glist_add_ptr(trans, (void *) link);
00351 ++ntrans;
00352 }
00353 }
00354 fsg->trans[i][j] = trans;
00355 }
00356 }
00357
00358 E_INFO("Added %d alternate word transitions\n", ntrans);
00359 return ntrans;
00360 }
00361
00362
00363 fsg_model_t *
00364 fsg_model_init(char const *name, logmath_t *lmath, float32 lw, int32 n_state)
00365 {
00366 fsg_model_t *fsg;
00367
00368
00369 fsg = ckd_calloc(1, sizeof(*fsg));
00370 fsg->refcount = 1;
00371 fsg->link_alloc = listelem_alloc_init(sizeof(fsg_link_t));
00372 fsg->lmath = lmath;
00373 fsg->name = name ? ckd_salloc(name) : NULL;
00374 fsg->n_state = n_state;
00375 fsg->lw = lw;
00376
00377
00378 fsg->trans = ckd_calloc_2d(fsg->n_state, fsg->n_state,
00379 sizeof(glist_t));
00380
00381 fsg->null_trans = ckd_calloc_2d(fsg->n_state, fsg->n_state,
00382 sizeof(fsg_link_t *));
00383 return fsg;
00384 }
00385
00386 fsg_model_t *
00387 fsg_model_read(FILE * fp, logmath_t *lmath, float32 lw)
00388 {
00389 fsg_model_t *fsg;
00390 hash_table_t *vocab;
00391 hash_iter_t *itor;
00392 int32 lastwid;
00393 char **wordptr;
00394 char *lineptr;
00395 char *fsgname;
00396 int32 lineno;
00397 int32 n, i, j;
00398 int n_state, n_trans, n_null_trans;
00399 glist_t nulls;
00400 float32 p;
00401
00402 lineno = 0;
00403 vocab = hash_table_new(32, FALSE);
00404 wordptr = NULL;
00405 lineptr = NULL;
00406 nulls = NULL;
00407 fsgname = NULL;
00408 fsg = NULL;
00409
00410
00411 for (;;) {
00412 n = nextline_str2words(fp, &lineno, &lineptr, &wordptr);
00413 if (n < 0) {
00414 E_ERROR("%s declaration missing\n", FSG_MODEL_BEGIN_DECL);
00415 goto parse_error;
00416 }
00417
00418 if ((strcmp(wordptr[0], FSG_MODEL_BEGIN_DECL) == 0)) {
00419 if (n > 2) {
00420 E_ERROR("Line[%d]: malformed FSG_BEGIN delcaration\n",
00421 lineno);
00422 goto parse_error;
00423 }
00424 break;
00425 }
00426 }
00427
00428 if (n == 2)
00429 fsgname = ckd_salloc(wordptr[1]);
00430
00431
00432 n = nextline_str2words(fp, &lineno, &lineptr, &wordptr);
00433 if ((n != 2)
00434 || ((strcmp(wordptr[0], FSG_MODEL_N_DECL) != 0)
00435 && (strcmp(wordptr[0], FSG_MODEL_NUM_STATES_DECL) != 0))
00436 || (sscanf(wordptr[1], "%d", &n_state) != 1)
00437 || (n_state <= 0)) {
00438 E_ERROR
00439 ("Line[%d]: #states declaration line missing or malformed\n",
00440 lineno);
00441 goto parse_error;
00442 }
00443
00444
00445 fsg = fsg_model_init(fsgname, lmath, lw, n_state);
00446 ckd_free(fsgname);
00447 fsgname = NULL;
00448
00449
00450 n = nextline_str2words(fp, &lineno, &lineptr, &wordptr);
00451 if ((n != 2)
00452 || ((strcmp(wordptr[0], FSG_MODEL_S_DECL) != 0)
00453 && (strcmp(wordptr[0], FSG_MODEL_START_STATE_DECL) != 0))
00454 || (sscanf(wordptr[1], "%d", &(fsg->start_state)) != 1)
00455 || (fsg->start_state < 0)
00456 || (fsg->start_state >= fsg->n_state)) {
00457 E_ERROR
00458 ("Line[%d]: start state declaration line missing or malformed\n",
00459 lineno);
00460 goto parse_error;
00461 }
00462
00463
00464 n = nextline_str2words(fp, &lineno, &lineptr, &wordptr);
00465 if ((n != 2)
00466 || ((strcmp(wordptr[0], FSG_MODEL_F_DECL) != 0)
00467 && (strcmp(wordptr[0], FSG_MODEL_FINAL_STATE_DECL) != 0))
00468 || (sscanf(wordptr[1], "%d", &(fsg->final_state)) != 1)
00469 || (fsg->final_state < 0)
00470 || (fsg->final_state >= fsg->n_state)) {
00471 E_ERROR
00472 ("Line[%d]: final state declaration line missing or malformed\n",
00473 lineno);
00474 goto parse_error;
00475 }
00476
00477
00478 lastwid = 0;
00479 n_trans = n_null_trans = 0;
00480 for (;;) {
00481 int32 wid, tprob;
00482
00483 n = nextline_str2words(fp, &lineno, &lineptr, &wordptr);
00484 if (n <= 0) {
00485 E_ERROR("Line[%d]: transition or FSG_END statement expected\n",
00486 lineno);
00487 goto parse_error;
00488 }
00489
00490 if ((strcmp(wordptr[0], FSG_MODEL_END_DECL) == 0)) {
00491 break;
00492 }
00493
00494 if ((strcmp(wordptr[0], FSG_MODEL_T_DECL) == 0)
00495 || (strcmp(wordptr[0], FSG_MODEL_TRANSITION_DECL) == 0)) {
00496 if (((n != 4) && (n != 5))
00497 || (sscanf(wordptr[1], "%d", &i) != 1)
00498 || (sscanf(wordptr[2], "%d", &j) != 1)
00499 || (sscanf(wordptr[3], "%f", &p) != 1)
00500 || (i < 0) || (i >= fsg->n_state)
00501 || (j < 0) || (j >= fsg->n_state)
00502 || (p <= 0.0) || (p > 1.0)) {
00503 E_ERROR
00504 ("Line[%d]: transition spec malformed; Expecting: from-state to-state trans-prob [word]\n",
00505 lineno);
00506 goto parse_error;
00507 }
00508 }
00509 else {
00510 E_ERROR("Line[%d]: transition or FSG_END statement expected\n",
00511 lineno);
00512 goto parse_error;
00513 }
00514
00515 tprob = (int32)(logmath_log(lmath, p) * fsg->lw);
00516
00517 if (n > 4) {
00518 if (hash_table_lookup_int32(vocab, wordptr[4], &wid) < 0) {
00519 (void)hash_table_enter_int32(vocab, ckd_salloc(wordptr[4]), lastwid);
00520 wid = lastwid;
00521 ++lastwid;
00522 }
00523 fsg_model_trans_add(fsg, i, j, tprob, wid);
00524 ++n_trans;
00525 }
00526 else {
00527 if (fsg_model_null_trans_add(fsg, i, j, tprob) == 1) {
00528 ++n_null_trans;
00529 nulls = glist_add_ptr(nulls, fsg->null_trans[i][j]);
00530 }
00531 }
00532 }
00533
00534 E_INFO("FSG: %d states, %d unique words, %d transitions (%d null)\n",
00535 fsg->n_state, hash_table_inuse(vocab), n_trans, n_null_trans);
00536
00537
00538 nulls = fsg_model_null_trans_closure(fsg, nulls);
00539 glist_free(nulls);
00540
00541
00542 fsg->n_word = hash_table_inuse(vocab);
00543 fsg->n_word_alloc = fsg->n_word + 10;
00544 fsg->vocab = ckd_calloc(fsg->n_word_alloc, sizeof(*fsg->vocab));
00545 for (itor = hash_table_iter(vocab); itor; itor = hash_table_iter_next(itor)) {
00546 char const *word = hash_entry_key(itor->ent);
00547 int32 wid = (int32)(long)hash_entry_val(itor->ent);
00548 fsg->vocab[wid] = (char *)word;
00549 }
00550 hash_table_free(vocab);
00551 ckd_free(lineptr);
00552 ckd_free(wordptr);
00553
00554 return fsg;
00555
00556 parse_error:
00557 for (itor = hash_table_iter(vocab); itor; itor = hash_table_iter_next(itor))
00558 ckd_free((char *)hash_entry_key(itor->ent));
00559 glist_free(nulls);
00560 hash_table_free(vocab);
00561 ckd_free(fsgname);
00562 ckd_free(lineptr);
00563 ckd_free(wordptr);
00564 fsg_model_free(fsg);
00565 return NULL;
00566 }
00567
00568
00569 fsg_model_t *
00570 fsg_model_readfile(const char *file, logmath_t *lmath, float32 lw)
00571 {
00572 FILE *fp;
00573 fsg_model_t *fsg;
00574
00575 if ((fp = fopen(file, "r")) == NULL) {
00576 E_ERROR("fopen(%s,r) failed\n", file);
00577 return NULL;
00578 }
00579 fsg = fsg_model_read(fp, lmath, lw);
00580 fclose(fp);
00581 return fsg;
00582 }
00583
00584 fsg_model_t *
00585 fsg_model_retain(fsg_model_t *fsg)
00586 {
00587 ++fsg->refcount;
00588 return fsg;
00589 }
00590
00591 int
00592 fsg_model_free(fsg_model_t * fsg)
00593 {
00594 int i, j;
00595
00596 if (fsg == NULL)
00597 return 0;
00598
00599 if (--fsg->refcount > 0)
00600 return fsg->refcount;
00601
00602 for (i = 0; i < fsg->n_word; ++i)
00603 ckd_free(fsg->vocab[i]);
00604 for (i = 0; i < fsg->n_state; ++i)
00605 for (j = 0; j < fsg->n_state; ++j)
00606 glist_free(fsg->trans[i][j]);
00607 ckd_free(fsg->vocab);
00608 listelem_alloc_free(fsg->link_alloc);
00609 bitvec_free(fsg->silwords);
00610 bitvec_free(fsg->altwords);
00611 ckd_free_2d(fsg->trans);
00612 ckd_free_2d(fsg->null_trans);
00613 ckd_free(fsg->name);
00614 ckd_free(fsg);
00615 return 0;
00616 }
00617
00618
00619 void
00620 fsg_model_write(fsg_model_t * fsg, FILE * fp)
00621 {
00622 int32 i, j;
00623 gnode_t *gn;
00624 fsg_link_t *tl;
00625
00626 fprintf(fp, "%s\n", FSG_MODEL_BEGIN_DECL);
00627 fprintf(fp, "%s %d\n", FSG_MODEL_NUM_STATES_DECL, fsg->n_state);
00628 fprintf(fp, "%s %d\n", FSG_MODEL_START_STATE_DECL, fsg->start_state);
00629 fprintf(fp, "%s %d\n", FSG_MODEL_FINAL_STATE_DECL, fsg->final_state);
00630
00631 for (i = 0; i < fsg->n_state; i++) {
00632 for (j = 0; j < fsg->n_state; j++) {
00633
00634 for (gn = fsg->trans[i][j]; gn; gn = gnode_next(gn)) {
00635 tl = (fsg_link_t *) gnode_ptr(gn);
00636
00637 fprintf(fp, "%s %d %d %f %s\n", FSG_MODEL_TRANSITION_DECL,
00638 tl->from_state, tl->to_state,
00639 logmath_exp(fsg->lmath, (int32)(tl->logs2prob / fsg->lw)),
00640 (tl->wid < 0) ? "" : fsg_model_word_str(fsg, tl->wid));
00641 }
00642
00643
00644 tl = fsg->null_trans[i][j];
00645 if (tl) {
00646 fprintf(fp, "%s %d %d %f\n",
00647 FSG_MODEL_TRANSITION_DECL,
00648 tl->from_state, tl->to_state,
00649 logmath_exp(fsg->lmath, (int32)(tl->logs2prob / fsg->lw)));
00650 }
00651 }
00652 }
00653
00654 fprintf(fp, "%c\n", FSG_MODEL_COMMENT_CHAR);
00655 fprintf(fp, "%s\n", FSG_MODEL_END_DECL);
00656
00657 fflush(fp);
00658 }
00659
00660 void
00661 fsg_model_writefile(fsg_model_t *fsg, char const *file)
00662 {
00663 FILE *fp;
00664
00665 assert(fsg);
00666
00667 E_INFO("Writing FSG file '%s'\n", file);
00668
00669 if ((fp = fopen(file, "w")) == NULL) {
00670 E_ERROR("fopen(%s,r) failed\n", file);
00671 return;
00672 }
00673
00674 fsg_model_write(fsg, fp);
00675
00676 fclose(fp);
00677 }