• Main Page
  • Related Pages
  • Data Structures
  • Files
  • File List
  • Globals

src/libsphinxbase/lm/ngram_model.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * \file ngram_model.c N-Gram language models.
00039  *
00040  * Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm
00041  */
00042 
00043 #include "config.h"
00044 #include "ngram_model.h"
00045 #include "ngram_model_internal.h"
00046 #include "ckd_alloc.h"
00047 #include "filename.h"
00048 #include "pio.h"
00049 #include "err.h"
00050 #include "logmath.h"
00051 #include "strfuncs.h"
00052 #include "case.h"
00053 
00054 #include <string.h>
00055 #include <assert.h>
00056 #ifdef HAVE_ICONV
00057 #include <iconv.h>
00058 #endif 
00059 
00060 ngram_file_type_t
00061 ngram_file_name_to_type(const char *file_name)
00062 {
00063     const char *ext;
00064 
00065     ext = strrchr(file_name, '.');
00066     if (ext == NULL) {
00067         return NGRAM_INVALID;
00068     }
00069     if (0 == strcmp_nocase(ext, ".gz")) {
00070         while (--ext >= file_name) {
00071             if (*ext == '.') break;
00072         }
00073         if (ext < file_name) {
00074             return NGRAM_INVALID;
00075          }
00076      }
00077      else if (0 == strcmp_nocase(ext, ".bz2")) {
00078          while (--ext >= file_name) {
00079              if (*ext == '.') break;
00080          }
00081          if (ext < file_name) {
00082              return NGRAM_INVALID;
00083          }
00084      }
00085      /* We use strncmp because there might be a .gz on the end. */
00086      if (0 == strncmp_nocase(ext, ".ARPA", 5))
00087          return NGRAM_ARPA;
00088      if (0 == strncmp_nocase(ext, ".DMP", 4))
00089          return NGRAM_DMP;
00090      return NGRAM_INVALID;
00091  }
00092 
00093 ngram_file_type_t
00094 ngram_str_to_type(const char *str_name)
00095 {
00096     if (0 == strcmp_nocase(str_name, "arpa"))
00097         return NGRAM_ARPA;
00098     if (0 == strcmp_nocase(str_name, "dmp"))
00099         return NGRAM_DMP;
00100     return NGRAM_INVALID;
00101 }
00102 
00103 char const *
00104 ngram_type_to_str(int type)
00105 {
00106     switch (type) {
00107     case NGRAM_ARPA:
00108         return "arpa";
00109     case NGRAM_DMP:
00110         return "dmp";
00111     default:
00112         return NULL;
00113     }
00114 }
00115 
00116 
00117  ngram_model_t *
00118  ngram_model_read(cmd_ln_t *config,
00119                   const char *file_name,
00120                   ngram_file_type_t file_type,
00121                   logmath_t *lmath)
00122  {
00123      ngram_model_t *model = NULL;
00124 
00125      switch (file_type) {
00126      case NGRAM_AUTO: {
00127          if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
00128              break;
00129          if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
00130              break;
00131          return NULL;
00132      }
00133      case NGRAM_ARPA:
00134          model = ngram_model_arpa_read(config, file_name, lmath);
00135          break;
00136      case NGRAM_DMP:
00137          model = ngram_model_dmp_read(config, file_name, lmath);
00138          break;
00139      default:
00140          E_ERROR("language model file type not supported\n");
00141          return NULL;
00142      }
00143 
00144      /* Now set weights based on config if present. */
00145      if (config) {
00146          float32 lw = 1.0;
00147          float32 wip = 1.0;
00148          float32 uw = 1.0;
00149 
00150          if (cmd_ln_exists_r(config, "-lw"))
00151              lw = cmd_ln_float32_r(config, "-lw");
00152          if (cmd_ln_exists_r(config, "-wip"))
00153              wip = cmd_ln_float32_r(config, "-wip");
00154          if (cmd_ln_exists_r(config, "-uw"))
00155              uw = cmd_ln_float32_r(config, "-uw");
00156 
00157          ngram_model_apply_weights(model, lw, wip, uw);
00158      }
00159 
00160      return model;
00161  }
00162 
00163  int
00164  ngram_model_write(ngram_model_t *model, const char *file_name,
00165                    ngram_file_type_t file_type)
00166  {
00167      switch (file_type) {
00168      case NGRAM_AUTO: {
00169          file_type = ngram_file_name_to_type(file_name);
00170          /* Default to ARPA (catches .lm and other things) */
00171          if (file_type == NGRAM_INVALID)
00172              file_type = NGRAM_ARPA;
00173          return ngram_model_write(model, file_name, file_type);
00174      }
00175      case NGRAM_ARPA:
00176          return ngram_model_arpa_write(model, file_name);
00177      case NGRAM_DMP:
00178          return ngram_model_dmp_write(model, file_name);
00179      default:
00180          E_ERROR("language model file type not supported\n");
00181          return -1;
00182      }
00183      E_ERROR("language model file type not supported\n");
00184      return -1;
00185  }
00186 
00187  int32
00188  ngram_model_init(ngram_model_t *base,
00189                   ngram_funcs_t *funcs,
00190                   logmath_t *lmath,
00191                   int32 n, int32 n_unigram)
00192  {
00193      base->refcount = 1;
00194      base->funcs = funcs;
00195      base->n = n;
00196      /* If this was previously initialized... */
00197     if (base->n_counts == NULL)
00198         base->n_counts = ckd_calloc(3, sizeof(*base->n_counts));
00199     /* Don't reset weights if logmath object hasn't changed. */
00200     if (base->lmath != lmath) {
00201         /* Set default values for weights. */
00202         base->lw = 1.0;
00203         base->log_wip = 0; /* i.e. 1.0 */
00204         base->log_uw = 0;  /* i.e. 1.0 */
00205         base->log_uniform = logmath_log(lmath, 1.0 / n_unigram);
00206         base->log_uniform_weight = logmath_get_zero(lmath);
00207         base->log_zero = logmath_get_zero(lmath);
00208         base->lmath = lmath;
00209     }
00210     /* Allocate or reallocate space for word strings. */
00211     if (base->word_str) {
00212         /* Free all previous word strings if they were allocated. */
00213         if (base->writable) {
00214             int32 i;
00215             for (i = 0; i < base->n_words; ++i) {
00216                 ckd_free(base->word_str[i]);
00217                 base->word_str[i] = NULL;
00218             }
00219         }
00220         base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *));
00221     }
00222     else
00223         base->word_str = ckd_calloc(n_unigram, sizeof(char *));
00224     /* NOTE: They are no longer case-insensitive since we are allowing
00225      * other encodings for word strings.  Beware. */
00226     if (base->wid)
00227         hash_table_empty(base->wid);
00228     else
00229         base->wid = hash_table_new(n_unigram, FALSE);
00230     base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram;
00231 
00232     return 0;
00233 }
00234 
00235 ngram_model_t *
00236 ngram_model_retain(ngram_model_t *model)
00237 {
00238     ++model->refcount;
00239     return model;
00240 }
00241 
00242 
00243 void
00244 ngram_model_flush(ngram_model_t *model)
00245 {
00246     if (model->funcs && model->funcs->flush)
00247         (*model->funcs->flush)(model);
00248 }
00249 
00250 int
00251 ngram_model_free(ngram_model_t *model)
00252 {
00253     int i;
00254 
00255     if (model == NULL)
00256         return 0;
00257     if (--model->refcount > 0)
00258         return model->refcount;
00259     if (model->funcs && model->funcs->free)
00260         (*model->funcs->free)(model);
00261     if (model->writable) {
00262         /* Free all words. */
00263         for (i = 0; i < model->n_words; ++i) {
00264             ckd_free(model->word_str[i]);
00265         }
00266     }
00267     else {
00268         /* Free all class words. */
00269         for (i = 0; i < model->n_classes; ++i) {
00270             ngram_class_t *lmclass;
00271             int32 j;
00272 
00273             lmclass = model->classes[i];
00274             for (j = 0; j < lmclass->n_words; ++j) {
00275                 ckd_free(model->word_str[lmclass->start_wid + j]);
00276             }
00277             for (j = 0; j < lmclass->n_hash; ++j) {
00278                 if (lmclass->nword_hash[j].wid != -1) {
00279                     ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
00280                 }
00281             }
00282         }
00283     }
00284     for (i = 0; i < model->n_classes; ++i) {
00285         ngram_class_free(model->classes[i]);
00286     }
00287     ckd_free(model->classes);
00288     hash_table_free(model->wid);
00289     ckd_free(model->word_str);
00290     ckd_free(model->n_counts);
00291     ckd_free(model);
00292     return 0;
00293 }
00294 
00295 int
00296 ngram_model_casefold(ngram_model_t *model, int kase)
00297 {
00298     int writable, i;
00299     hash_table_t *new_wid;
00300 
00301     /* Were word strings already allocated? */
00302     writable = model->writable;
00303     /* Either way, we are going to allocate some word strings. */
00304     model->writable = TRUE;
00305 
00306     /* And, don't forget, we need to rebuild the word to unigram ID
00307      * mapping. */
00308     new_wid = hash_table_new(model->n_words, FALSE);
00309     for (i = 0; i < model->n_words; ++i) {
00310         char *outstr;
00311         if (writable) {
00312             outstr = model->word_str[i];
00313         }
00314         else {
00315             outstr = ckd_salloc(model->word_str[i]);
00316         }
00317         /* Don't case-fold <tags> or [classes] */
00318         if (outstr[0] == '<' || outstr[0] == '[') {
00319         }
00320         else {
00321             switch (kase) {
00322             case NGRAM_UPPER:
00323                 ucase(outstr);
00324                 break;
00325             case NGRAM_LOWER:
00326                 lcase(outstr);
00327                 break;
00328             default:
00329                 ;
00330             }
00331         }
00332         model->word_str[i] = outstr;
00333 
00334         /* Now update the hash table.  We might have terrible
00335          * collisions here, so warn about them. */
00336         if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
00337             E_WARN("Duplicate word in dictionary after conversion: %s\n",
00338                    model->word_str[i]);
00339         }
00340     }
00341     /* Swap out the hash table. */
00342     hash_table_free(model->wid);
00343     model->wid = new_wid;
00344     return 0;
00345 }
00346 
00347 #ifdef HAVE_ICONV
00348 int
00349 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00350 {
00351     iconv_t ic;
00352     char *outbuf;
00353     size_t maxlen;
00354     int i, writable;
00355     hash_table_t *new_wid;
00356 
00357     /* FIXME: Need to do a special case thing for the GB-HEX encoding
00358      * used in Sphinx3 Mandarin models. */
00359     if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
00360         E_ERROR_SYSTEM("iconv_open() failed");
00361         return -1;
00362     }
00363     /* iconv(3) is a piece of crap and won't accept a NULL out buffer,
00364      * unlike wcstombs(3). So we have to either call it over and over
00365      * again until our buffer is big enough, or call it with a huge
00366      * buffer and then copy things back to the output.  We will use a
00367      * mix of these two approaches here.  We'll keep a single big
00368      * buffer around, and expand it as necessary.
00369      */
00370     maxlen = 0;
00371     for (i = 0; i < model->n_words; ++i) {
00372         if (strlen(model->word_str[i]) > maxlen)
00373             maxlen = strlen(model->word_str[i]);
00374     }
00375     /* Were word strings already allocated? */
00376     writable = model->writable;
00377     /* Either way, we are going to allocate some word strings. */
00378     model->writable = TRUE;
00379     /* Really should be big enough except for pathological cases. */
00380     maxlen = maxlen * sizeof(int) + 15;
00381     outbuf = ckd_calloc(maxlen, 1);
00382     /* And, don't forget, we need to rebuild the word to unigram ID
00383      * mapping. */
00384     new_wid = hash_table_new(model->n_words, FALSE);
00385     for (i = 0; i < model->n_words; ++i) {
00386         ICONV_CONST char *in;
00387         char *out;
00388         size_t inleft, outleft, result;
00389 
00390     start_conversion:
00391         in = (ICONV_CONST char *)model->word_str[i];
00392         /* Yes, this assumes that we don't have any NUL bytes. */
00393         inleft = strlen(in);
00394         out = outbuf;
00395         outleft = maxlen;
00396 
00397         while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
00398             if (errno != E2BIG) {
00399                 /* FIXME: if we already converted any words, then they
00400                  * are going to be in an inconsistent state. */
00401                 E_ERROR_SYSTEM("iconv() failed");
00402                 ckd_free(outbuf);
00403                 hash_table_free(new_wid);
00404                 return -1;
00405             }
00406             /* Reset the internal state of conversion. */
00407             iconv(ic, NULL, NULL, NULL, NULL);
00408             /* Make everything bigger. */
00409             maxlen *= 2;
00410             out = outbuf = ckd_realloc(outbuf, maxlen);
00411             /* Reset the input pointers. */
00412             in = (ICONV_CONST char *)model->word_str[i];
00413             inleft = strlen(in);
00414         }
00415 
00416         /* Now flush a shift-out sequence, if any. */
00417         if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
00418             if (errno != E2BIG) {
00419                 /* FIXME: if we already converted any words, then they
00420                  * are going to be in an inconsistent state. */
00421                 E_ERROR_SYSTEM("iconv() failed (state reset sequence)");
00422                 ckd_free(outbuf);
00423                 hash_table_free(new_wid);
00424                 return -1;
00425             }
00426             /* Reset the internal state of conversion. */
00427             iconv(ic, NULL, NULL, NULL, NULL);
00428             /* Make everything bigger. */
00429             maxlen *= 2;
00430             outbuf = ckd_realloc(outbuf, maxlen);
00431             /* Be very evil. */
00432             goto start_conversion;
00433         }
00434 
00435         result = maxlen - outleft;
00436         /* Okay, that was hard, now let's go shopping. */
00437         if (writable) {
00438             /* Grow or shrink the output string as necessary. */
00439             model->word_str[i] = ckd_realloc(model->word_str[i], result + 1);
00440             model->word_str[i][result] = '\0';
00441         }
00442         else {
00443             /* It actually was not allocated previously, so do that now. */
00444             model->word_str[i] = ckd_calloc(result + 1, 1);
00445         }
00446         /* Copy the new thing in. */
00447         memcpy(model->word_str[i], outbuf, result);
00448 
00449         /* Now update the hash table.  We might have terrible
00450          * collisions if a non-reversible conversion was requested.,
00451          * so warn about them. */
00452         if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
00453             E_WARN("Duplicate word in dictionary after conversion: %s\n",
00454                    model->word_str[i]);
00455         }
00456     }
00457     ckd_free(outbuf);
00458     iconv_close(ic);
00459     /* Swap out the hash table. */
00460     hash_table_free(model->wid);
00461     model->wid = new_wid;
00462 
00463     return 0;
00464 }
00465 #else /* !HAVE_ICONV */
00466 int
00467 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00468 {
00469     return -1;
00470 }
00471 #endif /* !HAVE_ICONV */
00472 
00473 int
00474 ngram_model_apply_weights(ngram_model_t *model,
00475                           float32 lw, float32 wip, float32 uw)
00476 {
00477     return (*model->funcs->apply_weights)(model, lw, wip, uw);
00478 }
00479 
00480 float32
00481 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip,
00482                         int32 *out_log_uw)
00483 {
00484     if (out_log_wip) *out_log_wip = model->log_wip;
00485     if (out_log_uw) *out_log_uw = model->log_uw;
00486     return model->lw;
00487 }
00488 
00489 
00490 int32
00491 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history,
00492                int32 n_hist, int32 *n_used)
00493 {
00494     int32 score, class_weight = 0;
00495     int i;
00496 
00497     /* Closed vocabulary, OOV word probability is zero */
00498     if (wid == NGRAM_INVALID_WID)
00499         return model->log_zero;
00500 
00501     /* "Declassify" wid and history */
00502     if (NGRAM_IS_CLASSWID(wid)) {
00503         ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00504 
00505         class_weight = ngram_class_prob(lmclass, wid);
00506         if (class_weight == 1) /* Meaning, not found in class. */
00507             return model->log_zero;
00508         wid = lmclass->tag_wid;
00509     }
00510     for (i = 0; i < n_hist; ++i) {
00511         if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00512             history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00513     }
00514     score = (*model->funcs->score)(model, wid, history, n_hist, n_used);
00515 
00516     /* Multiply by unigram in-class weight. */
00517     return score + class_weight;
00518 }
00519 
00520 int32
00521 ngram_score(ngram_model_t *model, const char *word, ...)
00522 {
00523     va_list history;
00524     const char *hword;
00525     int32 *histid;
00526     int32 n_hist;
00527     int32 n_used;
00528     int32 prob;
00529 
00530     va_start(history, word);
00531     n_hist = 0;
00532     while ((hword = va_arg(history, const char *)) != NULL)
00533         ++n_hist;
00534     va_end(history);
00535 
00536     histid = ckd_calloc(n_hist, sizeof(*histid));
00537     va_start(history, word);
00538     n_hist = 0;
00539     while ((hword = va_arg(history, const char *)) != NULL) {
00540         histid[n_hist] = ngram_wid(model, hword);
00541         ++n_hist;
00542     }
00543     va_end(history);
00544 
00545     prob = ngram_ng_score(model, ngram_wid(model, word),
00546                           histid, n_hist, &n_used);
00547     ckd_free(histid);
00548     return prob;
00549 }
00550 
00551 int32
00552 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
00553 {
00554     int32 hist[2];
00555     hist[0] = w2;
00556     hist[1] = w1;
00557     return ngram_ng_score(model, w3, hist, 2, n_used);
00558 }
00559 
00560 int32
00561 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
00562 {
00563     return ngram_ng_score(model, w2, &w1, 1, n_used);
00564 }
00565 
00566 int32
00567 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history,
00568               int32 n_hist, int32 *n_used)
00569 {
00570     int32 prob, class_weight = 0;
00571     int i;
00572 
00573     /* Closed vocabulary, OOV word probability is zero */
00574     if (wid == NGRAM_INVALID_WID)
00575         return model->log_zero;
00576 
00577     /* "Declassify" wid and history */
00578     if (NGRAM_IS_CLASSWID(wid)) {
00579         ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00580 
00581         class_weight = ngram_class_prob(lmclass, wid);
00582         if (class_weight == 1) /* Meaning, not found in class. */
00583             return class_weight;
00584         wid = lmclass->tag_wid;
00585     }
00586     for (i = 0; i < n_hist; ++i) {
00587         if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00588             history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00589     }
00590     prob = (*model->funcs->raw_score)(model, wid, history,
00591                                       n_hist, n_used);
00592     /* Multiply by unigram in-class weight. */
00593     return prob + class_weight;
00594 }
00595 
00596 int32
00597 ngram_prob(ngram_model_t *model, const char *word, ...)
00598 {
00599     va_list history;
00600     const char *hword;
00601     int32 *histid;
00602     int32 n_hist;
00603     int32 n_used;
00604     int32 prob;
00605 
00606     va_start(history, word);
00607     n_hist = 0;
00608     while ((hword = va_arg(history, const char *)) != NULL)
00609         ++n_hist;
00610     va_end(history);
00611 
00612     histid = ckd_calloc(n_hist, sizeof(*histid));
00613     va_start(history, word);
00614     n_hist = 0;
00615     while ((hword = va_arg(history, const char *)) != NULL) {
00616         histid[n_hist] = ngram_wid(model, hword);
00617         ++n_hist;
00618     }
00619     va_end(history);
00620 
00621     prob = ngram_ng_prob(model, ngram_wid(model, word),
00622                          histid, n_hist, &n_used);
00623     ckd_free(histid);
00624     return prob;
00625 }
00626 
00627 int32
00628 ngram_score_to_prob(ngram_model_t *base, int32 score)
00629 {
00630     int32 prob;
00631 
00632     /* Undo insertion penalty. */
00633     prob = score - base->log_wip;
00634     /* Undo language weight. */
00635     prob = (int32)(prob / base->lw);
00636 
00637     return prob;
00638 }
00639 
00640 int32
00641 ngram_unknown_wid(ngram_model_t *model)
00642 {
00643     int32 val;
00644 
00645     /* FIXME: This could be memoized for speed if necessary. */
00646     /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */
00647     if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
00648         return NGRAM_INVALID_WID;
00649     else
00650         return val;
00651 }
00652 
00653 int32
00654 ngram_zero(ngram_model_t *model)
00655 {
00656     return model->log_zero;
00657 }
00658 
00659 int32
00660 ngram_model_get_size(ngram_model_t *model)
00661 {
00662   if (model != NULL)
00663     return model->n;
00664   return 0;
00665 }
00666 
00667 int32 const *
00668 ngram_model_get_counts(ngram_model_t *model)
00669 {
00670   if (model != NULL)
00671     return model->n_counts;
00672   return NULL;
00673 }
00674 
00675 void
00676 ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model,
00677                 int m, int successor)
00678 {
00679     itor->model = model;
00680     itor->wids = ckd_calloc(model->n, sizeof(*itor->wids));
00681     itor->m = m;
00682     itor->successor = successor;
00683 }
00684 
00685 ngram_iter_t *
00686 ngram_model_mgrams(ngram_model_t *model, int m)
00687 {
00688     ngram_iter_t *itor;
00689     /* The fact that m=n-1 is not exactly obvious.  Prevent accidents. */
00690     if (m >= model->n)
00691         return NULL;
00692     if (model->funcs->mgrams == NULL)
00693         return NULL;
00694     itor = (*model->funcs->mgrams)(model, m);
00695     return itor;
00696 }
00697 
00698 ngram_iter_t *
00699 ngram_iter(ngram_model_t *model, const char *word, ...)
00700 {
00701     va_list history;
00702     const char *hword;
00703     int32 *histid;
00704     int32 n_hist;
00705     ngram_iter_t *itor;
00706 
00707     va_start(history, word);
00708     n_hist = 0;
00709     while ((hword = va_arg(history, const char *)) != NULL)
00710         ++n_hist;
00711     va_end(history);
00712 
00713     histid = ckd_calloc(n_hist, sizeof(*histid));
00714     va_start(history, word);
00715     n_hist = 0;
00716     while ((hword = va_arg(history, const char *)) != NULL) {
00717         histid[n_hist] = ngram_wid(model, hword);
00718         ++n_hist;
00719     }
00720     va_end(history);
00721 
00722     itor = ngram_ng_iter(model, ngram_wid(model, word), histid, n_hist);
00723     ckd_free(histid);
00724     return itor;
00725 }
00726 
00727 ngram_iter_t *
00728 ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
00729 {
00730     if (n_hist >= model->n)
00731         return NULL;
00732     if (model->funcs->iter == NULL)
00733         return NULL;
00734     return (*model->funcs->iter)(model, wid, history, n_hist);
00735 }
00736 
00737 ngram_iter_t *
00738 ngram_iter_successors(ngram_iter_t *itor)
00739 {
00740     /* Stop when we are at the highest order N-Gram. */
00741     if (itor->m == itor->model->n - 1)
00742         return NULL;
00743     return (*itor->model->funcs->successors)(itor);
00744 }
00745 
00746 int32 const *
00747 ngram_iter_get(ngram_iter_t *itor,
00748                int32 *out_score,
00749                int32 *out_bowt)
00750 {
00751     return (*itor->model->funcs->iter_get)(itor, out_score, out_bowt);
00752 }
00753 
00754 ngram_iter_t *
00755 ngram_iter_next(ngram_iter_t *itor)
00756 {
00757     return (*itor->model->funcs->iter_next)(itor);
00758 }
00759 
00760 void
00761 ngram_iter_free(ngram_iter_t *itor)
00762 {
00763     ckd_free(itor->wids);
00764     (*itor->model->funcs->iter_free)(itor);
00765 }
00766 
00767 int32
00768 ngram_wid(ngram_model_t *model, const char *word)
00769 {
00770     int32 val;
00771 
00772     if (hash_table_lookup_int32(model->wid, word, &val) == -1)
00773         return ngram_unknown_wid(model);
00774     else
00775         return val;
00776 }
00777 
00778 const char *
00779 ngram_word(ngram_model_t *model, int32 wid)
00780 {
00781     /* Remove any class tag */
00782     wid = NGRAM_BASEWID(wid);
00783     if (wid >= model->n_words)
00784         return NULL;
00785     return model->word_str[wid];
00786 }
00787 
00791 int32
00792 ngram_add_word_internal(ngram_model_t *model,
00793                         const char *word,
00794                         int32 classid)
00795 {
00796     void *dummy;
00797     int32 wid;
00798 
00799     /* Take the next available word ID */
00800     wid = model->n_words;
00801     if (classid >= 0) {
00802         wid = NGRAM_CLASSWID(wid, classid);
00803     }
00804     /* Check for hash collisions. */
00805     if (hash_table_lookup(model->wid, word, &dummy) == 0) {
00806         E_ERROR("Duplicate definition of word %s\n", word);
00807         return NGRAM_INVALID_WID;
00808     }
00809     /* Reallocate word_str if necessary. */
00810     if (model->n_words >= model->n_1g_alloc) {
00811         model->n_1g_alloc += UG_ALLOC_STEP;
00812         model->word_str = ckd_realloc(model->word_str,
00813                                       sizeof(*model->word_str) * model->n_1g_alloc);
00814     }
00815     /* Add the word string in the appropriate manner. */
00816     /* Class words are always dynamically allocated. */
00817     model->word_str[model->n_words] = ckd_salloc(word);
00818     /* Now enter it into the hash table. */
00819     if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) {
00820         E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n",
00821                 model->word_str[model->n_words], (void *)(long)(wid));
00822     }
00823     /* Increment number of words. */
00824     ++model->n_words;
00825     return wid;
00826 }
00827 
00828 int32
00829 ngram_model_add_word(ngram_model_t *model,
00830                      const char *word, float32 weight)
00831 {
00832     int32 wid, prob = model->log_zero;
00833 
00834     wid = ngram_add_word_internal(model, word, -1);
00835     if (wid == NGRAM_INVALID_WID)
00836         return wid;
00837 
00838     /* Do what needs to be done to add the word to the unigram. */
00839     if (model->funcs && model->funcs->add_ug)
00840         prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight));
00841     if (prob == 0) {
00842         if (model->writable)
00843             ckd_free(model->word_str[wid]);
00844         return -1;
00845     }
00846     return wid;
00847 }
00848 
00849 ngram_class_t *
00850 ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords)
00851 {
00852     ngram_class_t *lmclass;
00853     gnode_t *gn;
00854     float32 tprob;
00855     int i;
00856 
00857     lmclass = ckd_calloc(1, sizeof(*lmclass));
00858     lmclass->tag_wid = tag_wid;
00859     /* wid_base is the wid (minus class tag) of the first word in the list. */
00860     lmclass->start_wid = start_wid;
00861     lmclass->n_words = glist_count(classwords);
00862     lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
00863     lmclass->nword_hash = NULL;
00864     lmclass->n_hash = 0;
00865     tprob = 0.0;
00866     for (gn = classwords; gn; gn = gnode_next(gn)) {
00867         tprob += gnode_float32(gn);
00868     }
00869     if (tprob > 1.1 || tprob < 0.9) {
00870         E_WARN("Total class probability is %f, will normalize\n", tprob);
00871         for (gn = classwords; gn; gn = gnode_next(gn)) {
00872             gn->data.fl /= tprob;
00873         }
00874     }
00875     for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
00876         lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
00877     }
00878 
00879     return lmclass;
00880 }
00881 
00882 int32
00883 ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight)
00884 {
00885     int32 hash;
00886 
00887     if (lmclass->nword_hash == NULL) {
00888         /* Initialize everything in it to -1 */
00889         lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00890         memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00891         lmclass->n_hash = NGRAM_HASH_SIZE;
00892         lmclass->n_hash_inuse = 0;
00893     }
00894     /* Stupidest possible hash function.  This will work pretty well
00895      * when this function is called repeatedly with contiguous word
00896      * IDs, though... */
00897     hash = wid & (lmclass->n_hash - 1);
00898     if (lmclass->nword_hash[hash].wid == -1) {
00899         /* Good, no collision. */
00900         lmclass->nword_hash[hash].wid = wid;
00901         lmclass->nword_hash[hash].prob1 = lweight;
00902         ++lmclass->n_hash_inuse;
00903         return hash;
00904     }
00905     else {
00906         int32 next; 
00907         /* Collision... Find the end of the hash chain. */
00908         while (lmclass->nword_hash[hash].next != -1)
00909             hash = lmclass->nword_hash[hash].next;
00910         assert(hash != -1);
00911         /* Does we has any more bukkit? */
00912         if (lmclass->n_hash_inuse == lmclass->n_hash) {
00913             /* Oh noes!  Ok, we makes more. */
00914             lmclass->nword_hash = ckd_realloc(lmclass->nword_hash, 
00915                                               lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash));
00916             memset(lmclass->nword_hash + lmclass->n_hash,
00917                    0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash));
00918             /* Just use the next allocated one (easy) */
00919             next = lmclass->n_hash;
00920             lmclass->n_hash *= 2;
00921         }
00922         else {
00923             /* Look for any available bucket.  We hope this doesn't happen. */
00924             for (next = 0; next < lmclass->n_hash; ++next)
00925                 if (lmclass->nword_hash[next].wid == -1)
00926                     break;
00927             /* This should absolutely not happen. */
00928             assert(next != lmclass->n_hash);
00929         }
00930         lmclass->nword_hash[next].wid = wid;
00931         lmclass->nword_hash[next].prob1 = lweight;
00932         lmclass->nword_hash[hash].next = next;
00933         ++lmclass->n_hash_inuse;
00934         return next;
00935     }
00936 }
00937 
00938 void
00939 ngram_class_free(ngram_class_t *lmclass)
00940 {
00941     ckd_free(lmclass->nword_hash);
00942     ckd_free(lmclass->prob1);
00943     ckd_free(lmclass);
00944 }
00945 
00946 int32
00947 ngram_model_add_class_word(ngram_model_t *model,
00948                            const char *classname,
00949                            const char *word,
00950                            float32 weight)
00951 {
00952     ngram_class_t *lmclass;
00953     int32 classid, tag_wid, wid, i, scale;
00954     float32 fprob;
00955 
00956     /* Find the class corresponding to classname.  Linear search
00957      * probably okay here since there won't be very many classes, and
00958      * this doesn't have to be fast. */
00959     tag_wid = ngram_wid(model, classname);
00960     if (tag_wid == NGRAM_INVALID_WID) {
00961         E_ERROR("No such word or class tag: %s\n", classname);
00962         return tag_wid;
00963     }
00964     for (classid = 0; classid < model->n_classes; ++classid) {
00965         if (model->classes[classid]->tag_wid == tag_wid)
00966             break;
00967     }
00968     /* Hmm, no such class.  It's probably not a good idea to create one. */
00969     if (classid == model->n_classes) {
00970         E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
00971         return NGRAM_INVALID_WID;
00972     }
00973     lmclass = model->classes[classid];
00974 
00975     /* Add this word to the model's set of words. */
00976     wid = ngram_add_word_internal(model, word, classid);
00977     if (wid == NGRAM_INVALID_WID)
00978         return wid;
00979 
00980     /* This is the fixed probability of the new word. */
00981     fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
00982     /* Now normalize everything else to fit it in.  This is
00983      * accomplished by simply scaling all the other probabilities
00984      * by (1-fprob). */
00985     scale = logmath_log(model->lmath, 1.0 - fprob);
00986     for (i = 0; i < lmclass->n_words; ++i)
00987         lmclass->prob1[i] += scale;
00988     for (i = 0; i < lmclass->n_hash; ++i)
00989         if (lmclass->nword_hash[i].wid != -1)
00990             lmclass->nword_hash[i].prob1 += scale;
00991 
00992     /* Now add it to the class hash table. */
00993     return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob));
00994 }
00995 
00996 int32
00997 ngram_model_add_class(ngram_model_t *model,
00998                       const char *classname,
00999                       float32 classweight,
01000                       char **words,
01001                       const float32 *weights,
01002                       int32 n_words)
01003 {
01004     ngram_class_t *lmclass;
01005     glist_t classwords = NULL;
01006     int32 i, start_wid = -1;
01007     int32 classid, tag_wid;
01008 
01009     /* Check if classname already exists in model.  If not, add it.*/
01010     if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
01011         tag_wid = ngram_model_add_word(model, classname, classweight);
01012         if (tag_wid == NGRAM_INVALID_WID)
01013             return -1;
01014     }
01015 
01016     if (model->n_classes == 128) {
01017         E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
01018         return -1;
01019     }
01020     classid = model->n_classes;
01021     for (i = 0; i < n_words; ++i) {
01022         int32 wid;
01023 
01024         wid = ngram_add_word_internal(model, words[i], classid);
01025         if (wid == NGRAM_INVALID_WID)
01026             return -1;
01027         if (start_wid == -1)
01028             start_wid = NGRAM_BASEWID(wid);
01029         classwords = glist_add_float32(classwords, weights[i]);
01030     }
01031     classwords = glist_reverse(classwords);
01032     lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
01033     glist_free(classwords);
01034     if (lmclass == NULL)
01035         return -1;
01036 
01037     ++model->n_classes;
01038     if (model->classes == NULL)
01039         model->classes = ckd_calloc(1, sizeof(*model->classes));
01040     else
01041         model->classes = ckd_realloc(model->classes,
01042                                      model->n_classes * sizeof(*model->classes));
01043     model->classes[classid] = lmclass;
01044     return classid;
01045 }
01046 
01047 int32
01048 ngram_class_prob(ngram_class_t *lmclass, int32 wid)
01049 {
01050     int32 base_wid = NGRAM_BASEWID(wid);
01051 
01052     if (base_wid < lmclass->start_wid
01053         || base_wid > lmclass->start_wid + lmclass->n_words) {
01054         int32 hash;
01055 
01056         /* Look it up in the hash table. */
01057         hash = wid & (lmclass->n_hash - 1);
01058         while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
01059             hash = lmclass->nword_hash[hash].next;
01060         if (hash == -1)
01061             return 1;
01062         return lmclass->nword_hash[hash].prob1;
01063     }
01064     else {
01065         return lmclass->prob1[base_wid - lmclass->start_wid];
01066     }
01067 }
01068 
01069 int32
01070 read_classdef_file(hash_table_t *classes, const char *file_name)
01071 {
01072     FILE *fp;
01073     int32 is_pipe;
01074     int inclass;  
01075     int32 rv = -1;
01076     gnode_t *gn;
01077     glist_t classwords = NULL;
01078     glist_t classprobs = NULL;
01079     char *classname = NULL;
01080 
01081     if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
01082         E_ERROR("File %s not found\n", file_name);
01083         return -1;
01084     }
01085 
01086     inclass = FALSE;
01087     while (!feof(fp)) {
01088         char line[512];
01089         char *wptr[2];
01090         int n_words;
01091 
01092         if (fgets(line, sizeof(line), fp) == NULL)
01093             break;
01094 
01095         n_words = str2words(line, wptr, 2);
01096         if (n_words <= 0)
01097             continue;
01098 
01099         if (inclass) {
01100             /* Look for an end of class marker. */
01101             if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
01102                 classdef_t *classdef;
01103                 gnode_t *word, *weight;
01104                 int32 i;
01105 
01106                 if (classname == NULL || 0 != strcmp(wptr[1], classname))
01107                     goto error_out;
01108                 inclass = FALSE;
01109 
01110                 /* Construct a class from the list of words collected. */
01111                 classdef = ckd_calloc(1, sizeof(*classdef));
01112                 classwords = glist_reverse(classwords);
01113                 classprobs = glist_reverse(classprobs);
01114                 classdef->n_words = glist_count(classwords);
01115                 classdef->words = ckd_calloc(classdef->n_words,
01116                                              sizeof(*classdef->words));
01117                 classdef->weights = ckd_calloc(classdef->n_words,
01118                                                sizeof(*classdef->weights));
01119                 word = classwords;
01120                 weight = classprobs;
01121                 for (i = 0; i < classdef->n_words; ++i) {
01122                     classdef->words[i] = gnode_ptr(word);
01123                     classdef->weights[i] = gnode_float32(weight);
01124                     word = gnode_next(word);
01125                     weight = gnode_next(weight);
01126                 }
01127                 
01128                 /* Add this class to the hash table. */
01129                 if (hash_table_enter(classes, classname, classdef) != classdef) {
01130                     classdef_free(classdef);
01131                     goto error_out;
01132                 }
01133 
01134                 /* Reset everything. */
01135                 glist_free(classwords);
01136                 glist_free(classprobs);
01137                 classwords = NULL;
01138                 classprobs = NULL;
01139                 classname = NULL;
01140             }
01141             else {
01142                 float32 fprob;
01143 
01144                 if (n_words == 2)
01145                     fprob = (float32)atof_c(wptr[1]);
01146                 else
01147                     fprob = 1.0f;
01148                 /* Add it to the list of words for this class. */
01149                 classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0]));
01150                 classprobs = glist_add_float32(classprobs, fprob);
01151             }
01152         }
01153         else {
01154             /* Start a new LM class if the LMCLASS marker is seen */
01155             if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
01156                 if (inclass)
01157                     goto error_out;
01158                 inclass = TRUE;
01159                 classname = ckd_salloc(wptr[1]);
01160             }
01161             /* Otherwise, just ignore whatever junk we got */
01162         }
01163     }
01164     rv = 0; /* Success. */
01165 
01166 error_out:
01167     /* Free all the stuff we might have allocated. */
01168     fclose_comp(fp, is_pipe);
01169     for (gn = classwords; gn; gn = gnode_next(gn))
01170         ckd_free(gnode_ptr(gn));
01171     glist_free(classwords);
01172     glist_free(classprobs);
01173     ckd_free(classname);
01174 
01175     return rv;
01176 }
01177 
01178 void
01179 classdef_free(classdef_t *classdef)
01180 {
01181     int32 i;
01182     for (i = 0; i < classdef->n_words; ++i)
01183         ckd_free(classdef->words[i]);
01184     ckd_free(classdef->words);
01185     ckd_free(classdef->weights);
01186     ckd_free(classdef);
01187 }
01188 
01189 
01190 int32
01191 ngram_model_read_classdef(ngram_model_t *model,
01192                           const char *file_name)
01193 {
01194     hash_table_t *classes;
01195     glist_t hl = NULL;
01196     gnode_t *gn;
01197     int32 rv = -1;
01198 
01199     classes = hash_table_new(0, FALSE);
01200     if (read_classdef_file(classes, file_name) < 0) {
01201         hash_table_free(classes);
01202         return -1;
01203     }
01204     
01205     /* Create a new class in the language model for each classdef. */
01206     hl = hash_table_tolist(classes, NULL);
01207     for (gn = hl; gn; gn = gnode_next(gn)) {
01208         hash_entry_t *he = gnode_ptr(gn);
01209         classdef_t *classdef = he->val;
01210 
01211         if (ngram_model_add_class(model, he->key, 1.0,
01212                                   classdef->words,
01213                                   classdef->weights,
01214                                   classdef->n_words) < 0)
01215             goto error_out;
01216     }
01217     rv = 0;
01218 
01219 error_out:
01220     for (gn = hl; gn; gn = gnode_next(gn)) {
01221         hash_entry_t *he = gnode_ptr(gn);
01222         ckd_free((char *)he->key);
01223         classdef_free(he->val);
01224     }
01225     glist_free(hl);
01226     hash_table_free(classes);
01227     return rv;
01228 }

Generated on Tue Aug 17 2010 for SphinxBase by  doxygen 1.7.1