• Main Page
  • Related Pages
  • Data Structures
  • Files
  • File List
  • Globals

src/sphinx_fe/sphinx_fe.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1996-2004 Carnegie Mellon University.  All rights 
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 #include <stdio.h>
00038 #include <stdlib.h>
00039 #include <string.h>
00040 #include <time.h>
00041 #include <assert.h>
00042 
00043 #ifdef HAVE_CONFIG_H
00044 #include <config.h>
00045 #endif
00046 
00047 #include "fe.h"
00048 #include "strfuncs.h"
00049 #include "pio.h"
00050 #include "filename.h"
00051 #include "cmd_ln.h"
00052 #include "err.h"
00053 #include "ckd_alloc.h"
00054 #include "byteorder.h"
00055 
00056 #include "sphinx_wave2feat.h"
00057 #include "cmd_ln_defn.h"
00058 
00059 typedef struct audio_type_s {
00060     char const *name;
00061     int (*detect)(sphinx_wave2feat_t *wtf, char const *infile);
00062     int (*decode)(sphinx_wave2feat_t *wtf);
00063 } audio_type_t;
00064 
00065 typedef struct output_type_s {
00066     char const *name;
00067     int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat);
00068     int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr);
00069 } output_type_t;
00070 
00071 struct sphinx_wave2feat_s {
00072     int refcount;     
00073     cmd_ln_t *config; 
00074     fe_t *fe;         
00075     char *infile;     
00076     char *outfile;    
00077     FILE *infh;       
00078     FILE *outfh;      
00079     short *audio;     
00080     mfcc_t **feat;    
00081     int blocksize;    
00082     int featsize;     
00083     int veclen;       
00084     int in_veclen;    
00085     int byteswap;     
00086     output_type_t const *ot;
00087 };
00088 
00090 typedef struct RIFFHeader{
00091     char rifftag[4];      /* "RIFF" string */
00092     int32 TotalLength;      /* Total length */
00093     char wavefmttag[8];   /* "WAVEfmt " string (note space after 't') */
00094     int32 RemainingLength;  /* Remaining length */
00095     int16 data_format;    /* data format tag, 1 = PCM */
00096     int16 numchannels;    /* Number of channels in file */
00097     int32 SamplingFreq;     /* Sampling frequency */
00098     int32 BytesPerSec;      /* Average bytes/sec */
00099     int16 BlockAlign;     /* Block align */
00100     int16 BitsPerSample;  /* 8 or 16 bit */
00101     char datatag[4];      /* "data" string */
00102     int32 datalength;       /* Raw data length */
00103 } MSWAV_hdr;
00104 
00110 static int
00111 detect_riff(sphinx_wave2feat_t *wtf, char const *infile)
00112 {
00113     FILE *fh;
00114     MSWAV_hdr hdr;
00115 
00116     if ((fh = fopen(infile, "rb")) == NULL) {
00117         E_ERROR_SYSTEM("Failed to open %s", infile);
00118         return -1;
00119     }
00120     if (fread(&hdr, sizeof(hdr), 1, fh) != 1) {
00121         E_ERROR_SYSTEM("Failed to read RIFF header");
00122         fclose(fh);
00123         return -1;
00124     }
00125     /* Make sure it is actually a RIFF file. */
00126     if (0 != memcmp(hdr.rifftag, "RIFF", 4))
00127         return FALSE;
00128 
00129     /* Get relevant information. */
00130     cmd_ln_set_int32_r(wtf->config, "-nchans", hdr.numchannels);
00131     cmd_ln_set_float32_r(wtf->config, "-samprate", hdr.SamplingFreq);
00132     wtf->infile = ckd_salloc(infile);
00133     wtf->infh = fh;
00134 
00135     return TRUE;
00136 }
00137 
00143 static int
00144 detect_nist(sphinx_wave2feat_t *wtf, char const *infile)
00145 {
00146     char nist[7];
00147     lineiter_t *li;
00148     FILE *fh;
00149 
00150     if ((fh = fopen(infile, "rb")) == NULL) {
00151         E_ERROR_SYSTEM("Failed to open %s", infile);
00152         return -1;
00153     }
00154     if (fread(&nist, 1, 7, fh) != 7) {
00155         E_ERROR_SYSTEM("Failed to read NIST header");
00156         fclose(fh);
00157         return -1;
00158     }
00159     /* Is this actually a NIST file? */
00160     if (0 != strncmp(nist, "NIST_1A", 7)) {
00161         fclose(fh);
00162         return FALSE;
00163     }
00164     /* Rewind, parse lines. */
00165     fseek(fh, 0, SEEK_SET);
00166     for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
00167         char **words;
00168         int nword;
00169 
00170         string_trim(li->buf, STRING_BOTH);
00171         if (strlen(li->buf) == 0)
00172             break;
00173         nword = str2words(li->buf, NULL, 0);
00174         if (nword != 3)
00175             continue;
00176         words = ckd_calloc(nword, sizeof(*words));
00177         str2words(li->buf, words, nword);
00178         if (0 == strcmp(words[0], "sample_rate")) {
00179             cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2]));
00180         }
00181         if (0 == strcmp(words[0], "channel_count")) {
00182             cmd_ln_set_float32_r(wtf->config, "-nchans", atoi(words[2]));
00183         }
00184         if (0 == strcmp(words[0], "sample_byte_format")) {
00185             cmd_ln_set_str_r(wtf->config, "-input_endian",
00186                              (0 == strcmp(words[2], "10")) ? "big" : "little");
00187         }
00188         /* FIMXE: Warn about shorten-compressed data. */
00189         ckd_free(words);
00190     }
00191     fseek(fh, 1024, SEEK_SET);
00192     wtf->infile = ckd_salloc(infile);
00193     wtf->infh = fh;
00194     return TRUE;
00195 }
00196 
00197 
00204 static int
00205 detect_raw(sphinx_wave2feat_t *wtf, char const *infile)
00206 {
00207     FILE *fh;
00208 
00209     if ((fh = fopen(infile, "rb")) == NULL) {
00210         E_ERROR_SYSTEM("Failed to open %s", infile);
00211         return -1;
00212     }
00213     wtf->infile = ckd_salloc(infile);
00214     wtf->infh = fh;
00215     return TRUE;
00216 }
00217 
00224 static int
00225 detect_sphinx_mfc(sphinx_wave2feat_t *wtf, char const *infile)
00226 {
00227     FILE *fh;
00228     int32 len;
00229     long flen;
00230 
00231     if ((fh = fopen(infile, "rb")) == NULL) {
00232         E_ERROR_SYSTEM("Failed to open %s", infile);
00233         return -1;
00234     }
00235     if (fread(&len, 4, 1, fh) != 1) {
00236         E_ERROR_SYSTEM("Failed to read header from %s\n", infile);
00237         return -1;
00238     }
00239     fseek(fh, 0, SEEK_END);
00240     flen = ftell(fh);
00241 
00242     /* figure out whether to byteswap */
00243     flen = (flen / 4) - 1;
00244     if (flen != len) {
00245         /* First make sure this is an endianness problem, otherwise fail. */
00246         SWAP_INT32(&len);
00247         if (flen != len) {
00248             SWAP_INT32(&len);
00249             E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
00250                     len, flen);
00251             return -1;
00252         }
00253         /* Set the input endianness to the opposite of the machine endianness... */
00254         cmd_ln_set_str_r(wtf->config, "-input_endian",
00255                          (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian"))
00256                           ? "little" : "big"));
00257     }
00258     
00259     fseek(fh, 4, SEEK_SET);
00260     wtf->infile = ckd_salloc(infile);
00261     wtf->infh = fh;
00262     if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
00263         wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
00264     }
00265     else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
00266         wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep");
00267         wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
00268     }
00269     else {
00270         /* Should not happen. */
00271         E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
00272         assert(FALSE);
00273     }
00274             
00275     return TRUE;
00276 }
00277 
00282 static int
00283 decode_pcm(sphinx_wave2feat_t *wtf)
00284 {
00285     size_t nsamp;
00286     int32 nfr;
00287     int nfloat, n;
00288 
00289     fe_start_utt(wtf->fe);
00290     nfloat = 0;
00291     while ((nsamp = fread(wtf->audio, 2, wtf->blocksize, wtf->infh)) != 0) {
00292         size_t nvec;
00293         int16 const *inspeech;
00294 
00295         /* Byteswap stuff here if necessary. */
00296         if (wtf->byteswap) {
00297             for (n = 0; n < nsamp; ++n)
00298                 SWAP_INT16(wtf->audio + n);
00299         }
00300             
00301         inspeech = wtf->audio;
00302         nvec = wtf->featsize;
00303         /* Consume all samples. */
00304         while (nsamp) {
00305             nfr = nvec;
00306             fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr);
00307             if (nfr) {
00308                 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
00309                     return -1;
00310                 nfloat += n;
00311             }
00312         }
00313         inspeech = wtf->audio;
00314     }
00315     /* Now process any leftover audio frames. */
00316     fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
00317     if (nfr) {
00318         if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
00319             return -1;
00320         nfloat += n;
00321     }
00322 
00323     fclose(wtf->infh);
00324     wtf->infh = NULL;
00325     return nfloat;
00326 }
00327 
00332 static int
00333 decode_sphinx_mfc(sphinx_wave2feat_t *wtf)
00334 {
00335     int nfloat = 0, n;
00336     int featsize = wtf->featsize;
00337 
00338     /* If the input vector length is less than the output length, we
00339      * need to do this one frame at a time, because there's empty
00340      * space at the end of each vector in wtf->feat. */
00341     if (wtf->in_veclen < wtf->veclen)
00342         featsize = 1;
00343     while ((n = fread(wtf->feat[0], sizeof(**wtf->feat),
00344                       featsize * wtf->in_veclen, wtf->infh)) != 0) {
00345         int i, nfr = n / wtf->in_veclen;
00346         if (n % wtf->in_veclen) {
00347             E_ERROR("Size of file %d not a multiple of veclen %d\n",
00348                     n, wtf->in_veclen);
00349             return -1;
00350         }
00351         /* Byteswap stuff here if necessary. */
00352         if (wtf->byteswap) {
00353             for (i = 0; i < n; ++i)
00354                 SWAP_FLOAT32(wtf->feat[0] + i);
00355         }
00356         fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr);
00357         for (i = 0; i < nfr; ++i) {
00358             if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
00359                 if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy"))
00360                     fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]);
00361                 else
00362                     fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]);
00363             }
00364             else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
00365                 fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]);
00366             }
00367         }
00368         if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
00369             return -1;
00370         nfloat += n;
00371     }
00372 
00373     fclose(wtf->infh);
00374     wtf->infh = NULL;
00375     return nfloat;
00376 }
00377 
00378 static const audio_type_t types[] = {
00379     { "-mswav", &detect_riff, &decode_pcm },
00380     { "-nist", &detect_nist, &decode_pcm },
00381 #ifdef HAVE_SNDFILE
00382     { "-sndfile", &detect_sndfile, &decode_sndfile },
00383 #endif
00384     { "-raw", &detect_raw, &decode_pcm }
00385 };
00386 static const int ntypes = sizeof(types)/sizeof(types[0]);
00387 static const audio_type_t mfcc_type = {
00388     "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
00389 };
00390 
00396 static int
00397 output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat)
00398 {
00399     if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) {
00400         E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile);
00401         return -1;
00402     }
00403     return 0;
00404 }
00405 
00411 static int
00412 output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
00413 {
00414     int i, nfloat = 0;
00415 
00416     fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
00417     for (i = 0; i < nfr; ++i) {
00418         if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
00419             E_ERROR_SYSTEM("Writing %d values to %s failed",
00420                            wtf->veclen, wtf->outfile);
00421             return -1;
00422         }
00423         nfloat += wtf->veclen;
00424     }
00425     return nfloat;
00426 }
00427 
00428 typedef enum htk_feature_kind_e {
00429     WAVEFORM = 0,   /* PCM audio (rarely used) */
00430     LPC = 1,        /* LPC filter coefficients */
00431     LPCREFC = 2,    /* LPC reflection coefficients */
00432     LPCEPSTRA = 3,  /* LPC-based cepstral coefficients */
00433     LPCDELCEP = 4,  /* LPCC plus deltas */
00434     IREFC = 5,      /* 16-bit integer LPC reflection coefficients */
00435     MFCC = 6,       /* MFCCs */
00436     FBANK = 7,      /* Log mel spectrum */
00437     MELSPEC = 8,    /* Linear mel spectrum */
00438     USER = 9,       /* User defined */
00439     DISCRETE = 10,  /* Vector quantized data */
00440     PLP = 11        /* PLP coefficients */
00441 } htk_feature_kind_t;
00442 
00443 typedef enum htk_feature_flag_e {
00444     _E = 0000100, /* has energy */
00445     _N = 0000200, /* absolute energy supressed */
00446     _D = 0000400, /* has delta coefficients */
00447     _A = 0001000, /* has acceleration (delta-delta) coefficients */
00448     _C = 0002000, /* is compressed */
00449     _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */
00450     _K = 0010000, /* has CRC checksum */
00451     _O = 0020000, /* has 0th cepstral coefficient */
00452     _V = 0040000, /* has VQ data */
00453     _T = 0100000  /* has third differential coefficients */
00454 } htk_feature_flag_t;
00455 
00459 static int
00460 output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat)
00461 {
00462     int32 samp_period;
00463     int16 samp_size;
00464     int16 param_kind;
00465     int swap = FALSE;
00466 
00467     /* HTK files are big-endian. */
00468     if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")))
00469         swap = TRUE;
00470     /* Same file size thing as in Sphinx files (I think) */
00471     if (swap) SWAP_INT32(&nfloat);
00472     if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1)
00473         return -1;
00474     /* Sample period in 100ns units. */
00475     samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate"));
00476     if (swap) SWAP_INT32(&samp_period);
00477     if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1)
00478         return -1;
00479     /* Sample size - veclen * sizeof each sample. */
00480     samp_size = wtf->veclen * 4;
00481     if (swap) SWAP_INT16(&samp_size);
00482     if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1)
00483         return -1;
00484     /* Format and flags. */
00485     if (cmd_ln_boolean_r(wtf->config, "-logspec")
00486         || cmd_ln_boolean_r(wtf->config, "-cep2spec"))
00487         param_kind = FBANK; /* log mel-filter bank outputs */
00488     else
00489         param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */
00490     if (swap) SWAP_INT16(&param_kind);
00491     if (fwrite(&param_kind, 2, 1, wtf->outfh) != 1)
00492         return -1;
00493 
00494     return 0;
00495 }
00496 
00500 static int
00501 output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
00502 {
00503     int i, j, swap, htk_reorder, nfloat = 0;
00504 
00505     fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
00506     /* This is possibly inefficient, but probably not a big deal. */
00507     swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")));
00508     htk_reorder = (0 == strcmp("htk", wtf->ot->name)
00509                    && !(cmd_ln_boolean_r(wtf->config, "-logspec")
00510                         || cmd_ln_boolean_r(wtf->config, "-cep2spec")));
00511     for (i = 0; i < nfr; ++i) {
00512         if (htk_reorder) {
00513             mfcc_t c0 = frames[i][0];
00514             memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4);
00515             frames[i][wtf->veclen - 1] = c0;
00516         }
00517         if (swap)
00518             for (j = 0; j < wtf->veclen; ++j)
00519                 SWAP_FLOAT32(frames[i] + j);
00520         if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
00521             E_ERROR_SYSTEM("Writing %d values to %s failed",
00522                            wtf->veclen, wtf->outfile);
00523             return -1;
00524         }
00525         nfloat += wtf->veclen;
00526     }
00527     return nfloat;
00528 }
00529 
00533 static int
00534 output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
00535 {
00536     int i, j, nfloat = 0;
00537 
00538     fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
00539     for (i = 0; i < nfr; ++i) {
00540         for (j = 0; j < wtf->veclen; ++j) {
00541             fprintf(wtf->outfh, "%.5g", frames[i][j]);
00542             if (j == wtf->veclen - 1)
00543                 fprintf(wtf->outfh, "\n");
00544             else
00545                 fprintf(wtf->outfh, " ");
00546         }
00547         nfloat += wtf->veclen;
00548     }
00549     return nfloat;
00550 }
00551 
00552 static const output_type_t outtypes[] = {
00553     { "sphinx", &output_header_sphinx, &output_frames_sphinx },
00554     { "htk", &output_header_htk, &output_frames_htk },
00555     { "text", NULL, &output_frames_text }
00556 };
00557 static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]);
00558 
00559 sphinx_wave2feat_t *
00560 sphinx_wave2feat_init(cmd_ln_t *config)
00561 {
00562     sphinx_wave2feat_t *wtf;
00563     int i;
00564 
00565     wtf = ckd_calloc(1, sizeof(*wtf));
00566     wtf->refcount = 1;
00567     wtf->config = cmd_ln_retain(config);
00568     wtf->fe = fe_init_auto_r(wtf->config);
00569     wtf->ot = outtypes; /* Default (sphinx) type. */
00570     for (i = 0; i < nouttypes; ++i) {
00571         output_type_t const *otype = &outtypes[i];
00572         if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) {
00573             wtf->ot = otype;
00574             break;
00575         }
00576     }
00577     if (i == nouttypes) {
00578         E_ERROR("Unknown output type: '%s'\n",
00579                 cmd_ln_str_r(config, "-ofmt"));
00580         sphinx_wave2feat_free(wtf);
00581         return NULL;
00582     }
00583 
00584     return wtf;
00585 }
00586 
00587 int
00588 sphinx_wave2feat_free(sphinx_wave2feat_t *wtf)
00589 {
00590     if (wtf == NULL)
00591         return 0;
00592     if (--wtf->refcount > 0)
00593         return wtf->refcount;
00594 
00595     ckd_free(wtf->audio);
00596     ckd_free_2d(wtf->feat);
00597     ckd_free(wtf->infile);
00598     ckd_free(wtf->outfile);
00599     if (wtf->infh)
00600         fclose(wtf->infh);
00601     if (wtf->outfh)
00602         fclose(wtf->outfh);
00603     cmd_ln_free_r(wtf->config);
00604     fe_free(wtf->fe);
00605     ckd_free(wtf);
00606 
00607     return 0;
00608 }
00609 
00610 sphinx_wave2feat_t *
00611 sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf)
00612 {
00613     ++wtf->refcount;
00614     return wtf;
00615 }
00616 
00617 static audio_type_t const *
00618 detect_audio_type(sphinx_wave2feat_t *wtf, char const *infile)
00619 {
00620     audio_type_t const *atype;
00621     int i;
00622 
00623     /* Special case audio type for Sphinx MFCC inputs. */
00624     if (cmd_ln_boolean_r(wtf->config, "-spec2cep")
00625         || cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
00626         int rv = mfcc_type.detect(wtf, infile);
00627         if (rv == -1)
00628             return NULL;
00629         return &mfcc_type;
00630     }
00631 
00632     /* Try to use the type of infile given on the command line. */
00633     for (i = 0; i < ntypes; ++i) {
00634         int rv;
00635         atype = &types[i];
00636         if (cmd_ln_boolean_r(wtf->config, atype->name)) {
00637             rv = (*atype->detect)(wtf, infile);
00638             if (rv == -1)
00639                 return NULL;
00640             break;
00641         }
00642     }
00643     if (i == ntypes) {
00644         /* Detect file type of infile and get parameters. */
00645         for (i = 0; i < ntypes; ++i) {
00646             int rv;
00647             atype = &types[i];
00648             rv = (*atype->detect)(wtf, infile);
00649             if (rv == -1)
00650                 return NULL;
00651             else if (rv == TRUE)
00652             break;
00653         }
00654         if (i == ntypes)
00655             atype = NULL;
00656     }
00657     return atype;
00658 }
00659 
00660 int
00661 sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf,
00662                               char const *infile, char const *outfile)
00663 {
00664     int minfft, nfft, nfloat, veclen;
00665     audio_type_t const *atype;
00666     int fshift, fsize;
00667 
00668     if (cmd_ln_boolean_r(wtf->config, "-verbose"))
00669         E_INFO("Converting %s to %s\n", infile, outfile);
00670 
00671     /* Detect input file type. */
00672     if ((atype = detect_audio_type(wtf, infile)) == NULL)
00673         return -1;
00674 
00675     /* Determine whether to byteswap input. */
00676     wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"),
00677                            cmd_ln_str_r(wtf->config, "-input_endian"));
00678 
00679     /* Make sure the FFT size is sufficiently large. */
00680     minfft = (int)(cmd_ln_float32_r(wtf->config, "-samprate")
00681                    * cmd_ln_float32_r(wtf->config, "-wlen") + 0.5);
00682     for (nfft = 1; nfft < minfft; nfft <<= 1)
00683         ;
00684     if (nfft > cmd_ln_int32_r(wtf->config, "-nfft")) {
00685         E_WARN("Value of -nfft = %d is too small, increasing to %d\n",
00686                cmd_ln_int32_r(wtf->config, "-nfft"), nfft);
00687         cmd_ln_set_int32_r(wtf->config, "-nfft", nfft);
00688         fe_free(wtf->fe);
00689         wtf->fe = fe_init_auto_r(wtf->config);
00690     }
00691 
00692     /* Get the output frame size (if not already set). */
00693     if (wtf->veclen == 0)
00694         wtf->veclen = fe_get_output_size(wtf->fe);
00695 
00696     /* Set up the input and output buffers. */
00697     fe_get_input_size(wtf->fe, &fshift, &fsize);
00698     /* Want to get at least a whole frame plus shift in here. */
00699     wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize");
00700     if (wtf->blocksize < fsize + fshift) {
00701         E_INFO("Block size of %d too small, increasing to %d\n",
00702                wtf->blocksize, fsize + fshift);
00703         wtf->blocksize = fsize + fshift;
00704     }
00705     wtf->audio = ckd_calloc(wtf->blocksize, sizeof(*wtf->audio));
00706     wtf->featsize = (wtf->blocksize - fsize) / fshift;
00707 
00708     /* Use the maximum of the input and output frame sizes to allocate this. */
00709     veclen = wtf->veclen;
00710     if (wtf->in_veclen > veclen) veclen = wtf->in_veclen;
00711     wtf->feat = ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat));
00712 
00713     /* Let's go! */
00714     if ((wtf->outfh = fopen(outfile, "wb")) == NULL) {
00715         E_ERROR_SYSTEM("Failed to open %s for writing", outfile);
00716         return -1;
00717     }
00718     /* Write an empty header, which we'll fill in later. */
00719     if (wtf->ot->output_header &&
00720         (*wtf->ot->output_header)(wtf, 0) < 0) {
00721         E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile);
00722         goto error_out;
00723     }
00724     wtf->outfile = ckd_salloc(outfile);
00725 
00726     if ((nfloat = (*atype->decode)(wtf)) < 0)
00727         return -1;
00728 
00729     if (wtf->ot->output_header) {
00730         if (fseek(wtf->outfh, 0, SEEK_SET) < 0) {
00731             E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile);
00732             goto error_out;
00733         }
00734         if ((*wtf->ot->output_header)(wtf, nfloat) < 0) {
00735             E_ERROR_SYSTEM("Failed to write header to %s\n", outfile);
00736             goto error_out;
00737         }
00738     }
00739     fclose(wtf->outfh);
00740     wtf->outfh = NULL;
00741 
00742     return 0;
00743 error_out:
00744     if (wtf->outfh) {
00745         fclose(wtf->outfh);
00746         wtf->outfh = NULL;
00747     }
00748     return -1;
00749 }
00750 
00751 void
00752 build_filenames(cmd_ln_t *config, char const *basename,
00753                 char **out_infile, char **out_outfile)
00754 {
00755     char const *di, *do_, *ei, *eo;
00756 
00757     di = cmd_ln_str_r(config, "-di");
00758     do_ = cmd_ln_str_r(config, "-do");
00759     ei = cmd_ln_str_r(config, "-ei");
00760     eo = cmd_ln_str_r(config, "-eo");
00761 
00762     *out_infile = string_join(di ? di : "",
00763                               di ? "/" : "",
00764                               basename,
00765                               ei ? "." : "",
00766                               ei ? ei : "",
00767                               NULL);
00768     *out_outfile = string_join(do_ ? do_ : "",
00769                                do_ ? "/" : "",
00770                                basename,
00771                                eo ? "." : "",
00772                                eo ? eo : "",
00773                               NULL);
00774     /* Build output directory structure if possible/requested (it is
00775      * by default). */
00776     if (cmd_ln_boolean_r(config, "-build_outdirs")) {
00777         char *dirname = ckd_salloc(*out_outfile);
00778         path2dirname(*out_outfile, dirname);
00779         build_directory(dirname);
00780         ckd_free(dirname);
00781     }
00782 }
00783 
00784 static int
00785 run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
00786 {
00787     lineiter_t *li;
00788     FILE *ctlfh;
00789     int nskip, runlen, npart;
00790 
00791     if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
00792         E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
00793         return -1;
00794     }
00795     nskip = cmd_ln_int32_r(wtf->config, "-nskip");
00796     runlen = cmd_ln_int32_r(wtf->config, "-runlen");
00797     if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
00798         /* Count lines in the file. */
00799         int nlines, partlen, part;
00800         part = cmd_ln_int32_r(wtf->config, "-part");
00801         for (nlines = 0, li = lineiter_start(ctlfh); li; li = lineiter_next(li))
00802             ++nlines;
00803         fseek(ctlfh, 0, SEEK_SET);
00804         partlen = nlines / npart;
00805         nskip = partlen * (part - 1);
00806         if (part == npart)
00807             runlen = -1;
00808         else
00809             runlen = partlen;
00810     }
00811     if (runlen != -1)
00812         E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
00813     else
00814         E_INFO("Processing all remaining utterances at position %d\n", nskip);
00815     for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
00816         char *infile, *outfile;
00817         int rv;
00818 
00819         if (nskip-- > 0)
00820             continue;
00821         if (runlen == 0)
00822             break;
00823         --runlen;
00824 
00825         string_trim(li->buf, STRING_BOTH);
00826         build_filenames(wtf->config, li->buf, &infile, &outfile);
00827         rv = sphinx_wave2feat_convert_file(wtf, infile, outfile);
00828         ckd_free(infile);
00829         ckd_free(outfile);
00830         if (rv != 0) {
00831             lineiter_free(li);
00832             fclose(ctlfh);
00833             return rv;
00834         }
00835     }
00836     return 0;
00837 }
00838 
00839 int
00840 main(int argc, char *argv[])
00841 {
00842     sphinx_wave2feat_t *wtf;
00843     cmd_ln_t *config;
00844     int rv;
00845 
00846     /* Initialize config. */
00847     if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
00848         return 2;
00849 
00850     /* Parse an argument file if there's one in there. */
00851     if (cmd_ln_str_r(config, "-argfile"))
00852         config = cmd_ln_parse_file_r(config, defn,
00853                                      cmd_ln_str_r(config, "-argfile"), FALSE);
00854     if (config == NULL) {
00855         E_ERROR("Command line parsing failed\n");
00856         return 1;
00857     }
00858     if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
00859         E_ERROR("Failed to initialize wave2feat object\n");
00860         return 1;
00861     }
00862 
00863     /* If there's a control file run through it, otherwise we will do
00864      * a single file (which is what run_control_file will do
00865      * internally too) */
00866     if (cmd_ln_str_r(config, "-c"))
00867         rv = run_control_file(wtf, cmd_ln_str_r(config, "-c"));
00868     else
00869         rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"),
00870                                            cmd_ln_str_r(config, "-o"));
00871 
00872     sphinx_wave2feat_free(wtf);
00873     return rv;
00874 }

Generated on Tue Aug 17 2010 for SphinxBase by  doxygen 1.7.1