SphinxBase 5prealpha
sphinx_lm_eval.c
Go to the documentation of this file.
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2008 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
41#include <sphinxbase/logmath.h>
43#include <sphinxbase/cmd_ln.h>
45#include <sphinxbase/err.h>
46#include <sphinxbase/pio.h>
47#include <sphinxbase/strfuncs.h>
48
49#include <stdio.h>
50#include <string.h>
51#include <math.h>
52
53static const arg_t defn[] = {
54 { "-help",
56 "no",
57 "Shows the usage of the tool"},
58
59 { "-logbase",
61 "1.0001",
62 "Base in which all log-likelihoods calculated" },
63
64 { "-lm",
66 NULL,
67 "Language model file"},
68
69 { "-probdef",
71 NULL,
72 "Probability definition file for classes in LM"},
73
74 { "-lmctlfn",
76 NULL,
77 "Control file listing a set of language models"},
78
79 { "-lmname",
81 NULL,
82 "Name of language model in -lmctlfn to use for all utterances" },
83
84 { "-lsn",
86 NULL,
87 "Transcription file to evaluate"},
88
89 { "-text",
91 "Text string to evaluate"},
92
93 { "-mmap",
95 "no",
96 "Use memory-mapped I/O for reading binary LM files"},
97
98 { "-lw",
100 "1.0",
101 "Language model weight" },
102
103 { "-wip",
105 "1.0",
106 "Word insertion probability" },
107
108 { "-verbose",
110 "no",
111 "Print details of perplexity calculation" },
112
113 /* FIXME: Support -lmstartsym, -lmendsym, -lmctlfn, -ctl_lm */
114 { NULL, 0, NULL, NULL }
115};
116
117static int verbose;
118
119static int
120calc_entropy(ngram_model_t *lm, char **words, int32 n,
121 int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score)
122{
123 int32 *wids;
124 int32 startwid;
125 int32 i, ch, nccs, noovs, unk;
126
127 if (n == 0)
128 return 0;
129
130 unk = ngram_unknown_wid(lm);
131
132 /* Reverse this array into an array of word IDs. */
133 wids = ckd_calloc(n, sizeof(*wids));
134 for (i = 0; i < n; ++i)
135 wids[n-i-1] = ngram_wid(lm, words[i]);
136 /* Skip <s> as it's a context cue (HACK, this should be configurable). */
137 startwid = ngram_wid(lm, "<s>");
138
139 /* Now evaluate the list of words in reverse using the
140 * remainder of the array as the history. */
141 ch = noovs = nccs = 0;
142 for (i = 0; i < n; ++i) {
143 int32 n_used;
144 int32 prob;
145
146 /* Skip <s> as it's a context cue (HACK, this should be configurable). */
147 if (wids[i] == startwid) {
148 ++nccs;
149 continue;
150 }
151 /* Skip and count OOVs. */
152 if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) {
153 ++noovs;
154 continue;
155 }
156 /* Sum up information for each N-gram */
157 prob = ngram_ng_score(lm,
158 wids[i], wids + i + 1,
159 n - i - 1, &n_used);
160 if (verbose) {
161 int m;
162 printf("log P(%s|", ngram_word(lm, wids[i]));
163 m = i + ngram_model_get_size(lm) - 1;
164 if (m >= n)
165 m = n - 1;
166 while (m > i) {
167 printf("%s ", ngram_word(lm, wids[m--]));
168 }
169 printf(") = %d\n", prob);
170 }
171 ch -= prob;
172 }
173
174 if (out_n_ccs) *out_n_ccs = nccs;
175 if (out_n_oovs) *out_n_oovs = noovs;
176
177 /* Calculate cross-entropy CH = - 1/N sum log P(W|H) */
178 n -= (nccs + noovs);
179 if (n <= 0)
180 return 0;
181 if (out_lm_score)
182 *out_lm_score = -ch;
183 return ch / n;
184}
185
186static void
187evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn)
188{
189 FILE *fh;
190 lineiter_t *litor;
191 int32 nccs, noovs, nwords, lscr;
192 float64 ch, log_to_log2;;
193
194 if ((fh = fopen(lsnfn, "r")) == NULL)
195 E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn);
196
197 /* We have to keep ch in floating-point to avoid overflows, so
198 * we might as well use log2. */
199 log_to_log2 = log(logmath_get_base(lmath)) / log(2);
200 lscr = nccs = noovs = nwords = 0;
201 ch = 0.0;
202 for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) {
203 char **words;
204 int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr;
205
206 n = str2words(litor->buf, NULL, 0);
207 if (n < 0)
208 E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n);
209 if (n == 0) /* Do nothing! */
210 continue;
211 words = ckd_calloc(n, sizeof(*words));
212 str2words(litor->buf, words, n);
213
214 /* Remove any utterance ID (FIXME: has to be a single "word") */
215 if (words[n-1][0] == '('
216 && words[n-1][strlen(words[n-1])-1] == ')')
217 n = n - 1;
218
219 tmp_ch = calc_entropy(lm, words, n, &tmp_nccs,
220 &tmp_noovs, &tmp_lscr);
221
222 ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2;
223 nccs += tmp_nccs;
224 noovs += tmp_noovs;
225 lscr += tmp_lscr;
226 nwords += n;
227
228 ckd_free(words);
229 }
230
231 ch /= (nwords - nccs - noovs);
232 printf("cross-entropy: %f bits\n", ch);
233
234 /* Calculate perplexity pplx = exp CH */
235 printf("perplexity: %f\n", pow(2.0, ch));
236 printf("lm score: %d\n", lscr);
237
238 /* Report OOVs and CCs */
239 printf("%d words evaluated\n", nwords);
240 printf("%d OOVs (%.2f%%), %d context cues removed\n",
241 noovs, (double)noovs / nwords * 100, nccs);
242}
243
244static void
245evaluate_string(ngram_model_t *lm, logmath_t *lmath, const char *text)
246{
247 char *textfoo;
248 char **words;
249 int32 n, ch, noovs, nccs, lscr;
250
251 /* Split it into an array of strings. */
252 textfoo = ckd_salloc(text);
253 n = str2words(textfoo, NULL, 0);
254 if (n < 0)
255 E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n);
256 if (n == 0) /* Do nothing! */
257 return;
258 words = ckd_calloc(n, sizeof(*words));
259 str2words(textfoo, words, n);
260
261 ch = calc_entropy(lm, words, n, &nccs, &noovs, &lscr);
262
263 printf("input: %s\n", text);
264 printf("cross-entropy: %f bits\n",
265 ch * log(logmath_get_base(lmath)) / log(2));
266
267 /* Calculate perplexity pplx = exp CH */
268 printf("perplexity: %f\n", logmath_exp(lmath, ch));
269 printf("lm score: %d\n", lscr);
270
271 /* Report OOVs and CCs */
272 printf("%d words evaluated\n", n);
273 printf("%d OOVs, %d context cues removed\n",
274 noovs, nccs);
275
276 ckd_free(textfoo);
277 ckd_free(words);
278}
279
280int
281main(int argc, char *argv[])
282{
283 cmd_ln_t *config;
284 ngram_model_t *lm = NULL;
285 logmath_t *lmath;
286 const char *lmfn, *probdefn, *lsnfn, *text;
287
288 if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
289 return 1;
290
291 verbose = cmd_ln_boolean_r(config, "-verbose");
292
293 /* Create log math object. */
294 if ((lmath = logmath_init
295 (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
296 E_FATAL("Failed to initialize log math\n");
297 }
298
299 /* Load the language model. */
300 lmfn = cmd_ln_str_r(config, "-lm");
301 if (lmfn == NULL
302 || (lm = ngram_model_read(config, lmfn,
303 NGRAM_AUTO, lmath)) == NULL) {
304 E_FATAL("Failed to load language model from %s\n",
305 cmd_ln_str_r(config, "-lm"));
306 }
307 if ((probdefn = cmd_ln_str_r(config, "-probdef")) != NULL)
308 ngram_model_read_classdef(lm, probdefn);
310 cmd_ln_float32_r(config, "-lw"),
311 cmd_ln_float32_r(config, "-wip"));
312
313 /* Now evaluate some text. */
314 lsnfn = cmd_ln_str_r(config, "-lsn");
315 text = cmd_ln_str_r(config, "-text");
316 if (lsnfn) {
317 evaluate_file(lm, lmath, lsnfn);
318 }
319 else if (text) {
320 evaluate_string(lm, lmath, text);
321 }
322
323 return 0;
324}
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition ckd_alloc.c:244
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition ckd_alloc.h:248
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition ckd_alloc.h:264
Command-line and other configurationparsing and handling.
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition cmd_ln.h:334
#define ARG_STRING
String argument (optional).
Definition cmd_ln.h:114
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
Definition cmd_ln.c:949
#define ARG_BOOLEAN
Boolean (true/false) argument (optional).
Definition cmd_ln.h:118
#define ARG_FLOAT64
Definition cmd_ln.h:152
#define ARG_FLOAT32
Definition cmd_ln.h:148
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
Definition cmd_ln.c:556
Implementation of logging routines.
#define E_FATAL(...)
Exit with non-zero status after error message.
Definition err.h:81
#define E_FATAL_SYSTEM(...)
Print error text; Call perror(""); exit(errno);.
Definition err.h:90
Fast integer logarithmic addition operations.
SPHINXBASE_EXPORT float64 logmath_get_base(logmath_t *lmath)
Get the log base.
Definition logmath.c:368
SPHINXBASE_EXPORT logmath_t * logmath_init(float64 base, int shift, int use_table)
Initialize a log math computation table.
Definition logmath.c:62
SPHINXBASE_EXPORT float64 logmath_exp(logmath_t *lmath, int logb_p)
Convert integer log in base B to linear floating point.
Definition logmath.c:456
N-Gram language models.
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
#define NGRAM_INVALID_WID
Impossible word ID.
Definition ngram_model.h:83
@ NGRAM_AUTO
Determine file type automatically.
Definition ngram_model.h:78
SPHINXBASE_EXPORT int32 ngram_model_get_size(ngram_model_t *model)
Get the order of the N-gram model (i.e.
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip)
Apply a language weight, insertion penalty, and unigram weight to a language model.
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
SPHINXBASE_EXPORT int32 ngram_model_read_classdef(ngram_model_t *model, const char *file_name)
Read a class definition file and add classes to a language model.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
file IO related operations.
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
Definition pio.c:264
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition pio.c:347
Miscellaneous useful string functions.
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition strfuncs.c:123
Argument definition structure.
Opaque structure used to hold the results of command-line parsing.
Line iterator for files.
Definition pio.h:177
Common implementation of ngram_model_t.