SphinxBase 5prealpha
sphinx_pitch.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2008 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37
38#ifdef HAVE_CONFIG_H
39#include <config.h>
40#endif
41
42#include <stdio.h>
43#include <string.h>
44
45#include <sphinxbase/cmd_ln.h>
46#include <sphinxbase/yin.h>
48#include <sphinxbase/byteorder.h>
49#include <sphinxbase/strfuncs.h>
50#include <sphinxbase/err.h>
51#include <sphinxbase/pio.h>
52
53static arg_t defn[] = {
54 { "-i",
56 NULL,
57 "Single audio input file" },
58
59 { "-o",
61 NULL,
62 "Single text output file (standard output will be used if not given)" },
63
64 { "-c",
66 NULL,
67 "Control file for batch processing" },
68
69 { "-nskip",
71 "0",
72 "If a control file was specified, the number of utterances to skip at the head of the file" },
73
74 { "-runlen",
76 "-1",
77 "If a control file was specified, the number of utterances to process (see -nskip too)" },
78
79 { "-di",
81 NULL,
82 "Input directory, input file names are relative to this, if defined" },
83
84 { "-ei",
86 NULL,
87 "Input extension to be applied to all input files" },
88
89 { "-do",
91 NULL,
92 "Output directory, output files are relative to this" },
93
94 { "-eo",
96 NULL,
97 "Output extension to be applied to all output files" },
98
99 { "-nist",
101 "no",
102 "Defines input format as NIST sphere" },
103
104 { "-raw",
106 "no",
107 "Defines input format as raw binary data" },
108
109 { "-mswav",
111 "no",
112 "Defines input format as Microsoft Wav (RIFF)" },
113
114 { "-samprate",
115 ARG_INT32,
116 "0",
117 "Sampling rate of audio data (will be determined automatically if 0)" },
118
119 { "-input_endian",
121 NULL,
122 "Endianness of audio data (will be determined automatically if not given)" },
123
124 { "-fshift",
126 "0.01",
127 "Frame shift: number of seconds between each analysis frame." },
128
129 { "-flen",
131 "0.025",
132 "Number of seconds in each analysis frame (needs to be greater than twice the longest period you wish to detect - to detect down to 80Hz you need a frame length of 2.0/80 = 0.025)." },
133
134 { "-smooth_window",
135 ARG_INT32,
136 "2",
137 "Number of frames on either side of the current frame to use for smoothing." },
138
139 { "-voice_thresh",
141 "0.1",
142 "Threshold of normalized difference under which to search for the fundamental period." },
143
144 { "-search_range",
146 "0.2",
147 "Fraction of the best local estimate to use as a search range for smoothing." },
148
149 { NULL, 0, NULL, NULL }
150};
151
152static int extract_pitch(const char *in, const char *out);
153static int run_control_file(const char *ctl);
154
155int
156main(int argc, char *argv[])
157{
158 cmd_ln_parse(defn, argc, argv, TRUE);
159
160 /* Run a control file if requested. */
161 if (cmd_ln_str("-c")) {
162 if (run_control_file(cmd_ln_str("-c")) < 0)
163 return 1;
164 }
165 else {
166 if (extract_pitch(cmd_ln_str("-i"), cmd_ln_str("-o")) < 0)
167 return 1;
168 }
169
170 cmd_ln_free();
171 return 0;
172}
173
174static int
175guess_file_type(char const *file, FILE *infh)
176{
177 char header[4];
178
179 fseek(infh, 0, SEEK_SET);
180 if (fread(header, 1, 4, infh) != 4) {
181 E_ERROR_SYSTEM("Failed to read 4 byte header");
182 return -1;
183 }
184 if (0 == memcmp(header, "RIFF", 4)) {
185 E_INFO("%s appears to be a WAV file\n", file);
186 cmd_ln_set_boolean("-mswav", TRUE);
187 cmd_ln_set_boolean("-nist", FALSE);
188 cmd_ln_set_boolean("-raw", FALSE);
189 }
190 else if (0 == memcmp(header, "NIST", 4)) {
191 E_INFO("%s appears to be a NIST SPHERE file\n", file);
192 cmd_ln_set_boolean("-mswav", FALSE);
193 cmd_ln_set_boolean("-nist", TRUE);
194 cmd_ln_set_boolean("-raw", FALSE);
195 }
196 else {
197 E_INFO("%s appears to be raw data\n", file);
198 cmd_ln_set_boolean("-mswav", FALSE);
199 cmd_ln_set_boolean("-nist", FALSE);
200 cmd_ln_set_boolean("-raw", TRUE);
201 }
202 fseek(infh, 0, SEEK_SET);
203 return 0;
204}
205
206#define TRY_FREAD(ptr, size, nmemb, stream) \
207 if (fread(ptr, size, nmemb, stream) != (nmemb)) { \
208 E_ERROR_SYSTEM("Failed to read %d bytes", size * nmemb); \
209 goto error_out; \
210 }
211
212static int
213read_riff_header(FILE *infh)
214{
215 char id[4];
216 int32 intval, header_len;
217 int16 shortval;
218
219 /* RIFF files are little-endian by definition. */
220 cmd_ln_set_str("-input_endian", "little");
221
222 /* Read in all the header chunks and etcetera. */
223 TRY_FREAD(id, 1, 4, infh);
224 /* Total file length (we don't care) */
225 TRY_FREAD(&intval, 4, 1, infh);
226 /* 'WAVE' */
227 TRY_FREAD(id, 1, 4, infh);
228 if (0 != memcmp(id, "WAVE", 4)) {
229 E_ERROR("This is not a WAVE file\n");
230 goto error_out;
231 }
232 /* 'fmt ' */
233 TRY_FREAD(id, 1, 4, infh);
234 if (0 != memcmp(id, "fmt ", 4)) {
235 E_ERROR("Format chunk missing\n");
236 goto error_out;
237 }
238 /* Length of 'fmt ' chunk */
239 TRY_FREAD(&intval, 4, 1, infh);
240 SWAP_LE_32(&intval);
241 header_len = intval;
242
243 /* Data format. */
244 TRY_FREAD(&shortval, 2, 1, infh);
245 SWAP_LE_16(&shortval);
246 if (shortval != 1) { /* PCM */
247 E_ERROR("WAVE file is not in PCM format\n");
248 goto error_out;
249 }
250
251 /* Number of channels. */
252 TRY_FREAD(&shortval, 2, 1, infh);
253 SWAP_LE_16(&shortval);
254 if (shortval != 1) { /* PCM */
255 E_ERROR("WAVE file is not single channel\n");
256 goto error_out;
257 }
258
259 /* Sampling rate (finally!) */
260 TRY_FREAD(&intval, 4, 1, infh);
261 SWAP_LE_32(&intval);
262 if (cmd_ln_int32("-samprate") == 0)
263 cmd_ln_set_int32("-samprate", intval);
264 else if (cmd_ln_int32("-samprate") != intval) {
265 E_WARN("WAVE file sampling rate %d != -samprate %d\n",
266 intval, cmd_ln_int32("-samprate"));
267 }
268
269 /* Average bytes per second (we don't care) */
270 TRY_FREAD(&intval, 4, 1, infh);
271
272 /* Block alignment (we don't care) */
273 TRY_FREAD(&shortval, 2, 1, infh);
274
275 /* Bits per sample (must be 16) */
276 TRY_FREAD(&shortval, 2, 1, infh);
277 SWAP_LE_16(&shortval);
278 if (shortval != 16) {
279 E_ERROR("WAVE file is not 16-bit\n");
280 goto error_out;
281 }
282
283 /* Any extra parameters. */
284 if (header_len > 16)
285 fseek(infh, header_len - 16, SEEK_CUR);
286
287 /* Now skip to the 'data' chunk. */
288 while (1) {
289 TRY_FREAD(id, 1, 4, infh);
290 if (0 == memcmp(id, "data", 4)) {
291 /* Total number of bytes of data (we don't care). */
292 TRY_FREAD(&intval, 4, 1, infh);
293 break;
294 }
295 else {
296 /* Some other stuff... */
297 /* Number of bytes of ... whatever */
298 TRY_FREAD(&intval, 4, 1, infh);
299 SWAP_LE_32(&intval);
300 fseek(infh, intval, SEEK_CUR);
301 }
302 }
303
304 /* We are ready to rumble. */
305 return 0;
306error_out:
307 return -1;
308}
309
310static int
311read_nist_header(FILE *infh)
312{
313 char hdr[1024];
314 char *line, *c;
315
316 TRY_FREAD(hdr, 1, 1024, infh);
317 hdr[1023] = '\0';
318
319 /* Roughly parse it to find the sampling rate and byte order
320 * (don't bother with other stuff) */
321 if ((line = strstr(hdr, "sample_rate")) == NULL) {
322 E_ERROR("No sampling rate in NIST header!\n");
323 goto error_out;
324 }
325 c = strchr(line, '\n');
326 if (c) *c = '\0';
327 c = strrchr(line, ' ');
328 if (c == NULL) {
329 E_ERROR("Could not find sampling rate!\n");
330 goto error_out;
331 }
332 ++c;
333 if (cmd_ln_int32("-samprate") == 0)
334 cmd_ln_set_int32("-samprate", atoi(c));
335 else if (cmd_ln_int32("-samprate") != atoi(c)) {
336 E_WARN("NIST file sampling rate %d != -samprate %d\n",
337 atoi(c), cmd_ln_int32("-samprate"));
338 }
339
340 if (line + strlen(line) < hdr + 1023)
341 line[strlen(line)] = ' ';
342 if ((line = strstr(hdr, "sample_byte_format")) == NULL) {
343 E_ERROR("No sample byte format in NIST header!\n");
344 goto error_out;
345 }
346 c = strchr(line, '\n');
347 if (c) *c = '\0';
348 c = strrchr(line, ' ');
349 if (c == NULL) {
350 E_ERROR("Could not find sample byte order!\n");
351 goto error_out;
352 }
353 ++c;
354 if (0 == memcmp(c, "01", 2)) {
355 cmd_ln_set_str("-input_endian", "little");
356 }
357 else if (0 == memcmp(c, "10", 2)) {
358 cmd_ln_set_str("-input_endian", "big");
359 }
360 else {
361 E_ERROR("Unknown byte order %s\n", c);
362 goto error_out;
363 }
364
365 /* We are ready to rumble. */
366 return 0;
367error_out:
368 return -1;
369}
370
371static int
372extract_pitch(const char *in, const char *out)
373{
374 FILE *infh = NULL, *outfh = NULL;
375 size_t flen, fshift, nsamps;
376 int16 *buf = NULL;
377 yin_t *yin = NULL;
378 uint16 period, bestdiff;
379 int32 sps;
380
381 if (out) {
382 if ((outfh = fopen(out, "w")) == NULL) {
383 E_ERROR_SYSTEM("Failed to open %s for writing", out);
384 goto error_out;
385 }
386 }
387 else {
388 outfh = stdout;
389 }
390 if ((infh = fopen(in, "rb")) == NULL) {
391 E_ERROR_SYSTEM("Failed to open %s for reading", in);
392 goto error_out;
393 }
394
395 /* If we weren't told what the file type is, weakly try to
396 * determine it (actually it's pretty obvious) */
397 if (!(cmd_ln_boolean("-raw")
398 || cmd_ln_boolean("-mswav")
399 || cmd_ln_boolean("-nist"))) {
400 if (guess_file_type(in, infh) < 0)
401 goto error_out;
402 }
403
404 /* Grab the sampling rate and byte order from the header and also
405 * make sure this is 16-bit linear PCM. */
406 if (cmd_ln_boolean("-mswav")) {
407 if (read_riff_header(infh) < 0)
408 goto error_out;
409 }
410 else if (cmd_ln_boolean("-nist")) {
411 if (read_nist_header(infh) < 0)
412 goto error_out;
413 }
414 else if (cmd_ln_boolean("-raw")) {
415 /* Just use some defaults for sampling rate and endian. */
416 if (cmd_ln_str("-input_endian") == NULL) {
417 cmd_ln_set_str("-input_endian", "little");
418 }
419 if (cmd_ln_int32("-samprate") == 0)
420 cmd_ln_set_int32("-samprate", 16000);
421 }
422
423 /* Now read frames and write pitch estimates. */
424 sps = cmd_ln_int32("-samprate");
425 flen = (size_t)(0.5 + sps * cmd_ln_float32("-flen"));
426 fshift = (size_t)(0.5 + sps * cmd_ln_float32("-fshift"));
427 yin = yin_init(flen, cmd_ln_float32("-voice_thresh"),
428 cmd_ln_float32("-search_range"),
429 cmd_ln_int32("-smooth_window"));
430 if (yin == NULL) {
431 E_ERROR("Failed to initialize YIN\n");
432 goto error_out;
433 }
434 buf = ckd_calloc(flen, sizeof(*buf));
435 /* Read the first full frame of data. */
436 if (fread(buf, sizeof(*buf), flen, infh) != flen) {
437 /* Fail silently, which is probably okay. */
438 }
439 yin_start(yin);
440 nsamps = 0;
441 while (!feof(infh)) {
442 /* Process a frame of data. */
443 yin_write(yin, buf);
444 if (yin_read(yin, &period, &bestdiff)) {
445 fprintf(outfh, "%.3f %.2f %.2f\n",
446 /* Time point. */
447 (double)nsamps/sps,
448 /* "Probability" of voicing. */
449 bestdiff > 32768 ? 0.0 : 1.0 - (double)bestdiff / 32768,
450 /* Pitch (possibly bogus) */
451 period == 0 ? sps : (double)sps / period);
452 nsamps += fshift;
453 }
454 /* Shift it back and get the next frame's overlap. */
455 memmove(buf, buf + fshift, (flen - fshift) * sizeof(*buf));
456 if (fread(buf + flen - fshift, sizeof(*buf), fshift, infh) != fshift) {
457 /* Fail silently (FIXME: really?) */
458 }
459 }
460 yin_end(yin);
461 /* Process trailing frames of data. */
462 while (yin_read(yin, &period, &bestdiff)) {
463 fprintf(outfh, "%.3f %.2f %.2f\n",
464 /* Time point. */
465 (double)nsamps/sps,
466 /* "Probability" of voicing. */
467 bestdiff > 32768 ? 0.0 : 1.0 - (double)bestdiff / 32768,
468 /* Pitch (possibly bogus) */
469 period == 0 ? sps : (double)sps / period);
470 }
471
472 if (yin)
473 yin_free(yin);
474 ckd_free(buf);
475 fclose(infh);
476 if (outfh && outfh != stdout)
477 fclose(outfh);
478 return 0;
479
480error_out:
481 if (yin)
482 yin_free(yin);
483 ckd_free(buf);
484 if (infh) fclose(infh);
485 if (outfh && outfh != stdout)
486 fclose(outfh);
487 return -1;
488}
489
490static int
491run_control_file(const char *ctl)
492{
493 FILE *ctlfh;
494 char *line;
495 char *di, *dout, *ei, *eio;
496 size_t len;
497 int rv, guess_type, guess_sps, guess_endian;
498 int32 skip, runlen;
499
500 skip = cmd_ln_int32("-nskip");
501 runlen = cmd_ln_int32("-runlen");
502
503 /* Whether to guess file types */
504 guess_type = !(cmd_ln_boolean("-raw")
505 || cmd_ln_boolean("-mswav")
506 || cmd_ln_boolean("-nist"));
507 /* Whether to guess sampling rate */
508 guess_sps = (cmd_ln_int32("-samprate") == 0);
509 /* Whether to guess endian */
510 guess_endian = (cmd_ln_str("-input_endian") == NULL);
511
512 if ((ctlfh = fopen(ctl, "r")) == NULL) {
513 E_ERROR_SYSTEM("Failed to open control file %s", ctl);
514 return -1;
515 }
516 if (cmd_ln_str("-di"))
517 di = string_join(cmd_ln_str("-di"), "/", NULL);
518 else
519 di = ckd_salloc("");
520 if (cmd_ln_str("-do"))
521 dout = string_join(cmd_ln_str("-do"), "/", NULL);
522 else
523 dout = ckd_salloc("");
524 if (cmd_ln_str("-ei"))
525 ei = string_join(".", cmd_ln_str("-ei"), NULL);
526 else
527 ei = ckd_salloc("");
528 if (cmd_ln_str("-eo"))
529 eio = string_join(".", cmd_ln_str("-eo"), NULL);
530 else
531 eio = ckd_salloc("");
532 rv = 0;
533 while ((line = fread_line(ctlfh, &len)) != NULL) {
534 char *infile, *outfile;
535
536 if (skip-- > 0) {
537 ckd_free(line);
538 continue;
539 }
540 if (runlen == 0) {
541 ckd_free(line);
542 break;
543 }
544 --runlen;
545
546 if (line[len-1] == '\n')
547 line[len-1] = '\0';
548
549 infile = string_join(di, line, ei, NULL);
550 outfile = string_join(dout, line, eio, NULL);
551
552 /* Reset various guessed information */
553 if (guess_type) {
554 cmd_ln_set_boolean("-nist", FALSE);
555 cmd_ln_set_boolean("-mswav", FALSE);
556 cmd_ln_set_boolean("-raw", FALSE);
557 }
558 if (guess_sps)
559 cmd_ln_set_int32("-samprate", 0);
560 if (guess_endian)
561 cmd_ln_set_str("-input_endian", NULL);
562
563 rv = extract_pitch(infile, outfile);
564
565 ckd_free(infile);
566 ckd_free(outfile);
567 ckd_free(line);
568
569 if (rv != 0)
570 break;
571 }
572 ckd_free(di);
573 ckd_free(dout);
574 ckd_free(ei);
575 ckd_free(eio);
576 fclose(ctlfh);
577 return rv;
578}
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition ckd_alloc.c:244
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition ckd_alloc.h:248
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition ckd_alloc.h:264
Command-line and other configurationparsing and handling.
#define cmd_ln_boolean(name)
Retrieve a boolean from the global command line.
Definition cmd_ln.h:550
SPHINXBASE_EXPORT int32 cmd_ln_parse(const arg_t *defn, int32 argc, char *argv[], int32 strict)
Non-reentrant version of cmd_ln_parse().
Definition cmd_ln.c:746
SPHINXBASE_EXPORT void cmd_ln_free(void)
Free the global command line, if any exists.
Definition cmd_ln.c:1082
#define ARG_STRING
String argument (optional).
Definition cmd_ln.h:114
#define ARG_INT32
Definition cmd_ln.h:144
#define cmd_ln_float32(name)
Retrieve a 32-bit float from the global command line.
Definition cmd_ln.h:536
#define cmd_ln_set_boolean(n, b)
Set a boolean value in the global command line.
Definition cmd_ln.h:586
#define cmd_ln_str(name)
Retrieve a string from the global command line.
Definition cmd_ln.h:513
#define cmd_ln_set_int32(n, i)
Set a 32-bit integer value in the global command line.
Definition cmd_ln.h:565
#define ARG_BOOLEAN
Boolean (true/false) argument (optional).
Definition cmd_ln.h:118
#define cmd_ln_int32(name)
Retrieve a 32-bit integer from the global command line.
Definition cmd_ln.h:529
#define ARG_FLOAT32
Definition cmd_ln.h:148
#define cmd_ln_set_str(n, s)
Set a string in the global command line.
Definition cmd_ln.h:558
Implementation of logging routines.
#define E_ERROR(...)
Print error message to error log.
Definition err.h:104
#define E_INFO(...)
Print logging information to standard error stream.
Definition err.h:114
#define E_ERROR_SYSTEM(...)
Print error text; Call perror("");.
Definition err.h:99
#define E_WARN(...)
Print warning message to error log.
Definition err.h:109
file IO related operations.
SPHINXBASE_EXPORT char * fread_line(FILE *stream, size_t *out_len)
Read a line of arbitrary length from a file and return it as a newly allocated string.
Definition pio.c:377
Miscellaneous useful string functions.
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string.
Definition strfuncs.c:70
Argument definition structure.
Definition yin.c:51
Implementation of pitch estimation.
SPHINXBASE_EXPORT int yin_read(yin_t *pe, uint16 *out_period, uint16 *out_bestdiff)
Read a raw estimated pitch value from the pitch estimator.
Definition yin.c:222
SPHINXBASE_EXPORT void yin_end(yin_t *pe)
Mark the end of an utterance.
Definition yin.c:166
SPHINXBASE_EXPORT yin_t * yin_init(int frame_size, float search_threshold, float search_range, int smooth_window)
Initialize moving-window pitch estimation.
Definition yin.c:131
SPHINXBASE_EXPORT void yin_start(yin_t *pe)
Start processing an utterance.
Definition yin.c:158
SPHINXBASE_EXPORT void yin_write(yin_t *pe, int16 const *frame)
Feed a frame of data to the pitch estimator.
Definition yin.c:195
SPHINXBASE_EXPORT void yin_free(yin_t *pe)
Free a moving-window pitch estimator.
Definition yin.c:150