From 77bcb6418ea37c06426daa4d1b1d20fb734f9f32 Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 14 Jan 2026 09:57:50 -0500 Subject: [PATCH 1/3] Document JSON output format in CLI help Add output format section to general help describing JSON fields (b, d, p, t, w). Add documentation to align help explaining when each level (words, phones, states) is produced based on options. --- programs/pocketsphinx_main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/programs/pocketsphinx_main.c b/programs/pocketsphinx_main.c index 9fe8de8d..1c7a6c0c 100644 --- a/programs/pocketsphinx_main.c +++ b/programs/pocketsphinx_main.c @@ -730,6 +730,13 @@ usage(char *name, int help_config) fprintf(stderr, "\tsox -qd $(%s soxflags) | %s live -\n", name, name); fprintf(stderr, "\t%s single INPUT\n", name); fprintf(stderr, "\t%s align INPUT WORDS...\n", name); + fprintf(stderr, "\nOutput format:\n"); + fprintf(stderr, " JSON with the following fields:\n"); + fprintf(stderr, " b Begin time in seconds\n"); + fprintf(stderr, " d Duration in seconds\n"); + fprintf(stderr, " p Probability (acoustic model score)\n"); + fprintf(stderr, " t Text of utterance or segment\n"); + fprintf(stderr, " w Array of word segments\n"); fprintf(stderr, "\nFor detailed PARAMS values, run %s help-config\n", name); if (help_config) { err_set_loglevel(ERR_INFO); @@ -750,6 +757,9 @@ usage_align(char *name) fprintf(stderr, " (default: no)\n"); fprintf(stderr, " -state_align yes/no Run a second pass to align phones and states and print their\n"); fprintf(stderr, " durations. This implies -phone_align yes (default: no)\n"); + fprintf(stderr, "\nBy default, output contains words only. With -phone_align, each\n"); + fprintf(stderr, "word in \"w\" contains a nested \"w\" array of phones. With\n"); + fprintf(stderr, "-state_align, each phone also contains a nested \"w\" of HMM states.\n"); fprintf(stderr, "\nExamples:\n"); fprintf(stderr, " # Basic word alignment:\n"); fprintf(stderr, " %s align audio.wav \"hello world\"\n", name); From e3e5fe02c47ff11874718bcb0f2451d32de8abac Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 14 Jan 2026 09:57:56 -0500 Subject: [PATCH 2/3] Document C alignment API Add documentation to ps_alignment_t explaining the three-level hierarchy (words, phones, states), iteration methods, and entry fields. Clarify that score is acoustic log probability where higher (less negative) is better. Addresses: https://github.com/cmusphinx/pocketsphinx/issues/370 --- include/pocketsphinx/alignment.h | 16 ++++++++++++++++ src/ps_alignment_internal.h | 3 ++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/pocketsphinx/alignment.h b/include/pocketsphinx/alignment.h index bcc6e97b..65b84a74 100644 --- a/include/pocketsphinx/alignment.h +++ b/include/pocketsphinx/alignment.h @@ -64,6 +64,22 @@ extern "C" { /** * @struct ps_alignment_t pocketsphinx/alignment.h * @brief Multi-level alignment (words, phones, states) over an utterance. + * + * Alignments are organized hierarchically: words contain phones, and + * phones contain HMM states. Use ps_alignment_words(), + * ps_alignment_phones(), or ps_alignment_states() to iterate at each + * level, and ps_alignment_iter_children() to descend into children. + * + * Each entry has the following fields, accessible via + * ps_alignment_iter_seg() and ps_alignment_iter_name(): + * + * - name: Text (word string, phone symbol, or state ID as string) + * - start: Start frame index + * - duration: Duration in frames + * - score: Acoustic score (log probability, higher is better) + * + * To convert frames to seconds, divide by the frame rate (default + * 100, i.e. 10ms per frame). */ typedef struct ps_alignment_s ps_alignment_t; diff --git a/src/ps_alignment_internal.h b/src/ps_alignment_internal.h index 72e2d6a5..7f6e67ed 100644 --- a/src/ps_alignment_internal.h +++ b/src/ps_alignment_internal.h @@ -57,7 +57,8 @@ extern "C" { typedef struct ps_alignment_entry_s { int32 start; /**< Start frame index. */ int32 duration; /**< Duration in frames. */ - int32 score; /**< Alignment score (fairly meaningless). */ + int32 score; /**< Acoustic score (log probability). Higher + (less negative) is better. */ /** * Index of parent node. * From 4912aa502cfddf7aceac799e116d40395bb5566f Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 14 Jan 2026 09:58:03 -0500 Subject: [PATCH 3/3] Document Python alignment API Improve Alignment class docstring to explain three-level hierarchy and two iteration methods (flat and hierarchical). Improve AlignmentEntry docstring with clearer attribute documentation, frame-to-seconds conversion notes, and example showing output in seconds. --- cython/_pocketsphinx.pyx | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/cython/_pocketsphinx.pyx b/cython/_pocketsphinx.pyx index d94f4a3c..d5ef2da6 100644 --- a/cython/_pocketsphinx.pyx +++ b/cython/_pocketsphinx.pyx @@ -1990,23 +1990,28 @@ cdef class Endpointer: return (&outbuf[0])[:out_n_samples * 2] cdef class AlignmentEntry: - """Entry (word, phone, state) in an alignment. + """Entry (word, phone, or state) in an alignment. - Iterating over this will iterate over its children (i.e. the - phones in a word or the states in a phone) if any. For example:: + Iterating over this will iterate over its children (phones in a + word, or states in a phone) if any. For example, to print + word and phone timings in seconds:: for word in decoder.get_alignment(): - print("%s from %.2f to %.2f" % (word.name, word.start, - word.start + word.duration)) + print("%s from %.3f to %.3f seconds" % (word.name, + word.start / 100, + (word.start + word.duration) / 100)) for phone in word: - print("%s at %.2f duration %.2f" % - (phone.name, phone.start, phone.duration)) + print(" %s at %.3f for %.3f seconds" % (phone.name, + phone.start / 100, + phone.duration / 100)) Attributes: - name(str): Name of segment (word, phone name, state id) - start(int): Index of start frame. - duration(int): Duration in frames. - score(float): Acoustic score (density). + name(str): Text of this entry (word string, phone symbol, or + state ID as string). + start(int): Start frame index. Divide by frame rate for seconds + (default 100, i.e. 10ms per frame). + duration(int): Duration in frames. Divide by frame rate for seconds. + score(int): Acoustic score (log probability, higher is better). """ cdef public int start cdef public int duration @@ -2034,9 +2039,16 @@ cdef class AlignmentEntry: cdef class Alignment: """Sub-word alignment as returned by `get_alignment`. - For the moment this is read-only. You are able to iterate over - the words, phones, or states in it, as well as sub-iterating over - each of their children, as described in `AlignmentEntry`. + Alignments have three levels: words, phones, and HMM states. + Words contain phones, and phones contain states. + + There are two ways to iterate: + + Flat iteration over a single level using `words()`, `phones()`, + or `states()`. + + Hierarchical iteration by iterating over an `AlignmentEntry` to + get its children (phones of a word, or states of a phone). """ cdef ps_alignment_t *_al