diff --git a/basic_pitch/constants.py b/basic_pitch/constants.py index bf4f632..c2b717a 100644 --- a/basic_pitch/constants.py +++ b/basic_pitch/constants.py @@ -23,7 +23,6 @@ SEMITONES_PER_OCTAVE = 12 # for frequency bin calculations FFT_HOP = 256 -N_FFT = 8 * FFT_HOP NOTES_BINS_PER_SEMITONE = 1 CONTOURS_BINS_PER_SEMITONE = 3 diff --git a/basic_pitch/inference.py b/basic_pitch/inference.py index 24f02ea..06fa54f 100644 --- a/basic_pitch/inference.py +++ b/basic_pitch/inference.py @@ -57,6 +57,7 @@ AUDIO_N_SAMPLES, ANNOTATIONS_FPS, FFT_HOP, + AUDIO_WINDOW_LENGTH, ) from basic_pitch.commandline_printing import ( generating_file_message, @@ -247,6 +248,7 @@ def unwrap_output( output: npt.NDArray[np.float32], audio_original_length: int, n_overlapping_frames: int, + hop_size: int, ) -> np.array: """Unwrap batched model predictions to a single matrix. @@ -254,6 +256,7 @@ def unwrap_output( output: array (n_batches, n_times_short, n_freqs) audio_original_length: length of original audio signal (in samples) n_overlapping_frames: number of overlapping frames in the output + hop_size: size of the hop used when scanning the input audio Returns: array (n_times, n_freqs) @@ -266,10 +269,14 @@ def unwrap_output( # remove half of the overlapping frames from beginning and end output = output[:, n_olap:-n_olap, :] + # Concatenate the frames outputs (overlapping frames removed) into a single dimension output_shape = output.shape - n_output_frames_original = int(np.floor(audio_original_length * (ANNOTATIONS_FPS / AUDIO_SAMPLE_RATE))) unwrapped_output = output.reshape(output_shape[0] * output_shape[1], output_shape[2]) - return unwrapped_output[:n_output_frames_original, :] # trim to original audio length + + # trim to number of expected windows in output + n_expected_windows = audio_original_length / hop_size + n_frames_per_window = (AUDIO_WINDOW_LENGTH * ANNOTATIONS_FPS) - n_overlapping_frames + return unwrapped_output[: int(n_expected_windows * n_frames_per_window), :] def run_inference( @@ -303,7 +310,8 @@ def run_inference( output[k].append(v) unwrapped_output = { - k: unwrap_output(np.concatenate(output[k]), audio_original_length, n_overlapping_frames) for k in output + k: unwrap_output(np.concatenate(output[k]), audio_original_length, n_overlapping_frames, hop_size) + for k in output } if debug_file: diff --git a/tests/resources/vocadito_10/model_output.npz b/tests/resources/vocadito_10/model_output.npz index 9b2a404..55e24a8 100644 Binary files a/tests/resources/vocadito_10/model_output.npz and b/tests/resources/vocadito_10/model_output.npz differ diff --git a/tests/test_inference.py b/tests/test_inference.py index 3a71038..eb3a96d 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -26,12 +26,13 @@ import numpy as np import numpy.typing as npt -from basic_pitch import ICASSP_2022_MODEL_PATH, inference +from basic_pitch import ICASSP_2022_MODEL_PATH, inference, note_creation from basic_pitch.constants import ( AUDIO_SAMPLE_RATE, AUDIO_N_SAMPLES, ANNOTATIONS_N_SEMITONES, FFT_HOP, + ANNOTATION_HOP, ) RESOURCES_PATH = pathlib.Path(__file__).parent / "resources" @@ -55,6 +56,13 @@ def test_predict() -> None: assert all(note_pitch_max) assert isinstance(note_events, list) + # Check that model output has the expected length according to the last frame second computed downstream + # (via model_frames_to_time) with to a few frames of tolerance + audio_length_s = librosa.get_duration(filename=test_audio_path) + n_model_output_frames = model_output["note"].shape[0] + last_frame_s = note_creation.model_frames_to_time(n_model_output_frames)[-1] + np.testing.assert_allclose(last_frame_s, audio_length_s, atol=2 * ANNOTATION_HOP) + expected_model_output = np.load(RESOURCES_PATH / "vocadito_10" / "model_output.npz", allow_pickle=True)[ "arr_0" ].item()