From ca4f38bb49ed3b2ff7ae04b170a392b9416bbd3d Mon Sep 17 00:00:00 2001 From: Joshua Lochner <26504141+xenova@users.noreply.github.com> Date: Tue, 22 Jul 2025 17:37:14 -0400 Subject: [PATCH] Support longform voxtral processing --- src/models/voxtral/processing_voxtral.js | 40 ++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/src/models/voxtral/processing_voxtral.js b/src/models/voxtral/processing_voxtral.js index 33db9f500..4fb7eb2f1 100644 --- a/src/models/voxtral/processing_voxtral.js +++ b/src/models/voxtral/processing_voxtral.js @@ -7,6 +7,20 @@ const AUDIO_TOKEN = "[AUDIO]"; const BEGIN_AUDIO_TOKEN = "[BEGIN_AUDIO]"; const NUM_AUDIO_TOKENS = 375; +/** + * Helper function to split audio into non-overlapping chunks of n_samples + * @param {Float32Array} audio + * @param {number} n_samples + * @returns {Float32Array[]} + */ +function chunk(audio, n_samples) { + const chunks = []; + for (let i = 0; i < audio.length; i += n_samples) { + chunks.push(audio.subarray(i, Math.min(i + n_samples, audio.length))); + } + return chunks; +} + /** * Represents a VoxtralProcessor that extracts features from an audio input. */ @@ -32,16 +46,36 @@ export class VoxtralProcessor extends Processor { if (!Array.isArray(audio)) { audio = [audio]; } - const num_audio_tokens = text.split(AUDIO_TOKEN).length - 1; + const text_parts = text.split(AUDIO_TOKEN); + const num_audio_tokens = text_parts.length - 1; if (num_audio_tokens !== audio.length) { throw new Error(`The number of audio inputs (${audio.length}) does not match the number of audio tokens in the text (${num_audio_tokens}).`); } + + const n_samples = this.feature_extractor.config.n_samples; + + // Split each audio input into chunks and keep track of chunk counts + const audio_chunks = audio.map(a => chunk(a, n_samples)); + const chunk_counts = audio_chunks.map(chunks => chunks.length); + + // Flatten all chunks for feature extraction + const all_chunks = audio_chunks.flat(); const features = (await Promise.all( - audio.map((audio_input) => this.feature_extractor(audio_input, kwargs)) + all_chunks.map((audio_input) => this.feature_extractor(audio_input, kwargs)) )).map(x => x.input_features); + audio_inputs["audio_values"] = features.length > 1 ? cat(features, 0) : features[0]; - text = text.replaceAll(AUDIO_TOKEN, BEGIN_AUDIO_TOKEN + AUDIO_TOKEN.repeat(NUM_AUDIO_TOKENS)); + // Replace text tokens for each audio input, expanding for chunk count + let new_text = text_parts[0]; + for (let i = 0; i < chunk_counts.length; ++i) { + new_text += BEGIN_AUDIO_TOKEN; + for (let j = 0; j < chunk_counts[i]; ++j) { + new_text += AUDIO_TOKEN.repeat(NUM_AUDIO_TOKENS); + } + new_text += text_parts[i + 1]; + } + text = new_text; } const text_inputs = this.tokenizer(text, {