From ca4f38bb49ed3b2ff7ae04b170a392b9416bbd3d Mon Sep 17 00:00:00 2001
From: Joshua Lochner <26504141+xenova@users.noreply.github.com>
Date: Tue, 22 Jul 2025 17:37:14 -0400
Subject: [PATCH] Support longform voxtral processing

---
 src/models/voxtral/processing_voxtral.js | 40 ++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/src/models/voxtral/processing_voxtral.js b/src/models/voxtral/processing_voxtral.js
index 33db9f500..4fb7eb2f1 100644
--- a/src/models/voxtral/processing_voxtral.js
+++ b/src/models/voxtral/processing_voxtral.js
@@ -7,6 +7,20 @@ const AUDIO_TOKEN = "[AUDIO]";
 const BEGIN_AUDIO_TOKEN = "[BEGIN_AUDIO]";
 const NUM_AUDIO_TOKENS = 375;
 
+/**
+ * Helper function to split audio into non-overlapping chunks of n_samples
+ * @param {Float32Array} audio 
+ * @param {number} n_samples 
+ * @returns {Float32Array[]}
+ */
+function chunk(audio, n_samples) {
+    const chunks = [];
+    for (let i = 0; i < audio.length; i += n_samples) {
+        chunks.push(audio.subarray(i, Math.min(i + n_samples, audio.length)));
+    }
+    return chunks;
+}
+
 /**
  * Represents a VoxtralProcessor that extracts features from an audio input.
  */
@@ -32,16 +46,36 @@ export class VoxtralProcessor extends Processor {
             if (!Array.isArray(audio)) {
                 audio = [audio];
             }
-            const num_audio_tokens = text.split(AUDIO_TOKEN).length - 1;
+            const text_parts = text.split(AUDIO_TOKEN);
+            const num_audio_tokens = text_parts.length - 1;
             if (num_audio_tokens !== audio.length) {
                 throw new Error(`The number of audio inputs (${audio.length}) does not match the number of audio tokens in the text (${num_audio_tokens}).`);
             }
+
+            const n_samples = this.feature_extractor.config.n_samples;
+
+            // Split each audio input into chunks and keep track of chunk counts
+            const audio_chunks = audio.map(a => chunk(a, n_samples));
+            const chunk_counts = audio_chunks.map(chunks => chunks.length);
+
+            // Flatten all chunks for feature extraction
+            const all_chunks = audio_chunks.flat();
             const features = (await Promise.all(
-                audio.map((audio_input) => this.feature_extractor(audio_input, kwargs))
+                all_chunks.map((audio_input) => this.feature_extractor(audio_input, kwargs))
             )).map(x => x.input_features);
+
             audio_inputs["audio_values"] = features.length > 1 ? cat(features, 0) : features[0];
 
-            text = text.replaceAll(AUDIO_TOKEN, BEGIN_AUDIO_TOKEN + AUDIO_TOKEN.repeat(NUM_AUDIO_TOKENS));
+            // Replace text tokens for each audio input, expanding for chunk count
+            let new_text = text_parts[0];
+            for (let i = 0; i < chunk_counts.length; ++i) {
+                new_text += BEGIN_AUDIO_TOKEN;
+                for (let j = 0; j < chunk_counts[i]; ++j) {
+                    new_text += AUDIO_TOKEN.repeat(NUM_AUDIO_TOKENS);
+                }
+                new_text += text_parts[i + 1];
+            }
+            text = new_text;
         }
 
         const text_inputs = this.tokenizer(text, {