{
  "timestamp": "2026-04-15T03:05+05:30",
  "verdict": "voxtral_works_on_dgx_spark_aarch64_samantha_clone_needs_pr_2790",
  "headline": "First verified deployment of Voxtral-4B-TTS-2603 on DGX Spark aarch64 via vllm-omni v0.18.0. 10/10 utterances synthesized, RTF ~0.79x (faster than real-time). Samantha voice clone blocked on open PR #2790 (upload works, /speech rejects uploaded-voice name).",

  "runtime": {
    "base_docker_image": "vllm/vllm-openai:v0.18.0-aarch64-cu130",
    "vllm_version": "0.18.2rc1.dev73+gdb7a17ecc (paired with v0.18.0 API)",
    "vllm_omni_version": "v0.18.0 (tag, 2026-03-28) — paired base image required for API compat",
    "vllm_omni_commit": "git checkout v0.18.0 (detached HEAD)",
    "install_method": "pip install -e . inside ephemeral container, deps from requirements/common.txt + SETUPTOOLS_SCM_PRETEND_VERSION=0.18.0",
    "launch_cmd": "vllm-omni serve mistralai/Voxtral-4B-TTS-2603 --tokenizer-mode mistral --omni --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml --host 0.0.0.0 --port 8000 --max-num-seqs 1"
  },

  "blockers_resolved": {
    "missed_in_session_105": "I tested mainline vllm + vllm nightly; missed that voxtral_tts lives in separate vllm-omni pip package",
    "docker_hub_amd64_only_red_herring": "vllm/vllm-omni Docker images are amd64-only, but pip package + paired vllm-openai:v0.18.0-aarch64-cu130 gives the runtime on aarch64",
    "api_version_skew": "vllm-omni main HEAD imports vllm APIs (pooling.score.ServingScores, pooling.pooling.OpenAIServingPooling) that don't exist in the Titan's vllm 0.18.2rc1 OR today's nightly. SOLUTION: checkout vllm-omni v0.18.0 tag + use vllm-openai:v0.18.0-aarch64-cu130 base image (paired by the project's own Dockerfile.ci).",
    "memory_budget_override": "Stage-configs YAML hardcodes gpu_memory_utilization: 0.8. Sed to 0.5 — CLI flag does NOT override.",
    "missing_deps": "aenum, pydub, omegaconf, diffusers, accelerate==1.12.0, torchsde, x-transformers, einops, cache-dit, janus, openai-whisper, av, soundfile, resampy, sox, prettytable, imageio — not auto-installed by pip install -e . --no-deps. Must install common.txt deps separately."
  },

  "benchmark_results_preset_neutral_female": {
    "warmup_latency_s": 18.29,
    "utterances": [
      {"idx": 0, "text_first_words": "Good morning Rajesh.", "latency_s": 1.98, "audio_bytes": 111404, "audio_s_est": 2.32, "rtf": 0.85},
      {"idx": 1, "text_first_words": "I have been thinking about the way...", "latency_s": 3.84, "audio_bytes": 230444, "audio_s_est": 4.80, "rtf": 0.80},
      {"idx": 2, "text_first_words": "The weather in Bangalore...", "latency_s": 5.62, "audio_bytes": 337964, "audio_s_est": 7.04, "rtf": 0.80},
      {"idx": 3, "text_first_words": "You have three calendar events...", "latency_s": 9.43, "audio_bytes": 579884, "audio_s_est": 12.08, "rtf": 0.78},
      {"idx": 4, "text_first_words": "I am glad you are home. How was the walk?", "latency_s": 2.57, "audio_bytes": 157484, "audio_s_est": 3.28, "rtf": 0.78},
      {"idx": 5, "text_first_words": "I do not think that is quite right...", "latency_s": 4.91, "audio_bytes": 307244, "audio_s_est": 6.40, "rtf": 0.77},
      {"idx": 6, "text_first_words": "Your mom called twice while you were out.", "latency_s": 3.96, "audio_bytes": 245804, "audio_s_est": 5.12, "rtf": 0.77},
      {"idx": 7, "text_first_words": "Should I read you the article now...", "latency_s": 2.92, "audio_bytes": 180524, "audio_s_est": 3.76, "rtf": 0.78},
      {"idx": 8, "text_first_words": "I love how the light looks...", "latency_s": 4.26, "audio_bytes": 265004, "audio_s_est": 5.52, "rtf": 0.77},
      {"idx": 9, "text_first_words": "Okay, setting a reminder...", "latency_s": 4.45, "audio_bytes": 268844, "audio_s_est": 5.60, "rtf": 0.79}
    ],
    "summary": {
      "n": 10,
      "mean_latency_s": 4.39,
      "min_latency_s": 1.98,
      "max_latency_s": 9.43,
      "mean_rtf": 0.79,
      "all_http_200": true,
      "all_audio_valid": "10/10 WAVs pulled to docs/voxtral-bench-samples-20260415/voxtral_preset_neutral_female/, all 24kHz mono PCM"
    },
    "note": "This is TOTAL synthesis latency (batch), NOT streaming TTFA. For phone-call TTFA gate (<500ms), separate /v1/audio/speech/stream test needed."
  },

  "samantha_voice_clone_status": {
    "verdict": "NOT_POSSIBLE — Mistral intentionally withheld encoder weights from open-source release",
    "primary_source": "HF Discussion #17 on Voxtral-4B-TTS-2603 model card. Org-member y123456y78 on 2026-03-27: 'The voice cloning feature is not included in the current release, and we don't yet have a timeline for its availability.' URL: https://huggingface.co/mistralai/Voxtral-4B-TTS-2603/discussions/17",
    "root_cause": "The open-source checkpoint ships consolidated.safetensors (decoder/LM only) + 20 precomputed voice_embedding/*.pt tensors. NO encoder file. The 20 presets work because embeddings are pre-baked. Arbitrary audio cannot be encoded without the missing weights — produces the runtime error: 'encode_waveforms requires encoder weights which are not available in the open-source checkpoint'.",
    "session_106_attempts": {
      "upload_via_api": "SUCCESS — POST /v1/audio/voices works, voice registered with embedding_source: 'audio'",
      "synthesize_by_name_v0.18.0_unpatched": "FAIL 400 'Unknown voice' — routing bug, PR #2790 would have fixed this alone",
      "synthesize_by_name_after_hand_patch": "FAIL via deeper error — patched _build_voxtral_prompt to route uploaded voice to ref_audio. Patch worked routing-wise (PR #2790 equivalent). BUT hit the ACTUAL block: 'encode_waveforms requires encoder weights which are not available in the open-source checkpoint'. This is raised by mistral_common — no vllm-omni or user code can fix it.",
      "synthesize_by_inline_ref_audio": "CRASH — same underlying encoder-missing error, surfaces as orchestrator crash instead of clean 400"
    },
    "pr_2790_does_not_fix_this": "PR #2790 only routes uploaded-voice name to ref_audio. The encode_waveforms error is raised AFTER the routing fix. Our hand-patch (session 106) reached the same point as PR #2790 and confirmed this.",
    "community_discussion": [
      "HF #17 (11 comments) — voice clone confirmed missing: https://huggingface.co/mistralai/Voxtral-4B-TTS-2603/discussions/17",
      "HF #16 (17 comments, 11 thumbs) — community frustration at closed encoder: https://huggingface.co/mistralai/Voxtral-4B-TTS-2603/discussions/16",
      "HF #11 + #5 — 'How to make new voices?', 'Finetuning code?' both hit same gap",
      "MarvinRomson/voxtral-tts-codes-for-audio (9 stars, 2026-04-13) — community reverse-engineering attempt, NOT a working clone"
    ],
    "alternative_forks_checked": "All community forks (AITRADER bf16/mxfp4/mxfp8, idontkwow, MLX variants, NVFP4) inherit the missing-encoder gap. You can't quantize weights that don't exist.",
    "paths_forward": {
      "keep_chatterbox": "RECOMMENDED — already deployed on Panda, clones Samantha zero-shot from reference audio, MIT license, no restrictions.",
      "mistral_hosted_api": "Has voice cloning (encoder runs on their servers). $0.016/1K chars. CC-BY-NC license still restricts commercial use.",
      "voxtral_preset_only": "Self-hosted Voxtral works with 20 presets (neutral_female, etc.) at RTF 0.79x — just can't clone Samantha specifically.",
      "fine_tune": "NOT VIABLE — fine-tuning requires training a speaker embedding through the missing encoder."
    }
  },

  "samantha_references": {
    "primary_30s_trim": "services/audio-pipeline/voice-references/samantha_movie_primary.wav → trimmed to 28s via ffmpeg -ss 3 -t 28 → /tmp/samantha_28s.wav (1.3 MB, 24kHz mono). Voxtral 30s max enforced at upload.",
    "saved_on_titan_at": "/tmp/samantha_28s.wav (also uploaded into vllm-omni voice registry as 'samantha_movie' during run)"
  },

  "gemma_impact": {
    "baseline_p50_ms": 190,
    "post_flight_p50_ms": 222,
    "drift_ratio": 1.17,
    "within_abort_gate": true,
    "abort_threshold_ratio": 1.2,
    "outage_duration_minutes": "~45 (includes source-build iteration cycles, 4 container restarts)"
  },

  "next_steps_CORRECTED_AFTER_VENDOR_RESEARCH": [
    "DROP VOXTRAL from the Samantha-voice path. Mistral withheld encoder weights; self-hosted Voxtral cannot clone arbitrary audio. No patch possible without those weights.",
    "Swap services/audio-pipeline/voice-references/samantha_movie_primary.wav into Chatterbox voice-clone reference path (1-line change in the Chatterbox call site). This is the immediate Samantha unlock.",
    "If Chatterbox quality falls short for phone-call use case, evaluate next TTS candidate in docs/NEXT-SESSION-TTS-ALTERNATIVES.md 20-model survey (Chatterbox-Turbo, CosyVoice 2, F5-TTS). Pick one that ships the voice-clone encoder openly.",
    "Voxtral still viable if presets suffice — RTF 0.79x + 20 presets including neutral_female/casual_female/hi_female. Could serve non-Samantha flows if needed. But Samantha-cloning is off the table.",
    "Stop monitoring vllm-omni PR #2790 as 'the unblock' — it's not. Monitor Mistral's Voxtral model card for a separate 'encoder weights' release (no timeline announced as of 2026-04-15)."
  ],

  "artifacts_produced": {
    "voxtral_preset_neutral_female_wavs": "docs/voxtral-bench-samples-20260415/voxtral_preset_neutral_female/voxtral_00..09.wav",
    "verdict_json_this_run": "docs/BENCHMARK-VOXTRAL-TITAN-20260415.json",
    "verdict_json_prior_incorrect": "docs/BENCHMARK-VOXTRAL-TITAN-20260414-2300.json (preserved, superseded)",
    "branch": "voxtral-bench-20260414"
  }
}
