{
  "schema_version": "1",
  "bench_name": "chatterbox_500m_titan_dgx_spark",
  "bench_plan": "/home/rajesh/.claude/plans/replicated-cuddling-duckling.md",
  "date_utc": "2026-04-15",
  "host": {
    "machine": "NVIDIA DGX Spark (Titan)",
    "gpu": "NVIDIA GB10 (Blackwell SM_121, unified memory 128 GB)",
    "arch": "aarch64",
    "driver": "580.142",
    "cuda_runtime": "13.0"
  },
  "install": {
    "chatterbox_tts_version": "0.1.7",
    "torch_version": "2.11.0+cu128",
    "torchaudio_version": "2.11.0+cu128",
    "python_version": "3.12.3",
    "venv_path": "~/workplace/her/her-os-chatterbox-bench/.venv-chatterbox-bench/",
    "pip_freeze_sha256": "eb299ede3811be089ddd92a294e8e2a02b7e54d778d2291c185b09e96983330c",
    "pip_freeze_file": "docs/chatterbox-titan-bench-samples-20260415-053202/pip_freeze.txt",
    "hf_model_files": {
      "conds.pt": {
        "sha256": "6552d70568833628ba019c6b03459e77fe71ca197d5c560cef9411bee9d87f4e",
        "bytes": 107374
      },
      "s3gen.safetensors": {
        "sha256": "2b78103c654207393955e4900aac14a12de8ef25f4b09424f1ef91941f161d4e",
        "bytes": 1056484620
      },
      "t3_cfg.safetensors": {
        "sha256": "914cb1696f47527fe8852ca8f1fe1fa63cb34f76f9c715e84e067b744dd0da81",
        "bytes": 2129653744
      },
      "tokenizer.json": {
        "sha256": "d71e3a44eabb1784df9a68e9f95b251ecbf1a7af6a9f50835856b2ca9d8c14a5",
        "bytes": 25470
      },
      "ve.safetensors": {
        "sha256": "f0921cab452fa278bc25cd23ffd59d36f816d7dc5181dd1bef9751a7fb61f63c",
        "bytes": 5695784
      }
    },
    "license": "MIT (code + weights) — ResembleAI/chatterbox"
  },
  "blackwell_patches_applied": [
    {
      "helper": "patch_chatterbox_xvector_cpu_fbank",
      "target": "chatterbox.models.s3gen.xvector.extract_feature",
      "rationale": "torchaudio.compliance.kaldi.fbank → torch.stft → .abs() on complex → Jiterator NVRTC fails on SM_121. Route through CPU (one-shot per voice-ref, ~5s audio)."
    },
    {
      "helper": "patch_chatterbox_s3tokenizer_log_mel",
      "target": "chatterbox.models.s3tokenizer.s3tokenizer.S3Tokenizer.log_mel_spectrogram",
      "rationale": "Original: torch.stft(return_complex=True) + .abs()**2. Replaced with return_complex=False + real**2 + imag**2 for the magnitude, all via pre-compiled ops."
    }
  ],
  "bench_window": {
    "gemma_mode": "pause-gemma",
    "gemma_paused_at_utc_approx": "2026-04-15T05:11",
    "gemma_restarted_at_utc_approx": "2026-04-15T05:48",
    "pause_duration_min_approx": 37,
    "rationale": "Plan-mandated: install-retry inherent in pip force-reinstall torch; Gemma baseline p50 (220.9ms) exceeded ±5% of long-term median (~190ms); conservative to pause to avoid CUDA-context contention."
  },
  "voice_ref": {
    "filename": "samantha_evolving.wav",
    "source": "~/.her-os/annie/voice-references/samantha_evolving.wav (Panda production)",
    "sha256": "0f350a7f8d63a010e9b20b0c334c5c5b165b097e6e9b1045757fc91a27573e0e",
    "samplerate_hz": 24000,
    "channels": 1,
    "subtype": "PCM_16",
    "duration_s": 5.0,
    "plan_deviation_note": "Plan assumed samantha_movie_primary.wav (34.7s) on both sides. But Panda's production allowlist only contains the shorter refs (samantha_hello/name/evolving.wav); samantha_evolving (5s) is the actual production default. Using the production ref for both sides is MORE production-faithful than the plan's original assumption. Also eliminates the 30s voice-embedder cap concern (CODE-11/PM-3)."
  },
  "synthesis_params": {
    "cfg_weight": 0.3,
    "exaggeration": 0.3,
    "temperature": 0.6
  },
  "phase2_synthesis": {
    "n_utterances": 10,
    "mid_batch_block": false,
    "model_load_s": 6.61,
    "warmup_peak_vram_mib_torch": 3272.7,
    "totals_ms": {
      "mean": 2136.3,
      "p50": 2055.35,
      "p95": 3506.87,
      "min": 1074.3,
      "max": 4378.7
    },
    "rtfs": {
      "mean": 0.56,
      "p50": 0.55,
      "p95": 0.629,
      "min": 0.499,
      "max": 0.632
    },
    "durations_s": {
      "mean": 3.84,
      "p50": 3.74,
      "min": 1.72,
      "max": 7.0
    },
    "vram_mib_torch_peak_across_utterances": {
      "max": 3410.6,
      "min": 3409.7,
      "mean": 3410.2
    },
    "vram_mib_nvsmi_process_observed": 3752.0,
    "vram_gap_mib_nvsmi_minus_torch": 341.7,
    "nvsmi_device_memory_query_result": "N/A (GB10 unified memory — --query-gpu=memory.used returns [N/A] on this hardware; per-process GPU Memory via --query-compute-apps works)",
    "details_json": "docs/chatterbox-titan-bench-samples-20260415-053202/BENCHMARK-CHATTERBOX-TITAN-PHASE2.json"
  },
  "phase4_identity": {
    "mode": "embedding",
    "scorer": "resemblyzer.VoiceEncoder (CPU)",
    "plan_deviation_note": "Plan specified human 1-5 MOS A/B. Substituted with speaker-embedding cosine similarity because the bench loop had no human listener in the session. The human mode is implemented in scripts/tts_identity_score.py --mode human for future audit; it reuses the same paired WAVs.",
    "mean_cosine": 0.9199,
    "min_cosine": 0.8652,
    "max_cosine": 0.95,
    "spread": 0.0848,
    "thresholds": {
      "parity_mean_min": 0.8,
      "parity_spread_max": 0.15
    },
    "n_pairs": 10,
    "details_json": "docs/chatterbox-titan-bench-samples-20260415-053202/BENCHMARK-CHATTERBOX-TITAN-IDENTITY.json"
  },
  "gemma_drift": {
    "baseline_p50_ms": 220.9,
    "baseline_p95_ms": 236.8,
    "long_term_reference_p50_ms": 190,
    "pre_bench_deviation_ratio_vs_long_term": 1.16,
    "post_flight_p50_ms": 222.1,
    "post_flight_p95_ms": 223.7,
    "drift_ratio_post_over_baseline": 1.005,
    "drift_gate_passed": true,
    "gate_threshold_ratio": 1.2
  },
  "verdicts": [
    "titan_chatterbox_synthesis_parity_with_panda"
  ],
  "verdicts_not_emitted": [
    "titan_chatterbox_failover_dry_run_clean (Phase 6 was not opted in — synthesis parity only)"
  ],
  "redundancy_state": "synthesis_parity_only",
  "followup_required_for_full_redundancy": "Phase 6 staged HTTP failover dry-run (see docs/NEXT-SESSION-CHATTERBOX-TITAN-BENCH-V2.md)"
}
