""" Darwin-TTS-1.7B-Cross: Cross-Modal LLM→TTS FFN Blending ========================================================= World's first cross-modal FFN transfer from LLM to TTS. No training. 84 FFN tensors. Shape 100% match. Usage: python darwin_tts_blend.py --alpha 3 --text "안녕하세요!" python darwin_tts_blend.py --alpha 5 --ref voice.wav --text "Hello!" Alpha guide: 0 = Original Qwen3-TTS (no blending) 1 = Subtle (barely noticeable) 3 = Recommended (emotion appears) ★ 5 = Maximum stable (emotion intensified) ★★ 10 = BROKEN (do not use) """ import argparse import torch import numpy as np import soundfile as sf from pathlib import Path from safetensors import safe_open def load_llm_ffn(model_id="Qwen/Qwen3-1.7B"): """Load FFN weights from Qwen3-1.7B LLM.""" from huggingface_hub import snapshot_download path = snapshot_download(model_id, ignore_patterns=["*.bin", "*.ot", "*.msgpack"]) ffn = {} for f in sorted(Path(path).rglob("*.safetensors")): with safe_open(str(f), framework="pt") as s: for k in s.keys(): if any(x in k for x in ["gate_proj", "up_proj", "down_proj"]): ffn[k] = s.get_tensor(k) print(f"Loaded {len(ffn)} LLM FFN tensors") return ffn def blend_tts(alpha=0.03, tts_model="Qwen/Qwen3-TTS-12Hz-1.7B-Base"): """ Load TTS model and blend LLM FFN into talker. Args: alpha: Blend ratio (0.0 to 0.05 recommended, default 0.03) tts_model: TTS model ID or path Returns: Blended Qwen3TTSModel ready for inference """ from qwen_tts import Qwen3TTSModel print(f"Loading TTS: {tts_model}") model = Qwen3TTSModel.from_pretrained( tts_model, device_map="cuda:0", dtype=torch.bfloat16 ) if alpha > 0: llm_ffn = load_llm_ffn() cnt = 0 for n, p in model.model.named_parameters(): if "talker" not in n or "code_predictor" in n: continue if not any(x in n for x in ["gate_proj", "up_proj", "down_proj"]): continue llm_key = n.replace("talker.", "") if llm_key in llm_ffn: with torch.no_grad(): p.lerp_(llm_ffn[llm_key].to(p.device, p.dtype), alpha) cnt += 1 print(f"Blended {cnt} FFN tensors (alpha={alpha}, shape 100% match)") return model if __name__ == "__main__": parser = argparse.ArgumentParser(description="Darwin-TTS: LLM→TTS FFN Blending") parser.add_argument("--alpha", type=int, default=3, help="Blend %% (0=original, 3=recommended, 5=max stable)") parser.add_argument("--text", type=str, default="안녕하세요, 저는 다윈 인공지능입니다.") parser.add_argument("--ref", type=str, default=None, help="Reference audio for voice cloning") parser.add_argument("--output", type=str, default="darwin_output.wav") args = parser.parse_args() if args.ref is None: args.ref = "/tmp/_darwin_ref.wav" sf.write(args.ref, (0.1 * np.sin(2 * np.pi * 200 * np.linspace(0, 3, 72000)) ).astype(np.float32), 24000) print("Using default sine reference (provide --ref for better quality)") model = blend_tts(alpha=args.alpha / 100.0) wavs, sr = model.generate_voice_clone( text=args.text, ref_audio=args.ref, ref_text="ref", x_vector_only_mode=True ) wav = wavs[0].cpu().numpy() if hasattr(wavs[0], "cpu") else np.array(wavs[0]) sf.write(args.output, wav, sr) print(f"Saved: {args.output} ({len(wav)/sr:.1f}s)")