| """ |
| Darwin-TTS-1.7B-Cross: Cross-Modal LLMโTTS FFN Blending |
| ========================================================= |
| World's first cross-modal FFN transfer from LLM to TTS. |
| No training. 84 FFN tensors. Shape 100% match. |
| |
| Usage: |
| python darwin_tts_blend.py --alpha 3 --text "์๋
ํ์ธ์!" |
| python darwin_tts_blend.py --alpha 5 --ref voice.wav --text "Hello!" |
| |
| Alpha guide: |
| 0 = Original Qwen3-TTS (no blending) |
| 1 = Subtle (barely noticeable) |
| 3 = Recommended (emotion appears) โ
|
| 5 = Maximum stable (emotion intensified) โ
โ
|
| 10 = BROKEN (do not use) |
| """ |
| import argparse |
| import torch |
| import numpy as np |
| import soundfile as sf |
| from pathlib import Path |
| from safetensors import safe_open |
|
|
|
|
| def load_llm_ffn(model_id="Qwen/Qwen3-1.7B"): |
| """Load FFN weights from Qwen3-1.7B LLM.""" |
| from huggingface_hub import snapshot_download |
| path = snapshot_download(model_id, ignore_patterns=["*.bin", "*.ot", "*.msgpack"]) |
| ffn = {} |
| for f in sorted(Path(path).rglob("*.safetensors")): |
| with safe_open(str(f), framework="pt") as s: |
| for k in s.keys(): |
| if any(x in k for x in ["gate_proj", "up_proj", "down_proj"]): |
| ffn[k] = s.get_tensor(k) |
| print(f"Loaded {len(ffn)} LLM FFN tensors") |
| return ffn |
|
|
|
|
| def blend_tts(alpha=0.03, tts_model="Qwen/Qwen3-TTS-12Hz-1.7B-Base"): |
| """ |
| Load TTS model and blend LLM FFN into talker. |
| |
| Args: |
| alpha: Blend ratio (0.0 to 0.05 recommended, default 0.03) |
| tts_model: TTS model ID or path |
| |
| Returns: |
| Blended Qwen3TTSModel ready for inference |
| """ |
| from qwen_tts import Qwen3TTSModel |
| |
| print(f"Loading TTS: {tts_model}") |
| model = Qwen3TTSModel.from_pretrained( |
| tts_model, device_map="cuda:0", dtype=torch.bfloat16 |
| ) |
| |
| if alpha > 0: |
| llm_ffn = load_llm_ffn() |
| cnt = 0 |
| for n, p in model.model.named_parameters(): |
| if "talker" not in n or "code_predictor" in n: |
| continue |
| if not any(x in n for x in ["gate_proj", "up_proj", "down_proj"]): |
| continue |
| llm_key = n.replace("talker.", "") |
| if llm_key in llm_ffn: |
| with torch.no_grad(): |
| p.lerp_(llm_ffn[llm_key].to(p.device, p.dtype), alpha) |
| cnt += 1 |
| print(f"Blended {cnt} FFN tensors (alpha={alpha}, shape 100% match)") |
| |
| return model |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Darwin-TTS: LLMโTTS FFN Blending") |
| parser.add_argument("--alpha", type=int, default=3, |
| help="Blend %% (0=original, 3=recommended, 5=max stable)") |
| parser.add_argument("--text", type=str, |
| default="์๋
ํ์ธ์, ์ ๋ ๋ค์ ์ธ๊ณต์ง๋ฅ์
๋๋ค.") |
| parser.add_argument("--ref", type=str, default=None, |
| help="Reference audio for voice cloning") |
| parser.add_argument("--output", type=str, default="darwin_output.wav") |
| args = parser.parse_args() |
|
|
| if args.ref is None: |
| args.ref = "/tmp/_darwin_ref.wav" |
| sf.write(args.ref, |
| (0.1 * np.sin(2 * np.pi * 200 * np.linspace(0, 3, 72000)) |
| ).astype(np.float32), 24000) |
| print("Using default sine reference (provide --ref for better quality)") |
|
|
| model = blend_tts(alpha=args.alpha / 100.0) |
| wavs, sr = model.generate_voice_clone( |
| text=args.text, ref_audio=args.ref, |
| ref_text="ref", x_vector_only_mode=True |
| ) |
| wav = wavs[0].cpu().numpy() if hasattr(wavs[0], "cpu") else np.array(wavs[0]) |
| sf.write(args.output, wav, sr) |
| print(f"Saved: {args.output} ({len(wav)/sr:.1f}s)") |
|
|