#!/usr/bin/env python3 import os import sys from pathlib import Path # NumPy 兼容性补丁(必须在 import numpy 之前执行) import numpy as np if not hasattr(np, 'NaN'): np.NaN = np.nan import json import time print("="*70) print("🚀 Numberblocks One 音色提取 - Docker 版本 (已修复)") print("="*70) # 检查环境变量 HF_TOKEN = os.environ.get("HF_TOKEN") if not HF_TOKEN: print("❌ 错误: HF_TOKEN 环境变量未设置") sys.exit(1) print(f"✅ Token 已配置") print() try: # 导入库 print("📦 导入库...") from huggingface_hub import login, HfApi, hf_hub_download import torch from pyannote.audio import Pipeline import librosa import soundfile as sf print("✅ 库导入成功") print() # 配置 DATASET_ID = "ayf3/numberblocks-audio" OUTPUT_DIR = Path("/data/output") ONE_AUDIO_DIR = OUTPUT_DIR / "one_audio" # 🔧 修复:使用 /data 作为缓存目录,避免 /tmp 权限问题 CACHE_DIR = Path("/data/hf_cache") CACHE_DIR.mkdir(parents=True, exist_ok=True) os.environ['HF_HOME'] = str(CACHE_DIR) os.environ['HUGGINGFACE_HUB_CACHE'] = str(CACHE_DIR / 'hub') OUTPUT_DIR.mkdir(parents=True, exist_ok=True) ONE_AUDIO_DIR.mkdir(parents=True, exist_ok=True) # 检查设备 if torch.cuda.is_available(): print(f"✅ GPU: {torch.cuda.get_device_name(0)}") device = torch.device("cuda") else: print("⚠️ 使用 CPU") device = torch.device("cpu") print() # 登录 print("🔐 登录 Hugging Face...") login(token=HF_TOKEN) print("✅ 登录成功") print() # 加载模型 print("📥 加载模型 (需要几分钟)...") pipeline = Pipeline.from_pretrained( "pyannote/speaker-diarization-3.1", token=HF_TOKEN ).to(device) print("✅ 模型加载完成") print() # 获取文件列表 print("📋 获取文件列表...") api = HfApi(token=HF_TOKEN) files = api.list_repo_files(DATASET_ID, repo_type="dataset") audio_files = sorted([f for f in files if f.endswith('.wav')]) print(f"✅ 找到 {len(audio_files)} 个文件") print() # 处理 print("="*70) print("🎵 开始处理") print("="*70) print() all_segments = [] for idx, audio_file in enumerate(audio_files, 1): print(f"[{idx}/{len(audio_files)}] {audio_file}", flush=True) try: # 🔧 修复:使用 /data/download 而不是 /tmp download_dir = Path("/data/download") download_dir.mkdir(exist_ok=True) audio_path = download_dir / Path(audio_file).name hf_hub_download( repo_id=DATASET_ID, filename=audio_file, repo_type="dataset", local_dir=str(download_dir), token=HF_TOKEN, cache_dir=str(CACHE_DIR) # 🔧 指定缓存目录 ) # 分离 diarization = pipeline(audio_path) # 找主角 speaker_durations = {} for turn, _, speaker in diarization.itertracks(yield_label=True): duration = turn.end - turn.start speaker_durations[speaker] = speaker_durations.get(speaker, 0) + duration main_speaker = max(speaker_durations.items(), key=lambda x: x[1])[0] print(f" 🎯 {main_speaker}", flush=True) # 提取 y, sr = librosa.load(audio_path, sr=16000) segments = [] for turn, _, speaker in diarization.itertracks(yield_label=True): if speaker == main_speaker: start = int(turn.start * sr) end = int(turn.end * sr) segments.append(y[start:end]) if segments: one_audio = np.concatenate(segments) output_path = ONE_AUDIO_DIR / f"{Path(audio_file).stem}_one.wav" sf.write(output_path, one_audio, sr) duration = len(one_audio) / sr print(f" ✅ {duration:.1f}s", flush=True) all_segments.append({ 'source': audio_file, 'duration': duration }) # 清理 if audio_path.exists(): audio_path.unlink() except Exception as e: print(f" ❌ {str(e)[:50]}", flush=True) continue # 保存报告 print() print("="*70) print("📊 生成报告") print("="*70) print() total_hours = sum(s['duration'] for s in all_segments) / 3600 report = { 'total_files': len(audio_files), 'processed_files': len(all_segments), 'total_one_audio_hours': total_hours, 'segments': all_segments } with open(OUTPUT_DIR / "processing_report.json", 'w') as f: json.dump(report, f, indent=2) print(f"✅ 完成: {len(all_segments)}/{len(audio_files)}") print(f"🎵 音频: {total_hours:.2f} 小时") print() print("🎉 全部完成!") except Exception as e: print() print("="*70) print(f"❌ 错误: {str(e)}") print("="*70) import traceback traceback.print_exc() sys.exit(1)