#!/usr/bin/env python3 """ LAION数据集下载脚本 这个脚本帮助下载LAION数据集的不同版本。 LAION数据集很大,通常需要下载元数据文件和图像文件。 注意:完整LAION数据集非常大(数TB),建议下载子集或使用现有缓存。 """ import os import argparse import subprocess import pandas as pd from pathlib import Path import requests import json from tqdm import tqdm import sys def download_file(url, output_path, chunk_size=8192): """下载文件并显示进度条""" response = requests.get(url, stream=True) response.raise_for_status() total_size = int(response.headers.get('content-length', 0)) with open(output_path, 'wb') as f, tqdm( desc=os.path.basename(output_path), total=total_size, unit='B', unit_scale=True, unit_divisor=1024, ) as pbar: for chunk in response.iter_content(chunk_size=chunk_size): f.write(chunk) pbar.update(len(chunk)) return output_path def download_laion_aesthetic(output_dir="./data/laion", subset="6.5+"): """ 下载LAION-Aesthetic数据集 Args: output_dir: 输出目录 subset: 子集版本,可选 "6.5+" (6.5分以上), "7.0+" (7.0分以上) """ os.makedirs(output_dir, exist_ok=True) # LAION-Aesthetic数据集信息 datasets = { "6.5+": { "metadata": "https://huggingface.co/datasets/laion/laion-aesthetic-6.5plus/resolve/main/data/00000.parquet", "description": "LAION-Aesthetic 6.5+ (美学评分6.5分以上)" }, "7.0+": { "metadata": "https://huggingface.co/datasets/laion/laion-aesthetic-7.0plus/resolve/main/data/00000.parquet", "description": "LAION-Aesthetic 7.0+ (美学评分7.0分以上)" } } if subset not in datasets: print(f"错误: 不支持的子集 {subset}") print(f"可用子集: {list(datasets.keys())}") return False dataset_info = datasets[subset] print(f"下载 {dataset_info['description']}") # 下载元数据文件 metadata_url = dataset_info["metadata"] metadata_path = os.path.join(output_dir, "metadata.parquet") print(f"下载元数据文件到: {metadata_path}") try: download_file(metadata_url, metadata_path) print(f"元数据文件下载完成: {metadata_path}") # 验证文件 df = pd.read_parquet(metadata_path) print(f"元数据包含 {len(df)} 条记录") print(f"列名: {list(df.columns)}") # 保存样本信息 sample_info = { "total_samples": len(df), "columns": list(df.columns), "subset": subset, "description": dataset_info["description"] } with open(os.path.join(output_dir, "dataset_info.json"), "w") as f: json.dump(sample_info, f, indent=2) print(f"数据集信息已保存到: {os.path.join(output_dir, 'dataset_info.json')}") return True except Exception as e: print(f"下载失败: {e}") return False def download_laion_5b_sample(output_dir="./data/laion", num_samples=10000): """ 下载LAION-5B数据集的样本 Args: output_dir: 输出目录 num_samples: 样本数量 """ os.makedirs(output_dir, exist_ok=True) print(f"下载LAION-5B数据集样本 ({num_samples}条记录)") # LAION-5B数据集分片URL示例 # 注意:完整数据集有数万个分片,这里只下载一个样本分片 sample_shard = "https://huggingface.co/datasets/laion/laion2b-en/resolve/main/part-00000-5b54c5d5-bbcf-484d-a2ce-0d6f73df1a36-c000.snappy.parquet" metadata_path = os.path.join(output_dir, "metadata_sample.parquet") print(f"下载样本分片到: {metadata_path}") try: download_file(sample_shard, metadata_path) print(f"样本分片下载完成: {metadata_path}") # 读取并采样 df = pd.read_parquet(metadata_path) if len(df) > num_samples: df = df.sample(num_samples, random_state=42) # 保存采样后的数据 sampled_path = os.path.join(output_dir, "metadata.parquet") df.to_parquet(sampled_path) print(f"采样数据保存到: {sampled_path}") print(f"采样后包含 {len(df)} 条记录") print(f"列名: {list(df.columns)}") # 保存样本信息 sample_info = { "total_samples": len(df), "original_samples": len(pd.read_parquet(metadata_path)), "columns": list(df.columns), "dataset": "LAION-5B-sample", "description": f"LAION-5B数据集样本 ({num_samples}条记录)" } with open(os.path.join(output_dir, "dataset_info.json"), "w") as f: json.dump(sample_info, f, indent=2) print(f"数据集信息已保存到: {os.path.join(output_dir, 'dataset_info.json')}") return True except Exception as e: print(f"下载失败: {e}") return False def create_dummy_dataset(output_dir="./data/laion", num_samples=100): """ 创建虚拟数据集用于测试 Args: output_dir: 输出目录 num_samples: 样本数量 """ os.makedirs(output_dir, exist_ok=True) print(f"创建虚拟数据集 ({num_samples}条记录)") import numpy as np # 创建虚拟数据 data = { 'url': [f'https://example.com/image_{i}.jpg' for i in range(num_samples)], 'caption': [f'A beautiful image number {i}' for i in range(num_samples)], 'aesthetic_score': np.random.uniform(5.0, 9.0, num_samples), 'watermark_prob': np.random.uniform(0.0, 1.0, num_samples), 'NSFW': ['UNLIKELY'] * num_samples, 'image_file': [f'image_{i:06d}.jpg' for i in range(num_samples)] } df = pd.DataFrame(data) # 保存元数据 metadata_path = os.path.join(output_dir, "metadata.parquet") df.to_parquet(metadata_path) print(f"虚拟元数据创建完成: {metadata_path}") print(f"包含 {len(df)} 条记录") print(f"列名: {list(df.columns)}") # 创建虚拟图像目录 images_dir = os.path.join(output_dir, "images") os.makedirs(images_dir, exist_ok=True) print(f"虚拟图像目录: {images_dir}") print("注意:虚拟数据集不包含实际图像文件,仅用于测试代码流程") # 保存数据集信息 sample_info = { "total_samples": len(df), "columns": list(df.columns), "dataset": "dummy", "description": f"虚拟测试数据集 ({num_samples}条记录)", "note": "不包含实际图像文件,仅用于测试代码流程" } with open(os.path.join(output_dir, "dataset_info.json"), "w") as f: json.dump(sample_info, f, indent=2) print(f"数据集信息已保存到: {os.path.join(output_dir, 'dataset_info.json')}") return True def main(): parser = argparse.ArgumentParser(description="下载LAION数据集") parser.add_argument("--output-dir", default="./data/laion", help="输出目录") parser.add_argument("--subset", default="6.5+", choices=["6.5+", "7.0+"], help="LAION-Aesthetic子集版本") parser.add_argument("--sample-size", type=int, default=10000, help="LAION-5B样本大小") parser.add_argument("--dummy", action="store_true", help="创建虚拟数据集用于测试") parser.add_argument("--dummy-size", type=int, default=100, help="虚拟数据集大小") args = parser.parse_args() print("=" * 60) print("LAION数据集下载工具") print("=" * 60) if args.dummy: print("\n创建虚拟数据集模式...") success = create_dummy_dataset(args.output_dir, args.dummy_size) else: print("\n下载LAION-Aesthetic数据集...") print(f"输出目录: {args.output_dir}") print(f"子集版本: {args.subset}") print("\n注意:") print("1. LAION数据集很大,下载需要时间和存储空间") print("2. 元数据文件通常几百MB到几GB") print("3. 图像文件需要额外下载") print("4. 建议先使用虚拟数据集测试代码流程") response = input("\n是否继续? (y/n): ") if response.lower() != 'y': print("取消下载") return success = download_laion_aesthetic(args.output_dir, args.subset) if success: print("\n" + "=" * 60) print("下载完成!") print("=" * 60) print("\n下一步:") print("1. 检查下载的文件:") print(f" ls -lh {args.output_dir}/") print("2. 测试数据集加载:") print(" python -c \"import pandas as pd; df=pd.read_parquet('{}'); print('记录数:', len(df))\"".format( os.path.join(args.output_dir, "metadata.parquet"))) print("3. 运行测试脚本:") print(" python src/data/dataset.py") else: print("\n下载失败,请检查错误信息") if __name__ == "__main__": main()