#!/usr/bin/env python3 """Split giselle_60min.txt into ~10K-12K char chunks on paragraph boundaries.""" import os SRC = "/home/runner/.openclaw/workspace/scripts/giselle_60min.txt" OUTDIR = "/tmp/tts_chunks" os.makedirs(OUTDIR, exist_ok=True) with open(SRC) as f: text = f.read() # Split on paragraph breaks (double newlines or scene breaks) # Each paragraph is a block paragraphs = [] for block in text.split("\n\n"): block = block.strip() if block: paragraphs.append(block) # Greedy chunking: accumulate paragraphs until we hit ~10K chars chunks = [] current = [] current_size = 0 TARGET = 10000 # ~10K per chunk for p in paragraphs: if current_size + len(p) > TARGET and current: chunks.append("\n\n".join(current)) current = [p] current_size = len(p) else: current.append(p) current_size += len(p) if current: chunks.append("\n\n".join(current)) # Write chunks manifest = [] for i, chunk in enumerate(chunks): fname = f"chunk_{i:03d}.txt" path = os.path.join(OUTDIR, fname) with open(path, "w") as f: f.write(chunk) sz = len(chunk) manifest.append((fname, sz)) print(f" {fname}: {sz} chars") total = sum(sz for _, sz in manifest) print(f"\nTotal: {len(chunks)} chunks, {total} chars") print(f"\nEstimated time: ~{sum(max(5, sz//2000) for _, sz in manifest)} min total") # Save manifest with open(os.path.join(OUTDIR, "manifest.txt"), "w") as f: for fname, sz in manifest: f.write(f"{fname}\t{sz}\n")