from datasets import load_dataset import csv import time # Stream the dataset to avoid full download dataset = load_dataset("EuropeanParliament/Eurovoc_2025", split="train", streaming=True) output_file = "eurovoc_en.csv" # Define output columns in Annif CSV format output_columns = ["document_id", "url", "date", "type", "title", "text", "subject_uris"] # Open CSV and write header with open(output_file, mode="w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=output_columns) writer.writeheader() count = 0 for row in dataset: # Filter for English documents with Eurovoc concepts if row.get("language") == "EN" and len(row.get("eurovoc_concepts", [])) > 0: # Map dataset fields to output columns out_row = { "document_id": row.get("reference", ""), "url": row.get("url", ""), "date": row.get("date", ""), "type": row.get("type", ""), "title": row.get("title", ""), "text": row.get("text", ""), "subject_uris": ";".join(row.get("eurovoc_concepts", [])) } writer.writerow(out_row) count += 1 if count % 100 == 0: print(f"Processed {count} rows, sleeping 1 seconds...") time.sleep(1) print(f"✅ Saved {count} rows to {output_file}")