import csv from rdflib import Graph from rdflib.namespace import SKOS csv.field_size_limit(131072 * 100) def load_eurovoc_notations(graph_filename): g = Graph() g.parse(graph_filename, format='xml') print("Graph parsed") result = {} for concept, notation in g.subject_objects(SKOS.notation): result[str(notation)] = str(concept) # result = {} return result def convert_ids_to_uris(ids): uris = [] for id in ids: if len(id) == 6: uris.append( f'' ) else: mapped_uri = notatation_uri_map[id] if mapped_uri is not None: uris.append( "<" + mapped_uri + ">" ) else: print(f"Could not map notation {id} to uri") return uris def cleanup(text): return " ".join(text.replace('"', ' ').split()) def convert_eurovoc_csv(input_file, output_file): """ Convert CSV file with eurovoc subject URIs to proper format. Args: input_file (str): Path to input CSV file output_file (str): Path to output CSV file """ # Print out header to CSV with open(output_file, 'w', newline='', encoding='utf-8') as outfile: print("document_id,url,date,type,title,text,subject_uris", file=outfile) # Read the input CSV and process each row with open(input_file, 'r', newline='', encoding='utf-8') as infile: reader = csv.DictReader(infile) for row in reader: # Create a new row with processed data processed_row = row.copy() # Quote the title and text fields processed_row['title'] = f'"{cleanup(row["title"])}"' processed_row['text'] = f'"{cleanup(row["text"])}"' # Process subject_uris - convert IDs to full URIs if row['subject_uris']: ids = row['subject_uris'].split(';') uris = convert_ids_to_uris(ids) processed_row['subject_uris'] = '"' + ' '.join(uris) + '"' else: processed_row['subject_uris'] = '""' # Write to output CSV with open(output_file, 'a', newline='', encoding='utf-8') as outfile: print(",".join(processed_row.values()), file=outfile) # processed_rows.append(processed_row) # if len(processed_rows) >= 10: # break # Usage example: if __name__ == "__main__": notatation_uri_map = load_eurovoc_notations("eurovoc-skos-ap-eu.rdf") convert_eurovoc_csv("european_parliament_eurovoc_2025_en.csv", 'european_parliament_eurovoc_2025_en_converted.csv')