import os import pandas as pd from collections import defaultdict def process_files_to_excel(file_paths, output_file): """ Convert multiple text files into a single Excel file with specified structure. Args: file_paths (list): List of paths to input text files output_file (str): Path to output Excel file """ # Prepare data for the DataFrame rows = [] max_subjects = 0 # Process each file for file_path in file_paths: # Extract filename without path and extension filename = os.path.basename(file_path) # Read file content with open(file_path, 'r', encoding='utf-8') as f: lines = f.readlines() # Process lines - extract first two columns only file_data = [] for line in lines: if line.strip(): # Skip empty lines parts = line.strip().split('\t') if len(parts) >= 2: # Remove <> brackets from URI if present uri = parts[0].replace('<', '').replace('>', '') label = parts[1] file_data.append((uri, label)) # Create row with filename and vocabulary (based on filename or content) row = {'File name': filename.replace('.annif','.txt')} row['Vocabulary'] = 'EuroVoc' # Determine vocabulary based on filename pattern # if 'eurovoc' in filename.lower(): # row['Vocabulary'] = 'EuroVoc' # elif 'det' in filename.lower(): # row['Vocabulary'] = 'DET' # else: # row['Vocabulary'] = '' # Default empty if not recognized # Add tag data for i, (uri, label) in enumerate(file_data): row[f'Tag label(en) {i+1}'] = label row[f'Tag id {i+1}'] = uri rows.append(row) max_subjects = max(max_subjects, len(file_data)) # Create DataFrame with all columns in order all_columns = ['File name', 'Vocabulary'] # Add pairs of columns for each subject for i in range(max_subjects): all_columns.extend([f'Tag label(en) {i+1}', f'Tag id {i+1}']) df = pd.DataFrame(rows, columns=all_columns) # Save to Excel df.to_excel(output_file, index=False) print(f"Excel file saved as: {output_file}") def main(): # Example usage - replace with your actual file paths # You can also pass a directory to automatically find all txt files # Option 1: Specify exact file paths # file_paths = [ # 'GS0001.txt', # 'GS0002.txt', # 'GS0003.txt', # 'GS0004.txt', # 'GS0005.txt' # ] # Option 2: Automatically find all .txt files in current directory file_paths = [f for f in os.listdir('.') if f.endswith('.annif')] output_file = 'output.xlsx' process_files_to_excel(file_paths, output_file) if __name__ == "__main__": main()