import os
import pandas as pd
from collections import defaultdict

def process_files_to_excel(file_paths, output_file):
    """
    Convert multiple text files into a single Excel file with specified structure.

    Args:
        file_paths (list): List of paths to input text files
        output_file (str): Path to output Excel file
    """

    # Prepare data for the DataFrame
    rows = []
    max_subjects = 0

    # Process each file
    for file_path in file_paths:
        # Extract filename without path and extension
        filename = os.path.basename(file_path)

        # Read file content
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        # Process lines - extract first two columns only
        file_data = []
        for line in lines:
            if line.strip():  # Skip empty lines
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    # Remove <> brackets from URI if present
                    uri = parts[0].replace('<', '').replace('>', '')
                    label = parts[1]
                    file_data.append((uri, label))

        # Create row with filename and vocabulary (based on filename or content)
        row = {'File name': filename.replace('.annif','.txt')}

        row['Vocabulary'] = 'EuroVoc'
        # Determine vocabulary based on filename pattern
        # if 'eurovoc' in filename.lower():
        #     row['Vocabulary'] = 'EuroVoc'
        # elif 'det' in filename.lower():
        #     row['Vocabulary'] = 'DET'
        # else:
        #     row['Vocabulary'] = ''  # Default empty if not recognized

        # Add tag data
        for i, (uri, label) in enumerate(file_data):
            row[f'Tag label(en) {i+1}'] = label
            row[f'Tag id {i+1}'] = uri
        rows.append(row)
        max_subjects = max(max_subjects, len(file_data))

    # Create DataFrame with all columns in order
    all_columns = ['File name', 'Vocabulary']
    # Add pairs of columns for each subject
    for i in range(max_subjects):
        all_columns.extend([f'Tag label(en) {i+1}', f'Tag id {i+1}'])

    df = pd.DataFrame(rows, columns=all_columns)

    # Save to Excel
    df.to_excel(output_file, index=False)
    print(f"Excel file saved as: {output_file}")

def main():
    # Example usage - replace with your actual file paths
    # You can also pass a directory to automatically find all txt files

    # Option 1: Specify exact file paths
    # file_paths = [
    #     'GS0001.txt',
    #     'GS0002.txt',
    #     'GS0003.txt',
    #     'GS0004.txt',
    #     'GS0005.txt'
    # ]

    # Option 2: Automatically find all .txt files in current directory
    file_paths = [f for f in os.listdir('.') if f.endswith('.annif')]

    output_file = 'output.xlsx'

    process_files_to_excel(file_paths, output_file)

if __name__ == "__main__":
    main()