"""
Turkmen Word2Vec Model

This script preprocesses Turkmen text data and trains a Word2Vec model.
It's designed for open-source use and easy adaptation to other projects.

Requirements:
- Python 3.6+
- Dependencies: nltk, gensim, tqdm

Usage:
1. Prepare your Turkmen text data in a file (one sentence per line).
2. Update the CONFIG dictionary with your desired parameters.
3. Run the script: python turkmen_word2vec.py

The script will preprocess the data, train the model, and save it for future use.
"""

import re
import nltk
import logging
import multiprocessing
from pathlib import Path
from typing import List, Dict, Tuple

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
from gensim.models import Word2Vec

# Configuration
CONFIG = {
    "input_file": "path/to/your/input/file.txt",
    "output_dir": "path/to/output/directory",
    "model_name": "turkmen_word2vec",
    "vector_size": 300,
    "window": 5,
    "min_count": 15,
    "sg": 1,
    "epochs": 10,
    "negative": 15,
    "sample": 1e-5,
}

# Setup logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Ensure required NLTK data is available
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

# Load stop words (using Turkish as a close approximation to Turkmen)
STOP_WORDS = set(stopwords.words('turkish'))

# Character replacements for Turkmen-specific letters
REPLACEMENTS = {
    'ä': 'a', 'ç': 'ch', 'ö': 'o', 'ü': 'u', 'ň': 'n', 'ý': 'y', 'ğ': 'g', 'ş': 's',
    'Ç': 'Ch', 'Ö': 'O', 'Ü': 'U', 'Ä': 'A', 'Ň': 'N', 'Ş': 'S', 'Ý': 'Y', 'Ğ': 'G'
}

def preprocess_sentence(sentence: str) -> List[str]:
    """
    Preprocess a single sentence.
    
    Args:
        sentence (str): Input sentence.
    
    Returns:
        List[str]: List of preprocessed tokens.
    """
    for original, replacement in REPLACEMENTS.items():
        sentence = sentence.replace(original, replacement)
    
    sentence = re.sub(r'[^a-zA-Z ]', ' ', sentence)
    sentence = sentence.lower()

    tokens = word_tokenize(sentence)
    return [word for word in tokens if word not in STOP_WORDS and len(word) > 2]

def process_chunk(chunk: List[str]) -> List[List[str]]:
    """
    Process a chunk of sentences in parallel.
    
    Args:
        chunk (List[str]): List of sentences to process.
    
    Returns:
        List[List[str]]: List of preprocessed sentences (as token lists).
    """
    return [preprocess_sentence(sentence) for sentence in chunk]

def load_and_preprocess(file_path: str) -> List[List[str]]:
    """
    Load and preprocess the input file.
    
    Args:
        file_path (str): Path to the input file.
    
    Returns:
        List[List[str]]: List of preprocessed sentences (as token lists).
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        sentences = f.readlines()

    chunk_size = len(sentences) // multiprocessing.cpu_count()
    chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]

    processed_sentences = []
    with ProcessPoolExecutor() as executor:
        for chunk_result in tqdm(executor.map(process_chunk, chunks), total=len(chunks), desc="Preprocessing"):
            processed_sentences.extend(chunk_result)

    return processed_sentences

def train_word2vec(sentences: List[List[str]], params: Dict) -> Word2Vec:
    """
    Train the Word2Vec model.
    
    Args:
        sentences (List[List[str]]): Preprocessed sentences.
        params (Dict): Model parameters.
    
    Returns:
        Word2Vec: Trained Word2Vec model.
    """
    model = Word2Vec(sentences=sentences, vector_size=params['vector_size'], 
                     window=params['window'], min_count=params['min_count'],
                     workers=multiprocessing.cpu_count(), sg=params['sg'], 
                     epochs=params['epochs'], negative=params['negative'], 
                     sample=params['sample'], compute_loss=True)
    return model

def save_model(model: Word2Vec, output_dir: str, model_name: str) -> None:
    """
    Save the trained model and its metadata.
    
    Args:
        model (Word2Vec): Trained Word2Vec model.
        output_dir (str): Directory to save the model.
        model_name (str): Name of the model.
    """
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    model_path = output_path / f"{model_name}.model"
    model.save(str(model_path))
    logging.info(f"Model saved to {model_path}")

    # Save model metadata
    metadata_path = output_path / f"{model_name}_metadata.txt"
    with open(metadata_path, 'w', encoding='utf-8') as f:
        f.write(f"Model: {model_name}\n")
        f.write(f"Vocabulary size: {len(model.wv.key_to_index)}\n")
        f.write(f"Vector size: {model.vector_size}\n")
        f.write(f"Window size: {model.window}\n")
        f.write(f"Min count: {model.min_count}\n")
        f.write(f"Training epochs: {model.epochs}\n")
        f.write(f"Final training loss: {model.get_latest_training_loss()}\n")
    logging.info(f"Model metadata saved to {metadata_path}")

def main():
    """Main execution function."""
    logging.info("Starting Turkmen Word2Vec model training")

    # Load and preprocess data
    processed_sentences = load_and_preprocess(CONFIG['input_file'])
    logging.info(f"Preprocessed {len(processed_sentences)} sentences")

    # Train model
    model = train_word2vec(processed_sentences, CONFIG)
    logging.info("Model training completed")

    # Save model and metadata
    save_model(model, CONFIG['output_dir'], CONFIG['model_name'])

    logging.info("Process completed successfully")

if __name__ == "__main__":
    main()