""" Turkmen Word2Vec Model This script preprocesses Turkmen text data and trains a Word2Vec model. It's designed for open-source use and easy adaptation to other projects. Requirements: - Python 3.6+ - Dependencies: nltk, gensim, tqdm Usage: 1. Prepare your Turkmen text data in a file (one sentence per line). 2. Update the CONFIG dictionary with your desired parameters. 3. Run the script: python turkmen_word2vec.py The script will preprocess the data, train the model, and save it for future use. """ import re import nltk import logging import multiprocessing from pathlib import Path from typing import List, Dict, Tuple import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from tqdm import tqdm from concurrent.futures import ProcessPoolExecutor from gensim.models import Word2Vec # Configuration CONFIG = { "input_file": "path/to/your/input/file.txt", "output_dir": "path/to/output/directory", "model_name": "turkmen_word2vec", "vector_size": 300, "window": 5, "min_count": 15, "sg": 1, "epochs": 10, "negative": 15, "sample": 1e-5, } # Setup logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Ensure required NLTK data is available nltk.download('stopwords', quiet=True) nltk.download('punkt', quiet=True) # Load stop words (using Turkish as a close approximation to Turkmen) STOP_WORDS = set(stopwords.words('turkish')) # Character replacements for Turkmen-specific letters REPLACEMENTS = { 'ä': 'a', 'ç': 'ch', 'ö': 'o', 'ü': 'u', 'ň': 'n', 'ý': 'y', 'ğ': 'g', 'ş': 's', 'Ç': 'Ch', 'Ö': 'O', 'Ü': 'U', 'Ä': 'A', 'Ň': 'N', 'Ş': 'S', 'Ý': 'Y', 'Ğ': 'G' } def preprocess_sentence(sentence: str) -> List[str]: """ Preprocess a single sentence. Args: sentence (str): Input sentence. Returns: List[str]: List of preprocessed tokens. """ for original, replacement in REPLACEMENTS.items(): sentence = sentence.replace(original, replacement) sentence = re.sub(r'[^a-zA-Z ]', ' ', sentence) sentence = sentence.lower() tokens = word_tokenize(sentence) return [word for word in tokens if word not in STOP_WORDS and len(word) > 2] def process_chunk(chunk: List[str]) -> List[List[str]]: """ Process a chunk of sentences in parallel. Args: chunk (List[str]): List of sentences to process. Returns: List[List[str]]: List of preprocessed sentences (as token lists). """ return [preprocess_sentence(sentence) for sentence in chunk] def load_and_preprocess(file_path: str) -> List[List[str]]: """ Load and preprocess the input file. Args: file_path (str): Path to the input file. Returns: List[List[str]]: List of preprocessed sentences (as token lists). """ with open(file_path, 'r', encoding='utf-8') as f: sentences = f.readlines() chunk_size = len(sentences) // multiprocessing.cpu_count() chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)] processed_sentences = [] with ProcessPoolExecutor() as executor: for chunk_result in tqdm(executor.map(process_chunk, chunks), total=len(chunks), desc="Preprocessing"): processed_sentences.extend(chunk_result) return processed_sentences def train_word2vec(sentences: List[List[str]], params: Dict) -> Word2Vec: """ Train the Word2Vec model. Args: sentences (List[List[str]]): Preprocessed sentences. params (Dict): Model parameters. Returns: Word2Vec: Trained Word2Vec model. """ model = Word2Vec(sentences=sentences, vector_size=params['vector_size'], window=params['window'], min_count=params['min_count'], workers=multiprocessing.cpu_count(), sg=params['sg'], epochs=params['epochs'], negative=params['negative'], sample=params['sample'], compute_loss=True) return model def save_model(model: Word2Vec, output_dir: str, model_name: str) -> None: """ Save the trained model and its metadata. Args: model (Word2Vec): Trained Word2Vec model. output_dir (str): Directory to save the model. model_name (str): Name of the model. """ output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) model_path = output_path / f"{model_name}.model" model.save(str(model_path)) logging.info(f"Model saved to {model_path}") # Save model metadata metadata_path = output_path / f"{model_name}_metadata.txt" with open(metadata_path, 'w', encoding='utf-8') as f: f.write(f"Model: {model_name}\n") f.write(f"Vocabulary size: {len(model.wv.key_to_index)}\n") f.write(f"Vector size: {model.vector_size}\n") f.write(f"Window size: {model.window}\n") f.write(f"Min count: {model.min_count}\n") f.write(f"Training epochs: {model.epochs}\n") f.write(f"Final training loss: {model.get_latest_training_loss()}\n") logging.info(f"Model metadata saved to {metadata_path}") def main(): """Main execution function.""" logging.info("Starting Turkmen Word2Vec model training") # Load and preprocess data processed_sentences = load_and_preprocess(CONFIG['input_file']) logging.info(f"Preprocessed {len(processed_sentences)} sentences") # Train model model = train_word2vec(processed_sentences, CONFIG) logging.info("Model training completed") # Save model and metadata save_model(model, CONFIG['output_dir'], CONFIG['model_name']) logging.info("Process completed successfully") if __name__ == "__main__": main()