# tokenizer.py
from typing import List, Tuple, Optional, Union, Dict, Any
import os, json
from transformers import PreTrainedTokenizerFast

from .syllabic_pretokenizer import (
    Preprocessor,
    preprocess_and_segment_with_alignment,
    remap_offsets_to_raw,
)

class SyllabicTokenizerWrapper(PreTrainedTokenizerFast):
    """
    A HF-compatible tokenizer that FIRST applies your syllabic segmentation,
    then delegates to the underlying fast tokenizer from tokenizer.json.

    Required files in the same directory:
      - tokenizer.json, tokenizer_config.json, special_tokens_map.json
      - preprocess_config.json (with the Preprocessor flags)
    """
    slow_tokenizer_class = None  # required by HF when no slow version exists

    def __init__(self, *args, **kwargs):
        # Ensure we load the fast tokenizer directly (no slow->fast conversion).
        name_or_path = kwargs.get("name_or_path") or (args[0] if args and isinstance(args[0], str) else None)
        if "tokenizer_file" not in kwargs and name_or_path:
            tf = os.path.join(name_or_path, "tokenizer.json")
            if not os.path.isfile(tf):
                raise FileNotFoundError(f"Expected tokenizer.json at {tf}")
            kwargs["tokenizer_file"] = tf

        super().__init__(*args, **kwargs)

        # Resolve the directory where the artifacts live
        hf_dir = kwargs.get("name_or_path", getattr(self, "name_or_path", None)) \
                 or os.path.dirname(getattr(self, "tokenizer_file", "")) or "."
        revision = kwargs.get("revision", None)

        # Load preprocessing flags saved during training
        cfg_path = os.path.join(hf_dir, "preprocess_config.json", revision)
        if not os.path.exists(cfg_path):
            raise FileNotFoundError(
                f"Missing preprocess_config.json in {hf_dir}. "
                f"Did you save it during tokenizer training?"
            )
        with open(cfg_path, "r", encoding="utf-8") as f:
            self.pre_cfg = json.load(f)

        self.preprocessor = Preprocessor(**self.pre_cfg)

        '''
                cfg = {"lowercase": True, "space_punct": True}
        ppath = _get_repo_file(repo_id_or_path, "paradigms.json", revision)
        self.paradigms, self.paradigms_meta = _load_paradigms_any(ppath)

        cpath = _get_repo_file(repo_id_or_path, "preprocess_config.json", revision)
        cfg_path_exists = os.path.exists(cpath)  # when local path returned
        with open(cpath, "r", encoding="utf-8") as f:
            cfg.update(json.load(f))

        self.segmenter = ParadigmFinderSegmenter(
            paradigms=self.paradigms,
            lowercase=cfg.get("lowercase", True),
            space_punct=cfg.get("space_punct", True),
        )'''

    # --- core segmentation helpers ---
    def _segment_one(self, text: str) -> Tuple[str, List[Optional[int]]]:
        return preprocess_and_segment_with_alignment(text, self.preprocessor)

    # --- public API overrides ---
    def __call__(self, text: Union[str, List[str]], **kwargs) -> Dict[str, Any]:
        """
        Segments -> calls the fast tokenizer (super) with segmented text.
        """
        want_offset = kwargs.pop("return_offset_mapping", False)

        if isinstance(text, str):
            seg, seg_map = self._segment_one(text)
            enc = super().__call__(seg, **kwargs)
            return enc
        elif isinstance(text, (list, tuple)):
            segs = []
            for t in text:
                seg, maps = self._segment_one(t)
                segs.append(seg)
            enc = super().__call__(segs, **kwargs)
            return enc
        else:
            raise TypeError("text must be str or List[str]")

    def tokenize(self, text: Union[str, List[str]], **kwargs):
        """
        Also intercept manual .tokenize() to ensure segmentation happens first.
        """
        if isinstance(text, str):
            seg, _ = self._segment_one(text)
            return super().tokenize(seg, **kwargs)
        elif isinstance(text, list):
            out: List[str] = []
            for t in text:
                seg, _ = self._segment_one(t)
                out.extend(super().tokenize(seg, **kwargs))
            return out
        else:
            raise TypeError("tokenize() expects str or List[str]")