File size: 10,865 Bytes

064b576

# syllabic_pretokenizer.py

import re
from typing import List, Tuple, Optional

###############################################################################
# Configurable text preprocessor (spacing, lowercase) with alignment tracking
###############################################################################

class Preprocessor:
    def __init__(
        self,
        lowercase: bool = False,
        separate_apostrophes: bool = True,
        separate_digits: bool = True,
        separate_punctuation: bool = True,
    ):
        self.lowercase = lowercase
        self.separate_apostrophes = separate_apostrophes
        self.separate_digits = separate_digits
        self.separate_punctuation = separate_punctuation

        # Precompiled regexes
        self._apos_re = re.compile(r"[’'`]")
        self._punct_re = re.compile(r"[^A-Za-z0-9\s’'`]")
        self._digit_re = re.compile(r"\d")

    def preprocess_with_alignment(self, line: str) -> Tuple[str, List[Optional[int]]]:
        """
        Apply normalization equivalent to your previous Preprocessor, but also
        return a map from each character in the preprocessed string back to the
        raw string index. Inserted spaces get None in the map.
        """
        raw = line
        if self.lowercase:
            raw = raw.lower()

        out_chars: List[str] = []
        out2raw: List[Optional[int]] = []

        def emit(ch: str, raw_idx: Optional[int]):
            out_chars.append(ch)
            out2raw.append(raw_idx)

        i = 0
        n = len(raw)
        while i < n:
            c = raw[i]
            # Decide if we should isolate this char with spaces
            isolate = (
                (self.separate_apostrophes and self._apos_re.match(c) is not None)
                or (self.separate_punctuation and self._punct_re.match(c) is not None)
                or (self.separate_digits and self._digit_re.match(c) is not None)
            )

            if isolate:
                emit(" ", None)
                emit(c, i)
                emit(" ", None)
            else:
                emit(c, i)
            i += 1

        # Collapse whitespace to single spaces and strip, keeping alignment
        pre, pre_map = _collapse_whitespace_with_map(out_chars, out2raw)
        return pre, pre_map


def _collapse_whitespace_with_map(
    chars: List[str], idx_map: List[Optional[int]]
) -> Tuple[str, List[Optional[int]]]:
    """
    Collapse runs of whitespace to a single space and trim leading and trailing
    whitespace, while preserving a per-char map back to the original.
    """
    assert len(chars) == len(idx_map)
    result_chars: List[str] = []
    result_map: List[Optional[int]] = []

    def is_space(ch: str) -> bool:
        return ch.isspace()

    # First pass: collapse runs to single spaces
    prev_space = False
    for ch, m in zip(chars, idx_map):
        if is_space(ch):
            if not prev_space:
                result_chars.append(" ")
                result_map.append(None)  # inserted or collapsed space has no single origin
                prev_space = True
            # else skip extra spaces
        else:
            result_chars.append(ch)
            result_map.append(m)
            prev_space = False

    # Strip leading space
    if result_chars and result_chars[0] == " ":
        result_chars.pop(0)
        result_map.pop(0)

    # Strip trailing space
    if result_chars and result_chars[-1] == " ":
        result_chars.pop()
        result_map.pop()

    return "".join(result_chars), result_map


###############################################################################
# Syllabifier (faithful to your logic)
###############################################################################

class Syllabifier:
    def __init__(self):
        # Case insensitive vowel matcher
        self._vowel_re = re.compile(r"[aeiou]+", re.I)
        self._consonant_clusters = {
            "bl", "br", "cl", "cr", "dr", "fl", "fr", "gl", "gr", "pl", "pr",
            "sc", "sk", "sl", "sm", "sn", "sp", "st", "sw", "tr", "tw", "th",
            "ch", "sh", "ph", "wh", "sch", "str", "spr", "spl", "scr", "thr"
        }

    def syllabify_word(self, word: str) -> List[str]:
        if len(word) <= 2:
            return [word]

        syllables: List[str] = []
        i = 0
        n = len(word)

        while i < n:
            syllable = ""

            # Collect initial consonant cluster
            consonant_start = i
            while i < n and not self._vowel_re.match(word[i]):
                i += 1

            if i > consonant_start:
                cluster = word[consonant_start:i]
                found = False
                # Prefer longer known clusters first
                for known in sorted(self._consonant_clusters, key=len, reverse=True):
                    if cluster.startswith(known):
                        syllable += known
                        cluster_rest = cluster[len(known):]
                        if cluster_rest and syllables:
                            # attach leftover to previous syllable
                            syllables[-1] += cluster_rest
                        found = True
                        break
                if not found:
                    # Split unknown cluster roughly in half
                    split = len(cluster) // 2
                    if syllables:
                        syllables[-1] += cluster[:split]
                    syllable += cluster[split:]

            # Add vowel group
            vowel_start = i
            while i < n and self._vowel_re.match(word[i]):
                i += 1
            syllable += word[vowel_start:i]

            # Trailing consonants
            trailing_start = i
            while i < n and not self._vowel_re.match(word[i]):
                i += 1
            if i > trailing_start:
                consonants = word[trailing_start:i]
                if i < n:
                    # One consonant stays with current syllable, rest go to next
                    syllable += consonants[0]
                    i = trailing_start + 1
                else:
                    # End of word, keep all
                    syllable += consonants

            if syllable:
                syllables.append(syllable)

        # Merge very short syllables
        merged: List[str] = []
        k = 0
        while k < len(syllables):
            cur = syllables[k]
            if len(cur) == 1 and k < len(syllables) - 1:
                merged.append(cur + syllables[k + 1])
                k += 2
            else:
                merged.append(cur)
                k += 1

        return merged if merged else [word]


###############################################################################
# End to end helpers: preprocessing + syllabification + alignment
###############################################################################

def preprocess_and_segment_with_alignment(
    text: str,
    preprocessor: Preprocessor,
    syllabifier: Optional[Syllabifier] = None,
) -> Tuple[str, List[Optional[int]]]:
    """
    1) Apply Preprocessor with alignment tracking
    2) Split by spaces into tokens
    3) Syllabify each token
    4) Rejoin syllables with single spaces
    5) Return segmented text plus a map to raw indices
    """
    pre, pre2raw = preprocessor.preprocess_with_alignment(text)
    if syllabifier is None:
        syllabifier = Syllabifier()

    out_chars: List[str] = []
    out_map: List[Optional[int]] = []

    i = 0
    n = len(pre)
    while i < n:
        # Skip spaces
        while i < n and pre[i].isspace():
            i += 1
        if i >= n:
            break

        # Read one token
        j = i
        while j < n and not pre[j].isspace():
            j += 1
        token = pre[i:j]

        # Map for this token
        token_chars = pre[i:j]
        token_maps = pre2raw[i:j]

        # Syllabify this token
        sylls = syllabifier.syllabify_word(token)

        # --- Guard: ensure syllables cover the whole token exactly ---
        total = sum(len(s) for s in sylls)
        if total != len(token):
            # Rebuild syllables by slicing the original token according to the
            # predicted lengths, and force the last syllable to take any remainder.
            rebuilt = []
            pos2 = 0
            for s in sylls[:-1]:
                L = min(len(s), len(token) - pos2)
                if L <= 0:
                    break
                rebuilt.append(token[pos2:pos2 + L])
                pos2 += L
            # Last syllable gets the rest (can be empty if already exact)
            if pos2 < len(token):
                rebuilt.append(token[pos2:])
            sylls = [s for s in rebuilt if s]
        # --------------------------------------------------------------


        # Emit token chars, inserting a single space between syllables
        pos = 0
        for s_idx, syl in enumerate(sylls):
            L = len(syl)
            # Emit the next L chars from token
            for k in range(L):
                out_chars.append(token_chars[pos + k])
                out_map.append(token_maps[pos + k])
            pos += L
            # Add space between syllables, not after last
            if s_idx < len(sylls) - 1:
                out_chars.append(" ")
                out_map.append(None)

        # Add a space between tokens if there is more input
        i = j
        # Peek ahead to see if there is another token
        while i < n and pre[i].isspace():
            i += 1
        if i < n:
            out_chars.append(" ")
            out_map.append(None)

    # Final collapse (defensive) and strip
    segmented, seg_map = _collapse_whitespace_with_map(out_chars, out_map)
    return segmented, seg_map


def remap_offsets_to_raw(
    offsets: List[Tuple[int, int]],
    pre2raw: List[Optional[int]],
) -> List[Tuple[int, int]]:
    """
    Translate offsets from the preprocessed or segmented string back to raw
    string offsets. If an offset region consists only of inserted spaces,
    return a degenerate (0, 0) span.
    """
    mapped: List[Tuple[int, int]] = []
    L = len(pre2raw)
    for s, e in offsets:
        s = max(0, min(s, L))
        e = max(0, min(e, L))

        rs = None
        re_ = None

        # start: first non-None in [s, e)
        t = s
        while t < e and rs is None:
            if pre2raw[t] is not None:
                rs = pre2raw[t]
            t += 1

        # end: last non-None in [s, e)
        t = e - 1
        while t >= s and re_ is None:
            if pre2raw[t] is not None:
                re_ = pre2raw[t] + 1  # exclusive end
            t -= 1

        if rs is None or re_ is None:
            mapped.append((0, 0))
        else:
            mapped.append((rs, re_))
    return mapped