| |
|
|
| import re |
| from typing import List, Tuple, Optional |
|
|
| |
| |
| |
|
|
| class Preprocessor: |
| def __init__( |
| self, |
| lowercase: bool = False, |
| separate_apostrophes: bool = True, |
| separate_digits: bool = True, |
| separate_punctuation: bool = True, |
| ): |
| self.lowercase = lowercase |
| self.separate_apostrophes = separate_apostrophes |
| self.separate_digits = separate_digits |
| self.separate_punctuation = separate_punctuation |
|
|
| |
| self._apos_re = re.compile(r"[’'`]") |
| self._punct_re = re.compile(r"[^A-Za-z0-9\s’'`]") |
| self._digit_re = re.compile(r"\d") |
|
|
| def preprocess_with_alignment(self, line: str) -> Tuple[str, List[Optional[int]]]: |
| """ |
| Apply normalization equivalent to your previous Preprocessor, but also |
| return a map from each character in the preprocessed string back to the |
| raw string index. Inserted spaces get None in the map. |
| """ |
| raw = line |
| if self.lowercase: |
| raw = raw.lower() |
|
|
| out_chars: List[str] = [] |
| out2raw: List[Optional[int]] = [] |
|
|
| def emit(ch: str, raw_idx: Optional[int]): |
| out_chars.append(ch) |
| out2raw.append(raw_idx) |
|
|
| i = 0 |
| n = len(raw) |
| while i < n: |
| c = raw[i] |
| |
| isolate = ( |
| (self.separate_apostrophes and self._apos_re.match(c) is not None) |
| or (self.separate_punctuation and self._punct_re.match(c) is not None) |
| or (self.separate_digits and self._digit_re.match(c) is not None) |
| ) |
|
|
| if isolate: |
| emit(" ", None) |
| emit(c, i) |
| emit(" ", None) |
| else: |
| emit(c, i) |
| i += 1 |
|
|
| |
| pre, pre_map = _collapse_whitespace_with_map(out_chars, out2raw) |
| return pre, pre_map |
|
|
|
|
| def _collapse_whitespace_with_map( |
| chars: List[str], idx_map: List[Optional[int]] |
| ) -> Tuple[str, List[Optional[int]]]: |
| """ |
| Collapse runs of whitespace to a single space and trim leading and trailing |
| whitespace, while preserving a per-char map back to the original. |
| """ |
| assert len(chars) == len(idx_map) |
| result_chars: List[str] = [] |
| result_map: List[Optional[int]] = [] |
|
|
| def is_space(ch: str) -> bool: |
| return ch.isspace() |
|
|
| |
| prev_space = False |
| for ch, m in zip(chars, idx_map): |
| if is_space(ch): |
| if not prev_space: |
| result_chars.append(" ") |
| result_map.append(None) |
| prev_space = True |
| |
| else: |
| result_chars.append(ch) |
| result_map.append(m) |
| prev_space = False |
|
|
| |
| if result_chars and result_chars[0] == " ": |
| result_chars.pop(0) |
| result_map.pop(0) |
|
|
| |
| if result_chars and result_chars[-1] == " ": |
| result_chars.pop() |
| result_map.pop() |
|
|
| return "".join(result_chars), result_map |
|
|
|
|
| |
| |
| |
|
|
| class Syllabifier: |
| def __init__(self): |
| |
| self._vowel_re = re.compile(r"[aeiou]+", re.I) |
| self._consonant_clusters = { |
| "bl", "br", "cl", "cr", "dr", "fl", "fr", "gl", "gr", "pl", "pr", |
| "sc", "sk", "sl", "sm", "sn", "sp", "st", "sw", "tr", "tw", "th", |
| "ch", "sh", "ph", "wh", "sch", "str", "spr", "spl", "scr", "thr" |
| } |
|
|
| def syllabify_word(self, word: str) -> List[str]: |
| if len(word) <= 2: |
| return [word] |
|
|
| syllables: List[str] = [] |
| i = 0 |
| n = len(word) |
|
|
| while i < n: |
| syllable = "" |
|
|
| |
| consonant_start = i |
| while i < n and not self._vowel_re.match(word[i]): |
| i += 1 |
|
|
| if i > consonant_start: |
| cluster = word[consonant_start:i] |
| found = False |
| |
| for known in sorted(self._consonant_clusters, key=len, reverse=True): |
| if cluster.startswith(known): |
| syllable += known |
| cluster_rest = cluster[len(known):] |
| if cluster_rest and syllables: |
| |
| syllables[-1] += cluster_rest |
| found = True |
| break |
| if not found: |
| |
| split = len(cluster) // 2 |
| if syllables: |
| syllables[-1] += cluster[:split] |
| syllable += cluster[split:] |
|
|
| |
| vowel_start = i |
| while i < n and self._vowel_re.match(word[i]): |
| i += 1 |
| syllable += word[vowel_start:i] |
|
|
| |
| trailing_start = i |
| while i < n and not self._vowel_re.match(word[i]): |
| i += 1 |
| if i > trailing_start: |
| consonants = word[trailing_start:i] |
| if i < n: |
| |
| syllable += consonants[0] |
| i = trailing_start + 1 |
| else: |
| |
| syllable += consonants |
|
|
| if syllable: |
| syllables.append(syllable) |
|
|
| |
| merged: List[str] = [] |
| k = 0 |
| while k < len(syllables): |
| cur = syllables[k] |
| if len(cur) == 1 and k < len(syllables) - 1: |
| merged.append(cur + syllables[k + 1]) |
| k += 2 |
| else: |
| merged.append(cur) |
| k += 1 |
|
|
| return merged if merged else [word] |
|
|
|
|
| |
| |
| |
|
|
| def preprocess_and_segment_with_alignment( |
| text: str, |
| preprocessor: Preprocessor, |
| syllabifier: Optional[Syllabifier] = None, |
| ) -> Tuple[str, List[Optional[int]]]: |
| """ |
| 1) Apply Preprocessor with alignment tracking |
| 2) Split by spaces into tokens |
| 3) Syllabify each token |
| 4) Rejoin syllables with single spaces |
| 5) Return segmented text plus a map to raw indices |
| """ |
| pre, pre2raw = preprocessor.preprocess_with_alignment(text) |
| if syllabifier is None: |
| syllabifier = Syllabifier() |
|
|
| out_chars: List[str] = [] |
| out_map: List[Optional[int]] = [] |
|
|
| i = 0 |
| n = len(pre) |
| while i < n: |
| |
| while i < n and pre[i].isspace(): |
| i += 1 |
| if i >= n: |
| break |
|
|
| |
| j = i |
| while j < n and not pre[j].isspace(): |
| j += 1 |
| token = pre[i:j] |
|
|
| |
| token_chars = pre[i:j] |
| token_maps = pre2raw[i:j] |
|
|
| |
| sylls = syllabifier.syllabify_word(token) |
|
|
| |
| total = sum(len(s) for s in sylls) |
| if total != len(token): |
| |
| |
| rebuilt = [] |
| pos2 = 0 |
| for s in sylls[:-1]: |
| L = min(len(s), len(token) - pos2) |
| if L <= 0: |
| break |
| rebuilt.append(token[pos2:pos2 + L]) |
| pos2 += L |
| |
| if pos2 < len(token): |
| rebuilt.append(token[pos2:]) |
| sylls = [s for s in rebuilt if s] |
| |
|
|
|
|
| |
| pos = 0 |
| for s_idx, syl in enumerate(sylls): |
| L = len(syl) |
| |
| for k in range(L): |
| out_chars.append(token_chars[pos + k]) |
| out_map.append(token_maps[pos + k]) |
| pos += L |
| |
| if s_idx < len(sylls) - 1: |
| out_chars.append(" ") |
| out_map.append(None) |
|
|
| |
| i = j |
| |
| while i < n and pre[i].isspace(): |
| i += 1 |
| if i < n: |
| out_chars.append(" ") |
| out_map.append(None) |
|
|
| |
| segmented, seg_map = _collapse_whitespace_with_map(out_chars, out_map) |
| return segmented, seg_map |
|
|
|
|
| def remap_offsets_to_raw( |
| offsets: List[Tuple[int, int]], |
| pre2raw: List[Optional[int]], |
| ) -> List[Tuple[int, int]]: |
| """ |
| Translate offsets from the preprocessed or segmented string back to raw |
| string offsets. If an offset region consists only of inserted spaces, |
| return a degenerate (0, 0) span. |
| """ |
| mapped: List[Tuple[int, int]] = [] |
| L = len(pre2raw) |
| for s, e in offsets: |
| s = max(0, min(s, L)) |
| e = max(0, min(e, L)) |
|
|
| rs = None |
| re_ = None |
|
|
| |
| t = s |
| while t < e and rs is None: |
| if pre2raw[t] is not None: |
| rs = pre2raw[t] |
| t += 1 |
|
|
| |
| t = e - 1 |
| while t >= s and re_ is None: |
| if pre2raw[t] is not None: |
| re_ = pre2raw[t] + 1 |
| t -= 1 |
|
|
| if rs is None or re_ is None: |
| mapped.append((0, 0)) |
| else: |
| mapped.append((rs, re_)) |
| return mapped |
|
|