Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

document_redaction / tools /word_segmenter.py

seanpedrickcase

Sync: Merge pull request #199 from seanpedrick-case/startup_optimise

a2e06b3 8 days ago

Raw

History Blame Contribute Delete

83.5 kB

	import os
	from bisect import bisect_left
	from typing import Dict, List, Tuple

	import cv2
	import numpy as np

	from tools.config import OUTPUT_FOLDER, SAVE_WORD_SEGMENTER_OUTPUT_IMAGES

	# Adaptive thresholding parameters (resolution-independent via line_height / median CC height)
	BLOCK_SIZE_FACTOR = 0.5 # Fraction of line_height when median CC height unavailable
	BLOCK_SIZE_MEDIAN_CC_FACTOR = 1.2 # Block size = median_cc_height * this when available
	C_VALUE = 2 # Constant subtracted from mean in adaptive thresholding
	REFERENCE_LINE_HEIGHT = 50 # Line height (px) at which NOISE_THRESHOLD is defined

	# Word segmentation search parameters
	INITIAL_KERNEL_WIDTH_FACTOR = 0.0 # Starting kernel width factor for Stage 2 search
	INITIAL_VALLEY_THRESHOLD_FACTOR = (
	0.0 # Starting valley threshold factor for Stage 1 search
	)
	MAIN_VALLEY_THRESHOLD_FACTOR = (
	0.15 # Primary valley threshold factor for word separation
	)
	MIN_SPACE_FACTOR = 0.2 # Minimum space width relative to character width
	MATCH_TOLERANCE = 0 # Tolerance for word count matching

	# Noise removal parameters (resolution-independent: derived from line_height)
	MIN_AREA_HEIGHT_FRACTION = 0.05 # MIN_AREA = (line_height * this)^2
	MIN_AREA_FLOOR = 2 # Minimum pixel area floor for very low-res lines
	DEFAULT_TRIM_PERCENTAGE = (
	0.2 # Percentage to trim from top/bottom for vertical cropping
	)

	# Skew detection parameters
	MIN_SKEW_THRESHOLD = 0.5 # Ignore angles smaller than this (likely noise)
	MAX_SKEW_THRESHOLD = 15.0 # Angles larger than this are extreme and likely errors
	# Baseline (Hough) skew: minimum bottom points to use baseline method; Hough threshold
	SKEW_BASELINE_MIN_POINTS = 20
	SKEW_HOUGH_THRESHOLD = 25 # Min votes for a line to be considered

	ALLOWED_WORD_MISMATCH_COUNT = 0 # Maximum allowed difference in word count between the target and the detected words during the word segmentation process. If above this, it will use the fallback segmenter.

	# Noise detection: if estimated noise (Laplacian variance) is above this (at REFERENCE_LINE_HEIGHT),
	# skip primary segmentation and use fallback. Scaled by line_height for resolution independence.
	NOISE_THRESHOLD = 800

	# Polarity: binarization assumes dark text on light background. If estimated background
	# mean is below this, the image is treated as light-on-dark and inverted before binarization.
	POLARITY_MEAN_THRESHOLD = 128
	POLARITY_CORNER_FRACTION = (
	0.15 # Fraction of width/height used for corner/edge sampling
	)

	SEARCH_STAGE1_COARSE_STEP = 0.06
	SEARCH_STAGE1_FINE_STEP = 0.02
	SEARCH_STAGE2_COARSE_STEP = 0.05
	SEARCH_STAGE2_FINE_STEP = 0.02


	def _find_widest_zero_gaps(
	vertical_projection: np.ndarray,
	n: int,
	gap_threshold: float = 0.0,
	) -> List[Tuple[int, int]]:
	"""
	Find the N widest contiguous zero-gaps (or near-zero) in the vertical projection.
	Used for justified text: anchor word cut points to the centers of these gaps.
	Returns list of (start, end) in left-to-right order, or empty if not enough gaps.
	"""
	if vertical_projection is None or n <= 0:
	return []
	w = len(vertical_projection)
	gaps = []
	in_gap = False
	start = 0
	for x in range(w):
	val = vertical_projection[x] if x < w else 0
	if val <= gap_threshold and not in_gap:
	start = x
	in_gap = True
	elif val > gap_threshold and in_gap:
	gaps.append((start, x))
	in_gap = False
	if in_gap:
	gaps.append((start, w))
	if not gaps:
	return []
	# Sort by width descending, take first n
	gaps_by_width = sorted(gaps, key=lambda g: g[1] - g[0], reverse=True)
	selected = gaps_by_width[:n]
	# Sort by position (left-to-right) for cutting
	selected.sort(key=lambda g: g[0])
	return selected


	# Punctuation that often sits after a word with a visible gap (anchor to include in word box)
	TRAILING_PUNCTUATION_CHARS = frozenset(".,:;\"'!?)]}")


	def _word_ends_with_punctuation(word: str) -> bool:
	"""True if word ends with a punctuation character that may have a gap before it."""
	return bool(word and word[-1] in TRAILING_PUNCTUATION_CHARS)


	def get_weighted_length(text: str) -> float:
	"""
	Proportional-font heuristic: sum character width weights instead of counting chars.
	Narrow chars (i, l, 1, punctuation) get < 1.0; wide chars (W, M, w) get > 1.0.
	Used by HybridWordSegmenter.convert_line_to_word_level for better blind estimation.
	"""
	width = 0.0
	weights = {
	"i": 0.4,
	"l": 0.4,
	"1": 0.4,
	"t": 0.6,
	"j": 0.4,
	".": 0.3,
	",": 0.3,
	"!": 0.3,
	"'": 0.3,
	"W": 1.3,
	"M": 1.3,
	"m": 1.3,
	"w": 1.2,
	"@": 1.2,
	"%": 1.2,
	" ": 0.5, # space between words
	}
	for char in text:
	base = 1.1 if char.isupper() else 1.0
	width += weights.get(char, base)
	return width


	def _sanitize_filename(filename: str, max_length: int = 100) -> str:
	"""
	Sanitizes a string to be used as a valid filename.
	Removes or replaces invalid characters for Windows/Linux file systems.

	Args:
	filename: The string to sanitize
	max_length: Maximum length of the sanitized filename

	Returns:
	A sanitized string safe for use in file names
	"""
	if not filename:
	return "unnamed"

	# Replace spaces with underscores
	sanitized = filename.replace(" ", "_")

	# Remove or replace invalid characters for Windows/Linux
	# Invalid: < > : " / \ \| ? *
	invalid_chars = '<>:"/\\\|?*'
	for char in invalid_chars:
	sanitized = sanitized.replace(char, "_")

	# Remove control characters
	sanitized = "".join(
	char for char in sanitized if ord(char) >= 32 or char in "\n\r\t"
	)

	# Remove leading/trailing dots and spaces (Windows doesn't allow these)
	sanitized = sanitized.strip(". ")

	# Replace multiple consecutive underscores with a single one
	while "__" in sanitized:
	sanitized = sanitized.replace("__", "_")

	# Truncate if too long
	if len(sanitized) > max_length:
	sanitized = sanitized[:max_length]

	# Ensure it's not empty after sanitization
	if not sanitized:
	sanitized = "unnamed"

	return sanitized


	class AdaptiveSegmenter:
	"""
	Line to word segmentation pipeline. It features:
	1. Adaptive Thresholding.
	2. Targeted Noise Removal using Connected Component Analysis.
	3. The robust two-stage adaptive search (Valley -> Kernel).
	4. CCA for final pixel-perfect refinement.
	"""

	def __init__(self, output_folder: str = OUTPUT_FOLDER):
	self.output_folder = output_folder
	self.fallback_segmenter = HybridWordSegmenter()

	def _correct_orientation(
	self, gray_image: np.ndarray
	) -> Tuple[np.ndarray, np.ndarray]:
	"""
	Detects and corrects 90-degree orientation issues.
	"""
	h, w = gray_image.shape
	center = (w // 2, h // 2)

	block_size = 21
	if h < block_size:
	block_size = h if h % 2 != 0 else h - 1

	if block_size > 3:
	binary = cv2.adaptiveThreshold(
	gray_image,
	255,
	cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY_INV,
	block_size,
	4,
	)
	else:
	_, binary = cv2.threshold(
	gray_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
	)

	opening_kernel = np.ones((2, 2), np.uint8)
	binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, opening_kernel)

	coords = np.column_stack(np.where(binary > 0))
	if len(coords) < 50:
	M_orient = cv2.getRotationMatrix2D(center, 0, 1.0)
	return gray_image, M_orient

	ymin, xmin = coords.min(axis=0)
	ymax, xmax = coords.max(axis=0)
	box_height = ymax - ymin
	box_width = xmax - xmin

	orientation_angle = 0.0
	if box_height > box_width:
	orientation_angle = 90.0
	else:
	M_orient = cv2.getRotationMatrix2D(center, 0, 1.0)
	return gray_image, M_orient

	M_orient = cv2.getRotationMatrix2D(center, orientation_angle, 1.0)
	new_w, new_h = h, w
	M_orient[0, 2] += (new_w - w) / 2
	M_orient[1, 2] += (new_h - h) / 2

	oriented_gray = cv2.warpAffine(
	gray_image,
	M_orient,
	(new_w, new_h),
	flags=cv2.INTER_CUBIC,
	borderMode=cv2.BORDER_REPLICATE,
	)

	return oriented_gray, M_orient

	def _skew_angle_from_baseline(self, binary: np.ndarray) -> float:
	"""
	Estimate skew angle from the text baseline using bottom points of foreground
	and Hough line transform. More stable than minAreaRect for short words or
	lines with ascenders/descenders (e.g. "all"). Returns correction angle in
	degrees, or None if baseline cannot be reliably estimated.
	"""
	h, w = binary.shape
	# For each column, take the bottom-most foreground pixel (baseline point)
	bottom_points = []
	for x in range(w):
	col = binary[:, x]
	on_pixels = np.where(col > 0)[0]
	if len(on_pixels) > 0:
	y_bottom = int(np.max(on_pixels))
	bottom_points.append((x, y_bottom))
	if len(bottom_points) < SKEW_BASELINE_MIN_POINTS:
	return None
	# Draw baseline points on a blank image for Hough
	baseline_img = np.zeros((h, w), dtype=np.uint8)
	for x, y in bottom_points:
	baseline_img[y, x] = 255
	# Slight dilation so Hough sees a denser line
	kernel = np.ones((2, 2), np.uint8)
	baseline_img = cv2.dilate(baseline_img, kernel)
	lines = cv2.HoughLines(
	baseline_img,
	rho=1,
	theta=np.pi / 180,
	threshold=SKEW_HOUGH_THRESHOLD,
	)
	if lines is None or len(lines) == 0:
	return None
	# Score each line by number of bottom points near it; take best
	best_angle = None
	best_score = 0
	dist_thresh = max(2, h // 30)
	for line in lines:
	rho, theta = line[0]
	# Line equation: rho = xcos(theta) + ysin(theta). Perpendicular is at angle theta.
	# Baseline angle from horizontal = theta - 90°. To level it we rotate by -(theta - 90°) = 90° - theta.
	correction_deg = 90.0 - np.degrees(theta)
	# Normalize to [-90, 90] for comparison
	if correction_deg > 90:
	correction_deg -= 180
	elif correction_deg < -90:
	correction_deg += 180
	score = 0
	for x, y in bottom_points:
	# Distance from (x,y) to line rho = xcos(theta)+ysin(theta)
	d = abs(x * np.cos(theta) + y * np.sin(theta) - rho)
	if d <= dist_thresh:
	score += 1
	if score > best_score:
	best_score = score
	best_angle = correction_deg
	if best_angle is None:
	return None
	return float(best_angle)

	def _skew_angle_from_min_area_rect(
	self, coords: np.ndarray, w: int, h: int
	) -> float:
	"""Fallback: skew angle from minAreaRect of all foreground pixels."""
	if len(coords) < 50:
	return 0.0
	rect = cv2.minAreaRect(coords[:, ::-1])
	rect_width, rect_height = rect[1]
	angle = rect[2]
	if rect_width < rect_height:
	angle += 90
	if angle > 45:
	angle -= 90
	elif angle < -45:
	angle += 90
	return float(angle)

	def _deskew_image(self, gray_image: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
	"""
	Detects skew using baseline (Hough on bottom points of letters) when possible,
	which is more stable for short words and ascenders/descenders; falls back to
	minAreaRect otherwise.
	"""
	h, w = gray_image.shape

	block_size = 21
	if h < block_size:
	block_size = h if h % 2 != 0 else h - 1

	if block_size > 3:
	binary = cv2.adaptiveThreshold(
	gray_image,
	255,
	cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY_INV,
	block_size,
	4,
	)
	else:
	_, binary = cv2.threshold(
	gray_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
	)

	opening_kernel = np.ones((2, 2), np.uint8)
	binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, opening_kernel)

	coords = np.column_stack(np.where(binary > 0))
	if len(coords) < 50:
	M = cv2.getRotationMatrix2D((w // 2, h // 2), 0, 1.0)
	return gray_image, M

	# Prefer baseline-based skew (stable for short words / ascenders-descenders)
	correction_angle = self._skew_angle_from_baseline(binary)
	if correction_angle is None:
	correction_angle = self._skew_angle_from_min_area_rect(coords, w, h)

	if abs(correction_angle) < MIN_SKEW_THRESHOLD:
	correction_angle = 0.0
	elif abs(correction_angle) > MAX_SKEW_THRESHOLD:
	correction_angle = 0.0

	center = (w // 2, h // 2)
	M = cv2.getRotationMatrix2D(center, correction_angle, 1.0)

	deskewed_gray = cv2.warpAffine(
	gray_image,
	M,
	(w, h),
	flags=cv2.INTER_CUBIC,
	borderMode=cv2.BORDER_REPLICATE,
	)

	return deskewed_gray, M

	def _estimate_quick_skew_degrees(self, gray_image: np.ndarray) -> float:
	"""Cheap skew estimate used to skip expensive orientation/deskew when safe."""
	if gray_image is None or gray_image.size == 0:
	return 0.0
	h, w = gray_image.shape[:2]
	if h < 8 or w < 8:
	return 0.0
	_, quick_bin = cv2.threshold(
	gray_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
	)
	coords = np.column_stack(np.where(quick_bin > 0))
	if len(coords) < 30:
	return 0.0
	rect = cv2.minAreaRect(coords[:, ::-1])
	rect_w, rect_h = rect[1]
	angle = float(rect[2])
	if rect_w < rect_h:
	angle += 90.0
	if angle > 45.0:
	angle -= 90.0
	elif angle < -45.0:
	angle += 90.0
	return angle

	def _can_skip_expensive_preprocess(self, gray_image: np.ndarray) -> bool:
	"""Return True when line is already horizontal enough for direct segmentation."""
	if gray_image is None or gray_image.size == 0:
	return True
	h, w = gray_image.shape[:2]
	if h <= 0 or w <= 0:
	return True
	if w <= int(h * 1.2):
	return False
	skew = self._estimate_quick_skew_degrees(gray_image)
	return abs(skew) < 1.2

	def _get_boxes_from_profile(
	self,
	binary_image: np.ndarray,
	stable_avg_char_width: float,
	min_space_factor: float,
	valley_threshold_factor: float,
	) -> List:
	"""
	Extracts word bounding boxes from vertical projection profile.
	"""
	img_h, img_w = binary_image.shape
	vertical_projection = np.sum(binary_image, axis=0)
	peaks = vertical_projection[vertical_projection > 0]
	if len(peaks) == 0:
	return []
	avg_peak_height = np.mean(peaks)
	valley_threshold = int(avg_peak_height * valley_threshold_factor)
	min_space_width = int(stable_avg_char_width * min_space_factor)

	patched_projection = vertical_projection.copy()
	in_gap = False
	gap_start = 0

	for x, col_sum in enumerate(patched_projection):
	if col_sum <= valley_threshold and not in_gap:
	in_gap = True
	gap_start = x
	elif col_sum > valley_threshold and in_gap:
	in_gap = False
	if (x - gap_start) < min_space_width:
	patched_projection[gap_start:x] = int(avg_peak_height)

	unlabeled_boxes = []
	in_word = False
	start_x = 0
	for x, col_sum in enumerate(patched_projection):
	if col_sum > valley_threshold and not in_word:
	start_x = x
	in_word = True
	elif col_sum <= valley_threshold and in_word:
	# [NOTE] Returns full height stripe
	unlabeled_boxes.append((start_x, 0, x - start_x, img_h))
	in_word = False
	if in_word:
	unlabeled_boxes.append((start_x, 0, img_w - start_x, img_h))
	return unlabeled_boxes

	def _enforce_logical_constraints(
	self, output: Dict[str, List], image_width: int, image_height: int
	) -> Dict[str, List]:
	"""
	Enforces geometric sanity checks with 2D awareness.
	"""
	if not output or not output["text"]:
	return output

	num_items = len(output["text"])
	boxes = []
	for i in range(num_items):
	boxes.append(
	{
	"text": output["text"][i],
	"left": int(output["left"][i]),
	"top": int(output["top"][i]),
	"width": int(output["width"][i]),
	"height": int(output["height"][i]),
	"conf": output["conf"][i],
	}
	)

	valid_boxes = []
	for box in boxes:
	x0 = max(0, box["left"])
	y0 = max(0, box["top"])
	x1 = min(image_width, box["left"] + box["width"])
	y1 = min(image_height, box["top"] + box["height"])

	w = x1 - x0
	h = y1 - y0

	if w > 0 and h > 0:
	box["left"] = x0
	box["top"] = y0
	box["width"] = w
	box["height"] = h
	valid_boxes.append(box)
	boxes = valid_boxes

	is_vertical = image_height > (image_width * 1.2)
	if is_vertical:
	boxes.sort(key=lambda b: (b["top"], b["left"]))
	else:
	boxes.sort(key=lambda b: (b["left"], -b["width"]))

	final_pass_boxes = []
	if boxes:
	keep_indices = [True] * len(boxes)
	# Fast path: adjacent comparisons after sorting removes most duplicates
	# without full O(n^2) cross checks.
	for i in range(len(boxes) - 1):
	b1 = boxes[i]
	b2 = boxes[i + 1]
	x_nested = (b1["left"] >= b2["left"] - 2) and (
	b1["left"] + b1["width"] <= b2["left"] + b2["width"] + 2
	)
	y_nested = (b1["top"] >= b2["top"] - 2) and (
	b1["top"] + b1["height"] <= b2["top"] + b2["height"] + 2
	)
	if x_nested and y_nested and b1["text"] == b2["text"]:
	if b1["width"] * b1["height"] <= b2["width"] * b2["height"]:
	keep_indices[i] = False
	# Also evaluate opposite containment (b2 inside b1).
	x_nested_rev = (b2["left"] >= b1["left"] - 2) and (
	b2["left"] + b2["width"] <= b1["left"] + b1["width"] + 2
	)
	y_nested_rev = (b2["top"] >= b1["top"] - 2) and (
	b2["top"] + b2["height"] <= b1["top"] + b1["height"] + 2
	)
	if x_nested_rev and y_nested_rev and b1["text"] == b2["text"]:
	if b2["width"] * b2["height"] <= b1["width"] * b1["height"]:
	keep_indices[i + 1] = False

	for i, keep in enumerate(keep_indices):
	if keep:
	final_pass_boxes.append(boxes[i])

	boxes = final_pass_boxes

	if is_vertical:
	boxes.sort(key=lambda b: (b["top"], b["left"]))
	else:
	boxes.sort(key=lambda b: (b["left"], -b["width"]))

	for i in range(len(boxes) - 1):
	b1 = boxes[i]
	b2 = boxes[i + 1]
	x_overlap = min(b1["left"] + b1["width"], b2["left"] + b2["width"]) - max(
	b1["left"], b2["left"]
	)
	y_overlap = min(b1["top"] + b1["height"], b2["top"] + b2["height"]) - max(
	b1["top"], b2["top"]
	)

	if x_overlap > 0 and y_overlap > 0:
	if is_vertical:
	if b1["top"] < b2["top"]:
	b1["height"] = max(1, b2["top"] - b1["top"])
	else:
	if b1["left"] < b2["left"]:
	b1_right = b1["left"] + b1["width"]
	b2_right = b2["left"] + b2["width"]
	left_slice_width = max(0, b2["left"] - b1["left"])
	right_slice_width = max(0, b1_right - b2_right)
	if b1_right > b2_right and right_slice_width > left_slice_width:
	b1["left"] = b2_right
	b1["width"] = right_slice_width
	else:
	b1["width"] = max(1, left_slice_width)

	cleaned_output = {
	k: [] for k in ["text", "left", "top", "width", "height", "conf"]
	}
	if is_vertical:
	boxes.sort(key=lambda b: (b["top"], b["left"]))
	else:
	boxes.sort(key=lambda b: (b["left"], -b["width"]))

	for box in boxes:
	for key in cleaned_output.keys():
	cleaned_output[key].append(box[key])

	return cleaned_output

	def _is_geometry_valid(
	self,
	boxes: List[Tuple[int, int, int, int]],
	words: List[str],
	expected_height: float = 0,
	) -> bool:
	"""
	Validates if the detected boxes are physically plausible.
	[FIX] Improved robustness for punctuation and mixed-case text.
	"""
	if len(boxes) != len(words):
	return False

	baseline = expected_height
	# Use median only if provided expected height is unreliable
	if baseline < 5:
	heights = [b[3] for b in boxes]
	if heights:
	baseline = np.median(heights)

	if baseline < 5:
	return True

	for i, box in enumerate(boxes):
	word = words[i]

	# [FIX] Check for punctuation/symbols. They are allowed to be small.
	# If word is just punctuation, skip geometry checks
	is_punctuation = not any(c.isalnum() for c in word)
	if is_punctuation:
	continue

	# Standard checks for alphanumeric words
	num_chars = len(word)
	if num_chars < 1:
	continue

	width = box[2]
	height = box[3]

	# [FIX] Only reject height if it's REALLY small compared to baseline
	# A period might be small, but we skipped that check above.
	# This check ensures a real word like "The" isn't 2 pixels tall.
	if height < (baseline * 0.20):
	return False

	avg_char_width = width / num_chars
	min_expected = baseline * 0.20

	# Only reject if it fails BOTH absolute (4px) and relative checks
	if avg_char_width < min_expected and avg_char_width < 4:
	# Exception: If the word is 1 char long (e.g. "I", "l", "1"), allow it to be skinny.
	if num_chars == 1 and avg_char_width >= 2:
	continue
	return False

	return True

	def _estimate_noise(self, gray: np.ndarray) -> float:
	"""
	Estimate image noisiness using Laplacian variance. Noisy images tend to have
	high high-frequency content, so higher values indicate more noise (or very
	sharp edges). Used to skip the primary segmentation pipeline when above
	NOISE_THRESHOLD and use the fallback segmenter instead.
	"""
	if gray is None or gray.size == 0:
	return 0.0
	lap = cv2.Laplacian(gray, cv2.CV_64F, ksize=3)
	return float(lap.var())

	def _block_size_from_median_cc_height(
	self, gray: np.ndarray, line_height: int, fallback_block_size: int
	) -> int:
	"""
	Determine adaptive threshold block size from median height of connected components
	(resolution-independent). Uses an Otsu pre-pass to get CCs; if median height is
	valid, returns block_size = median_cc_height * BLOCK_SIZE_MEDIAN_CC_FACTOR.
	Otherwise returns fallback_block_size (e.g. from line_height).
	"""
	if gray is None or gray.size == 0 or line_height < 3:
	return fallback_block_size
	_, otsu_binary = cv2.threshold(
	gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
	)
	num_labels, _, stats, _ = cv2.connectedComponentsWithStats(
	otsu_binary, 8, cv2.CV_32S
	)
	if num_labels < 3: # background + need at least 2 components
	return fallback_block_size
	areas = stats[1:, cv2.CC_STAT_AREA]
	heights = stats[1:, cv2.CC_STAT_HEIGHT]
	min_area_cc = max(2, int((line_height * 0.02) ** 2))
	valid = areas >= min_area_cc
	if not np.any(valid):
	return fallback_block_size
	median_h = np.median(heights[valid])
	if np.isnan(median_h) or median_h < 2:
	return fallback_block_size
	block = max(3, int(median_h * BLOCK_SIZE_MEDIAN_CC_FACTOR))
	if block % 2 == 0:
	block += 1
	return block

	def _normalize_polarity_for_binarization(self, gray: np.ndarray) -> np.ndarray:
	"""
	Ensure we work with dark-text-on-light-background for binarization. If the
	image is mostly dark (light text on dark background), invert it so that
	adaptive threshold and projection profile logic behave correctly.

	Uses corner/edge regions to estimate background (typical in documents);
	falls back to global mean for very small or full-page line crops.
	"""
	if gray is None or gray.size == 0:
	return gray
	h, w = gray.shape
	frac = POLARITY_CORNER_FRACTION
	# Sample corners and edges (background is often visible there)
	margin_w = max(1, int(w * frac))
	margin_h = max(1, int(h * frac))
	corner_pixels = []
	if margin_w < w and margin_h < h:
	top_left = gray[:margin_h, :margin_w]
	top_right = gray[:margin_h, -margin_w:]
	bottom_left = gray[-margin_h:, :margin_w]
	bottom_right = gray[-margin_h:, -margin_w:]
	for region in (top_left, top_right, bottom_left, bottom_right):
	corner_pixels.append(region.ravel())
	if corner_pixels:
	corner_pixels = np.concatenate(corner_pixels)
	background_mean = float(np.mean(corner_pixels))
	else:
	background_mean = float(np.mean(gray))
	else:
	background_mean = float(np.mean(gray))
	if background_mean < POLARITY_MEAN_THRESHOLD:
	return cv2.bitwise_not(gray)
	return gray

	def segment(
	self,
	line_data: Dict[str, List],
	line_image: np.ndarray,
	min_space_factor=MIN_SPACE_FACTOR,
	match_tolerance=MATCH_TOLERANCE,
	image_name: str = None,
	) -> Tuple[Dict[str, List], bool]:
	if (
	line_image is None
	or not isinstance(line_image, np.ndarray)
	or line_image.size == 0
	):
	return ({}, False)
	# Allow grayscale (2 dims) or color (3 dims)
	if len(line_image.shape) < 2:
	return ({}, False)
	if not line_data or not line_data.get("text") or len(line_data["text"]) == 0:
	return ({}, False)

	line_text = line_data["text"][0]
	words = line_text.split()

	# Early return if 1 or fewer words
	if len(words) <= 1:
	img_h, img_w = line_image.shape[:2]
	one_word_result = self.fallback_segmenter.convert_line_to_word_level(
	line_data, img_w, img_h
	)
	return (one_word_result, False)

	# Validate that line_image is not empty before processing
	if line_image is None or line_image.size == 0 or len(line_image.shape) < 2:
	# If line_image is empty, fall back to proportional estimation
	return {}, False

	line_number = line_data["line"][0]
	safe_image_name = "image"
	safe_line_number = str(line_number)
	safe_shortened_line_text = "line"
	if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
	safe_image_name = _sanitize_filename(image_name or "image", max_length=50)
	safe_line_number = _sanitize_filename(str(line_number), max_length=10)
	safe_shortened_line_text = _sanitize_filename(line_text, max_length=10)

	if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
	os.makedirs(self.output_folder, exist_ok=True)
	output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_line_number}_{safe_shortened_line_text}_original.png"
	os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
	# Only write if image is valid
	if line_image.size > 0 and len(line_image.shape) >= 2:
	cv2.imwrite(output_path, line_image)

	if len(line_image.shape) == 3:
	gray = cv2.cvtColor(line_image, cv2.COLOR_BGR2GRAY)
	else:
	gray = line_image.copy()

	# ========================================================================
	# IMAGE PREPROCESSING (Deskew / Rotate)
	# ========================================================================
	if self._can_skip_expensive_preprocess(gray):
	h, w = gray.shape[:2]
	deskewed_gray = gray
	deskewed_line_image = line_image.copy()
	M = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=np.float32)
	else:
	oriented_gray, M_orient = self._correct_orientation(gray)
	deskewed_gray, M_skew = self._deskew_image(oriented_gray)

	# Combine matrices: M_total = M_skew * M_orient
	M_orient_3x3 = np.vstack([M_orient, [0, 0, 1]])
	M_skew_3x3 = np.vstack([M_skew, [0, 0, 1]])
	M_total_3x3 = M_skew_3x3 @ M_orient_3x3
	M = M_total_3x3[0:2, :] # Extract 2x3 affine matrix

	# Apply transformation to the original color image
	h, w = deskewed_gray.shape
	deskewed_line_image = cv2.warpAffine(
	line_image,
	M,
	(w, h),
	flags=cv2.INTER_CUBIC,
	borderMode=cv2.BORDER_REPLICATE,
	)

	# [FIX] Create Local Line Data that matches the deskewed/rotated image dimensions.
	# This prevents the fallback segmenter from using vertical dimensions on a horizontal image.
	local_line_data = {
	"text": line_data["text"],
	"conf": line_data["conf"],
	"left": [0], # Local coordinate system starts at 0
	"top": [0],
	"width": [w], # Use the ROTATED width
	"height": [h], # Use the ROTATED height
	"line": line_data.get("line", [0]),
	}

	if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
	os.makedirs(self.output_folder, exist_ok=True)
	output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_line_number}_{safe_shortened_line_text}_deskewed.png"
	cv2.imwrite(output_path, deskewed_line_image)

	# ========================================================================
	# MAIN SEGMENTATION PIPELINE
	# ========================================================================
	approx_char_count = len(line_data["text"][0].replace(" ", ""))
	if approx_char_count == 0:
	return {}, False

	img_h, img_w = deskewed_gray.shape
	line_height = img_h
	estimated_char_height = img_h * 0.6
	avg_char_width_approx = img_w / approx_char_count

	# Block size from line height (resolution-independent); could be refined from median CC height in two-pass
	block_size = max(3, int(line_height * BLOCK_SIZE_FACTOR))
	if block_size % 2 == 0:
	block_size += 1

	# Noise threshold scaled by line height so behavior is resolution-independent
	effective_noise_threshold = NOISE_THRESHOLD * (
	line_height / REFERENCE_LINE_HEIGHT
	)

	# --- Noise check: skip primary pipeline if image is too noisy ---
	noise_level = self._estimate_noise(deskewed_gray)
	if noise_level > effective_noise_threshold:
	used_fallback = True
	final_output = self.fallback_segmenter.refine_words_bidirectional(
	local_line_data, deskewed_line_image
	)
	else:
	# --- Polarity: ensure dark text on light background for binarization ---
	gray_for_binary = self._normalize_polarity_for_binarization(deskewed_gray)

	# Refine block size from median CC height (Otsu pre-pass) when possible
	block_size = self._block_size_from_median_cc_height(
	gray_for_binary, line_height, block_size
	)

	# --- Binarization ---
	binary_adaptive = cv2.adaptiveThreshold(
	gray_for_binary,
	255,
	cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY_INV,
	block_size,
	C_VALUE,
	)
	otsu_thresh_val, _ = cv2.threshold(
	gray_for_binary, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
	)
	strict_thresh_val = otsu_thresh_val * 0.75
	_, binary_strict = cv2.threshold(
	gray_for_binary, strict_thresh_val, 255, cv2.THRESH_BINARY_INV
	)
	binary = cv2.bitwise_and(binary_adaptive, binary_strict)

	if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
	output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_line_number}_{safe_shortened_line_text}_binary.png"
	cv2.imwrite(output_path, binary)

	# --- Morphological Closing ---
	morph_width = max(3, int(avg_char_width_approx * 0.40))
	morph_height = max(2, int(avg_char_width_approx * 0.1))
	kernel = cv2.getStructuringElement(
	cv2.MORPH_RECT, (morph_width, morph_height)
	)
	closed_binary = cv2.morphologyEx(
	binary, cv2.MORPH_CLOSE, kernel, iterations=1
	)

	# --- Noise Removal ---
	num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
	closed_binary, 8, cv2.CV_32S
	)
	clean_binary = np.zeros_like(binary)

	force_fallback = False
	significant_labels = 0
	if num_labels > 1:
	# Only count components with area > 3 pixels
	significant_labels = np.sum(stats[1:, cv2.CC_STAT_AREA] > 3)

	if approx_char_count > 0 and significant_labels > (approx_char_count * 12):
	force_fallback = True

	if num_labels > 1:
	areas = stats[1:, cv2.CC_STAT_AREA]
	if len(areas) == 0:
	clean_binary = binary
	areas = np.array([0])
	else:
	p1 = np.percentile(areas, 1)
	img_h, img_w = binary.shape
	line_h = img_h
	estimated_char_height = img_h * 0.7
	# Resolution-independent min area: (line_height * 0.05)^2 with floor
	min_area_threshold = max(
	MIN_AREA_FLOOR,
	int((line_h * MIN_AREA_HEIGHT_FRACTION) ** 2),
	)
	estimated_min_letter_area = max(
	2,
	int(estimated_char_height * 0.2 * estimated_char_height * 0.15),
	)
	area_threshold = max(
	min_area_threshold, min(p1, estimated_min_letter_area)
	)

	# Gap detection logic...
	sorted_areas = np.sort(areas)
	area_diffs = np.diff(sorted_areas)
	if len(sorted_areas) > 10 and len(area_diffs) > 0:
	jump_threshold = np.percentile(area_diffs, 95)
	significant_jump_thresh = max(10, jump_threshold * 3)
	jump_indices = np.where(area_diffs > significant_jump_thresh)[0]
	if len(jump_indices) > 0:
	gap_idx = jump_indices[0]
	area_before_gap = sorted_areas[gap_idx]
	final_threshold = max(area_before_gap + 1, area_threshold)
	final_threshold = min(final_threshold, 15)
	area_threshold = final_threshold

	for i in range(1, num_labels):
	if stats[i, cv2.CC_STAT_AREA] >= area_threshold:
	clean_binary[labels == i] = 255
	else:
	clean_binary = binary

	# Validate clean_binary is not empty before proceeding
	if (
	clean_binary is None
	or clean_binary.size == 0
	or len(clean_binary.shape) < 2
	):
	# If clean_binary is empty, fall back to proportional estimation
	return {}, False

	# --- Vertical Cropping ---
	horizontal_projection = np.sum(clean_binary, axis=1)
	y_start = 0
	non_zero_rows = np.where(horizontal_projection > 0)[0]
	if len(non_zero_rows) > 0:
	p_top = int(np.percentile(non_zero_rows, 5))
	p_bottom = int(np.percentile(non_zero_rows, 95))
	core_height = p_bottom - p_top
	trim_pixels = int(core_height * 0.1)
	y_start = max(0, p_top + trim_pixels)
	y_end = min(clean_binary.shape[0], p_bottom - trim_pixels)
	if y_end - y_start < 5:
	y_start = p_top
	y_end = p_bottom
	# Ensure y_end > y_start to avoid empty slice
	if y_end > y_start:
	analysis_image = clean_binary[y_start:y_end, :]
	else:
	# If slice would be empty, use the full image
	analysis_image = clean_binary
	else:
	analysis_image = clean_binary

	# Validate that analysis_image is not empty before proceeding
	if (
	analysis_image is None
	or analysis_image.size == 0
	or len(analysis_image.shape) < 2
	):
	# If analysis_image is empty, fall back to proportional estimation
	return {}, False

	if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
	# Validate that analysis_image is not empty before writing
	if analysis_image.size > 0 and len(analysis_image.shape) >= 2:
	output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_line_number}_{safe_shortened_line_text}_clean_binary.png"
	cv2.imwrite(output_path, analysis_image)

	# --- Adaptive Search ---
	best_boxes = None
	successful_binary_image = None

	if not force_fallback:
	words = line_data["text"][0].split()
	target = len(words)
	backup_boxes_s1 = None
	best_stage1_diff = float("inf")
	best_stage1_factor = INITIAL_VALLEY_THRESHOLD_FACTOR

	# STAGE 1
	stage1_coarse = np.arange(
	INITIAL_VALLEY_THRESHOLD_FACTOR, 0.60, SEARCH_STAGE1_COARSE_STEP
	)
	for v_factor in stage1_coarse:
	curr_boxes = self._get_boxes_from_profile(
	analysis_image,
	avg_char_width_approx,
	min_space_factor,
	v_factor,
	)
	diff = abs(target - len(curr_boxes))
	is_geom_valid = self._is_geometry_valid(
	curr_boxes, words, estimated_char_height
	)
	if diff < best_stage1_diff:
	best_stage1_diff = diff
	best_stage1_factor = float(v_factor)

	if diff == 0:
	if is_geom_valid:
	best_boxes = curr_boxes
	successful_binary_image = analysis_image
	break
	else:
	if backup_boxes_s1 is None:
	backup_boxes_s1 = curr_boxes
	if (
	diff <= ALLOWED_WORD_MISMATCH_COUNT
	and backup_boxes_s1 is None
	and is_geom_valid
	):
	backup_boxes_s1 = curr_boxes

	# Refine around best coarse factor only when needed.
	if best_boxes is None:
	lower = max(
	INITIAL_VALLEY_THRESHOLD_FACTOR,
	best_stage1_factor - SEARCH_STAGE1_COARSE_STEP,
	)
	upper = min(0.60, best_stage1_factor + SEARCH_STAGE1_COARSE_STEP)
	for v_factor in np.arange(
	lower, upper + 1e-9, SEARCH_STAGE1_FINE_STEP
	):
	curr_boxes = self._get_boxes_from_profile(
	analysis_image,
	avg_char_width_approx,
	min_space_factor,
	v_factor,
	)
	diff = abs(target - len(curr_boxes))
	is_geom_valid = self._is_geometry_valid(
	curr_boxes, words, estimated_char_height
	)
	if diff == 0 and is_geom_valid:
	best_boxes = curr_boxes
	successful_binary_image = analysis_image
	break
	if (
	diff <= ALLOWED_WORD_MISMATCH_COUNT
	and backup_boxes_s1 is None
	and is_geom_valid
	):
	backup_boxes_s1 = curr_boxes

	# STAGE 2 (if needed)
	if best_boxes is None:
	backup_boxes_s2 = None
	best_stage2_diff = float("inf")
	best_stage2_factor = INITIAL_KERNEL_WIDTH_FACTOR
	for k_factor in np.arange(
	INITIAL_KERNEL_WIDTH_FACTOR, 0.5, SEARCH_STAGE2_COARSE_STEP
	):
	k_w = max(1, int(avg_char_width_approx * k_factor))
	s2_bin = cv2.morphologyEx(
	clean_binary, cv2.MORPH_CLOSE, np.ones((1, k_w), np.uint8)
	)
	s2_img = (
	s2_bin[y_start:y_end, :]
	if len(non_zero_rows) > 0
	else s2_bin
	)

	if s2_img is None or s2_img.size == 0:
	continue

	curr_boxes = self._get_boxes_from_profile(
	s2_img,
	avg_char_width_approx,
	min_space_factor,
	MAIN_VALLEY_THRESHOLD_FACTOR,
	)
	diff = abs(target - len(curr_boxes))
	if diff < best_stage2_diff:
	best_stage2_diff = diff
	best_stage2_factor = float(k_factor)
	is_geom_valid = self._is_geometry_valid(
	curr_boxes, words, estimated_char_height
	)

	if diff == 0 and is_geom_valid:
	best_boxes = curr_boxes
	successful_binary_image = s2_bin
	break

	if (
	diff <= ALLOWED_WORD_MISMATCH_COUNT
	and backup_boxes_s2 is None
	and is_geom_valid
	):
	backup_boxes_s2 = curr_boxes

	if best_boxes is None:
	lower = max(
	INITIAL_KERNEL_WIDTH_FACTOR,
	best_stage2_factor - SEARCH_STAGE2_COARSE_STEP,
	)
	upper = min(0.5, best_stage2_factor + SEARCH_STAGE2_COARSE_STEP)
	for k_factor in np.arange(
	lower, upper + 1e-9, SEARCH_STAGE2_FINE_STEP
	):
	k_w = max(1, int(avg_char_width_approx * k_factor))
	s2_bin = cv2.morphologyEx(
	clean_binary,
	cv2.MORPH_CLOSE,
	np.ones((1, k_w), np.uint8),
	)
	s2_img = (
	s2_bin[y_start:y_end, :]
	if len(non_zero_rows) > 0
	else s2_bin
	)
	if s2_img is None or s2_img.size == 0:
	continue
	curr_boxes = self._get_boxes_from_profile(
	s2_img,
	avg_char_width_approx,
	min_space_factor,
	MAIN_VALLEY_THRESHOLD_FACTOR,
	)
	diff = abs(target - len(curr_boxes))
	is_geom_valid = self._is_geometry_valid(
	curr_boxes, words, estimated_char_height
	)
	if diff == 0 and is_geom_valid:
	best_boxes = curr_boxes
	successful_binary_image = s2_bin
	break
	if (
	diff <= ALLOWED_WORD_MISMATCH_COUNT
	and backup_boxes_s2 is None
	and is_geom_valid
	):
	backup_boxes_s2 = curr_boxes

	if best_boxes is None:
	if backup_boxes_s1 is not None:
	best_boxes = backup_boxes_s1
	successful_binary_image = analysis_image
	elif backup_boxes_s2 is not None:
	best_boxes = backup_boxes_s2
	successful_binary_image = clean_binary

	final_output = None
	used_fallback = False

	if best_boxes is None:
	# --- FALLBACK WITH ROTATED DATA ---
	used_fallback = True
	# [FIX] Use local_line_data (rotated dims) instead of line_data (original dims)
	final_output = self.fallback_segmenter.refine_words_bidirectional(
	local_line_data, deskewed_line_image
	)
	else:
	# --- CCA Refinement ---
	unlabeled_boxes = best_boxes
	if successful_binary_image is analysis_image:
	cca_source_image = clean_binary
	else:
	cca_source_image = successful_binary_image

	num_labels, _, stats, _ = cv2.connectedComponentsWithStats(
	cca_source_image, 8, cv2.CV_32S
	)
	cca_img_h, cca_img_w = cca_source_image.shape[:2]

	component_assignments = {}
	num_proc = min(len(words), len(unlabeled_boxes))
	min_valid_component_area = estimated_char_height * 2
	box_meta = []
	for i in range(num_proc):
	box_x, box_y, box_w, box_h = unlabeled_boxes[i]
	box_r = box_x + box_w
	box_center_x = box_x + box_w / 2
	box_meta.append((i, box_x, box_r, box_center_x, box_w))
	box_starts = [meta[1] for meta in box_meta]

	for j in range(1, num_labels):
	comp_x = stats[j, cv2.CC_STAT_LEFT]
	comp_w = stats[j, cv2.CC_STAT_WIDTH]
	comp_area = stats[j, cv2.CC_STAT_AREA]
	comp_r = comp_x + comp_w
	comp_center_x = comp_x + comp_w / 2
	comp_y = stats[j, cv2.CC_STAT_TOP]
	comp_h = stats[j, cv2.CC_STAT_HEIGHT]
	comp_center_y = comp_y + comp_h / 2

	if (
	comp_center_y < cca_img_h * 0.1
	or comp_center_y > cca_img_h * 0.9
	):
	continue
	if comp_area < min_valid_component_area:
	continue

	best_box_idx = None
	max_overlap = 0
	best_center_distance = float("inf")
	component_center_in_box = False

	# Assign components to boxes...
	# Candidate pruning: only evaluate boxes near this component.
	left_search = max(0, comp_x - comp_w)
	right_search = comp_r + comp_w
	start_idx = bisect_left(box_starts, left_search)
	idx = start_idx
	while idx < len(box_meta) and box_meta[idx][1] <= right_search:
	i, box_x, box_r, box_center_x, box_w = box_meta[idx]
	idx += 1
	if comp_w > box_w * 1.5:
	continue

	if comp_x < box_r and box_x < comp_r:
	overlap_start = max(comp_x, box_x)
	overlap_end = min(comp_r, box_r)
	overlap = overlap_end - overlap_start

	if overlap > 0:
	center_in_box = box_x <= comp_center_x < box_r
	center_distance = abs(comp_center_x - box_center_x)

	if center_in_box:
	if (
	not component_center_in_box
	or overlap > max_overlap
	):
	component_center_in_box = True
	best_center_distance = center_distance
	max_overlap = overlap
	best_box_idx = i
	elif not component_center_in_box:
	if center_distance < best_center_distance or (
	center_distance == best_center_distance
	and overlap > max_overlap
	):
	best_center_distance = center_distance
	max_overlap = overlap
	best_box_idx = i

	if best_box_idx is not None:
	component_assignments[j] = best_box_idx

	refined_boxes_list = []
	for i in range(num_proc):
	word_label = words[i]
	components_in_box = [
	stats[j] for j, b in component_assignments.items() if b == i
	]

	use_original_box = False
	if not components_in_box:
	use_original_box = True
	else:
	min_x = min(c[cv2.CC_STAT_LEFT] for c in components_in_box)
	min_y = min(c[cv2.CC_STAT_TOP] for c in components_in_box)
	max_r = max(
	c[cv2.CC_STAT_LEFT] + c[cv2.CC_STAT_WIDTH]
	for c in components_in_box
	)
	max_b = max(
	c[cv2.CC_STAT_TOP] + c[cv2.CC_STAT_HEIGHT]
	for c in components_in_box
	)
	cca_h = max(1, max_b - min_y)
	if cca_h < (estimated_char_height * 0.35):
	use_original_box = True

	if use_original_box:
	box_x, box_y, box_w, box_h = unlabeled_boxes[i]
	adjusted_box_y = y_start + box_y
	refined_boxes_list.append(
	{
	"text": word_label,
	"left": box_x,
	"top": adjusted_box_y,
	"width": box_w,
	"height": box_h,
	"conf": line_data["conf"][0],
	}
	)
	else:
	refined_boxes_list.append(
	{
	"text": word_label,
	"left": min_x,
	"top": min_y,
	"width": max(1, max_r - min_x),
	"height": cca_h,
	"conf": line_data["conf"][0],
	}
	)

	# Check validity
	cca_check_list = [
	(b["left"], b["top"], b["width"], b["height"])
	for b in refined_boxes_list
	]
	if not self._is_geometry_valid(
	cca_check_list, words, estimated_char_height
	):
	if abs(len(refined_boxes_list) - len(words)) > 1:
	best_boxes = None # Trigger fallback
	else:
	final_output = {
	k: []
	for k in ["text", "left", "top", "width", "height", "conf"]
	}
	for box in refined_boxes_list:
	for key in final_output.keys():
	final_output[key].append(box[key])
	else:
	final_output = {
	k: []
	for k in ["text", "left", "top", "width", "height", "conf"]
	}
	for box in refined_boxes_list:
	for key in final_output.keys():
	final_output[key].append(box[key])

	# --- REPEAT FALLBACK IF VALIDATION FAILED ---
	if best_boxes is None and not used_fallback:
	used_fallback = True
	# [FIX] Use local_line_data here too
	final_output = self.fallback_segmenter.refine_words_bidirectional(
	local_line_data, deskewed_line_image
	)

	# ========================================================================
	# COORDINATE TRANSFORMATION (Map back to Original)
	# ========================================================================
	M_inv = cv2.invertAffineTransform(M)
	remapped_boxes_list = []
	for i in range(len(final_output["text"])):
	left, top = final_output["left"][i], final_output["top"][i]
	width, height = final_output["width"][i], final_output["height"][i]

	# Map the 4 corners
	corners = np.array(
	[
	[left, top],
	[left + width, top],
	[left + width, top + height],
	[left, top + height],
	],
	dtype="float32",
	)
	corners_expanded = np.expand_dims(corners, axis=1)
	original_corners = cv2.transform(corners_expanded, M_inv)
	squeezed_corners = original_corners.squeeze(axis=1)

	# Get axis aligned bounding box in original space
	min_x = int(np.min(squeezed_corners[:, 0]))
	max_x = int(np.max(squeezed_corners[:, 0]))
	min_y = int(np.min(squeezed_corners[:, 1]))
	max_y = int(np.max(squeezed_corners[:, 1]))

	remapped_boxes_list.append(
	{
	"text": final_output["text"][i],
	"left": min_x,
	"top": min_y,
	"width": max_x - min_x,
	"height": max_y - min_y,
	"conf": final_output["conf"][i],
	}
	)

	remapped_output = {k: [] for k in final_output.keys()}
	for box in remapped_boxes_list:
	for key in remapped_output.keys():
	remapped_output[key].append(box[key])

	img_h, img_w = line_image.shape[:2]
	remapped_output = self._enforce_logical_constraints(
	remapped_output, img_w, img_h
	)

	# ========================================================================
	# FINAL SAFETY NET
	# ========================================================================
	words = line_data["text"][0].split()
	target_count = len(words)
	current_count = len(remapped_output["text"])
	has_collapsed_boxes = any(w < 3 for w in remapped_output["width"])

	if current_count > 0:
	total_text_len = sum(len(t) for t in remapped_output["text"])
	total_box_width = sum(remapped_output["width"])
	avg_width_pixels = total_box_width / max(1, total_text_len)
	else:
	avg_width_pixels = 0
	is_suspiciously_thin = avg_width_pixels < 4

	if current_count != target_count or is_suspiciously_thin or has_collapsed_boxes:
	used_fallback = True

	# [FIX] Do NOT use original line_image/line_data here.
	# Use the local_line_data + deskewed_line_image pipeline,
	# then transform back using M_inv (same as above).

	# 1. Run fallback on rotated data
	temp_local_output = self.fallback_segmenter.refine_words_bidirectional(
	local_line_data, deskewed_line_image
	)

	# 2. If bidirectional failed to split correctly, use purely mathematical split on rotated data
	if len(temp_local_output["text"]) != target_count:
	h, w = deskewed_line_image.shape[:2]
	temp_local_output = self.fallback_segmenter.convert_line_to_word_level(
	local_line_data, w, h
	)

	# 3. Transform the result back to original coordinates (M_inv)
	# (Repeating the transformation logic for the safety net result)
	remapped_boxes_list = []
	for i in range(len(temp_local_output["text"])):
	left, top = temp_local_output["left"][i], temp_local_output["top"][i]
	width, height = (
	temp_local_output["width"][i],
	temp_local_output["height"][i],
	)

	corners = np.array(
	[
	[left, top],
	[left + width, top],
	[left + width, top + height],
	[left, top + height],
	],
	dtype="float32",
	)
	corners_expanded = np.expand_dims(corners, axis=1)
	original_corners = cv2.transform(corners_expanded, M_inv)
	squeezed_corners = original_corners.squeeze(axis=1)

	min_x = int(np.min(squeezed_corners[:, 0]))
	max_x = int(np.max(squeezed_corners[:, 0]))
	min_y = int(np.min(squeezed_corners[:, 1]))
	max_y = int(np.max(squeezed_corners[:, 1]))

	remapped_boxes_list.append(
	{
	"text": temp_local_output["text"][i],
	"left": min_x,
	"top": min_y,
	"width": max_x - min_x,
	"height": max_y - min_y,
	"conf": temp_local_output["conf"][i],
	}
	)

	remapped_output = {k: [] for k in temp_local_output.keys()}
	for box in remapped_boxes_list:
	for key in remapped_output.keys():
	remapped_output[key].append(box[key])

	if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
	output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_shortened_line_text}_final_boxes.png"
	os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
	output_image_vis = line_image.copy()
	for i in range(len(remapped_output["text"])):
	x, y, w, h = (
	int(remapped_output["left"][i]),
	int(remapped_output["top"][i]),
	int(remapped_output["width"][i]),
	int(remapped_output["height"][i]),
	)
	cv2.rectangle(output_image_vis, (x, y), (x + w, y + h), (0, 255, 0), 2)
	cv2.imwrite(output_path, output_image_vis)

	return remapped_output, used_fallback


	class HybridWordSegmenter:
	"""
	Implements a two-step approach for word segmentation:
	1. Proportional estimation based on text (primary; avoids image noise).
	2. Image-based refinement with a "Bounded Scan" that cannot shrink boxes
	beyond a fraction of the text-based width.

	Design: Relies more on expected character spacing from the text than on
	image analysis, so noisy images are less likely to produce tiny or
	missing boxes.

	Situations that could otherwise cause very small boxes (and how we mitigate):
	- False gaps in the vertical projection (noise/speckle) -> refinement is
	bounded by shrink_limit_fraction; initial boxes use proportional only.
	- Image-based "justified" gap anchoring picking wrong cuts -> we do not
	use vertical_projection for initial segmentation here; only proportional.
	- Bidirectional scan snapping to a thin low-density strip inside a word ->
	same shrink bound; fallback "thinnest point" also clamped.
	- De-overlapping stealing space from the next word -> shrink bound keeps
	each box at least (1 - shrink_limit_fraction) of initial width.

	ROBUSTNESS UPGRADES:
	- Uses Horizontal Smearing to prevent cutting inside noisy characters.
	- Uses Gaussian Blur to suppress speckle noise.
	- Implements 'Noise Floors' for gap detection (never assumes perfect 0).
	"""

	def convert_line_to_word_level(
	self,
	line_data: Dict[str, List],
	image_width: int,
	image_height: int,
	vertical_projection: np.ndarray = None,
	) -> Dict[str, List]:
	"""
	Step 1: Converts line-level OCR results to word-level using proportional estimation.
	Includes noise-tolerant gap anchoring for justified text.
	"""
	output = {
	"text": list(),
	"left": list(),
	"top": list(),
	"width": list(),
	"height": list(),
	"conf": list(),
	}

	if not line_data or not line_data.get("text"):
	return output

	i = 0
	line_text = line_data["text"][i]
	line_left = float(line_data["left"][i])
	line_top = float(line_data["top"][i])
	line_width = float(line_data["width"][i])
	line_height = float(line_data["height"][i])
	line_conf = line_data["conf"][i]

	if not line_text.strip():
	return output
	words = line_text.split()
	if not words:
	return output
	num_chars = len("".join(words))
	num_spaces = len(words) - 1
	if num_chars == 0:
	return output

	# --- Justified text: anchor cut points to widest zero-gaps in projection ---
	if (
	vertical_projection is not None
	and len(vertical_projection) == image_width
	and num_spaces > 0
	):
	# ROBUSTNESS: Allow significantly more noise in gaps for justified text detection.
	# Allow up to 3% of the column height to be noise and still count as a "gap".
	dynamic_gap_threshold = max(255.0 * 0.03 * image_height, 255.0 * 2)
	gaps = _find_widest_zero_gaps(
	vertical_projection, n=num_spaces, gap_threshold=dynamic_gap_threshold
	)
	if len(gaps) == num_spaces:
	cuts = [0]
	for start, end in gaps:
	cuts.append((start + end) // 2)
	cuts.append(image_width)

	for idx, word in enumerate(words):
	left_px = cuts[idx]
	right_px = cuts[idx + 1]
	width_px = max(1, right_px - left_px)
	output["text"].append(word)
	output["left"].append(line_left + left_px)
	output["top"].append(line_top)
	output["width"].append(width_px)
	output["height"].append(line_height)
	output["conf"].append(line_conf)
	return output

	# --- Proportional estimation ---
	total_line_weight = get_weighted_length(line_text)
	if total_line_weight <= 0:
	total_line_weight = 1.0
	avg_weight_unit = line_width / total_line_weight
	estimated_space_width = get_weighted_length(" ") * avg_weight_unit

	avg_char_width = line_width / (num_chars if num_chars > 0 else 1)
	avg_char_width = max(3.0, avg_char_width)
	min_word_width = max(5.0, avg_char_width * 0.5)

	current_left = line_left
	for word in words:
	word_weight = get_weighted_length(word)
	raw_word_width = word_weight * avg_weight_unit
	word_width = max(min_word_width, raw_word_width)

	clamped_left = max(0, min(current_left, image_width))
	output["text"].append(word)
	output["left"].append(clamped_left)
	output["top"].append(line_top)
	output["width"].append(word_width)
	output["height"].append(line_height)
	output["conf"].append(line_conf)
	current_left += word_width + estimated_space_width

	return output

	def _run_single_pass(
	self,
	initial_boxes: List[Dict],
	vertical_projection: np.ndarray,
	max_scan_distance: int,
	img_w: int,
	img_h: int,
	direction: str = "ltr",
	trailing_punctuation: List[bool] = None,
	shrink_limit_fraction: float = 0.5,
	) -> List[Dict]:
	"""
	Helper function to run one pass of refinement.
	ROBUSTNESS UPGRADE:
	- Uses a 'gap_noise_floor' instead of looking for 0.
	- Enforces 'safety_density_limit': if the "thinnest" point is still thick (ink),
	it refuses to cut there (prevents cutting bold letters).
	- shrink_limit_fraction: Refinement cannot shrink a box by more than this fraction
	of its initial (text-based) width from either edge. Prevents noise from creating
	tiny boxes; keeps segmentation anchored to expected character spacing.
	"""

	refined_boxes = [box.copy() for box in initial_boxes]
	if trailing_punctuation is None:
	trailing_punctuation = [False] * len(initial_boxes)

	# ROBUSTNESS: Define what constitutes a "gap" vs "ink"
	# 1. Gap Floor: Anything below 5% of image height is treated as empty space (noise tolerance)
	gap_noise_floor = 255.0 * (img_h * 0.05)

	# 2. Ink Safety Limit: If the "thinnest" point has > 25% ink density, it is NOT a gap.
	# It's a character. Do not cut.
	safety_density_limit = 255.0 * (img_h * 0.25)

	if direction == "ltr":
	last_corrected_right_edge = 0
	indices = range(len(refined_boxes))
	else: # rtl
	next_corrected_left_edge = img_w
	indices = range(len(refined_boxes) - 1, -1, -1)

	for i in indices:
	box = refined_boxes[i]
	left = int(box["left"])
	right = int(box["left"] + box["width"])
	init_width = max(1, int(box["width"]))
	# Bounds from initial (text-based) box: don't let image refinement shrink too much
	min_right = right - int(shrink_limit_fraction * init_width)
	max_left = left + int(shrink_limit_fraction * init_width)

	left = max(0, min(left, img_w - 1))
	right = max(0, min(right, img_w - 1))

	new_left, new_right = left, right

	if direction == "ltr" or direction == "both": # Scan right
	if right < img_w:
	scan_limit = min(img_w, right + max_scan_distance)
	search_range = range(right, scan_limit)

	best_x = right
	min_density = float("inf")
	found_gap = False
	first_gap_x = None

	for x in search_range:
	density = vertical_projection[x]

	# Check for Gap
	if density <= gap_noise_floor:
	first_gap_x = x
	found_gap = True
	break

	# Track minimum density for fallback
	if density < min_density:
	min_density = density
	best_x = x

	if found_gap and first_gap_x is not None:
	if trailing_punctuation[i]:
	# Logic to jump over the gap and include the punctuation blob
	# ... (same safety limits as before) ...
	proj_len = len(vertical_projection)
	x_pos = first_gap_x

	# 1. Cross the gap
	gap_safety_limit = x_pos + (max_scan_distance // 2)
	while (
	x_pos < scan_limit
	and x_pos < proj_len
	and vertical_projection[x_pos] <= gap_noise_floor
	):
	if x_pos >= gap_safety_limit:
	break
	x_pos += 1

	# 2. Consume blob
	blob_start = x_pos
	blob_safety_limit = blob_start + max(1, int(img_h * 0.5))
	while (
	x_pos < scan_limit
	and x_pos < proj_len
	and vertical_projection[x_pos] > gap_noise_floor
	):
	if x_pos >= blob_safety_limit:
	x_pos = first_gap_x # Revert
	break
	x_pos += 1
	new_right = min(x_pos, scan_limit)
	else:
	new_right = first_gap_x

	elif not found_gap:
	# Fallback: No clear gap found.
	# ROBUSTNESS CHECK: Is the "thinnest" point actually thin?
	if min_density < safety_density_limit:
	new_right = best_x
	else:
	# The thinnest point is still very dark (ink).
	# Don't cut through a letter. Keep original guess or limit.
	new_right = right

	if direction == "rtl" or direction == "both": # Scan left
	if left > 0:
	scan_limit = max(0, left - max_scan_distance)
	search_range = range(left, scan_limit, -1)

	best_x = left
	min_density = float("inf")
	found_gap = False

	for x in search_range:
	density = vertical_projection[x]

	if density <= gap_noise_floor:
	new_left = x
	found_gap = True
	break

	if density < min_density:
	min_density = density
	best_x = x

	if not found_gap:
	# ROBUSTNESS CHECK
	if min_density < safety_density_limit:
	new_left = best_x
	else:
	# Refuse to cut through dense ink
	new_left = left

	# --- Anchor to text: don't shrink past allowed fraction of initial width ---
	new_right = max(new_right, min_right)
	new_left = min(new_left, max_left)

	# --- Directional de-overlapping ---
	if direction == "ltr":
	if new_left < last_corrected_right_edge:
	new_left = last_corrected_right_edge
	if new_right <= new_left:
	new_right = new_left + 1
	last_corrected_right_edge = new_right
	else: # rtl
	if new_right > next_corrected_left_edge:
	new_right = next_corrected_left_edge
	if new_left >= new_right:
	new_left = new_right - 1
	next_corrected_left_edge = new_left

	box["left"] = new_left
	box["width"] = max(1, new_right - new_left)

	return refined_boxes

	def refine_words_bidirectional(
	self,
	line_data: Dict[str, List],
	line_image: np.ndarray,
	) -> Dict[str, List]:
	"""
	Refines boxes using a robust bidirectional scan.
	DIFFERENCE FROM MAIN SEGMENTER: Uses aggressive smoothing and horizontal
	smearing to force-merge characters, prioritizing word separation over
	character detail.
	"""
	if line_image is None:
	return line_data

	# Handle grayscale (2D) or BGR (3D) line images
	if len(line_image.shape) == 2:
	gray = np.ascontiguousarray(line_image)
	else:
	gray = cv2.cvtColor(line_image, cv2.COLOR_BGR2GRAY)
	img_h, img_w = gray.shape[:2]

	# OpenCV GaussianBlur(5,5) and later adaptiveThreshold need minimum dimensions.
	# Avoid "Unknown C++ exception" on very small line crops (e.g. 1–4 px).
	if img_h < 5 or img_w < 5:
	return self.convert_line_to_word_level(line_data, img_w, img_h)

	if line_data and line_data.get("text"):
	words = line_data["text"][0].split()
	if len(words) <= 1:
	return self.convert_line_to_word_level(line_data, img_w, img_h)

	# --- PRE-PROCESSING: The "Bulldozer" Approach ---
	# 1. Gaussian Blur: Suppress high-frequency speckle noise that confuses the main segmenter
	# We accept slight edge blurring for the sake of noise reduction.
	# OpenCV can intermittently throw low-information C++ exceptions on some
	# page crops (often due to dtype/range/nan/inf issues). If that happens,
	# fall back to the non-image-based word conversion to keep OCR flowing.
	try:
	# Guard against NaN/Inf propagating into OpenCV internals.
	if gray.dtype.kind in ("f", "c"):
	gray = np.nan_to_num(gray, nan=0.0, posinf=255.0, neginf=0.0)

	# GaussianBlur is most stable on uint8 or float32. If we have another
	# dtype (e.g. int16/float64/object), normalize and cast.
	if gray.dtype != np.uint8 and gray.dtype != np.float32:
	# Normalize to 0..255 if range looks unusual.
	gmin = float(np.min(gray)) if gray.size else 0.0
	gmax = float(np.max(gray)) if gray.size else 255.0
	if gmax > 255.0 or gmin < 0.0:
	gray = cv2.normalize(gray, None, 0, 255, cv2.NORM_MINMAX)
	gray = np.clip(gray, 0, 255).astype(np.uint8)

	blurred_gray = cv2.GaussianBlur(gray, (5, 5), 0)
	except Exception:
	return self.convert_line_to_word_level(line_data, img_w, img_h)

	# 2. Aggressive Thresholding
	# We use a larger block size here to be less sensitive to local texture variations
	block_size = max(25, int(img_h * 0.5))
	if block_size % 2 == 0:
	block_size += 1

	binary = cv2.adaptiveThreshold(
	blurred_gray,
	255,
	cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY_INV,
	block_size,
	10,
	)

	# 3. Horizontal Smearing (The critical difference)
	# We intentionally smear mostly horizontally to bridge gaps inside noisy letters.
	# Kernel width: ~15-20% of line height.
	smear_w = max(3, int(img_h * 0.20))
	smear_h = max(1, int(img_h * 0.05))
	kernel_smear = cv2.getStructuringElement(cv2.MORPH_RECT, (smear_w, smear_h))

	# Apply Morphological Closing
	binary_smeared = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_smear)

	# Calculate projection on the SMEARED image
	vertical_projection = np.sum(binary_smeared, axis=0)

	# --- Setup for Scan ---
	# Detect blobs to estimate character width for scan limiting
	char_blobs = []
	in_blob = False
	blob_start = 0
	for x, col_sum in enumerate(vertical_projection):
	if col_sum > 0 and not in_blob:
	blob_start = x
	in_blob = True
	elif col_sum == 0 and in_blob:
	char_blobs.append((blob_start, x))
	in_blob = False
	if in_blob:
	char_blobs.append((blob_start, img_w))

	if not char_blobs:
	return self.convert_line_to_word_level(line_data, img_w, img_h)

	total_chars = len("".join(words))
	if total_chars > 0:
	geom_avg_char_width = img_w / total_chars
	else:
	geom_avg_char_width = 10

	blob_avg_char_width = np.mean([end - start for start, end in char_blobs])
	safe_avg_char_width = min(blob_avg_char_width, geom_avg_char_width * 1.5)

	# Scan distance parameters
	max_scan_distance = max(int(safe_avg_char_width * 2.5), int(img_h * 0.6))
	min_safe_box_width = max(4, int(safe_avg_char_width * 0.5))

	# --- Standard Logic Continues ---
	# Use proportional estimation only (no vertical_projection) so initial boxes
	# are driven by text/character spacing. Image-based gap anchoring on noisy
	# images can produce tiny slices; refinement will still run but is bounded.
	estimated_data = self.convert_line_to_word_level(line_data, img_w, img_h)
	if not estimated_data["text"]:
	return estimated_data

	initial_boxes = []
	for i in range(len(estimated_data["text"])):
	initial_boxes.append(
	{
	"text": estimated_data["text"][i],
	"left": estimated_data["left"][i],
	"top": estimated_data["top"][i],
	"width": estimated_data["width"][i],
	"height": estimated_data["height"][i],
	"conf": estimated_data["conf"][i],
	}
	)

	trailing_punctuation = [
	_word_ends_with_punctuation(estimated_data["text"][j])
	for j in range(len(estimated_data["text"]))
	]

	# Run passes (ensure _run_single_pass uses the robust gap logic)
	ltr_boxes = self._run_single_pass(
	initial_boxes,
	vertical_projection,
	max_scan_distance,
	img_w,
	img_h,
	"ltr",
	trailing_punctuation,
	)
	rtl_boxes = self._run_single_pass(
	initial_boxes,
	vertical_projection,
	max_scan_distance,
	img_w,
	img_h,
	"rtl",
	trailing_punctuation,
	)

	# [Re-use stitching logic from previous code...]
	combined_boxes = [box.copy() for box in initial_boxes]
	for i in range(len(combined_boxes)):
	final_left = ltr_boxes[i]["left"]
	rtl_right = rtl_boxes[i]["left"] + rtl_boxes[i]["width"]
	combined_boxes[i]["left"] = final_left
	combined_boxes[i]["width"] = max(min_safe_box_width, rtl_right - final_left)

	for i in range(len(combined_boxes) - 1):
	if combined_boxes[i + 1]["left"] <= combined_boxes[i]["left"]:
	combined_boxes[i + 1]["left"] = (
	combined_boxes[i]["left"] + min_safe_box_width
	)

	for i in range(len(combined_boxes) - 1):
	curr = combined_boxes[i]
	nxt = combined_boxes[i + 1]
	gap_width = nxt["left"] - curr["left"]
	curr["width"] = max(min_safe_box_width, gap_width)

	final_output = {k: [] for k in estimated_data.keys()}
	for box in combined_boxes:
	# Always keep one box per word; enforce minimum width 1 for valid geometry
	box_width = max(1, box["width"])
	box["width"] = box_width
	for key in final_output.keys():
	final_output[key].append(box[key])

	return final_output