#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Enhanced Text Extractor Module with CJK Support
Provides superior text extraction from HTML with proper Unicode handling
Optimized for Korean, Japanese, and Chinese content extraction
"""
import os
import re
import html
import unicodedata
from typing import Tuple, Optional
import chardet
# BEAUTIFUL SOUP IMPORT MONKEY FIX - Import BeautifulSoup BEFORE html2text
# This prevents certain parser initialization issues
try:
from bs4 import BeautifulSoup
# Force BeautifulSoup to initialize its parsers
_ = BeautifulSoup("", 'html.parser')
except ImportError:
BeautifulSoup = None
raise ImportError("BeautifulSoup is required. Install with: pip install beautifulsoup4")
# Now import html2text AFTER BeautifulSoup
try:
import html2text
except ImportError:
html2text = None
raise ImportError("html2text is required. Install with: pip install html2text")
class EnhancedTextExtractor:
"""Enhanced text extraction with proper Unicode and CJK handling"""
# Unicode preservation mappings
UNICODE_QUOTES = {
# Western quotes
'“': '\u201c', # Left double quotation mark
'”': '\u201d', # Right double quotation mark
'‘': '\u2018', # Left single quotation mark
'’': '\u2019', # Right single quotation mark
'"': '"', # Standard double quote
''': "'", # Standard apostrophe
# CJK quotes and punctuation
'「': '「', # Japanese left corner bracket
'」': '」', # Japanese right corner bracket
'『': '『', # Japanese left white corner bracket
'』': '』', # Japanese right white corner bracket
'(': '(', # Fullwidth left parenthesis
')': ')', # Fullwidth right parenthesis
'【': '【', # Left black lenticular bracket
'】': '】', # Right black lenticular bracket
'《': '《', # Left double angle bracket
'》': '》', # Right double angle bracket
';': ';', # Fullwidth semicolon
':': ':', # Fullwidth colon
'。': '。', # Ideographic full stop
'?': '?', # Fullwidth question mark
'!': '!', # Fullwidth exclamation mark
'、': '、', # Ideographic comma
# Numeric entities
'“': '\u201c', # Left double quote (numeric)
'”': '\u201d', # Right double quote (numeric)
'‘': '\u2018', # Left single quote (numeric)
'’': '\u2019', # Right single quote (numeric)
# Common CJK entities
'…': '…', # Horizontal ellipsis
'—': '—', # Em dash
'–': '–', # En dash
' ': '\u00A0', # Non-breaking space
}
# CJK-specific punctuation to preserve
CJK_PUNCTUATION = {
'。', '、', '!', '?', '…', '—', '~', '・',
'「', '」', '『', '』', '(', ')', '【', '】',
'《', '》', '〈', '〉', '〔', '〕', '[', ']',
':', ';', '"', '"', ''', ''',
',', '.', '?', '!', ':', ';',
'"', '"', '‚', '„', '«', '»',
}
# Quote protection markers
QUOTE_MARKERS = {
'"': '␥', # Opening double quote marker
'"': '␦', # Closing double quote marker
'"': '␦', # Alternative closing quote
"'": '␣', # Opening single quote marker
"'": '', # Closing single quote marker
"'": '', # Alternative closing quote
}
# Angle bracket protection markers for invalid tags
ANGLE_BRACKET_MARKERS = {
'<': '‹', # Single left-pointing angle quotation mark (U+2039)
'>': '›', # Single right-pointing angle quotation mark (U+203A)
}
def __init__(self, filtering_mode: str = "smart", preserve_structure: bool = True):
"""Initialize the enhanced text extractor"""
if not html2text:
raise ImportError("html2text is required for enhanced extraction")
if not BeautifulSoup:
raise ImportError("BeautifulSoup is required for enhanced extraction")
self.filtering_mode = filtering_mode
self.preserve_structure = preserve_structure
self.h2t = None
self.detected_language = None
self._configure_html2text()
def _detect_encoding(self, content: bytes) -> str:
"""Detect the encoding of the content"""
try:
# Try chardet detection
detected = chardet.detect(content)
if detected['confidence'] > 0.7:
return detected['encoding']
except Exception:
pass
# Try common CJK encodings in order
for encoding in ['utf-8', 'gb2312', 'gbk', 'gb18030', 'big5', 'shift_jis', 'euc-kr', 'euc-jp']:
try:
content.decode(encoding)
return encoding
except Exception:
continue
return 'utf-8' # Default fallback
def _detect_content_language(self, text: str) -> str:
"""Detect the primary language of content"""
if not text:
return 'unknown'
# Take a sample of the text
sample = text[:5000]
# Count characters by script
korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF)
japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF))
chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF)
latin_chars = sum(1 for char in sample if 0x0041 <= ord(char) <= 0x007A)
# Determine primary language
if korean_chars > 50:
return 'korean'
elif japanese_kana > 20:
return 'japanese'
elif chinese_chars > 50 and japanese_kana < 10:
return 'chinese'
elif latin_chars > 100:
return 'english'
else:
return 'unknown'
def _configure_html2text(self):
"""Configure html2text with optimal Unicode and CJK settings"""
self.h2t = html2text.HTML2Text()
# Core settings for Unicode preservation
self.h2t.unicode_snob = True
self.h2t.escape_snob = os.getenv('HTML2TEXT_ESCAPE_SNOB', '0') == '1'
self.h2t.use_automatic_links = False
# Layout settings
self.h2t.body_width = 0
# Check environment variable for single line break setting
single_line_break = os.getenv('ENHANCED_SINGLE_LINE_BREAK', '0') == '1'
self.h2t.single_line_break = single_line_break
# Content filtering
self.h2t.ignore_links = False
self.h2t.ignore_images = False
self.h2t.ignore_anchors = False
self.h2t.skip_internal_links = False
self.h2t.ignore_tables = False
# Image handling - CRITICAL: Force html2text to preserve img tags as HTML
self.h2t.images_as_html = True # Keep images as tags instead of ![]()
self.h2t.images_to_alt = False # Don't convert to alt text only
self.h2t.images_with_size = True # Include width/height attributes
# Additional settings
self.h2t.wrap_links = False
self.h2t.wrap_list_items = False
self.h2t.protect_links = True
# Structure preservation settings
if self.preserve_structure:
self.h2t.bypass_tables = False
self.h2t.ignore_emphasis = False
self.h2t.mark_code = True
self.h2t.ul_item_mark = '•'
else:
self.h2t.bypass_tables = True
self.h2t.ignore_emphasis = True
self.h2t.mark_code = False
def _decode_entities(self, text: str) -> str:
"""Decode HTML entities to Unicode characters with CJK support"""
if not text:
return text
# First pass: Apply known CJK-aware replacements
for entity, unicode_char in self.UNICODE_QUOTES.items():
text = text.replace(entity, unicode_char)
# Second pass: standard HTML unescape
text = html.unescape(text)
# Third pass: handle numeric entities
def decode_decimal(match):
try:
code = int(match.group(1))
if code < 0x110000:
return chr(code)
except Exception:
pass
return match.group(0)
def decode_hex(match):
try:
code = int(match.group(1), 16)
if code < 0x110000:
return chr(code)
except Exception:
pass
return match.group(0)
text = re.sub(r'(\d+);?', decode_decimal, text)
text = re.sub(r'([0-9a-fA-F]+);?', decode_hex, text)
# Fourth pass: handle special CJK entities
cjk_special_entities = {
'〈': '〈', '〉': '〉',
'⌈': '⌈', '⌉': '⌉',
'⌊': '⌊', '⌋': '⌋',
}
for entity, char in cjk_special_entities.items():
text = text.replace(entity, char)
return text
def _normalize_unicode(self, text: str) -> str:
"""Normalize Unicode with CJK awareness"""
if self.detected_language in ['korean', 'japanese', 'chinese']:
return text
else:
return unicodedata.normalize('NFC', text)
def _protect_quotes(self, text: str) -> str:
"""Protect quotes by replacing with special markers"""
for original, marker in self.QUOTE_MARKERS.items():
text = text.replace(original, marker)
return text
def _restore_quotes(self, text: str) -> str:
"""Restore quotes from special markers"""
for original, marker in self.QUOTE_MARKERS.items():
text = text.replace(marker, original)
return text
def _protect_cjk_angle_brackets(self, text: str) -> str:
"""Protect angle brackets containing CJK text from being treated as HTML tags."""
import re
# Pattern to match angle brackets containing CJK characters
cjk_pattern = r'[\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\u4e00-\u9fff\uac00-\ud7af]'
bracket_pattern = rf'<([^<>]*{cjk_pattern}[^<>]*)>'
def replace_brackets(match):
content = match.group(1)
# Use special Unicode markers instead of HTML entities
return f"‹{content}›"
return re.sub(bracket_pattern, replace_brackets, text)
def _restore_cjk_angle_brackets(self, text: str) -> str:
"""Restore angle brackets from special markers."""
# Convert markers back to HTML entities for proper display
text = text.replace('‹', '<')
text = text.replace('›', '>')
return text
def _protect_empty_attr_tags(self, text: str) -> str:
"""Protect tags with empty attributes from being stripped by html2text"""
# Known HTML tags to preserve
known_tags = {
'html','head','body','title','meta','link','style','script','noscript',
'p','div','span','br','hr','img','a','h1','h2','h3','h4','h5','h6',
'ul','ol','li','dl','dt','dd',
'pre','code','em','strong','b','i','u','s','strike','del','ins','mark','small','sub','sup',
'table','thead','tbody','tr','td','th','caption','col','colgroup',
'blockquote','q','cite',
'section','article','header','footer','nav','main','aside','details','summary',
'figure','figcaption',
'form','input','button','select','option','textarea','label','fieldset','legend',
'iframe','canvas','svg','math',
'video','audio','source','track','embed','object','param',
'map','area',
'center', 'font', 'base'
}
# Transform:
and
"왜 이러는 겁니까? 우리가 무슨 잘못을 했다고!"
"......"
"한 번만 살려주시오! 가족을 지키려면 어쩔 수 없었소!"
"응애! 응애! 응애!"
"미안하구나. 모든 죄는 내가 짊어지고 사마."
''' # Japanese test with quotes '''「こんにちは!これは日本語のテストです。」
彼は言った。「これで全部ですか?」
「はい、そうです」と答えた。
''', # Chinese test with quotes '''"你好!这是中文测试。"
他说:"这就是全部吗?"
"是的,"她回答道。
''', ] extractor = EnhancedTextExtractor() print("=== CJK and Quote Preservation Test ===\n") for i, test_html in enumerate(test_cases, 1): print(f"--- Test Case {i} ---") try: content, _, title = extractor.extract_chapter_content(test_html) print(f"Title: {title}") print(f"Content:\n{content}\n") # Check for quotes preservation quote_checks = [ ('"', 'Western double quotes'), ('「', 'Japanese left bracket'), ('」', 'Japanese right bracket'), ('“', 'Chinese double quote'), ] print("Quote preservation check:") quote_found = False for quote_char, desc in quote_checks: if quote_char in content: print(f" ✓ Found {desc}: {quote_char}") quote_found = True if not quote_found: print(" ❌ No quotes found!") else: print(" ✅ Quotes preserved successfully!") # Check for image tag preservation (html2text now preserves them natively) img_count = content.count('