Text Generation
Transformers
English
custom
tokenizer
symbolic-ai
mathematics
llm
reasoning
ast
compiler
nlp
deep-learning
machine-learning
mathematical-reasoning
symbolic-reasoning
tokenization
parser
artificial-intelligence
Eval Results (legacy)
Instructions to use SurweeshSP/mathtok with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use SurweeshSP/mathtok with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="SurweeshSP/mathtok")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("SurweeshSP/mathtok", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use SurweeshSP/mathtok with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "SurweeshSP/mathtok" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SurweeshSP/mathtok", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/SurweeshSP/mathtok
- SGLang
How to use SurweeshSP/mathtok with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "SurweeshSP/mathtok" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SurweeshSP/mathtok", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "SurweeshSP/mathtok" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SurweeshSP/mathtok", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use SurweeshSP/mathtok with Docker Model Runner:
docker model run hf.co/SurweeshSP/mathtok
Commit ·
edede4c
0
Parent(s):
Initial clean MathTok release
Browse files- .gitattributes +35 -0
- .gitignore +16 -0
- README.md +178 -0
- assets/mathtok_architecture_improvements.svg +124 -0
- evaluation/__init__.py +1 -0
- evaluation/benchmark.py +201 -0
- evaluation/comparison.py +920 -0
- evaluation/datasets/sample_problems.json +115 -0
- evaluation/metrics.py +367 -0
- evaluation/results/comparison_results.jsonl +70 -0
- evaluation/visualize.py +371 -0
- mathtok/__init__.py +42 -0
- mathtok/ast_generator.py +334 -0
- mathtok/canonicalizer.py +320 -0
- mathtok/lexer.py +315 -0
- mathtok/metadata.py +307 -0
- mathtok/operator_registry.py +429 -0
- mathtok/pipeline.py +301 -0
- mathtok/serializer.py +239 -0
- mathtok/streaming.py +73 -0
- mathtok/validator.py +137 -0
- mathtok/vocabulary.py +408 -0
- model.md +168 -0
- pyproject.toml +14 -0
- requirements.txt +31 -0
- review.md +243 -0
- setup.py +46 -0
- tests/__init__.py +1 -0
- tests/test_ast_generator.py +166 -0
- tests/test_canonicalizer.py +125 -0
- tests/test_comparison.py +180 -0
- tests/test_lexer.py +81 -0
- tests/test_pipeline.py +124 -0
- tests/test_serializer.py +125 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
|
| 6 |
+
build/
|
| 7 |
+
dist/
|
| 8 |
+
*.egg-info/
|
| 9 |
+
|
| 10 |
+
.env
|
| 11 |
+
venv/
|
| 12 |
+
|
| 13 |
+
.ipynb_checkpoints/
|
| 14 |
+
|
| 15 |
+
evaluation/results/*.json
|
| 16 |
+
evaluation/results/*.png
|
README.md
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MathTok
|
| 2 |
+
|
| 3 |
+
**A Hybrid Canonicalized AST-Based Tokenization Framework for Mathematical Language Modeling**
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Overview
|
| 8 |
+
|
| 9 |
+
MathTok is a research-grade tokenizer pipeline that converts raw mathematical expressions (LaTeX or ASCII) into a structured, semantically-rich token stream. Unlike standard BPE or SentencePiece tokenizers, MathTok is *structure-aware*: it builds an Abstract Syntax Tree (AST) from each expression and serializes it via DFS preorder traversal, preserving full mathematical structure.
|
| 10 |
+
|
| 11 |
+
```
|
| 12 |
+
Raw Mathematical Expression
|
| 13 |
+
↓
|
| 14 |
+
Canonicalization Layer (sympy: simplify, expand, normalize)
|
| 15 |
+
↓
|
| 16 |
+
Hybrid Mathematical Lexer (split TEXT / MATH spans)
|
| 17 |
+
↓
|
| 18 |
+
AST Generator (SymPy tree → typed ASTNode tree)
|
| 19 |
+
↓
|
| 20 |
+
Operator-Aware Semantic Encoder (rich metadata per operator)
|
| 21 |
+
↓
|
| 22 |
+
Structural Serialization (DFS preorder → flat token stream)
|
| 23 |
+
↓
|
| 24 |
+
Structural Attention Metadata (per-token tree context)
|
| 25 |
+
↓
|
| 26 |
+
Vocabulary Mapping + BPE (fixed math vocab + HF BPE for text)
|
| 27 |
+
↓
|
| 28 |
+
Compressed Token Stream
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
## Quick Start
|
| 34 |
+
|
| 35 |
+
```bash
|
| 36 |
+
# Install dependencies and package in editable mode
|
| 37 |
+
pip install -e ".[eval,dev]"
|
| 38 |
+
|
| 39 |
+
# Tokenize an expression using the CLI pipeline
|
| 40 |
+
python -m mathtok.pipeline "The derivative of sin(x^2) + 3x"
|
| 41 |
+
|
| 42 |
+
# Run the comprehensive 110+ test suite
|
| 43 |
+
pytest tests/ -v
|
| 44 |
+
|
| 45 |
+
# Run the 4-way comparative tokenizer evaluation benchmark
|
| 46 |
+
# (MathTok vs GPT-2 BPE vs SentencePiece Unigram vs Char-level)
|
| 47 |
+
python -m evaluation.comparison
|
| 48 |
+
|
| 49 |
+
# Generate visual plots and the unified metrics dashboard
|
| 50 |
+
python -m evaluation.visualize
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
---
|
| 54 |
+
|
| 55 |
+
## Python API
|
| 56 |
+
|
| 57 |
+
```python
|
| 58 |
+
from mathtok import MathTokPipeline
|
| 59 |
+
|
| 60 |
+
pipeline = MathTokPipeline()
|
| 61 |
+
|
| 62 |
+
# Encode mixed text + math (supporting LaTeX or ASCII syntax)
|
| 63 |
+
out = pipeline.encode("The derivative of $\\sin(x^2)$ is $2x\\cos(x^2)$.")
|
| 64 |
+
print(out.tokens) # ['[MATH_START]', 'FUNC_SIN', 'OP_POW', 'VAR_X', 'CONST_2', '[MATH_END]', ...]
|
| 65 |
+
print(out.sexp) # (FUNC_SIN (OP_POW VAR_X CONST_2))
|
| 66 |
+
print(out.input_ids) # [4, 27, 10, 45, 12, 5, ...]
|
| 67 |
+
|
| 68 |
+
# Access structural metadata (for tree-aware attention masking)
|
| 69 |
+
for meta in out.metadata:
|
| 70 |
+
print(meta.token, meta.depth, meta.tree_position_key)
|
| 71 |
+
|
| 72 |
+
# Pure math expression serialization
|
| 73 |
+
out = pipeline.encode_math_only("(x+1)^2")
|
| 74 |
+
print(out.sexp) # (OP_POW (OP_ADD VAR_X CONST_1) CONST_2)
|
| 75 |
+
|
| 76 |
+
# HuggingFace-compatible tokenizer export
|
| 77 |
+
hf_tok = pipeline.get_hf_tokenizer()
|
| 78 |
+
hf_tok.save_pretrained("./mathtok-tokenizer")
|
| 79 |
+
result = hf_tok("x^2 + 2*x + 1", return_tensors="pt")
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
---
|
| 83 |
+
|
| 84 |
+
## Research Contributions
|
| 85 |
+
|
| 86 |
+
### 1. Hybrid Lexer
|
| 87 |
+
Separates natural language from mathematical content using LaTeX delimiter detection (`$...$`, `\(...\)`, `\[...\]`) and ASCII math heuristics.
|
| 88 |
+
|
| 89 |
+
### 2. Canonicalization Engine
|
| 90 |
+
Normalizes mathematically equivalent expressions via SymPy's `simplify()`, `expand()`, and internal representation (subtraction → addition + negation, division → multiplication + reciprocal).
|
| 91 |
+
|
| 92 |
+
### 3. AST-Based Structural Serialization
|
| 93 |
+
Maps SymPy's expression tree to a typed token vocabulary with semantic metadata per operator. Serializes via DFS preorder traversal.
|
| 94 |
+
|
| 95 |
+
### 4. Operator Semantic Registry
|
| 96 |
+
Every operator and function carries an explicit metadata record: `arity`, `precedence`, `associativity`, `semantic_role`. This is the primary novelty over standard tokenization.
|
| 97 |
+
|
| 98 |
+
### 5. Structural Attention Metadata
|
| 99 |
+
Per-token records encoding `depth`, `parent_id`, `children_ids`, `tree_position_key`, and `sibling_count` — enabling future structure-aware attention.
|
| 100 |
+
|
| 101 |
+
### 6. Two-Tier Vocabulary
|
| 102 |
+
- **Fixed math vocabulary**: deterministic IDs for all operators, functions, variables, constants.
|
| 103 |
+
- **BPE text vocabulary**: HuggingFace `tokenizers` BPE for natural language spans.
|
| 104 |
+
|
| 105 |
+
---
|
| 106 |
+
|
| 107 |
+
## Evaluation Metrics & Benchmarks
|
| 108 |
+
|
| 109 |
+
### Core Metrics
|
| 110 |
+
|
| 111 |
+
| Metric | Symbol | Meaning |
|
| 112 |
+
|--------|--------|---------|
|
| 113 |
+
| **Semantic Compression Ratio** | SCR | `structural_score / token_count` (Higher is better — measures parsed semantic content density) |
|
| 114 |
+
| **Semantic Density** | SD | `math_tokens / total_tokens` (Ratio of high-value math tokens, measures information density) |
|
| 115 |
+
| **Structural Efficiency** | SE | `parent_child_relations / token_count` (Ratio of hierarchy relationships encoded per token) |
|
| 116 |
+
| **Token Stability** | TS | `1 - CoV(token count across rewritings)` (Fidelity and stability across representations) |
|
| 117 |
+
|
| 118 |
+
### Empirical Benchmarks (4-Way Comparison)
|
| 119 |
+
|
| 120 |
+
Below are the empirical averages computed over our comprehensive suite of 70 mathematical test expressions:
|
| 121 |
+
|
| 122 |
+
| Tokenizer | Mean SCR (↑ Better) | Semantic Density (↑ Better) | Structural Efficiency (↑ Better) |
|
| 123 |
+
|:---|:---:|:---:|:---:|
|
| 124 |
+
| **MathTok (Ours)** | **0.8501** | **0.5285** | **0.2339** |
|
| 125 |
+
| **GPT-2 BPE** | 0.4251 | 0.1838 | 0.1491 |
|
| 126 |
+
| **SentencePiece Unigram** | 0.3696 | 0.1499 | 0.1403 |
|
| 127 |
+
| **Character-Level** | 0.3708 | 0.1518 | 0.1518 |
|
| 128 |
+
|
| 129 |
+
> [!NOTE]
|
| 130 |
+
> * MathTok achieves a **2.30x structural compression improvement** over SentencePiece.
|
| 131 |
+
> * MathTok packs **3.52x more math-centric information** per token stream compared to SentencePiece unigrams (**0.5285** vs **0.1499**), showing immense semantic density.
|
| 132 |
+
> * MathTok is **1.67x more efficient** at encoding hierarchical ast relationships directly into token structures (**0.2339** vs **0.1403**).
|
| 133 |
+
|
| 134 |
+
### High-Impact Visualizations
|
| 135 |
+
|
| 136 |
+
The visualization system runs via `python -m evaluation.visualize` and exports professional visual assets under [`evaluation/results/`](file:///c:/Users/surwe/Project/math_token/evaluation/results/):
|
| 137 |
+
- **Unified Evaluation Dashboard** (`metrics_dashboard.png`): 3-panel side-by-side display of SCR, Semantic Density, and Structural Efficiency.
|
| 138 |
+
- **Overall SCR Comparison** (`scr_comparison.png`): Comparative summary bar chart.
|
| 139 |
+
- **Category-Level Breakdowns** (`scr_by_category.png`): SCR analyzed by nested/standard categories.
|
| 140 |
+
- **Semantic Density Summary** (`semantic_density_comparison.png`): Ratio of math structure to total tokens.
|
| 141 |
+
|
| 142 |
+
---
|
| 143 |
+
|
| 144 |
+
## Project Structure
|
| 145 |
+
|
| 146 |
+
```
|
| 147 |
+
math_token/
|
| 148 |
+
├── mathtok/
|
| 149 |
+
│ ├── canonicalizer.py # Layer 1: Canonicalization Engine
|
| 150 |
+
│ ├── lexer.py # Layer 2: Hybrid Mathematical Lexer
|
| 151 |
+
│ ├── ast_generator.py # Layer 3: AST Generator
|
| 152 |
+
│ ├── operator_registry.py # Layer 4: Operator Semantic Registry
|
| 153 |
+
│ ├── serializer.py # Layer 5: Structural Traversal & Serialization
|
| 154 |
+
│ ├── metadata.py # Layer 6: Structural Attention Metadata
|
| 155 |
+
│ ├── vocabulary.py # Layer 7: Two-Tier Vocabulary
|
| 156 |
+
│ └── pipeline.py # Orchestrator Pipeline
|
| 157 |
+
├── evaluation/
|
| 158 |
+
│ ├── metrics.py # Definition of core evaluation metrics
|
| 159 |
+
│ ├── benchmark.py # Quick benchmarking scripts
|
| 160 |
+
│ ├── comparison.py # Full 4-way comparative framework (SentencePiece integrated)
|
| 161 |
+
│ ├── visualize.py # Custom dashboard visualization engine
|
| 162 |
+
│ └── results/ # JSON/JSONL reports & visual plots
|
| 163 |
+
└── tests/ # 110+ passing unit tests
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
---
|
| 167 |
+
|
| 168 |
+
## Citation
|
| 169 |
+
|
| 170 |
+
```bibtex
|
| 171 |
+
@article{mathtok2024,
|
| 172 |
+
title = {MathTok: A Hybrid Canonicalized AST-Based Tokenization Framework
|
| 173 |
+
for Mathematical Language Modeling},
|
| 174 |
+
author = {Anonymous},
|
| 175 |
+
year = {2024},
|
| 176 |
+
note = {Under review}
|
| 177 |
+
}
|
| 178 |
+
```
|
assets/mathtok_architecture_improvements.svg
ADDED
|
|
evaluation/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# evaluation package
|
evaluation/benchmark.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MathTok Benchmark Runner
|
| 3 |
+
|
| 4 |
+
Evaluates the MathTok pipeline against baseline tokenizers on a curated
|
| 5 |
+
dataset of mathematical expressions and mixed text+math problems.
|
| 6 |
+
|
| 7 |
+
Usage
|
| 8 |
+
─────
|
| 9 |
+
python -m evaluation.benchmark # run full benchmark
|
| 10 |
+
python -m evaluation.benchmark --quick # 20 examples only
|
| 11 |
+
python -m evaluation.benchmark --json # JSON output
|
| 12 |
+
python -m evaluation.benchmark --baselines # include GPT-2 baseline
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import argparse
|
| 18 |
+
import json
|
| 19 |
+
import logging
|
| 20 |
+
import time
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
from typing import Callable
|
| 23 |
+
|
| 24 |
+
from mathtok.pipeline import MathTokPipeline
|
| 25 |
+
from .metrics import (
|
| 26 |
+
EvaluationReport, MetricResult,
|
| 27 |
+
structural_compression_ratio,
|
| 28 |
+
canonical_consistency_score,
|
| 29 |
+
operator_preservation_score,
|
| 30 |
+
token_stability,
|
| 31 |
+
tree_depth_fidelity,
|
| 32 |
+
make_gpt2_tokenizer,
|
| 33 |
+
tokenize_character_level,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
logger = logging.getLogger(__name__)
|
| 37 |
+
|
| 38 |
+
_DATASET_PATH = Path(__file__).parent / "datasets" / "sample_problems.json"
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# ── Dataset loading ───────────────────────────────────────────────────────
|
| 42 |
+
|
| 43 |
+
def load_dataset(path: Path = _DATASET_PATH) -> dict:
|
| 44 |
+
"""Load the benchmark dataset JSON."""
|
| 45 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 46 |
+
return json.load(f)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ── Benchmark runner ──────────────────────────────────────────────────────
|
| 50 |
+
|
| 51 |
+
class MathTokBenchmark:
|
| 52 |
+
"""
|
| 53 |
+
Run all five evaluation metrics on the benchmark dataset.
|
| 54 |
+
|
| 55 |
+
Parameters
|
| 56 |
+
----------
|
| 57 |
+
pipeline : MathTokPipeline to evaluate
|
| 58 |
+
dataset : loaded benchmark dict (from load_dataset())
|
| 59 |
+
max_n : maximum number of examples to evaluate (None = all)
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
def __init__(
|
| 63 |
+
self,
|
| 64 |
+
pipeline: MathTokPipeline,
|
| 65 |
+
dataset: dict,
|
| 66 |
+
max_n: int | None = None,
|
| 67 |
+
) -> None:
|
| 68 |
+
self.pipeline = pipeline
|
| 69 |
+
self.dataset = dataset
|
| 70 |
+
self.max_n = max_n
|
| 71 |
+
|
| 72 |
+
def run(self) -> EvaluationReport:
|
| 73 |
+
"""Run all five metrics and return an EvaluationReport."""
|
| 74 |
+
ds = self.dataset
|
| 75 |
+
|
| 76 |
+
# Slice if max_n is set
|
| 77 |
+
exprs = ds.get("expressions", [])[:self.max_n]
|
| 78 |
+
eq_pairs = ds.get("equivalent_pairs", [])[:self.max_n]
|
| 79 |
+
expr_groups = ds.get("rewriting_groups", [])[:self.max_n]
|
| 80 |
+
mixed = ds.get("mixed_text_math", [])[:self.max_n]
|
| 81 |
+
|
| 82 |
+
# Build the primary tokenizer function
|
| 83 |
+
def tokenize(text: str) -> list[str]:
|
| 84 |
+
return self.pipeline.encode(text).tokens
|
| 85 |
+
|
| 86 |
+
def tokenize_math(expr: str) -> list[str]:
|
| 87 |
+
return self.pipeline.encode_math_only(expr).tokens
|
| 88 |
+
|
| 89 |
+
print(f"Running MathTok benchmark on {len(exprs)} expressions...")
|
| 90 |
+
t0 = time.time()
|
| 91 |
+
|
| 92 |
+
# ── SCR ──────────────────────────────────────────────────────────
|
| 93 |
+
print(" Computing SCR...")
|
| 94 |
+
tok_lengths = []
|
| 95 |
+
for expr in exprs:
|
| 96 |
+
try:
|
| 97 |
+
out = self.pipeline.encode_math_only(expr)
|
| 98 |
+
tok_lengths.append(len(out.tokens))
|
| 99 |
+
except Exception:
|
| 100 |
+
tok_lengths.append(0)
|
| 101 |
+
scr = structural_compression_ratio(exprs, tok_lengths)
|
| 102 |
+
|
| 103 |
+
# ── CCS ──────────────────────────────────────────────────────────
|
| 104 |
+
print(" Computing CCS...")
|
| 105 |
+
ccs = canonical_consistency_score(eq_pairs, tokenize_math)
|
| 106 |
+
|
| 107 |
+
# ── OPS ──────────────────────────────────────────────────────────
|
| 108 |
+
print(" Computing OPS...")
|
| 109 |
+
ops = operator_preservation_score(exprs, tokenize_math)
|
| 110 |
+
|
| 111 |
+
# ── TS ───────────────────────────────────────────────────────────
|
| 112 |
+
print(" Computing TS...")
|
| 113 |
+
ts = token_stability(expr_groups, tokenize_math)
|
| 114 |
+
|
| 115 |
+
# ── TDF ──────────────────────────────────────────────────────────
|
| 116 |
+
print(" Computing TDF...")
|
| 117 |
+
tdf = tree_depth_fidelity(exprs, self.pipeline.encode_math_only)
|
| 118 |
+
|
| 119 |
+
elapsed = time.time() - t0
|
| 120 |
+
print(f" Done in {elapsed:.1f}s")
|
| 121 |
+
|
| 122 |
+
return EvaluationReport(
|
| 123 |
+
scr=scr, ccs=ccs, ops=ops, ts=ts, tdf=tdf,
|
| 124 |
+
num_examples=len(exprs),
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
def run_baseline_comparison(self, baseline_name: str = "gpt2") -> dict:
|
| 128 |
+
"""
|
| 129 |
+
Compare MathTok against a baseline tokenizer on SCR and CCS.
|
| 130 |
+
|
| 131 |
+
Returns a dict with 'mathtok' and 'baseline' results.
|
| 132 |
+
"""
|
| 133 |
+
ds = self.dataset
|
| 134 |
+
exprs = ds.get("expressions", [])[:self.max_n]
|
| 135 |
+
eq_pairs = ds.get("equivalent_pairs", [])[:self.max_n]
|
| 136 |
+
|
| 137 |
+
if baseline_name == "gpt2":
|
| 138 |
+
baseline_fn = make_gpt2_tokenizer()
|
| 139 |
+
elif baseline_name == "char":
|
| 140 |
+
baseline_fn = tokenize_character_level
|
| 141 |
+
else:
|
| 142 |
+
raise ValueError(f"Unknown baseline: {baseline_name}")
|
| 143 |
+
|
| 144 |
+
def mathtok_fn(expr: str) -> list[str]:
|
| 145 |
+
return self.pipeline.encode_math_only(expr).tokens
|
| 146 |
+
|
| 147 |
+
# MathTok metrics
|
| 148 |
+
mt_tok_lengths = [len(mathtok_fn(e)) for e in exprs]
|
| 149 |
+
mt_scr = structural_compression_ratio(exprs, mt_tok_lengths)
|
| 150 |
+
mt_ccs = canonical_consistency_score(eq_pairs, mathtok_fn)
|
| 151 |
+
|
| 152 |
+
# Baseline metrics
|
| 153 |
+
bl_tok_lengths = []
|
| 154 |
+
for e in exprs:
|
| 155 |
+
try:
|
| 156 |
+
bl_tok_lengths.append(len(baseline_fn(e)))
|
| 157 |
+
except Exception:
|
| 158 |
+
bl_tok_lengths.append(0)
|
| 159 |
+
bl_scr = structural_compression_ratio(exprs, bl_tok_lengths)
|
| 160 |
+
bl_ccs = canonical_consistency_score(eq_pairs, baseline_fn)
|
| 161 |
+
|
| 162 |
+
return {
|
| 163 |
+
"mathtok": {"SCR": mt_scr.value, "CCS": mt_ccs.value},
|
| 164 |
+
"baseline": {"name": baseline_name, "SCR": bl_scr.value, "CCS": bl_ccs.value},
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# ── CLI ───────────────────────────────────────────────────────────────────
|
| 169 |
+
|
| 170 |
+
def main() -> None:
|
| 171 |
+
logging.basicConfig(level=logging.WARNING)
|
| 172 |
+
parser = argparse.ArgumentParser(description="MathTok Benchmark Runner")
|
| 173 |
+
parser.add_argument("--quick", action="store_true", help="Run on first 20 examples only")
|
| 174 |
+
parser.add_argument("--json", action="store_true", help="Output JSON")
|
| 175 |
+
parser.add_argument("--baselines", action="store_true", help="Include GPT-2 baseline comparison")
|
| 176 |
+
parser.add_argument("--dataset", default=str(_DATASET_PATH), help="Dataset JSON path")
|
| 177 |
+
args = parser.parse_args()
|
| 178 |
+
|
| 179 |
+
dataset = load_dataset(Path(args.dataset))
|
| 180 |
+
pipeline = MathTokPipeline()
|
| 181 |
+
max_n = 20 if args.quick else None
|
| 182 |
+
|
| 183 |
+
bench = MathTokBenchmark(pipeline, dataset, max_n=max_n)
|
| 184 |
+
report = bench.run()
|
| 185 |
+
|
| 186 |
+
if args.json:
|
| 187 |
+
result = report.to_dict()
|
| 188 |
+
if args.baselines:
|
| 189 |
+
result["baseline_comparison"] = bench.run_baseline_comparison("char")
|
| 190 |
+
print(json.dumps(result, indent=2))
|
| 191 |
+
else:
|
| 192 |
+
print(report.summary())
|
| 193 |
+
if args.baselines:
|
| 194 |
+
comp = bench.run_baseline_comparison("char")
|
| 195 |
+
print("\nBaseline comparison (char-level):")
|
| 196 |
+
print(f" MathTok SCR={comp['mathtok']['SCR']:.4f} CCS={comp['mathtok']['CCS']:.4f}")
|
| 197 |
+
print(f" CharLvl SCR={comp['baseline']['SCR']:.4f} CCS={comp['baseline']['CCS']:.4f}")
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
if __name__ == "__main__":
|
| 201 |
+
main()
|
evaluation/comparison.py
ADDED
|
@@ -0,0 +1,920 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Semantic Tokenizer Comparison Framework
|
| 3 |
+
========================================
|
| 4 |
+
|
| 5 |
+
Compares MathTok against GPT-2 and character-level baselines across
|
| 6 |
+
four evaluation categories, computing the Semantic Compression Ratio (SCR)
|
| 7 |
+
at three levels:
|
| 8 |
+
|
| 9 |
+
Level 1 — Raw Token Count
|
| 10 |
+
raw_scr = structural_score / token_count
|
| 11 |
+
|
| 12 |
+
Level 2 — Semantic Density
|
| 13 |
+
semantic_density = math_tokens / total_tokens
|
| 14 |
+
(how "information-dense" the token stream is)
|
| 15 |
+
|
| 16 |
+
Level 3 — Structural Efficiency
|
| 17 |
+
structural_efficiency = parent_child_relations / token_count
|
| 18 |
+
(how efficiently hierarchy is encoded)
|
| 19 |
+
|
| 20 |
+
Structural Score Formula
|
| 21 |
+
─────────────────────────
|
| 22 |
+
score = operator_nodes (+1 per OP_/FUNC_ token)
|
| 23 |
+
+ tree_depth (+max depth in metadata)
|
| 24 |
+
+ parent_child_relations (+1 per non-leaf node)
|
| 25 |
+
+ function_scope (+1 per FUNC_ token)
|
| 26 |
+
+ canonical_bonus (+2 if expression parsed ok)
|
| 27 |
+
|
| 28 |
+
GPT-2 structural score is estimated heuristically from the token stream.
|
| 29 |
+
|
| 30 |
+
Test Categories
|
| 31 |
+
───────────────
|
| 32 |
+
1. Standard expressions — basic algebra, calculus
|
| 33 |
+
2. Deep nesting — sin(cos((x+1)^2 + y^3))
|
| 34 |
+
3. Canonical equivalence — x+2 vs 2+x (should converge)
|
| 35 |
+
4. Mixed text+math — "The derivative of sin(x^2)"
|
| 36 |
+
5. LaTeX vs ASCII — \\sin(x^2) vs sin(x^2)
|
| 37 |
+
|
| 38 |
+
Output
|
| 39 |
+
──────
|
| 40 |
+
JSONL file: evaluation/results/comparison_results.jsonl
|
| 41 |
+
Summary: evaluation/results/comparison_summary.json
|
| 42 |
+
|
| 43 |
+
Usage
|
| 44 |
+
─────
|
| 45 |
+
python -m evaluation.comparison
|
| 46 |
+
python -m evaluation.comparison --no-gpt2 # skip GPT-2 download
|
| 47 |
+
python -m evaluation.comparison --save # save JSONL
|
| 48 |
+
python -m evaluation.comparison --category deep # run one category
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
from __future__ import annotations
|
| 52 |
+
|
| 53 |
+
import argparse
|
| 54 |
+
import json
|
| 55 |
+
import logging
|
| 56 |
+
import os
|
| 57 |
+
import time
|
| 58 |
+
from dataclasses import dataclass, asdict, field
|
| 59 |
+
from pathlib import Path
|
| 60 |
+
from typing import Callable, Optional
|
| 61 |
+
|
| 62 |
+
logger = logging.getLogger(__name__)
|
| 63 |
+
|
| 64 |
+
# ── Output directory ───────────────────────────────────────────────────────
|
| 65 |
+
_RESULTS_DIR = Path(__file__).parent / "results"
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# ── Test suites ───────────────────────────────────────────────────────────
|
| 69 |
+
|
| 70 |
+
STANDARD_EXPRESSIONS = [
|
| 71 |
+
"(x+1)^2",
|
| 72 |
+
"sin(x^2) + 3*x",
|
| 73 |
+
"x^2 + 2*x + 1",
|
| 74 |
+
"exp(-x^2/2)",
|
| 75 |
+
"1/(1 + exp(-x))",
|
| 76 |
+
"log(x*y)",
|
| 77 |
+
"sqrt(a^2 + b^2)",
|
| 78 |
+
"n*(n+1)/2",
|
| 79 |
+
"factorial(n)",
|
| 80 |
+
"diff(sin(x), x)",
|
| 81 |
+
"integrate(x^2, x)",
|
| 82 |
+
"limit(sin(x)/x, x, 0)",
|
| 83 |
+
"a^2 - b^2",
|
| 84 |
+
"(-b + sqrt(b^2 - 4*a*c)) / (2*a)",
|
| 85 |
+
"sum(k^2, k, 1, n)",
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
DEEP_NESTING_EXPRESSIONS = [
|
| 89 |
+
"sin(cos(x^2 + 1))",
|
| 90 |
+
"sin(cos((x+1)^2 + y^3))",
|
| 91 |
+
"exp(log(sin(x^2 + cos(y))))",
|
| 92 |
+
"sqrt(1 + sqrt(1 + sqrt(x)))",
|
| 93 |
+
"log(1 + log(1 + x))",
|
| 94 |
+
"((x+1)^2 + (y-1)^2)^3",
|
| 95 |
+
"((a + b)*(a - b)) / ((a + b)^2)",
|
| 96 |
+
]
|
| 97 |
+
|
| 98 |
+
ODE_PDE_EXPRESSIONS = [
|
| 99 |
+
"Derivative(f(x), x, 2) + 2*Derivative(f(x), x) + f(x)",
|
| 100 |
+
"Derivative(u(x, t), t) - alpha * Derivative(u(x, t), x, 2)",
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
MATRIX_LINEAR_ALGEBRA = [
|
| 104 |
+
"A*x + b",
|
| 105 |
+
"det(A - lambda*I)",
|
| 106 |
+
]
|
| 107 |
+
|
| 108 |
+
PROBABILITY_EXPRESSIONS = [
|
| 109 |
+
"P(A|B) * P(B) / P(A)",
|
| 110 |
+
"exp(-x^2 / 2) / sqrt(2*pi)",
|
| 111 |
+
]
|
| 112 |
+
|
| 113 |
+
SET_THEORY = [
|
| 114 |
+
"Union(A, B)",
|
| 115 |
+
"Intersection(A, B)",
|
| 116 |
+
]
|
| 117 |
+
|
| 118 |
+
CANONICAL_PAIRS = [
|
| 119 |
+
("x + 2", "2 + x"),
|
| 120 |
+
("a*b + a*c", "a*(b+c)"),
|
| 121 |
+
("(x+1)^2", "x^2 + 2*x + 1"),
|
| 122 |
+
("x^2 - y^2", "(x+y)*(x-y)"),
|
| 123 |
+
("sin(x)^2 + cos(x)^2", "1"),
|
| 124 |
+
("2*x + 2*y", "2*(x+y)"),
|
| 125 |
+
("x*y + x*z", "x*(y+z)"),
|
| 126 |
+
("a^2 + 2*a*b + b^2","(a+b)^2"),
|
| 127 |
+
]
|
| 128 |
+
|
| 129 |
+
MIXED_TEXT_MATH = [
|
| 130 |
+
"The derivative of sin(x^2) with respect to x.",
|
| 131 |
+
"Solve for x when x^2 + 2*x + 1 = 0.",
|
| 132 |
+
"The quadratic formula gives $x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}$.",
|
| 133 |
+
"For $n \\geq 1$, the sum $\\sum_{k=1}^{n} k = \\frac{n(n+1)}{2}$.",
|
| 134 |
+
"Integrate $\\int_0^1 x^2 dx$ to get $\\frac{1}{3}$.",
|
| 135 |
+
"If $a > 0$ and $b > 0$ then $\\log(a) + \\log(b) = \\log(ab)$.",
|
| 136 |
+
"The area of a circle of radius r is pi*r^2.",
|
| 137 |
+
"Euler's identity: $e^{i\\pi} + 1 = 0$.",
|
| 138 |
+
]
|
| 139 |
+
|
| 140 |
+
LATEX_ASCII_PAIRS = [
|
| 141 |
+
("sin(x^2)", "\\sin(x^2)"),
|
| 142 |
+
("sqrt(x^2 + 1)", "\\sqrt{x^2 + 1}"),
|
| 143 |
+
("log(x)", "\\ln(x)"),
|
| 144 |
+
("exp(x)", "e^x"),
|
| 145 |
+
("x/y", "\\frac{x}{y}"),
|
| 146 |
+
("int(x^2, x)", "\\int x^2 dx"),
|
| 147 |
+
("diff(sin(x), x)", "\\frac{d}{dx}\\sin(x)"),
|
| 148 |
+
("factorial(n)", "n!"),
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# ── Result dataclasses ────────────────────────────────────────────────────
|
| 153 |
+
|
| 154 |
+
@dataclass
|
| 155 |
+
class TokenizerStats:
|
| 156 |
+
"""Stats for one tokenizer on one expression."""
|
| 157 |
+
name: str
|
| 158 |
+
tokens: list[str]
|
| 159 |
+
token_count: int
|
| 160 |
+
|
| 161 |
+
# Structural score components
|
| 162 |
+
operator_nodes: int = 0
|
| 163 |
+
tree_depth: int = 0
|
| 164 |
+
parent_child_relations: int = 0
|
| 165 |
+
function_scope: int = 0
|
| 166 |
+
canonical_bonus: int = 0
|
| 167 |
+
|
| 168 |
+
# Derived scores
|
| 169 |
+
structural_score: float = 0.0
|
| 170 |
+
raw_scr: float = 0.0 # structural_score / token_count
|
| 171 |
+
semantic_density: float = 0.0 # math tokens / total tokens
|
| 172 |
+
structural_efficiency: float = 0.0 # parent_child_relations / token_count
|
| 173 |
+
|
| 174 |
+
def compute_scr(self) -> None:
|
| 175 |
+
self.structural_score = (
|
| 176 |
+
self.operator_nodes
|
| 177 |
+
+ self.tree_depth
|
| 178 |
+
+ self.parent_child_relations
|
| 179 |
+
+ self.function_scope
|
| 180 |
+
+ self.canonical_bonus
|
| 181 |
+
)
|
| 182 |
+
self.raw_scr = (
|
| 183 |
+
self.structural_score / self.token_count
|
| 184 |
+
if self.token_count > 0 else 0.0
|
| 185 |
+
)
|
| 186 |
+
self.structural_efficiency = (
|
| 187 |
+
self.parent_child_relations / self.token_count
|
| 188 |
+
if self.token_count > 0 else 0.0
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
def to_dict(self) -> dict:
|
| 192 |
+
d = asdict(self)
|
| 193 |
+
d.pop("tokens") # too verbose for JSONL
|
| 194 |
+
return d
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
@dataclass
|
| 198 |
+
class ComparisonRecord:
|
| 199 |
+
"""Full comparison record for one expression."""
|
| 200 |
+
expression: str
|
| 201 |
+
category: str
|
| 202 |
+
mathtok: TokenizerStats
|
| 203 |
+
char_level: TokenizerStats
|
| 204 |
+
gpt2: Optional[TokenizerStats] = None
|
| 205 |
+
sentencepiece: Optional[TokenizerStats] = None
|
| 206 |
+
sexp: str = "" # MathTok S-expression
|
| 207 |
+
notes: list[str] = field(default_factory=list)
|
| 208 |
+
|
| 209 |
+
@property
|
| 210 |
+
def scr_improvement_vs_gpt2(self) -> Optional[float]:
|
| 211 |
+
if self.gpt2 is None or self.gpt2.raw_scr == 0:
|
| 212 |
+
return None
|
| 213 |
+
return self.mathtok.raw_scr / self.gpt2.raw_scr
|
| 214 |
+
|
| 215 |
+
@property
|
| 216 |
+
def scr_improvement_vs_sp(self) -> Optional[float]:
|
| 217 |
+
if self.sentencepiece is None or self.sentencepiece.raw_scr == 0:
|
| 218 |
+
return None
|
| 219 |
+
return self.mathtok.raw_scr / self.sentencepiece.raw_scr
|
| 220 |
+
|
| 221 |
+
@property
|
| 222 |
+
def scr_improvement_vs_char(self) -> float:
|
| 223 |
+
if self.char_level.raw_scr == 0:
|
| 224 |
+
return 0.0
|
| 225 |
+
return self.mathtok.raw_scr / self.char_level.raw_scr
|
| 226 |
+
|
| 227 |
+
def to_dict(self) -> dict:
|
| 228 |
+
return {
|
| 229 |
+
"expression": self.expression,
|
| 230 |
+
"category": self.category,
|
| 231 |
+
"sexp": self.sexp,
|
| 232 |
+
"mathtok": self.mathtok.to_dict(),
|
| 233 |
+
"gpt2": self.gpt2.to_dict() if self.gpt2 else None,
|
| 234 |
+
"sentencepiece": self.sentencepiece.to_dict() if self.sentencepiece else None,
|
| 235 |
+
"char_level": self.char_level.to_dict(),
|
| 236 |
+
"scr_improvement_vs_gpt2": self.scr_improvement_vs_gpt2,
|
| 237 |
+
"scr_improvement_vs_sp": self.scr_improvement_vs_sp,
|
| 238 |
+
"scr_improvement_vs_char": self.scr_improvement_vs_char,
|
| 239 |
+
"notes": self.notes,
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
def print_row(self) -> None:
|
| 243 |
+
gpt_count = self.gpt2.token_count if self.gpt2 else "N/A"
|
| 244 |
+
gpt_scr = f"{self.gpt2.raw_scr:.2f}" if self.gpt2 else "N/A"
|
| 245 |
+
sp_count = self.sentencepiece.token_count if self.sentencepiece else "N/A"
|
| 246 |
+
sp_scr = f"{self.sentencepiece.raw_scr:.2f}" if self.sentencepiece else "N/A"
|
| 247 |
+
impr = (f"{self.scr_improvement_vs_char:.2f}x"
|
| 248 |
+
if self.char_level.raw_scr > 0 else "N/A")
|
| 249 |
+
expr_short = self.expression[:30].ljust(31)
|
| 250 |
+
print(
|
| 251 |
+
f" {expr_short}"
|
| 252 |
+
f" | MT:{self.mathtok.token_count:3d} (SCR {self.mathtok.raw_scr:.2f})"
|
| 253 |
+
f" | GP:{str(gpt_count):3s} (SCR {gpt_scr})"
|
| 254 |
+
f" | SP:{str(sp_count):3s} (SCR {sp_scr})"
|
| 255 |
+
f" | CH:{self.char_level.token_count:3d} (SCR {self.char_level.raw_scr:.2f})"
|
| 256 |
+
f" | Impr: {impr}"
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
# ── Structural score helpers ──────────────────────────────────────────────
|
| 261 |
+
|
| 262 |
+
_OP_PREFIXES = ("OP_", "FRAC")
|
| 263 |
+
_FUNC_PREFIXES = ("FUNC_",)
|
| 264 |
+
_BOUNDARY = {"[MATH_START]", "[MATH_END]", "[TEXT_START]", "[TEXT_END]",
|
| 265 |
+
"[BOS]", "[EOS]", "[PAD]", "[UNK]", "[SEP]", "[MASK]"}
|
| 266 |
+
|
| 267 |
+
_MATH_OPS_GPT2 = {"+", "-", "*", "/", "^", "=", "<", ">", "**", "//"}
|
| 268 |
+
_MATH_FUNCS_GPT2 = {"sin", "cos", "tan", "log", "ln", "exp", "sqrt",
|
| 269 |
+
"lim", "sum", "prod", "diff", "integrate", "factorial"}
|
| 270 |
+
_PARENS = {"(", ")", "[", "]", "{", "}"}
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def _score_mathtok(out) -> TokenizerStats:
|
| 274 |
+
"""Compute structural score for a MathTok TokenizedOutput."""
|
| 275 |
+
tokens = [t for t in out.tokens if t not in _BOUNDARY]
|
| 276 |
+
token_count = len(out.tokens)
|
| 277 |
+
|
| 278 |
+
operator_nodes = sum(
|
| 279 |
+
1 for t in tokens
|
| 280 |
+
if any(t.startswith(p) for p in _OP_PREFIXES) or t == "FRAC"
|
| 281 |
+
)
|
| 282 |
+
function_scope = sum(1 for t in tokens if t.startswith("FUNC_"))
|
| 283 |
+
math_tokens = operator_nodes + function_scope + sum(
|
| 284 |
+
1 for t in tokens if t.startswith("VAR_") or t.startswith("CONST_") or t.startswith("NUM_")
|
| 285 |
+
)
|
| 286 |
+
semantic_density = math_tokens / max(token_count, 1)
|
| 287 |
+
|
| 288 |
+
# Tree depth and parent-child from metadata
|
| 289 |
+
tree_depth = 0
|
| 290 |
+
parent_child = 0
|
| 291 |
+
if out.metadata:
|
| 292 |
+
depths = [m.depth for m in out.metadata if m.depth >= 0]
|
| 293 |
+
tree_depth = max(depths) if depths else 0
|
| 294 |
+
parent_child = sum(1 for m in out.metadata if m.num_children > 0)
|
| 295 |
+
|
| 296 |
+
canonical_bonus = 2 if out.canon_results and out.canon_results[0].success else 0
|
| 297 |
+
|
| 298 |
+
stats = TokenizerStats(
|
| 299 |
+
name="MathTok",
|
| 300 |
+
tokens=out.tokens,
|
| 301 |
+
token_count=token_count,
|
| 302 |
+
operator_nodes=operator_nodes,
|
| 303 |
+
tree_depth=tree_depth,
|
| 304 |
+
parent_child_relations=parent_child,
|
| 305 |
+
function_scope=function_scope,
|
| 306 |
+
canonical_bonus=canonical_bonus,
|
| 307 |
+
semantic_density=semantic_density,
|
| 308 |
+
)
|
| 309 |
+
stats.compute_scr()
|
| 310 |
+
return stats
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
def _score_gpt2(tokens: list[str]) -> TokenizerStats:
|
| 314 |
+
"""Estimate structural score for a GPT-2 token list (heuristic)."""
|
| 315 |
+
token_count = len(tokens)
|
| 316 |
+
lower_toks = [t.lower().strip() for t in tokens]
|
| 317 |
+
|
| 318 |
+
operator_nodes = sum(1 for t in lower_toks if t in _MATH_OPS_GPT2)
|
| 319 |
+
function_scope = sum(1 for t in lower_toks if t in _MATH_FUNCS_GPT2)
|
| 320 |
+
math_tokens = operator_nodes + function_scope
|
| 321 |
+
|
| 322 |
+
# Estimate nesting depth from parentheses
|
| 323 |
+
max_depth, depth = 0, 0
|
| 324 |
+
for t in lower_toks:
|
| 325 |
+
if t in ("(", "[", "{"):
|
| 326 |
+
depth += 1
|
| 327 |
+
max_depth = max(max_depth, depth)
|
| 328 |
+
elif t in (")", "]", "}"):
|
| 329 |
+
depth = max(0, depth - 1)
|
| 330 |
+
|
| 331 |
+
# Estimate parent-child: every operator has ~1 parent and ~2 children
|
| 332 |
+
parent_child = operator_nodes
|
| 333 |
+
|
| 334 |
+
# No canonical parsing bonus
|
| 335 |
+
canonical_bonus = 0
|
| 336 |
+
|
| 337 |
+
semantic_density = math_tokens / max(token_count, 1)
|
| 338 |
+
|
| 339 |
+
stats = TokenizerStats(
|
| 340 |
+
name="GPT-2",
|
| 341 |
+
tokens=tokens,
|
| 342 |
+
token_count=token_count,
|
| 343 |
+
operator_nodes=operator_nodes,
|
| 344 |
+
tree_depth=max_depth,
|
| 345 |
+
parent_child_relations=parent_child,
|
| 346 |
+
function_scope=function_scope,
|
| 347 |
+
canonical_bonus=canonical_bonus,
|
| 348 |
+
semantic_density=semantic_density,
|
| 349 |
+
)
|
| 350 |
+
stats.compute_scr()
|
| 351 |
+
return stats
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
def _score_char(expr: str) -> TokenizerStats:
|
| 355 |
+
"""Score for character-level tokenization."""
|
| 356 |
+
tokens = list(expr)
|
| 357 |
+
token_count = len(tokens)
|
| 358 |
+
|
| 359 |
+
operator_nodes = sum(1 for c in tokens if c in "+-*/^=")
|
| 360 |
+
function_scope = 0 # character level can't identify functions
|
| 361 |
+
max_depth, depth = 0, 0
|
| 362 |
+
for c in tokens:
|
| 363 |
+
if c in "([{":
|
| 364 |
+
depth += 1
|
| 365 |
+
max_depth = max(max_depth, depth)
|
| 366 |
+
elif c in ")]}":
|
| 367 |
+
depth = max(0, depth - 1)
|
| 368 |
+
parent_child = operator_nodes # rough estimate
|
| 369 |
+
|
| 370 |
+
semantic_density = operator_nodes / max(token_count, 1)
|
| 371 |
+
|
| 372 |
+
stats = TokenizerStats(
|
| 373 |
+
name="CharLevel",
|
| 374 |
+
tokens=tokens,
|
| 375 |
+
token_count=token_count,
|
| 376 |
+
operator_nodes=operator_nodes,
|
| 377 |
+
tree_depth=max_depth,
|
| 378 |
+
parent_child_relations=parent_child,
|
| 379 |
+
function_scope=function_scope,
|
| 380 |
+
canonical_bonus=0,
|
| 381 |
+
semantic_density=semantic_density,
|
| 382 |
+
)
|
| 383 |
+
stats.compute_scr()
|
| 384 |
+
return stats
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
def _score_sp(tokens: list[str]) -> TokenizerStats:
|
| 388 |
+
"""Estimate structural score for a SentencePiece token list (heuristic)."""
|
| 389 |
+
token_count = len(tokens)
|
| 390 |
+
# Strip SentencePiece word prefix ' ' if present
|
| 391 |
+
lower_toks = [t.lower().replace(" ", "").strip() for t in tokens]
|
| 392 |
+
lower_toks = [t for t in lower_toks if t]
|
| 393 |
+
|
| 394 |
+
operator_nodes = sum(1 for t in lower_toks if t in _MATH_OPS_GPT2)
|
| 395 |
+
function_scope = sum(1 for t in lower_toks if t in _MATH_FUNCS_GPT2)
|
| 396 |
+
math_tokens = operator_nodes + function_scope
|
| 397 |
+
|
| 398 |
+
# Estimate nesting depth from parentheses
|
| 399 |
+
max_depth, depth = 0, 0
|
| 400 |
+
for t in lower_toks:
|
| 401 |
+
if t in ("(", "[", "{"):
|
| 402 |
+
depth += 1
|
| 403 |
+
max_depth = max(max_depth, depth)
|
| 404 |
+
elif t in (")", "]", "}"):
|
| 405 |
+
depth = max(0, depth - 1)
|
| 406 |
+
|
| 407 |
+
parent_child = operator_nodes
|
| 408 |
+
canonical_bonus = 0
|
| 409 |
+
semantic_density = math_tokens / max(token_count, 1)
|
| 410 |
+
|
| 411 |
+
stats = TokenizerStats(
|
| 412 |
+
name="SentencePiece",
|
| 413 |
+
tokens=tokens,
|
| 414 |
+
token_count=token_count,
|
| 415 |
+
operator_nodes=operator_nodes,
|
| 416 |
+
tree_depth=max_depth,
|
| 417 |
+
parent_child_relations=parent_child,
|
| 418 |
+
function_scope=function_scope,
|
| 419 |
+
canonical_bonus=canonical_bonus,
|
| 420 |
+
semantic_density=semantic_density,
|
| 421 |
+
)
|
| 422 |
+
stats.compute_scr()
|
| 423 |
+
return stats
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
def _get_trained_sp_tokenizer() -> Optional[Callable[[str], list[str]]]:
|
| 427 |
+
"""Train a small custom SentencePiece unigram model dynamically on all expressions."""
|
| 428 |
+
try:
|
| 429 |
+
import sentencepiece as spm
|
| 430 |
+
import tempfile
|
| 431 |
+
|
| 432 |
+
# Collect all expressions from our suites to form a corpus
|
| 433 |
+
corpus = []
|
| 434 |
+
corpus.extend(STANDARD_EXPRESSIONS)
|
| 435 |
+
corpus.extend(DEEP_NESTING_EXPRESSIONS)
|
| 436 |
+
corpus.extend(ODE_PDE_EXPRESSIONS)
|
| 437 |
+
corpus.extend(MATRIX_LINEAR_ALGEBRA)
|
| 438 |
+
corpus.extend(PROBABILITY_EXPRESSIONS)
|
| 439 |
+
corpus.extend(SET_THEORY)
|
| 440 |
+
for a, b in CANONICAL_PAIRS:
|
| 441 |
+
corpus.extend([a, b])
|
| 442 |
+
corpus.extend(MIXED_TEXT_MATH)
|
| 443 |
+
for a, b in LATEX_ASCII_PAIRS:
|
| 444 |
+
corpus.extend([a, b])
|
| 445 |
+
|
| 446 |
+
# Deduplicate and strip
|
| 447 |
+
corpus = sorted(list(set(e.strip() for e in corpus if e.strip())))
|
| 448 |
+
|
| 449 |
+
# Write to a temp file
|
| 450 |
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') as f:
|
| 451 |
+
f.write("\n".join(corpus))
|
| 452 |
+
temp_corpus_path = f.name
|
| 453 |
+
|
| 454 |
+
model_prefix = os.path.join(tempfile.gettempdir(), "spm_math_temp")
|
| 455 |
+
|
| 456 |
+
# Train a unigram model
|
| 457 |
+
# Using a small vocab size (e.g., 100)
|
| 458 |
+
spm.SentencePieceTrainer.train(
|
| 459 |
+
input=temp_corpus_path,
|
| 460 |
+
model_prefix=model_prefix,
|
| 461 |
+
vocab_size=100,
|
| 462 |
+
model_type="unigram",
|
| 463 |
+
user_defined_symbols=["[PAD]", "[UNK]", "[BOS]", "[EOS]"],
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
# Clean up temp corpus file
|
| 467 |
+
try:
|
| 468 |
+
os.remove(temp_corpus_path)
|
| 469 |
+
except Exception:
|
| 470 |
+
pass
|
| 471 |
+
|
| 472 |
+
sp = spm.SentencePieceProcessor(model_file=f"{model_prefix}.model")
|
| 473 |
+
return lambda text: sp.encode(text, out_type=str)
|
| 474 |
+
except Exception as exc:
|
| 475 |
+
logger.warning("Could not train custom SentencePiece tokenizer: %s", exc)
|
| 476 |
+
return None
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
# ── Main comparison engine ────────────────────────────────────────────────
|
| 480 |
+
|
| 481 |
+
class TokenizerComparison:
|
| 482 |
+
"""
|
| 483 |
+
Run the full 3-level SCR comparison across all test categories.
|
| 484 |
+
|
| 485 |
+
Parameters
|
| 486 |
+
----------
|
| 487 |
+
pipeline : MathTokPipeline
|
| 488 |
+
gpt2_fn : callable(str) -> list[str], or None to skip GPT-2
|
| 489 |
+
save_jsonl : write results to evaluation/results/comparison_results.jsonl
|
| 490 |
+
"""
|
| 491 |
+
|
| 492 |
+
def __init__(
|
| 493 |
+
self,
|
| 494 |
+
pipeline,
|
| 495 |
+
gpt2_fn: Optional[Callable] = None,
|
| 496 |
+
sp_fn: Optional[Callable] = None,
|
| 497 |
+
save_jsonl: bool = True,
|
| 498 |
+
) -> None:
|
| 499 |
+
self.pipeline = pipeline
|
| 500 |
+
self.gpt2_fn = gpt2_fn
|
| 501 |
+
self.sp_fn = sp_fn
|
| 502 |
+
self.save_jsonl = save_jsonl
|
| 503 |
+
self._records: list[ComparisonRecord] = []
|
| 504 |
+
|
| 505 |
+
# ── Public API ────────────────────────────────────────────────────────
|
| 506 |
+
|
| 507 |
+
def run_all(self) -> list[ComparisonRecord]:
|
| 508 |
+
"""Run all 5 test categories and return all ComparisonRecords."""
|
| 509 |
+
print("\n" + "=" * 80)
|
| 510 |
+
print(" MathTok Semantic Tokenizer Comparison")
|
| 511 |
+
print("=" * 80)
|
| 512 |
+
|
| 513 |
+
self._run_category("standard", STANDARD_EXPRESSIONS)
|
| 514 |
+
self._run_category("deep_nesting", DEEP_NESTING_EXPRESSIONS)
|
| 515 |
+
self._run_category("ode_pde", ODE_PDE_EXPRESSIONS)
|
| 516 |
+
self._run_category("linear_algebra", MATRIX_LINEAR_ALGEBRA)
|
| 517 |
+
self._run_category("probability", PROBABILITY_EXPRESSIONS)
|
| 518 |
+
self._run_category("set_theory", SET_THEORY)
|
| 519 |
+
self._run_canonical_equivalence()
|
| 520 |
+
self._run_mixed_text_math()
|
| 521 |
+
self._run_latex_vs_ascii()
|
| 522 |
+
|
| 523 |
+
if self.save_jsonl:
|
| 524 |
+
self._save_results()
|
| 525 |
+
|
| 526 |
+
self._print_summary()
|
| 527 |
+
return self._records
|
| 528 |
+
|
| 529 |
+
def run_category(self, category: str) -> list[ComparisonRecord]:
|
| 530 |
+
"""Run a single named category."""
|
| 531 |
+
categories = {
|
| 532 |
+
"standard": (self._run_category, ("standard", STANDARD_EXPRESSIONS)),
|
| 533 |
+
"deep": (self._run_category, ("deep_nesting", DEEP_NESTING_EXPRESSIONS)),
|
| 534 |
+
"ode_pde": (self._run_category, ("ode_pde", ODE_PDE_EXPRESSIONS)),
|
| 535 |
+
"linear": (self._run_category, ("linear_algebra", MATRIX_LINEAR_ALGEBRA)),
|
| 536 |
+
"probability": (self._run_category, ("probability", PROBABILITY_EXPRESSIONS)),
|
| 537 |
+
"set_theory": (self._run_category, ("set_theory", SET_THEORY)),
|
| 538 |
+
"canonical": (self._run_canonical_equivalence, ()),
|
| 539 |
+
"mixed": (self._run_mixed_text_math, ()),
|
| 540 |
+
"latex_ascii": (self._run_latex_vs_ascii, ()),
|
| 541 |
+
}
|
| 542 |
+
if category not in categories:
|
| 543 |
+
raise ValueError(f"Unknown category: {category}. Choose from: {list(categories)}")
|
| 544 |
+
fn, args = categories[category]
|
| 545 |
+
fn(*args)
|
| 546 |
+
if self.save_jsonl:
|
| 547 |
+
self._save_results()
|
| 548 |
+
self._print_summary()
|
| 549 |
+
return self._records
|
| 550 |
+
|
| 551 |
+
# ── Category runners ──────────────────────────────────────────────────
|
| 552 |
+
|
| 553 |
+
def _run_category(self, category: str, expressions: list[str]) -> None:
|
| 554 |
+
print(f"\n--- {category.upper().replace('_', ' ')} ---")
|
| 555 |
+
print(f" {'Expression':<30} | {'MathTok':^21} | {'GPT-2':^16} | {'S-Piece':^16} | {'Char':^16} | Impr")
|
| 556 |
+
print(f" {'-'*30}-+-{'-'*21}-+-{'-'*16}-+-{'-'*16}-+-{'-'*16}-+------")
|
| 557 |
+
|
| 558 |
+
for expr in expressions:
|
| 559 |
+
rec = self._compare_one(expr, category)
|
| 560 |
+
self._records.append(rec)
|
| 561 |
+
rec.print_row()
|
| 562 |
+
|
| 563 |
+
def _run_canonical_equivalence(self) -> None:
|
| 564 |
+
print(f"\n--- CANONICAL EQUIVALENCE ---")
|
| 565 |
+
print(" Testing that equivalent expressions -> similar MathTok token sets")
|
| 566 |
+
print(f" {'Pair':<45} | MT Jac | GP Jac | SP Jac | Converged")
|
| 567 |
+
print(f" {'-'*45}-+---------+---------+---------+----------")
|
| 568 |
+
|
| 569 |
+
for expr_a, expr_b in CANONICAL_PAIRS:
|
| 570 |
+
rec_a = self._compare_one(expr_a, "canonical")
|
| 571 |
+
rec_b = self._compare_one(expr_b, "canonical")
|
| 572 |
+
self._records.extend([rec_a, rec_b])
|
| 573 |
+
|
| 574 |
+
mt_a = set(t for t in rec_a.mathtok.tokens if t not in _BOUNDARY)
|
| 575 |
+
mt_b = set(t for t in rec_b.mathtok.tokens if t not in _BOUNDARY)
|
| 576 |
+
mt_jaccard = _jaccard(mt_a, mt_b)
|
| 577 |
+
|
| 578 |
+
gp_jaccard = None
|
| 579 |
+
if rec_a.gpt2 and rec_b.gpt2:
|
| 580 |
+
gp_a = set(rec_a.gpt2.tokens)
|
| 581 |
+
gp_b = set(rec_b.gpt2.tokens)
|
| 582 |
+
gp_jaccard = _jaccard(gp_a, gp_b)
|
| 583 |
+
|
| 584 |
+
sp_jaccard = None
|
| 585 |
+
if rec_a.sentencepiece and rec_b.sentencepiece:
|
| 586 |
+
sp_a = set(rec_a.sentencepiece.tokens)
|
| 587 |
+
sp_b = set(rec_b.sentencepiece.tokens)
|
| 588 |
+
sp_jaccard = _jaccard(sp_a, sp_b)
|
| 589 |
+
|
| 590 |
+
pair_str = f"{expr_a!r} vs {expr_b!r}"[:45].ljust(46)
|
| 591 |
+
gp_str = f"{gp_jaccard:.3f}" if gp_jaccard is not None else " N/A "
|
| 592 |
+
sp_str = f"{sp_jaccard:.3f}" if sp_jaccard is not None else " N/A "
|
| 593 |
+
converged = "YES" if mt_jaccard > 0.5 else "no "
|
| 594 |
+
print(f" {pair_str}| MT:{mt_jaccard:.3f} | GP:{gp_str} | SP:{sp_str} | {converged}")
|
| 595 |
+
|
| 596 |
+
def _run_mixed_text_math(self) -> None:
|
| 597 |
+
print(f"\n--- MIXED TEXT + MATH ---")
|
| 598 |
+
print(f" {'Input (truncated)':<40} | MT tokens | GP tokens | SP tokens | Math spans")
|
| 599 |
+
print(f" {'-'*40}-+-----------+-----------+-----------+-----------")
|
| 600 |
+
|
| 601 |
+
for text in MIXED_TEXT_MATH:
|
| 602 |
+
out = self.pipeline.encode(text)
|
| 603 |
+
math_spans = len(out.math_sexps)
|
| 604 |
+
mt_count = len(out.tokens)
|
| 605 |
+
|
| 606 |
+
gp_count = "N/A"
|
| 607 |
+
if self.gpt2_fn:
|
| 608 |
+
try:
|
| 609 |
+
gp_count = str(len(self.gpt2_fn(text)))
|
| 610 |
+
except Exception:
|
| 611 |
+
pass
|
| 612 |
+
|
| 613 |
+
sp_count = "N/A"
|
| 614 |
+
if self.sp_fn:
|
| 615 |
+
try:
|
| 616 |
+
sp_count = str(len(self.sp_fn(text)))
|
| 617 |
+
except Exception:
|
| 618 |
+
pass
|
| 619 |
+
|
| 620 |
+
preview = text[:40].ljust(41)
|
| 621 |
+
print(f" {preview}| {mt_count:9d} | {str(gp_count):9s} | {str(sp_count):9s} | {math_spans:9d}")
|
| 622 |
+
|
| 623 |
+
rec = ComparisonRecord(
|
| 624 |
+
expression=text,
|
| 625 |
+
category="mixed_text_math",
|
| 626 |
+
mathtok=_score_mathtok(out),
|
| 627 |
+
gpt2=None,
|
| 628 |
+
sentencepiece=None,
|
| 629 |
+
char_level=_score_char(text),
|
| 630 |
+
sexp=out.sexp,
|
| 631 |
+
)
|
| 632 |
+
self._records.append(rec)
|
| 633 |
+
|
| 634 |
+
def _run_latex_vs_ascii(self) -> None:
|
| 635 |
+
print(f"\n--- LaTeX vs ASCII NORMALIZATION ---")
|
| 636 |
+
print(" Same expression in two formats — MathTok should produce identical AST")
|
| 637 |
+
print(f" {'ASCII':<25} {'LaTeX':<25} | MT same? | MT tokens A/L | GP tokens A/L | SP tokens A/L")
|
| 638 |
+
print(f" {'-'*25} {'-'*25}-+----------+---------------+---------------+---------------")
|
| 639 |
+
|
| 640 |
+
for ascii_expr, latex_expr in LATEX_ASCII_PAIRS:
|
| 641 |
+
out_ascii = self.pipeline.encode_math_only(ascii_expr)
|
| 642 |
+
out_latex = self.pipeline.encode_math_only(latex_expr)
|
| 643 |
+
|
| 644 |
+
mt_a = set(t for t in out_ascii.tokens if t not in _BOUNDARY)
|
| 645 |
+
mt_l = set(t for t in out_latex.tokens if t not in _BOUNDARY)
|
| 646 |
+
mt_same = _jaccard(mt_a, mt_l)
|
| 647 |
+
same_str = f"{mt_same:.2f}" if mt_same > 0.8 else f"{mt_same:.2f}(~)"
|
| 648 |
+
|
| 649 |
+
gp_str = "N/A / N/A"
|
| 650 |
+
if self.gpt2_fn:
|
| 651 |
+
try:
|
| 652 |
+
ga = len(self.gpt2_fn(ascii_expr))
|
| 653 |
+
gl = len(self.gpt2_fn(latex_expr))
|
| 654 |
+
gp_str = f"{ga:3d} / {gl:3d}"
|
| 655 |
+
except Exception:
|
| 656 |
+
pass
|
| 657 |
+
|
| 658 |
+
sp_str = "N/A / N/A"
|
| 659 |
+
if self.sp_fn:
|
| 660 |
+
try:
|
| 661 |
+
sa = len(self.sp_fn(ascii_expr))
|
| 662 |
+
sl = len(self.sp_fn(latex_expr))
|
| 663 |
+
sp_str = f"{sa:3d} / {sl:3d}"
|
| 664 |
+
except Exception:
|
| 665 |
+
pass
|
| 666 |
+
|
| 667 |
+
print(
|
| 668 |
+
f" {ascii_expr:<25} {latex_expr:<25}"
|
| 669 |
+
f"| {same_str:>8s} "
|
| 670 |
+
f"| {len(out_ascii.tokens):3d} / {len(out_latex.tokens):3d} "
|
| 671 |
+
f"| {gp_str} "
|
| 672 |
+
f"| {sp_str}"
|
| 673 |
+
)
|
| 674 |
+
|
| 675 |
+
for expr, out, fmt in [
|
| 676 |
+
(ascii_expr, out_ascii, "ascii"),
|
| 677 |
+
(latex_expr, out_latex, "latex"),
|
| 678 |
+
]:
|
| 679 |
+
rec = ComparisonRecord(
|
| 680 |
+
expression=expr,
|
| 681 |
+
category=f"latex_vs_ascii_{fmt}",
|
| 682 |
+
mathtok=_score_mathtok(out),
|
| 683 |
+
gpt2=None,
|
| 684 |
+
sentencepiece=None,
|
| 685 |
+
char_level=_score_char(expr),
|
| 686 |
+
sexp=out.sexp,
|
| 687 |
+
notes=[f"pair_partner={latex_expr if fmt=='ascii' else ascii_expr}"],
|
| 688 |
+
)
|
| 689 |
+
self._records.append(rec)
|
| 690 |
+
|
| 691 |
+
# ── Single expression comparison ──────────────────────────────────────
|
| 692 |
+
|
| 693 |
+
def _compare_one(self, expr: str, category: str) -> ComparisonRecord:
|
| 694 |
+
# MathTok
|
| 695 |
+
try:
|
| 696 |
+
out = self.pipeline.encode_math_only(expr)
|
| 697 |
+
mt_stats = _score_mathtok(out)
|
| 698 |
+
sexp = out.sexp
|
| 699 |
+
except Exception as exc:
|
| 700 |
+
logger.debug("MathTok failed on %r: %s", expr, exc)
|
| 701 |
+
mt_stats = TokenizerStats(name="MathTok", tokens=[], token_count=0)
|
| 702 |
+
sexp = ""
|
| 703 |
+
|
| 704 |
+
# GPT-2
|
| 705 |
+
gp_stats: Optional[TokenizerStats] = None
|
| 706 |
+
if self.gpt2_fn:
|
| 707 |
+
try:
|
| 708 |
+
gp_tokens = self.gpt2_fn(expr)
|
| 709 |
+
gp_stats = _score_gpt2(gp_tokens)
|
| 710 |
+
except Exception as exc:
|
| 711 |
+
logger.debug("GPT-2 failed on %r: %s", expr, exc)
|
| 712 |
+
|
| 713 |
+
# SentencePiece
|
| 714 |
+
sp_stats: Optional[TokenizerStats] = None
|
| 715 |
+
if self.sp_fn:
|
| 716 |
+
try:
|
| 717 |
+
sp_tokens = self.sp_fn(expr)
|
| 718 |
+
sp_stats = _score_sp(sp_tokens)
|
| 719 |
+
except Exception as exc:
|
| 720 |
+
logger.debug("SentencePiece failed on %r: %s", expr, exc)
|
| 721 |
+
|
| 722 |
+
# Character-level
|
| 723 |
+
ch_stats = _score_char(expr)
|
| 724 |
+
|
| 725 |
+
return ComparisonRecord(
|
| 726 |
+
expression=expr,
|
| 727 |
+
category=category,
|
| 728 |
+
mathtok=mt_stats,
|
| 729 |
+
gpt2=gp_stats,
|
| 730 |
+
sentencepiece=sp_stats,
|
| 731 |
+
char_level=ch_stats,
|
| 732 |
+
sexp=sexp,
|
| 733 |
+
)
|
| 734 |
+
|
| 735 |
+
# ── Aggregated summary ────────────────────────────────────────────────
|
| 736 |
+
|
| 737 |
+
def _print_summary(self) -> None:
|
| 738 |
+
math_records = [
|
| 739 |
+
r for r in self._records
|
| 740 |
+
if r.category not in ("mixed_text_math",)
|
| 741 |
+
and r.mathtok.token_count > 0
|
| 742 |
+
]
|
| 743 |
+
if not math_records:
|
| 744 |
+
return
|
| 745 |
+
|
| 746 |
+
mt_scr_mean = _mean([r.mathtok.raw_scr for r in math_records])
|
| 747 |
+
mt_sd_mean = _mean([r.mathtok.semantic_density for r in math_records])
|
| 748 |
+
mt_se_mean = _mean([r.mathtok.structural_efficiency for r in math_records])
|
| 749 |
+
ch_scr_mean = _mean([r.char_level.raw_scr for r in math_records])
|
| 750 |
+
|
| 751 |
+
gp_records = [r for r in math_records if r.gpt2 is not None]
|
| 752 |
+
gp_scr_mean = _mean([r.gpt2.raw_scr for r in gp_records]) if gp_records else None
|
| 753 |
+
gp_sd_mean = _mean([r.gpt2.semantic_density for r in gp_records]) if gp_records else None
|
| 754 |
+
|
| 755 |
+
sp_records = [r for r in math_records if r.sentencepiece is not None]
|
| 756 |
+
sp_scr_mean = _mean([r.sentencepiece.raw_scr for r in sp_records]) if sp_records else None
|
| 757 |
+
sp_sd_mean = _mean([r.sentencepiece.semantic_density for r in sp_records]) if sp_records else None
|
| 758 |
+
|
| 759 |
+
impr_vs_gpt2 = (mt_scr_mean / gp_scr_mean) if gp_scr_mean else None
|
| 760 |
+
impr_vs_sp = (mt_scr_mean / sp_scr_mean) if sp_scr_mean else None
|
| 761 |
+
impr_vs_char = (mt_scr_mean / ch_scr_mean) if ch_scr_mean else None
|
| 762 |
+
|
| 763 |
+
print("\n" + "=" * 80)
|
| 764 |
+
print(" AGGREGATED RESULTS")
|
| 765 |
+
print("=" * 80)
|
| 766 |
+
print(f"\n {'Metric':<40} {'MathTok':>10} {'GPT-2':>10} {'S-Piece':>10} {'CharLvl':>10}")
|
| 767 |
+
print(f" {'-'*40} {'-'*10} {'-'*10} {'-'*10} {'-'*10}")
|
| 768 |
+
|
| 769 |
+
def row(label, mt_val, gp_val=None, sp_val=None, ch_val=None):
|
| 770 |
+
gp_str = f"{gp_val:10.4f}" if gp_val is not None else " N/A"
|
| 771 |
+
sp_str = f"{sp_val:10.4f}" if sp_val is not None else " N/A"
|
| 772 |
+
ch_str = f"{ch_val:10.4f}" if ch_val is not None else " N/A"
|
| 773 |
+
print(f" {label:<40} {mt_val:10.4f} {gp_str} {sp_str} {ch_str}")
|
| 774 |
+
|
| 775 |
+
row("Level 1 — SCR (struct_score / tokens)",
|
| 776 |
+
mt_scr_mean, gp_scr_mean, sp_scr_mean, ch_scr_mean)
|
| 777 |
+
row("Level 2 — Semantic Density (math_toks / total)",
|
| 778 |
+
mt_sd_mean, gp_sd_mean, sp_sd_mean, None)
|
| 779 |
+
row("Level 3 — Structural Efficiency (rels / tokens)",
|
| 780 |
+
mt_se_mean)
|
| 781 |
+
|
| 782 |
+
print(f"\n SCR improvement vs GPT-2 : "
|
| 783 |
+
f"{f'{impr_vs_gpt2:.2f}x' if impr_vs_gpt2 else 'N/A'}")
|
| 784 |
+
print(f" SCR improvement vs S-Piece : "
|
| 785 |
+
f"{f'{impr_vs_sp:.2f}x' if impr_vs_sp else 'N/A'}")
|
| 786 |
+
print(f" SCR improvement vs CharLevel: "
|
| 787 |
+
f"{f'{impr_vs_char:.2f}x' if impr_vs_char else 'N/A'}")
|
| 788 |
+
print(f"\n Total records evaluated : {len(self._records)}")
|
| 789 |
+
print("=" * 80)
|
| 790 |
+
|
| 791 |
+
return {
|
| 792 |
+
"mathtok_scr": mt_scr_mean,
|
| 793 |
+
"gpt2_scr": gp_scr_mean,
|
| 794 |
+
"sp_scr": sp_scr_mean,
|
| 795 |
+
"charlevel_scr": ch_scr_mean,
|
| 796 |
+
"scr_improvement_vs_gpt2": impr_vs_gpt2,
|
| 797 |
+
"scr_improvement_vs_sp": impr_vs_sp,
|
| 798 |
+
"scr_improvement_vs_char": impr_vs_char,
|
| 799 |
+
"mathtok_semantic_density": mt_sd_mean,
|
| 800 |
+
"mathtok_structural_efficiency": mt_se_mean,
|
| 801 |
+
}
|
| 802 |
+
|
| 803 |
+
# ── Persistence ───────────────────────────────────────────────────────
|
| 804 |
+
|
| 805 |
+
def _save_results(self) -> None:
|
| 806 |
+
_RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
| 807 |
+
jsonl_path = _RESULTS_DIR / "comparison_results.jsonl"
|
| 808 |
+
|
| 809 |
+
with open(jsonl_path, "w", encoding="utf-8") as f:
|
| 810 |
+
for rec in self._records:
|
| 811 |
+
f.write(json.dumps(rec.to_dict(), ensure_ascii=False) + "\n")
|
| 812 |
+
|
| 813 |
+
print(f"\n Results saved to: {jsonl_path}")
|
| 814 |
+
|
| 815 |
+
# Compact summary JSON
|
| 816 |
+
math_records = [
|
| 817 |
+
r for r in self._records
|
| 818 |
+
if r.mathtok.token_count > 0
|
| 819 |
+
]
|
| 820 |
+
summary = {
|
| 821 |
+
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
| 822 |
+
"total_records": len(self._records),
|
| 823 |
+
"mathtok_mean_scr": _mean([r.mathtok.raw_scr for r in math_records]),
|
| 824 |
+
"charlevel_mean_scr": _mean([r.char_level.raw_scr for r in math_records]),
|
| 825 |
+
"gpt2_scr": _mean([r.gpt2.raw_scr for r in math_records if r.gpt2 is not None]),
|
| 826 |
+
"sentencepiece_mean_scr": _mean([r.sentencepiece.raw_scr for r in math_records if r.sentencepiece is not None]),
|
| 827 |
+
"mathtok_mean_semantic_density":
|
| 828 |
+
_mean([r.mathtok.semantic_density for r in math_records]),
|
| 829 |
+
"mathtok_mean_structural_efficiency":
|
| 830 |
+
_mean([r.mathtok.structural_efficiency for r in math_records]),
|
| 831 |
+
"per_record": [
|
| 832 |
+
{
|
| 833 |
+
"expression": r.expression[:60],
|
| 834 |
+
"category": r.category,
|
| 835 |
+
"mt_tokens": r.mathtok.token_count,
|
| 836 |
+
"mt_scr": round(r.mathtok.raw_scr, 4),
|
| 837 |
+
"gp_tokens": r.gpt2.token_count if r.gpt2 else None,
|
| 838 |
+
"gp_scr": round(r.gpt2.raw_scr, 4) if r.gpt2 else None,
|
| 839 |
+
"sp_tokens": r.sentencepiece.token_count if r.sentencepiece else None,
|
| 840 |
+
"sp_scr": round(r.sentencepiece.raw_scr, 4) if r.sentencepiece else None,
|
| 841 |
+
"ch_tokens": r.char_level.token_count,
|
| 842 |
+
"ch_scr": round(r.char_level.raw_scr, 4),
|
| 843 |
+
"impr_vs_char": round(r.scr_improvement_vs_char, 4),
|
| 844 |
+
}
|
| 845 |
+
for r in math_records
|
| 846 |
+
],
|
| 847 |
+
}
|
| 848 |
+
summary_path = _RESULTS_DIR / "comparison_summary.json"
|
| 849 |
+
with open(summary_path, "w", encoding="utf-8") as f:
|
| 850 |
+
json.dump(summary, f, indent=2, ensure_ascii=False)
|
| 851 |
+
print(f" Summary saved to: {summary_path}")
|
| 852 |
+
|
| 853 |
+
|
| 854 |
+
# ── Helpers ───────────────────────────────────────────────────────────────
|
| 855 |
+
|
| 856 |
+
def _jaccard(a: set, b: set) -> float:
|
| 857 |
+
union = len(a | b)
|
| 858 |
+
return len(a & b) / union if union > 0 else 0.0
|
| 859 |
+
|
| 860 |
+
|
| 861 |
+
def _mean(values: list) -> float:
|
| 862 |
+
vals = [v for v in values if v is not None]
|
| 863 |
+
return sum(vals) / len(vals) if vals else 0.0
|
| 864 |
+
|
| 865 |
+
|
| 866 |
+
def _load_gpt2():
|
| 867 |
+
"""Load GPT-2 tokenizer, return None if unavailable."""
|
| 868 |
+
try:
|
| 869 |
+
from transformers import GPT2Tokenizer
|
| 870 |
+
tok = GPT2Tokenizer.from_pretrained("gpt2")
|
| 871 |
+
return tok.tokenize
|
| 872 |
+
except Exception as exc:
|
| 873 |
+
logger.warning("GPT-2 unavailable (%s); running without it.", exc)
|
| 874 |
+
return None
|
| 875 |
+
|
| 876 |
+
|
| 877 |
+
# ── CLI ───────────────────────────────────────────────────────────────────
|
| 878 |
+
|
| 879 |
+
def main() -> None:
|
| 880 |
+
logging.basicConfig(level=logging.WARNING)
|
| 881 |
+
|
| 882 |
+
parser = argparse.ArgumentParser(
|
| 883 |
+
description="MathTok vs GPT-2 vs Char-level — Semantic SCR Comparison"
|
| 884 |
+
)
|
| 885 |
+
parser.add_argument(
|
| 886 |
+
"--no-gpt2", action="store_true",
|
| 887 |
+
help="Skip GPT-2 (no internet required)"
|
| 888 |
+
)
|
| 889 |
+
parser.add_argument(
|
| 890 |
+
"--save", action="store_true", default=True,
|
| 891 |
+
help="Save JSONL and summary JSON (default: on)"
|
| 892 |
+
)
|
| 893 |
+
parser.add_argument(
|
| 894 |
+
"--no-save", action="store_true",
|
| 895 |
+
help="Disable JSONL saving"
|
| 896 |
+
)
|
| 897 |
+
parser.add_argument(
|
| 898 |
+
"--category",
|
| 899 |
+
choices=["standard", "deep", "canonical", "mixed", "latex_ascii", "all"],
|
| 900 |
+
default="all",
|
| 901 |
+
help="Which category to run (default: all)"
|
| 902 |
+
)
|
| 903 |
+
args = parser.parse_args()
|
| 904 |
+
|
| 905 |
+
from mathtok.pipeline import MathTokPipeline
|
| 906 |
+
pipeline = MathTokPipeline(include_metadata=True)
|
| 907 |
+
gpt2_fn = None if args.no_gpt2 else _load_gpt2()
|
| 908 |
+
sp_fn = _get_trained_sp_tokenizer()
|
| 909 |
+
save = args.save and not args.no_save
|
| 910 |
+
|
| 911 |
+
comp = TokenizerComparison(pipeline, gpt2_fn=gpt2_fn, sp_fn=sp_fn, save_jsonl=save)
|
| 912 |
+
|
| 913 |
+
if args.category == "all":
|
| 914 |
+
comp.run_all()
|
| 915 |
+
else:
|
| 916 |
+
comp.run_category(args.category)
|
| 917 |
+
|
| 918 |
+
|
| 919 |
+
if __name__ == "__main__":
|
| 920 |
+
main()
|
evaluation/datasets/sample_problems.json
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"expressions": [
|
| 2 |
+
"x^2 + 2*x + 1",
|
| 3 |
+
"sin(x)^2 + cos(x)^2",
|
| 4 |
+
"x^3 - 3*x^2 + 3*x - 1",
|
| 5 |
+
"e^(i*pi) + 1",
|
| 6 |
+
"log(x*y)",
|
| 7 |
+
"sqrt(x^2 + y^2)",
|
| 8 |
+
"1/(1 + e^(-x))",
|
| 9 |
+
"x^2 - y^2",
|
| 10 |
+
"a^2 + 2*a*b + b^2",
|
| 11 |
+
"(x+1)*(x-1)",
|
| 12 |
+
"diff(sin(x), x)",
|
| 13 |
+
"integrate(x^2, x)",
|
| 14 |
+
"limit(sin(x)/x, x, 0)",
|
| 15 |
+
"sum(k^2, k, 1, n)",
|
| 16 |
+
"factorial(n) / (factorial(k)*factorial(n-k))",
|
| 17 |
+
"exp(-x^2/2) / sqrt(2*pi)",
|
| 18 |
+
"a*x^2 + b*x + c",
|
| 19 |
+
"(-b + sqrt(b^2 - 4*a*c)) / (2*a)",
|
| 20 |
+
"log(1 + x)",
|
| 21 |
+
"x - x^3/6 + x^5/120",
|
| 22 |
+
"1 + 1/2 + 1/4 + 1/8",
|
| 23 |
+
"n*(n+1)/2",
|
| 24 |
+
"2^10",
|
| 25 |
+
"abs(x - y)",
|
| 26 |
+
"floor(x) + ceil(-x)",
|
| 27 |
+
"gamma(n+1)",
|
| 28 |
+
"sinh(x) + cosh(x)",
|
| 29 |
+
"atan(y/x)",
|
| 30 |
+
"x^2 + y^2 + z^2",
|
| 31 |
+
"det([[a,b],[c,d]])"
|
| 32 |
+
],
|
| 33 |
+
|
| 34 |
+
"equivalent_pairs": [
|
| 35 |
+
["x^2 + 2*x + 1", "(x+1)^2"],
|
| 36 |
+
["a^2 - b^2", "(a+b)*(a-b)"],
|
| 37 |
+
["a^2 + 2*a*b + b^2", "(a+b)^2"],
|
| 38 |
+
["x^3 - y^3", "(x-y)*(x^2 + x*y + y^2)"],
|
| 39 |
+
["sin(x)^2 + cos(x)^2","1"],
|
| 40 |
+
["log(x) + log(y)", "log(x*y)"],
|
| 41 |
+
["e^x * e^y", "e^(x+y)"],
|
| 42 |
+
["1/x + 1/y", "(x+y)/(x*y)"],
|
| 43 |
+
["b + a", "a + b"],
|
| 44 |
+
["2*x + 2*y", "2*(x+y)"],
|
| 45 |
+
["x/2", "x * (1/2)"],
|
| 46 |
+
["x^2 * x^3", "x^5"],
|
| 47 |
+
["(x^2)^3", "x^6"],
|
| 48 |
+
["log(e^x)", "x"],
|
| 49 |
+
["e^(log(x))", "x"],
|
| 50 |
+
["n*(n+1)/2", "n/2 + n^2/2"],
|
| 51 |
+
["1 + x + x^2", "(x^3 - 1)/(x-1)"],
|
| 52 |
+
["cos(2*x)", "1 - 2*sin(x)^2"],
|
| 53 |
+
["tan(x)", "sin(x)/cos(x)"],
|
| 54 |
+
["cosh(x)^2 - sinh(x)^2","1"]
|
| 55 |
+
],
|
| 56 |
+
|
| 57 |
+
"rewriting_groups": [
|
| 58 |
+
["x^2 + 2*x + 1", "(x+1)^2", "x*(x+2) + 1"],
|
| 59 |
+
["a*b + a*c", "a*(b+c)", "a*c + a*b"],
|
| 60 |
+
["sin(x)/cos(x)", "tan(x)", "sin(x)*sec(x)"],
|
| 61 |
+
["e^(x+y)", "e^x * e^y"],
|
| 62 |
+
["log(x^2)", "2*log(x)","log(x) + log(x)"],
|
| 63 |
+
["n*(n+1)/2", "n/2*(n+1)", "sum(k, k, 1, n)"]
|
| 64 |
+
],
|
| 65 |
+
|
| 66 |
+
"mixed_text_math": [
|
| 67 |
+
"The derivative of $\\sin(x^2)$ with respect to $x$ is $2x\\cos(x^2)$.",
|
| 68 |
+
"Let $f(x) = x^2 + 2x + 1$. Then $f(x) = (x+1)^2$.",
|
| 69 |
+
"The quadratic formula gives $x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}$.",
|
| 70 |
+
"Euler's identity states that $e^{i\\pi} + 1 = 0$.",
|
| 71 |
+
"The integral $\\int_0^1 x^2 dx = \\frac{1}{3}$.",
|
| 72 |
+
"For any $n \\geq 1$, the sum $\\sum_{k=1}^{n} k = \\frac{n(n+1)}{2}$.",
|
| 73 |
+
"The Pythagorean theorem: $a^2 + b^2 = c^2$ for right triangles.",
|
| 74 |
+
"The normal distribution is $f(x) = \\frac{1}{\\sqrt{2\\pi}}e^{-x^2/2}$.",
|
| 75 |
+
"If $\\sin^2(x) + \\cos^2(x) = 1$ then $\\tan^2(x) + 1 = \\sec^2(x)$.",
|
| 76 |
+
"The limit $\\lim_{x \\to 0} \\frac{\\sin(x)}{x} = 1$ is fundamental.",
|
| 77 |
+
"Find the derivative of f(x) = sin(x^2) + 3x.",
|
| 78 |
+
"Solve for x: x^2 - 5*x + 6 = 0.",
|
| 79 |
+
"The area of a circle of radius r is pi*r^2.",
|
| 80 |
+
"Simplify: (a+b)^2 - (a-b)^2.",
|
| 81 |
+
"Compute the Taylor series of exp(x) around x=0."
|
| 82 |
+
],
|
| 83 |
+
|
| 84 |
+
"latex_only": [
|
| 85 |
+
"\\frac{x^2 - 1}{x + 1}",
|
| 86 |
+
"\\sqrt{\\frac{a^2 + b^2}{2}}",
|
| 87 |
+
"\\int_0^\\infty e^{-x^2} dx",
|
| 88 |
+
"\\sum_{n=0}^{\\infty} \\frac{x^n}{n!}",
|
| 89 |
+
"\\lim_{n \\to \\infty} \\left(1 + \\frac{1}{n}\\right)^n",
|
| 90 |
+
"\\binom{n}{k} = \\frac{n!}{k!(n-k)!}",
|
| 91 |
+
"\\frac{d}{dx}\\left[\\ln(x)\\right] = \\frac{1}{x}",
|
| 92 |
+
"\\nabla^2 f = \\frac{\\partial^2 f}{\\partial x^2} + \\frac{\\partial^2 f}{\\partial y^2}"
|
| 93 |
+
],
|
| 94 |
+
|
| 95 |
+
"ascii_only": [
|
| 96 |
+
"x**2 + 2*x + 1",
|
| 97 |
+
"sin(x)**2 + cos(x)**2",
|
| 98 |
+
"exp(-x**2 / 2) / sqrt(2*pi)",
|
| 99 |
+
"factorial(n) / (factorial(k) * factorial(n - k))",
|
| 100 |
+
"log(x**2) - 2*log(x)",
|
| 101 |
+
"abs(a - b) + abs(b - c)",
|
| 102 |
+
"floor(x/2) * 2",
|
| 103 |
+
"gamma(n + 1) / gamma(n)"
|
| 104 |
+
],
|
| 105 |
+
|
| 106 |
+
"metadata": {
|
| 107 |
+
"version": "1.0",
|
| 108 |
+
"description": "MathTok benchmark dataset — curated expressions for evaluating structural tokenization quality",
|
| 109 |
+
"sources": ["handcrafted", "DeepMind-Mathematics-inspired"],
|
| 110 |
+
"num_expressions": 30,
|
| 111 |
+
"num_equivalent_pairs": 20,
|
| 112 |
+
"num_rewriting_groups": 6,
|
| 113 |
+
"num_mixed": 15
|
| 114 |
+
}
|
| 115 |
+
}
|
evaluation/metrics.py
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MathTok Evaluation Metrics
|
| 3 |
+
|
| 4 |
+
Implements the five core metrics for evaluating structural tokenization
|
| 5 |
+
quality, as described in the MathTok paper:
|
| 6 |
+
|
| 7 |
+
SCR — Structural Compression Ratio
|
| 8 |
+
CCS — Canonical Consistency Score
|
| 9 |
+
OPS — Operator Preservation Score
|
| 10 |
+
TS — Token Stability
|
| 11 |
+
TDF — Tree Depth Fidelity
|
| 12 |
+
|
| 13 |
+
Each metric is self-contained and operates on TokenizedOutput objects
|
| 14 |
+
or lists of token strings, enabling easy integration into benchmark runs.
|
| 15 |
+
|
| 16 |
+
Baseline comparisons are supported for:
|
| 17 |
+
- GPT-2 tokenizer (character-level BPE)
|
| 18 |
+
- SentencePiece unigram
|
| 19 |
+
- Character-level tokenization
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
import logging
|
| 25 |
+
import math
|
| 26 |
+
from dataclasses import dataclass, field
|
| 27 |
+
from typing import Callable, Optional
|
| 28 |
+
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ── Metric result container ───────────────────────────────────────────────
|
| 33 |
+
|
| 34 |
+
@dataclass
|
| 35 |
+
class MetricResult:
|
| 36 |
+
"""Holds the value and supporting statistics for one metric."""
|
| 37 |
+
name: str
|
| 38 |
+
value: float
|
| 39 |
+
description: str
|
| 40 |
+
details: dict = field(default_factory=dict)
|
| 41 |
+
|
| 42 |
+
def __repr__(self) -> str:
|
| 43 |
+
return f"{self.name}: {self.value:.4f} ({self.description})"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@dataclass
|
| 47 |
+
class EvaluationReport:
|
| 48 |
+
"""Full report across all five MathTok metrics."""
|
| 49 |
+
scr: MetricResult
|
| 50 |
+
ccs: MetricResult
|
| 51 |
+
ops: MetricResult
|
| 52 |
+
ts: MetricResult
|
| 53 |
+
tdf: MetricResult
|
| 54 |
+
num_examples: int = 0
|
| 55 |
+
|
| 56 |
+
def summary(self) -> str:
|
| 57 |
+
lines = [
|
| 58 |
+
f"{'='*60}",
|
| 59 |
+
f" MathTok Evaluation Report (n={self.num_examples})",
|
| 60 |
+
f"{'='*60}",
|
| 61 |
+
f" {self.scr}",
|
| 62 |
+
f" {self.ccs}",
|
| 63 |
+
f" {self.ops}",
|
| 64 |
+
f" {self.ts}",
|
| 65 |
+
f" {self.tdf}",
|
| 66 |
+
f"{'='*60}",
|
| 67 |
+
]
|
| 68 |
+
return "\n".join(lines)
|
| 69 |
+
|
| 70 |
+
def to_dict(self) -> dict:
|
| 71 |
+
return {
|
| 72 |
+
"num_examples": self.num_examples,
|
| 73 |
+
"SCR": self.scr.value, "CCS": self.ccs.value,
|
| 74 |
+
"OPS": self.ops.value, "TS": self.ts.value,
|
| 75 |
+
"TDF": self.tdf.value,
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# ── Metric 1: Structural Compression Ratio (SCR) ─────────────────────────
|
| 80 |
+
|
| 81 |
+
def structural_compression_ratio(
|
| 82 |
+
expressions: list[str],
|
| 83 |
+
tokenized_lengths: list[int],
|
| 84 |
+
) -> MetricResult:
|
| 85 |
+
"""
|
| 86 |
+
SCR = mean( |AST_tokens| / |raw_chars| )
|
| 87 |
+
|
| 88 |
+
Measures how efficiently the structural token stream represents the
|
| 89 |
+
information content relative to raw character count.
|
| 90 |
+
Lower SCR = more compressed. A ratio < 1.0 indicates compression.
|
| 91 |
+
|
| 92 |
+
Parameters
|
| 93 |
+
----------
|
| 94 |
+
expressions : list of raw input expression strings
|
| 95 |
+
tokenized_lengths : list of token counts output by MathTok
|
| 96 |
+
"""
|
| 97 |
+
assert len(expressions) == len(tokenized_lengths), "Length mismatch"
|
| 98 |
+
ratios = []
|
| 99 |
+
for expr, tlen in zip(expressions, tokenized_lengths):
|
| 100 |
+
char_len = max(len(expr), 1)
|
| 101 |
+
ratios.append(tlen / char_len)
|
| 102 |
+
|
| 103 |
+
mean_scr = sum(ratios) / len(ratios)
|
| 104 |
+
return MetricResult(
|
| 105 |
+
name="SCR",
|
| 106 |
+
value=mean_scr,
|
| 107 |
+
description="Structural Compression Ratio (tokens / chars); lower = more compressed",
|
| 108 |
+
details={
|
| 109 |
+
"min": min(ratios),
|
| 110 |
+
"max": max(ratios),
|
| 111 |
+
"std": _std(ratios),
|
| 112 |
+
"n": len(ratios),
|
| 113 |
+
},
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# ── Metric 2: Canonical Consistency Score (CCS) ──────────────────────────
|
| 118 |
+
|
| 119 |
+
def canonical_consistency_score(
|
| 120 |
+
equivalent_pairs: list[tuple[str, str]],
|
| 121 |
+
tokenize_fn: Callable[[str], list[str]],
|
| 122 |
+
) -> MetricResult:
|
| 123 |
+
"""
|
| 124 |
+
CCS = mean( Jaccard(tokens_A, tokens_B) ) over equivalent pairs.
|
| 125 |
+
|
| 126 |
+
Measures how similar the token streams are for mathematically
|
| 127 |
+
equivalent expressions. CCS → 1.0 means perfect consistency.
|
| 128 |
+
|
| 129 |
+
Parameters
|
| 130 |
+
----------
|
| 131 |
+
equivalent_pairs : list of (expr_A, expr_B) that are mathematically equal
|
| 132 |
+
tokenize_fn : function str → list[str] (the tokenizer under test)
|
| 133 |
+
"""
|
| 134 |
+
scores = []
|
| 135 |
+
for expr_a, expr_b in equivalent_pairs:
|
| 136 |
+
try:
|
| 137 |
+
toks_a = set(tokenize_fn(expr_a))
|
| 138 |
+
toks_b = set(tokenize_fn(expr_b))
|
| 139 |
+
# Remove boundary tokens from Jaccard
|
| 140 |
+
toks_a = {t for t in toks_a if not t.startswith("[") }
|
| 141 |
+
toks_b = {t for t in toks_b if not t.startswith("[") }
|
| 142 |
+
if not toks_a and not toks_b:
|
| 143 |
+
scores.append(1.0)
|
| 144 |
+
else:
|
| 145 |
+
intersection = len(toks_a & toks_b)
|
| 146 |
+
union = len(toks_a | toks_b)
|
| 147 |
+
scores.append(intersection / union if union > 0 else 0.0)
|
| 148 |
+
except Exception as exc:
|
| 149 |
+
logger.debug("CCS: failed on pair (%s, %s): %s", expr_a[:30], expr_b[:30], exc)
|
| 150 |
+
scores.append(0.0)
|
| 151 |
+
|
| 152 |
+
mean_ccs = sum(scores) / len(scores) if scores else 0.0
|
| 153 |
+
return MetricResult(
|
| 154 |
+
name="CCS",
|
| 155 |
+
value=mean_ccs,
|
| 156 |
+
description="Canonical Consistency Score — Jaccard overlap for equivalent forms (higher is better)",
|
| 157 |
+
details={"scores": scores[:20], "n": len(scores), "std": _std(scores)},
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
# ── Metric 3: Operator Preservation Score (OPS) ──────────────────────────
|
| 162 |
+
|
| 163 |
+
def operator_preservation_score(
|
| 164 |
+
expressions: list[str],
|
| 165 |
+
tokenize_fn: Callable[[str], list[str]],
|
| 166 |
+
expected_operators: Optional[list[set[str]]] = None,
|
| 167 |
+
) -> MetricResult:
|
| 168 |
+
"""
|
| 169 |
+
OPS = fraction of expressions where all expected operator tokens appear.
|
| 170 |
+
|
| 171 |
+
If expected_operators is not provided, we auto-detect expected operators
|
| 172 |
+
from simple heuristics on the raw expression string.
|
| 173 |
+
|
| 174 |
+
Parameters
|
| 175 |
+
----------
|
| 176 |
+
expressions : list of raw expression strings
|
| 177 |
+
tokenize_fn : str → list[str]
|
| 178 |
+
expected_operators : optional list of sets of expected operator tokens
|
| 179 |
+
"""
|
| 180 |
+
_OP_HEURISTICS: dict[str, str] = {
|
| 181 |
+
"+": "OP_ADD", "*": "OP_MUL", "^": "OP_POW", "**": "OP_POW",
|
| 182 |
+
"/": "FRAC", "sin": "FUNC_SIN", "cos": "FUNC_COS",
|
| 183 |
+
"tan": "FUNC_TAN", "log": "FUNC_LOG", "exp": "FUNC_EXP",
|
| 184 |
+
"sqrt": "FUNC_SQRT", "diff": "OP_DERIV", "integrate": "OP_INT",
|
| 185 |
+
"lim": "OP_LIMIT", "sum": "OP_SUM", "factorial": "FUNC_FACTORIAL",
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
preserved = 0
|
| 189 |
+
total = 0
|
| 190 |
+
|
| 191 |
+
for i, expr in enumerate(expressions):
|
| 192 |
+
if expected_operators is not None:
|
| 193 |
+
expected = expected_operators[i]
|
| 194 |
+
else:
|
| 195 |
+
# Heuristic: derive expected operators from raw expression
|
| 196 |
+
expected = set()
|
| 197 |
+
expr_lower = expr.lower()
|
| 198 |
+
for key, op_tok in _OP_HEURISTICS.items():
|
| 199 |
+
if key in expr_lower:
|
| 200 |
+
expected.add(op_tok)
|
| 201 |
+
|
| 202 |
+
if not expected:
|
| 203 |
+
continue # skip if we can't determine expected operators
|
| 204 |
+
|
| 205 |
+
try:
|
| 206 |
+
tokens = set(tokenize_fn(expr))
|
| 207 |
+
except Exception:
|
| 208 |
+
tokens = set()
|
| 209 |
+
|
| 210 |
+
if expected.issubset(tokens):
|
| 211 |
+
preserved += 1
|
| 212 |
+
total += 1
|
| 213 |
+
|
| 214 |
+
ops_value = preserved / total if total > 0 else 1.0
|
| 215 |
+
return MetricResult(
|
| 216 |
+
name="OPS",
|
| 217 |
+
value=ops_value,
|
| 218 |
+
description="Operator Preservation Score — % of expressions with all expected ops (higher is better)",
|
| 219 |
+
details={"preserved": preserved, "total": total},
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
# ── Metric 4: Token Stability (TS) ───────────────────────────────────────
|
| 224 |
+
|
| 225 |
+
def token_stability(
|
| 226 |
+
expression_groups: list[list[str]],
|
| 227 |
+
tokenize_fn: Callable[[str], list[str]],
|
| 228 |
+
) -> MetricResult:
|
| 229 |
+
"""
|
| 230 |
+
TS = 1 - mean( CoV(token_count) ) where CoV = std/mean.
|
| 231 |
+
|
| 232 |
+
Measures how stable the token count is across syntactic rewritings
|
| 233 |
+
of the same expression. TS → 1.0 means perfectly stable.
|
| 234 |
+
|
| 235 |
+
Parameters
|
| 236 |
+
----------
|
| 237 |
+
expression_groups : list of groups; each group = rewritings of one expr
|
| 238 |
+
tokenize_fn : str → list[str]
|
| 239 |
+
"""
|
| 240 |
+
covs = []
|
| 241 |
+
for group in expression_groups:
|
| 242 |
+
lengths = []
|
| 243 |
+
for expr in group:
|
| 244 |
+
try:
|
| 245 |
+
lengths.append(len(tokenize_fn(expr)))
|
| 246 |
+
except Exception:
|
| 247 |
+
lengths.append(0)
|
| 248 |
+
if len(lengths) < 2 or sum(lengths) == 0:
|
| 249 |
+
continue
|
| 250 |
+
mu = sum(lengths) / len(lengths)
|
| 251 |
+
std = _std(lengths)
|
| 252 |
+
cov = std / mu if mu > 0 else 0.0
|
| 253 |
+
covs.append(cov)
|
| 254 |
+
|
| 255 |
+
mean_cov = sum(covs) / len(covs) if covs else 0.0
|
| 256 |
+
ts_value = max(0.0, 1.0 - mean_cov)
|
| 257 |
+
return MetricResult(
|
| 258 |
+
name="TS",
|
| 259 |
+
value=ts_value,
|
| 260 |
+
description="Token Stability — 1 - CoV(token count across rewritings) (higher is better)",
|
| 261 |
+
details={"mean_cov": mean_cov, "n_groups": len(covs)},
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
# ── Metric 5: Tree Depth Fidelity (TDF) ──────────────────────────────────
|
| 266 |
+
|
| 267 |
+
def tree_depth_fidelity(
|
| 268 |
+
expressions: list[str],
|
| 269 |
+
tokenize_fn_with_meta: Callable, # returns TokenizedOutput
|
| 270 |
+
expected_depth_fn: Optional[Callable] = None,
|
| 271 |
+
) -> MetricResult:
|
| 272 |
+
"""
|
| 273 |
+
TDF = 1 - mean( |actual_max_depth - expected_max_depth| / expected_max_depth )
|
| 274 |
+
|
| 275 |
+
Measures how accurately the metadata captures the true tree depth.
|
| 276 |
+
Relies on metadata.depth fields being correctly computed.
|
| 277 |
+
|
| 278 |
+
Parameters
|
| 279 |
+
----------
|
| 280 |
+
expressions : list of expression strings
|
| 281 |
+
tokenize_fn_with_meta : pipeline.encode() or equivalent
|
| 282 |
+
expected_depth_fn : optional callable(expr) → int for ground-truth depth
|
| 283 |
+
If None, uses sympy-computed depth as ground truth.
|
| 284 |
+
"""
|
| 285 |
+
errors = []
|
| 286 |
+
|
| 287 |
+
for expr in expressions:
|
| 288 |
+
try:
|
| 289 |
+
out = tokenize_fn_with_meta(expr)
|
| 290 |
+
if not out.metadata:
|
| 291 |
+
continue
|
| 292 |
+
actual_depth = max((m.depth for m in out.metadata if m.depth >= 0), default=0)
|
| 293 |
+
|
| 294 |
+
if expected_depth_fn is not None:
|
| 295 |
+
expected_depth = expected_depth_fn(expr)
|
| 296 |
+
else:
|
| 297 |
+
# Use AST subtree height from first canon_result as ground truth
|
| 298 |
+
if out.canon_results and out.canon_results[0].success:
|
| 299 |
+
import sympy as sp
|
| 300 |
+
expr_tree = out.canon_results[0].expr
|
| 301 |
+
expected_depth = _sympy_depth(expr_tree)
|
| 302 |
+
else:
|
| 303 |
+
continue
|
| 304 |
+
|
| 305 |
+
if expected_depth == 0:
|
| 306 |
+
errors.append(0.0)
|
| 307 |
+
else:
|
| 308 |
+
rel_err = abs(actual_depth - expected_depth) / expected_depth
|
| 309 |
+
errors.append(min(rel_err, 1.0))
|
| 310 |
+
except Exception as exc:
|
| 311 |
+
logger.debug("TDF: error on %s: %s", expr[:30], exc)
|
| 312 |
+
errors.append(1.0)
|
| 313 |
+
|
| 314 |
+
mean_err = sum(errors) / len(errors) if errors else 0.0
|
| 315 |
+
tdf_value = max(0.0, 1.0 - mean_err)
|
| 316 |
+
return MetricResult(
|
| 317 |
+
name="TDF",
|
| 318 |
+
value=tdf_value,
|
| 319 |
+
description="Tree Depth Fidelity — accuracy of depth metadata vs ground truth (higher is better)",
|
| 320 |
+
details={"mean_relative_error": mean_err, "n": len(errors)},
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
# ── Baseline comparators ──────────────────────────────────────────────────
|
| 325 |
+
|
| 326 |
+
def tokenize_character_level(expr: str) -> list[str]:
|
| 327 |
+
"""Character-level tokenizer baseline."""
|
| 328 |
+
return list(expr)
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def make_gpt2_tokenizer():
|
| 332 |
+
"""Return a GPT-2 tokenizer as a baseline (requires transformers)."""
|
| 333 |
+
try:
|
| 334 |
+
from transformers import AutoTokenizer
|
| 335 |
+
tok = AutoTokenizer.from_pretrained("gpt2")
|
| 336 |
+
return lambda text: tok.tokenize(text)
|
| 337 |
+
except Exception:
|
| 338 |
+
logger.warning("GPT-2 tokenizer not available; using character baseline.")
|
| 339 |
+
return tokenize_character_level
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
def make_sentencepiece_tokenizer(model_path: str):
|
| 343 |
+
"""Return a SentencePiece tokenizer baseline."""
|
| 344 |
+
try:
|
| 345 |
+
import sentencepiece as spm
|
| 346 |
+
sp = spm.SentencePieceProcessor(model_file=model_path)
|
| 347 |
+
return lambda text: sp.encode(text, out_type=str)
|
| 348 |
+
except Exception:
|
| 349 |
+
logger.warning("SentencePiece not available.")
|
| 350 |
+
return tokenize_character_level
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
# ── Utility helpers ───────────────────────────────────────────────────────
|
| 354 |
+
|
| 355 |
+
def _std(values: list[float]) -> float:
|
| 356 |
+
if len(values) < 2:
|
| 357 |
+
return 0.0
|
| 358 |
+
mu = sum(values) / len(values)
|
| 359 |
+
var = sum((v - mu) ** 2 for v in values) / (len(values) - 1)
|
| 360 |
+
return math.sqrt(var)
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
def _sympy_depth(expr) -> int:
|
| 364 |
+
"""Compute tree depth of a SymPy expression."""
|
| 365 |
+
if not expr.args:
|
| 366 |
+
return 0
|
| 367 |
+
return 1 + max(_sympy_depth(a) for a in expr.args)
|
evaluation/results/comparison_results.jsonl
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"expression": "(x+1)^2", "category": "standard", "sexp": "(OP_ADD CONST_1 (OP_POW VAR_X CONST_2) (OP_MUL CONST_2 VAR_X))", "mathtok": {"name": "MathTok", "token_count": 10, "operator_nodes": 3, "tree_depth": 2, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 2, "structural_score": 10, "raw_scr": 1.0, "semantic_density": 0.8, "structural_efficiency": 0.3}, "gpt2": {"name": "GPT-2", "token_count": 7, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.7142857142857143, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "sentencepiece": {"name": "SentencePiece", "token_count": 4, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.25, "semantic_density": 0.0, "structural_efficiency": 0.0}, "char_level": {"name": "CharLevel", "token_count": 7, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.7142857142857143, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "scr_improvement_vs_gpt2": 1.4, "scr_improvement_vs_sp": 4.0, "scr_improvement_vs_char": 1.4, "notes": []}
|
| 2 |
+
{"expression": "sin(x^2) + 3*x", "category": "standard", "sexp": "(OP_ADD (OP_MUL CONST_3 VAR_X) (FUNC_SIN (OP_POW VAR_X CONST_2)))", "mathtok": {"name": "MathTok", "token_count": 12, "operator_nodes": 3, "tree_depth": 3, "parent_child_relations": 4, "function_scope": 1, "canonical_bonus": 2, "structural_score": 13, "raw_scr": 1.0833333333333333, "semantic_density": 0.6666666666666666, "structural_efficiency": 0.3333333333333333}, "gpt2": {"name": "GPT-2", "token_count": 10, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 1, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.6, "semantic_density": 0.3, "structural_efficiency": 0.2}, "sentencepiece": {"name": "SentencePiece", "token_count": 12, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.4166666666666667, "semantic_density": 0.16666666666666666, "structural_efficiency": 0.16666666666666666}, "char_level": {"name": "CharLevel", "token_count": 14, "operator_nodes": 3, "tree_depth": 1, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 7, "raw_scr": 0.5, "semantic_density": 0.21428571428571427, "structural_efficiency": 0.21428571428571427}, "scr_improvement_vs_gpt2": 1.8055555555555556, "scr_improvement_vs_sp": 2.5999999999999996, "scr_improvement_vs_char": 2.1666666666666665, "notes": []}
|
| 3 |
+
{"expression": "x^2 + 2*x + 1", "category": "standard", "sexp": "(OP_ADD CONST_1 (OP_POW VAR_X CONST_2) (OP_MUL CONST_2 VAR_X))", "mathtok": {"name": "MathTok", "token_count": 10, "operator_nodes": 3, "tree_depth": 2, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 2, "structural_score": 10, "raw_scr": 1.0, "semantic_density": 0.8, "structural_efficiency": 0.3}, "gpt2": {"name": "GPT-2", "token_count": 9, "operator_nodes": 2, "tree_depth": 0, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.4444444444444444, "semantic_density": 0.2222222222222222, "structural_efficiency": 0.2222222222222222}, "sentencepiece": {"name": "SentencePiece", "token_count": 9, "operator_nodes": 2, "tree_depth": 0, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.4444444444444444, "semantic_density": 0.2222222222222222, "structural_efficiency": 0.2222222222222222}, "char_level": {"name": "CharLevel", "token_count": 13, "operator_nodes": 4, "tree_depth": 0, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 0, "structural_score": 8, "raw_scr": 0.6153846153846154, "semantic_density": 0.3076923076923077, "structural_efficiency": 0.3076923076923077}, "scr_improvement_vs_gpt2": 2.25, "scr_improvement_vs_sp": 2.25, "scr_improvement_vs_char": 1.625, "notes": []}
|
| 4 |
+
{"expression": "exp(-x^2/2)", "category": "standard", "sexp": "(FUNC_EXP (OP_MUL (FRAC (OP_NEG CONST_1) CONST_2) (OP_POW VAR_X CONST_2)))", "mathtok": {"name": "MathTok", "token_count": 13, "operator_nodes": 4, "tree_depth": 4, "parent_child_relations": 5, "function_scope": 1, "canonical_bonus": 2, "structural_score": 16, "raw_scr": 1.2307692307692308, "semantic_density": 0.6923076923076923, "structural_efficiency": 0.38461538461538464}, "gpt2": {"name": "GPT-2", "token_count": 8, "operator_nodes": 2, "tree_depth": 0, "parent_child_relations": 2, "function_scope": 1, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.625, "semantic_density": 0.375, "structural_efficiency": 0.25}, "sentencepiece": {"name": "SentencePiece", "token_count": 9, "operator_nodes": 1, "tree_depth": 0, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.2222222222222222, "semantic_density": 0.1111111111111111, "structural_efficiency": 0.1111111111111111}, "char_level": {"name": "CharLevel", "token_count": 11, "operator_nodes": 3, "tree_depth": 1, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 7, "raw_scr": 0.6363636363636364, "semantic_density": 0.2727272727272727, "structural_efficiency": 0.2727272727272727}, "scr_improvement_vs_gpt2": 1.9692307692307693, "scr_improvement_vs_sp": 5.538461538461539, "scr_improvement_vs_char": 1.9340659340659343, "notes": []}
|
| 5 |
+
{"expression": "1/(1 + exp(-x))", "category": "standard", "sexp": "(OP_MUL (OP_RECIP (OP_ADD CONST_1 (FUNC_EXP VAR_X))) (FUNC_EXP VAR_X))", "mathtok": {"name": "MathTok", "token_count": 14, "operator_nodes": 3, "tree_depth": 4, "parent_child_relations": 5, "function_scope": 2, "canonical_bonus": 2, "structural_score": 16, "raw_scr": 1.1428571428571428, "semantic_density": 0.5714285714285714, "structural_efficiency": 0.35714285714285715}, "gpt2": {"name": "GPT-2", "token_count": 8, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "sentencepiece": {"name": "SentencePiece", "token_count": 12, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.4166666666666667, "semantic_density": 0.16666666666666666, "structural_efficiency": 0.16666666666666666}, "char_level": {"name": "CharLevel", "token_count": 15, "operator_nodes": 3, "tree_depth": 2, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 8, "raw_scr": 0.5333333333333333, "semantic_density": 0.2, "structural_efficiency": 0.2}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": 2.7428571428571424, "scr_improvement_vs_char": 2.142857142857143, "notes": []}
|
| 6 |
+
{"expression": "log(x*y)", "category": "standard", "sexp": "(FUNC_LOG (OP_MUL VAR_X VAR_Y))", "mathtok": {"name": "MathTok", "token_count": 8, "operator_nodes": 1, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 1, "canonical_bonus": 2, "structural_score": 8, "raw_scr": 1.0, "semantic_density": 0.5, "structural_efficiency": 0.25}, "gpt2": {"name": "GPT-2", "token_count": 6, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 1, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.6666666666666666, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.16666666666666666}, "sentencepiece": {"name": "SentencePiece", "token_count": 8, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.375, "semantic_density": 0.125, "structural_efficiency": 0.125}, "char_level": {"name": "CharLevel", "token_count": 8, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.375, "semantic_density": 0.125, "structural_efficiency": 0.125}, "scr_improvement_vs_gpt2": 1.5, "scr_improvement_vs_sp": 2.6666666666666665, "scr_improvement_vs_char": 2.6666666666666665, "notes": []}
|
| 7 |
+
{"expression": "sqrt(a^2 + b^2)", "category": "standard", "sexp": "(OP_POW (OP_ADD (OP_POW VAR_A CONST_2) (OP_POW VAR_B CONST_2)) (FRAC CONST_1 CONST_2))", "mathtok": {"name": "MathTok", "token_count": 13, "operator_nodes": 5, "tree_depth": 3, "parent_child_relations": 5, "function_scope": 0, "canonical_bonus": 2, "structural_score": 15, "raw_scr": 1.1538461538461537, "semantic_density": 0.8461538461538461, "structural_efficiency": 0.38461538461538464}, "gpt2": {"name": "GPT-2", "token_count": 11, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.45454545454545453, "semantic_density": 0.18181818181818182, "structural_efficiency": 0.18181818181818182}, "sentencepiece": {"name": "SentencePiece", "token_count": 9, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 1, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.4444444444444444, "semantic_density": 0.2222222222222222, "structural_efficiency": 0.1111111111111111}, "char_level": {"name": "CharLevel", "token_count": 15, "operator_nodes": 3, "tree_depth": 1, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 7, "raw_scr": 0.4666666666666667, "semantic_density": 0.2, "structural_efficiency": 0.2}, "scr_improvement_vs_gpt2": 2.5384615384615383, "scr_improvement_vs_sp": 2.5961538461538463, "scr_improvement_vs_char": 2.472527472527472, "notes": []}
|
| 8 |
+
{"expression": "n*(n+1)/2", "category": "standard", "sexp": "(OP_MUL (FRAC CONST_1 CONST_2) VAR_N (OP_ADD CONST_1 VAR_N))", "mathtok": {"name": "MathTok", "token_count": 10, "operator_nodes": 3, "tree_depth": 2, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 2, "structural_score": 10, "raw_scr": 1.0, "semantic_density": 0.8, "structural_efficiency": 0.3}, "gpt2": {"name": "GPT-2", "token_count": 8, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.625, "semantic_density": 0.25, "structural_efficiency": 0.25}, "sentencepiece": {"name": "SentencePiece", "token_count": 10, "operator_nodes": 3, "tree_depth": 1, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 7, "raw_scr": 0.7, "semantic_density": 0.3, "structural_efficiency": 0.3}, "char_level": {"name": "CharLevel", "token_count": 9, "operator_nodes": 3, "tree_depth": 1, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 7, "raw_scr": 0.7777777777777778, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.3333333333333333}, "scr_improvement_vs_gpt2": 1.6, "scr_improvement_vs_sp": 1.4285714285714286, "scr_improvement_vs_char": 1.2857142857142856, "notes": []}
|
| 9 |
+
{"expression": "factorial(n)", "category": "standard", "sexp": "(FUNC_FACTORIAL VAR_N)", "mathtok": {"name": "MathTok", "token_count": 6, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 1, "canonical_bonus": 2, "structural_score": 5, "raw_scr": 0.8333333333333334, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.16666666666666666}, "gpt2": {"name": "GPT-2", "token_count": 5, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.2, "semantic_density": 0.0, "structural_efficiency": 0.0}, "sentencepiece": {"name": "SentencePiece", "token_count": 11, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.09090909090909091, "semantic_density": 0.0, "structural_efficiency": 0.0}, "char_level": {"name": "CharLevel", "token_count": 12, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.08333333333333333, "semantic_density": 0.0, "structural_efficiency": 0.0}, "scr_improvement_vs_gpt2": 4.166666666666667, "scr_improvement_vs_sp": 9.166666666666666, "scr_improvement_vs_char": 10.000000000000002, "notes": []}
|
| 10 |
+
{"expression": "diff(sin(x), x)", "category": "standard", "sexp": "(FUNC_COS VAR_X)", "mathtok": {"name": "MathTok", "token_count": 6, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 1, "canonical_bonus": 2, "structural_score": 5, "raw_scr": 0.8333333333333334, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.16666666666666666}, "gpt2": {"name": "GPT-2", "token_count": 8, "operator_nodes": 0, "tree_depth": 2, "parent_child_relations": 0, "function_scope": 2, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.5, "semantic_density": 0.25, "structural_efficiency": 0.0}, "sentencepiece": {"name": "SentencePiece", "token_count": 14, "operator_nodes": 0, "tree_depth": 2, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.14285714285714285, "semantic_density": 0.0, "structural_efficiency": 0.0}, "char_level": {"name": "CharLevel", "token_count": 15, "operator_nodes": 0, "tree_depth": 2, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.13333333333333333, "semantic_density": 0.0, "structural_efficiency": 0.0}, "scr_improvement_vs_gpt2": 1.6666666666666667, "scr_improvement_vs_sp": 5.833333333333334, "scr_improvement_vs_char": 6.25, "notes": []}
|
| 11 |
+
{"expression": "integrate(x^2, x)", "category": "standard", "sexp": "(OP_MUL (FRAC CONST_1 CONST_3) (OP_POW VAR_X CONST_3))", "mathtok": {"name": "MathTok", "token_count": 9, "operator_nodes": 3, "tree_depth": 2, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 2, "structural_score": 10, "raw_scr": 1.1111111111111112, "semantic_density": 0.7777777777777778, "structural_efficiency": 0.3333333333333333}, "gpt2": {"name": "GPT-2", "token_count": 9, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.3333333333333333, "semantic_density": 0.1111111111111111, "structural_efficiency": 0.1111111111111111}, "sentencepiece": {"name": "SentencePiece", "token_count": 13, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.07692307692307693, "semantic_density": 0.0, "structural_efficiency": 0.0}, "char_level": {"name": "CharLevel", "token_count": 17, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.17647058823529413, "semantic_density": 0.058823529411764705, "structural_efficiency": 0.058823529411764705}, "scr_improvement_vs_gpt2": 3.3333333333333335, "scr_improvement_vs_sp": 14.444444444444445, "scr_improvement_vs_char": 6.296296296296296, "notes": []}
|
| 12 |
+
{"expression": "limit(sin(x)/x, x, 0)", "category": "standard", "sexp": "CONST_1", "mathtok": {"name": "MathTok", "token_count": 3, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 2, "structural_score": 2, "raw_scr": 0.6666666666666666, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.0}, "gpt2": {"name": "GPT-2", "token_count": 12, "operator_nodes": 0, "tree_depth": 2, "parent_child_relations": 0, "function_scope": 1, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.25, "semantic_density": 0.08333333333333333, "structural_efficiency": 0.0}, "sentencepiece": {"name": "SentencePiece", "token_count": 18, "operator_nodes": 1, "tree_depth": 2, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.2222222222222222, "semantic_density": 0.05555555555555555, "structural_efficiency": 0.05555555555555555}, "char_level": {"name": "CharLevel", "token_count": 21, "operator_nodes": 1, "tree_depth": 2, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.19047619047619047, "semantic_density": 0.047619047619047616, "structural_efficiency": 0.047619047619047616}, "scr_improvement_vs_gpt2": 2.6666666666666665, "scr_improvement_vs_sp": 3.0, "scr_improvement_vs_char": 3.5, "notes": []}
|
| 13 |
+
{"expression": "a^2 - b^2", "category": "standard", "sexp": "(OP_ADD (OP_POW VAR_A CONST_2) (OP_NEG (OP_POW VAR_B CONST_2)))", "mathtok": {"name": "MathTok", "token_count": 10, "operator_nodes": 4, "tree_depth": 3, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 2, "structural_score": 13, "raw_scr": 1.3, "semantic_density": 0.8, "structural_efficiency": 0.4}, "gpt2": {"name": "GPT-2", "token_count": 7, "operator_nodes": 2, "tree_depth": 0, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.5714285714285714, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "sentencepiece": {"name": "SentencePiece", "token_count": 6, "operator_nodes": 1, "tree_depth": 0, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.3333333333333333, "semantic_density": 0.16666666666666666, "structural_efficiency": 0.16666666666666666}, "char_level": {"name": "CharLevel", "token_count": 9, "operator_nodes": 3, "tree_depth": 0, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.6666666666666666, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.3333333333333333}, "scr_improvement_vs_gpt2": 2.2750000000000004, "scr_improvement_vs_sp": 3.9000000000000004, "scr_improvement_vs_char": 1.9500000000000002, "notes": []}
|
| 14 |
+
{"expression": "(-b + sqrt(b^2 - 4*a*c)) / (2*a)", "category": "standard", "sexp": "(OP_MUL (FRAC CONST_1 CONST_2) (OP_RECIP VAR_A) (OP_ADD (OP_POW (OP_ADD (OP_POW VAR_B CONST_2) (OP_MUL (OP_NEG CONST_4) VAR_A VAR_C)) (FRAC CONST_1 CONST_2)) (OP_NEG VAR_B)))", "mathtok": {"name": "MathTok", "token_count": 24, "operator_nodes": 11, "tree_depth": 6, "parent_child_relations": 11, "function_scope": 0, "canonical_bonus": 2, "structural_score": 30, "raw_scr": 1.25, "semantic_density": 0.9166666666666666, "structural_efficiency": 0.4583333333333333}, "gpt2": {"name": "GPT-2", "token_count": 22, "operator_nodes": 4, "tree_depth": 1, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 0, "structural_score": 9, "raw_scr": 0.4090909090909091, "semantic_density": 0.18181818181818182, "structural_efficiency": 0.18181818181818182}, "sentencepiece": {"name": "SentencePiece", "token_count": 27, "operator_nodes": 6, "tree_depth": 2, "parent_child_relations": 6, "function_scope": 1, "canonical_bonus": 0, "structural_score": 15, "raw_scr": 0.5555555555555556, "semantic_density": 0.25925925925925924, "structural_efficiency": 0.2222222222222222}, "char_level": {"name": "CharLevel", "token_count": 32, "operator_nodes": 8, "tree_depth": 2, "parent_child_relations": 8, "function_scope": 0, "canonical_bonus": 0, "structural_score": 18, "raw_scr": 0.5625, "semantic_density": 0.25, "structural_efficiency": 0.25}, "scr_improvement_vs_gpt2": 3.0555555555555554, "scr_improvement_vs_sp": 2.25, "scr_improvement_vs_char": 2.2222222222222223, "notes": []}
|
| 15 |
+
{"expression": "sum(k^2, k, 1, n)", "category": "standard", "sexp": "[UNK]", "mathtok": {"name": "MathTok", "token_count": 1, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "gpt2": {"name": "GPT-2", "token_count": 12, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 1, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.3333333333333333, "semantic_density": 0.16666666666666666, "structural_efficiency": 0.08333333333333333}, "sentencepiece": {"name": "SentencePiece", "token_count": 14, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 1, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.14285714285714285, "semantic_density": 0.07142857142857142, "structural_efficiency": 0.0}, "char_level": {"name": "CharLevel", "token_count": 17, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.17647058823529413, "semantic_density": 0.058823529411764705, "structural_efficiency": 0.058823529411764705}, "scr_improvement_vs_gpt2": 0.0, "scr_improvement_vs_sp": 0.0, "scr_improvement_vs_char": 0.0, "notes": []}
|
| 16 |
+
{"expression": "sin(cos(x^2 + 1))", "category": "deep_nesting", "sexp": "(FUNC_SIN (FUNC_COS (OP_ADD CONST_1 (OP_POW VAR_X CONST_2))))", "mathtok": {"name": "MathTok", "token_count": 13, "operator_nodes": 2, "tree_depth": 4, "parent_child_relations": 4, "function_scope": 2, "canonical_bonus": 2, "structural_score": 14, "raw_scr": 1.0769230769230769, "semantic_density": 0.5384615384615384, "structural_efficiency": 0.3076923076923077}, "gpt2": {"name": "GPT-2", "token_count": 10, "operator_nodes": 1, "tree_depth": 2, "parent_child_relations": 1, "function_scope": 2, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.6, "semantic_density": 0.3, "structural_efficiency": 0.1}, "sentencepiece": {"name": "SentencePiece", "token_count": 14, "operator_nodes": 1, "tree_depth": 2, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.2857142857142857, "semantic_density": 0.07142857142857142, "structural_efficiency": 0.07142857142857142}, "char_level": {"name": "CharLevel", "token_count": 17, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.35294117647058826, "semantic_density": 0.11764705882352941, "structural_efficiency": 0.11764705882352941}, "scr_improvement_vs_gpt2": 1.794871794871795, "scr_improvement_vs_sp": 3.769230769230769, "scr_improvement_vs_char": 3.051282051282051, "notes": []}
|
| 17 |
+
{"expression": "sin(cos((x+1)^2 + y^3))", "category": "deep_nesting", "sexp": "(FUNC_SIN (FUNC_COS (OP_ADD CONST_1 (OP_POW VAR_X CONST_2) (OP_POW VAR_Y CONST_3) (OP_MUL CONST_2 VAR_X))))", "mathtok": {"name": "MathTok", "token_count": 19, "operator_nodes": 4, "tree_depth": 4, "parent_child_relations": 6, "function_scope": 2, "canonical_bonus": 2, "structural_score": 18, "raw_scr": 0.9473684210526315, "semantic_density": 0.6842105263157895, "structural_efficiency": 0.3157894736842105}, "gpt2": {"name": "GPT-2", "token_count": 15, "operator_nodes": 3, "tree_depth": 1, "parent_child_relations": 3, "function_scope": 2, "canonical_bonus": 0, "structural_score": 9, "raw_scr": 0.6, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.2}, "sentencepiece": {"name": "SentencePiece", "token_count": 18, "operator_nodes": 2, "tree_depth": 3, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 7, "raw_scr": 0.3888888888888889, "semantic_density": 0.1111111111111111, "structural_efficiency": 0.1111111111111111}, "char_level": {"name": "CharLevel", "token_count": 23, "operator_nodes": 4, "tree_depth": 3, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 0, "structural_score": 11, "raw_scr": 0.4782608695652174, "semantic_density": 0.17391304347826086, "structural_efficiency": 0.17391304347826086}, "scr_improvement_vs_gpt2": 1.5789473684210527, "scr_improvement_vs_sp": 2.4360902255639094, "scr_improvement_vs_char": 1.9808612440191387, "notes": []}
|
| 18 |
+
{"expression": "exp(log(sin(x^2 + cos(y))))", "category": "deep_nesting", "sexp": "(FUNC_SIN (OP_ADD (OP_POW VAR_X CONST_2) (FUNC_COS VAR_Y)))", "mathtok": {"name": "MathTok", "token_count": 13, "operator_nodes": 2, "tree_depth": 3, "parent_child_relations": 4, "function_scope": 2, "canonical_bonus": 2, "structural_score": 13, "raw_scr": 1.0, "semantic_density": 0.5384615384615384, "structural_efficiency": 0.3076923076923077}, "gpt2": {"name": "GPT-2", "token_count": 14, "operator_nodes": 1, "tree_depth": 4, "parent_child_relations": 1, "function_scope": 3, "canonical_bonus": 0, "structural_score": 9, "raw_scr": 0.6428571428571429, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.07142857142857142}, "sentencepiece": {"name": "SentencePiece", "token_count": 23, "operator_nodes": 1, "tree_depth": 4, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.2608695652173913, "semantic_density": 0.043478260869565216, "structural_efficiency": 0.043478260869565216}, "char_level": {"name": "CharLevel", "token_count": 27, "operator_nodes": 2, "tree_depth": 4, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 8, "raw_scr": 0.2962962962962963, "semantic_density": 0.07407407407407407, "structural_efficiency": 0.07407407407407407}, "scr_improvement_vs_gpt2": 1.5555555555555554, "scr_improvement_vs_sp": 3.8333333333333335, "scr_improvement_vs_char": 3.375, "notes": []}
|
| 19 |
+
{"expression": "sqrt(1 + sqrt(1 + sqrt(x)))", "category": "deep_nesting", "sexp": "(OP_POW (OP_ADD CONST_1 (OP_POW (OP_ADD CONST_1 (OP_POW VAR_X (FRAC CONST_1 CONST_2))) (FRAC CONST_1 CONST_2))) (FRAC CONST_1 CONST_2))", "mathtok": {"name": "MathTok", "token_count": 19, "operator_nodes": 8, "tree_depth": 6, "parent_child_relations": 8, "function_scope": 0, "canonical_bonus": 2, "structural_score": 24, "raw_scr": 1.263157894736842, "semantic_density": 0.8947368421052632, "structural_efficiency": 0.42105263157894735}, "gpt2": {"name": "GPT-2", "token_count": 15, "operator_nodes": 0, "tree_depth": 3, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.2, "semantic_density": 0.0, "structural_efficiency": 0.0}, "sentencepiece": {"name": "SentencePiece", "token_count": 18, "operator_nodes": 2, "tree_depth": 3, "parent_child_relations": 2, "function_scope": 3, "canonical_bonus": 0, "structural_score": 10, "raw_scr": 0.5555555555555556, "semantic_density": 0.2777777777777778, "structural_efficiency": 0.1111111111111111}, "char_level": {"name": "CharLevel", "token_count": 27, "operator_nodes": 2, "tree_depth": 3, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 7, "raw_scr": 0.25925925925925924, "semantic_density": 0.07407407407407407, "structural_efficiency": 0.07407407407407407}, "scr_improvement_vs_gpt2": 6.31578947368421, "scr_improvement_vs_sp": 2.2736842105263158, "scr_improvement_vs_char": 4.87218045112782, "notes": []}
|
| 20 |
+
{"expression": "log(1 + log(1 + x))", "category": "deep_nesting", "sexp": "(FUNC_LOG (OP_ADD CONST_1 (FUNC_LOG (OP_ADD CONST_1 VAR_X))))", "mathtok": {"name": "MathTok", "token_count": 13, "operator_nodes": 2, "tree_depth": 4, "parent_child_relations": 4, "function_scope": 2, "canonical_bonus": 2, "structural_score": 14, "raw_scr": 1.0769230769230769, "semantic_density": 0.5384615384615384, "structural_efficiency": 0.3076923076923077}, "gpt2": {"name": "GPT-2", "token_count": 10, "operator_nodes": 0, "tree_depth": 2, "parent_child_relations": 0, "function_scope": 1, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.3, "semantic_density": 0.1, "structural_efficiency": 0.0}, "sentencepiece": {"name": "SentencePiece", "token_count": 16, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.375, "semantic_density": 0.125, "structural_efficiency": 0.125}, "char_level": {"name": "CharLevel", "token_count": 19, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.3157894736842105, "semantic_density": 0.10526315789473684, "structural_efficiency": 0.10526315789473684}, "scr_improvement_vs_gpt2": 3.58974358974359, "scr_improvement_vs_sp": 2.871794871794872, "scr_improvement_vs_char": 3.41025641025641, "notes": []}
|
| 21 |
+
{"expression": "((x+1)^2 + (y-1)^2)^3", "category": "deep_nesting", "sexp": "(OP_ADD CONST_8 (OP_POW VAR_X CONST_6) (OP_POW VAR_Y CONST_6) (OP_MUL (OP_NEG CONST_32) (OP_POW VAR_Y CONST_3)) (OP_MUL (OP_NEG CONST_24) VAR_Y) (OP_MUL (OP_NEG CONST_6) (OP_POW VAR_Y CONST_5)) (OP_MUL CONST_6 (OP_POW VAR_X CONST_5)) (OP_MUL CONST_18 (OP_POW VAR_X CONST_4)) (OP_MUL CONST_18 (OP_POW VAR_Y CONST_4)) (OP_MUL CONST_24 VAR_X) (OP_MUL CONST_32 (OP_POW VAR_X CONST_3)) (OP_MUL CONST_36 (OP_POW VAR_X CONST_2)) (OP_MUL CONST_36 (OP_POW VAR_Y CONST_2)) (OP_MUL (OP_NEG CONST_48) VAR_X VAR_Y) (OP_MUL (OP_NEG CONST_48) VAR_Y (OP_POW VAR_X CONST_2)) (OP_MUL (OP_NEG CONST_24) VAR_X (OP_POW VAR_Y CONST_3)) (OP_MUL (OP_NEG CONST_24) VAR_Y (OP_POW VAR_X CONST_3)) (OP_MUL (OP_NEG CONST_12) (OP_POW VAR_X CONST_2) (OP_POW VAR_Y CONST_3)) (OP_MUL (OP_NEG CONST_6) VAR_Y (OP_POW VAR_X CONST_4)) (OP_MUL CONST_3 (OP_POW VAR_X CONST_2) (OP_POW VAR_Y CONST_4)) (OP_MUL CONST_3 (OP_POW VAR_X CONST_4) (OP_POW VAR_Y CONST_2)) (OP_MUL CONST_6 VAR_X (OP_POW VAR_Y CONST_4)) (OP_MUL CONST_12 (OP_POW VAR_X CONST_3) (OP_POW VAR_Y CONST_2)) (OP_MUL CONST_36 (OP_POW VAR_X CONST_2) (OP_POW VAR_Y CONST_2)) (OP_MUL CONST_48 VAR_X (OP_POW VAR_Y CONST_2)))", "mathtok": {"name": "MathTok", "token_count": 145, "operator_nodes": 58, "tree_depth": 3, "parent_child_relations": 58, "function_scope": 0, "canonical_bonus": 2, "structural_score": 121, "raw_scr": 0.8344827586206897, "semantic_density": 0.9862068965517241, "structural_efficiency": 0.4}, "gpt2": {"name": "GPT-2", "token_count": 18, "operator_nodes": 5, "tree_depth": 0, "parent_child_relations": 5, "function_scope": 0, "canonical_bonus": 0, "structural_score": 10, "raw_scr": 0.5555555555555556, "semantic_density": 0.2777777777777778, "structural_efficiency": 0.2777777777777778}, "sentencepiece": {"name": "SentencePiece", "token_count": 14, "operator_nodes": 3, "tree_depth": 1, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 7, "raw_scr": 0.5, "semantic_density": 0.21428571428571427, "structural_efficiency": 0.21428571428571427}, "char_level": {"name": "CharLevel", "token_count": 21, "operator_nodes": 6, "tree_depth": 2, "parent_child_relations": 6, "function_scope": 0, "canonical_bonus": 0, "structural_score": 14, "raw_scr": 0.6666666666666666, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "scr_improvement_vs_gpt2": 1.5020689655172412, "scr_improvement_vs_sp": 1.6689655172413793, "scr_improvement_vs_char": 1.2517241379310347, "notes": []}
|
| 22 |
+
{"expression": "((a + b)*(a - b)) / ((a + b)^2)", "category": "deep_nesting", "sexp": "(OP_MUL (OP_RECIP (OP_ADD VAR_A VAR_B)) (OP_ADD VAR_A (OP_NEG VAR_B)))", "mathtok": {"name": "MathTok", "token_count": 11, "operator_nodes": 5, "tree_depth": 3, "parent_child_relations": 5, "function_scope": 0, "canonical_bonus": 2, "structural_score": 15, "raw_scr": 1.3636363636363635, "semantic_density": 0.8181818181818182, "structural_efficiency": 0.45454545454545453}, "gpt2": {"name": "GPT-2", "token_count": 19, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.15789473684210525, "semantic_density": 0.05263157894736842, "structural_efficiency": 0.05263157894736842}, "sentencepiece": {"name": "SentencePiece", "token_count": 22, "operator_nodes": 5, "tree_depth": 1, "parent_child_relations": 5, "function_scope": 0, "canonical_bonus": 0, "structural_score": 11, "raw_scr": 0.5, "semantic_density": 0.22727272727272727, "structural_efficiency": 0.22727272727272727}, "char_level": {"name": "CharLevel", "token_count": 31, "operator_nodes": 6, "tree_depth": 2, "parent_child_relations": 6, "function_scope": 0, "canonical_bonus": 0, "structural_score": 14, "raw_scr": 0.45161290322580644, "semantic_density": 0.1935483870967742, "structural_efficiency": 0.1935483870967742}, "scr_improvement_vs_gpt2": 8.636363636363637, "scr_improvement_vs_sp": 2.727272727272727, "scr_improvement_vs_char": 3.019480519480519, "notes": []}
|
| 23 |
+
{"expression": "Derivative(f(x), x, 2) + 2*Derivative(f(x), x) + f(x)", "category": "ode_pde", "sexp": "(OP_ADD (OP_MUL CONST_2 VAR_F) (OP_MUL VAR_F VAR_X))", "mathtok": {"name": "MathTok", "token_count": 9, "operator_nodes": 3, "tree_depth": 2, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 2, "structural_score": 10, "raw_scr": 1.1111111111111112, "semantic_density": 0.7777777777777778, "structural_efficiency": 0.3333333333333333}, "gpt2": {"name": "GPT-2", "token_count": 30, "operator_nodes": 1, "tree_depth": 3, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.16666666666666666, "semantic_density": 0.03333333333333333, "structural_efficiency": 0.03333333333333333}, "sentencepiece": {"name": "SentencePiece", "token_count": 32, "operator_nodes": 2, "tree_depth": 3, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 7, "raw_scr": 0.21875, "semantic_density": 0.0625, "structural_efficiency": 0.0625}, "char_level": {"name": "CharLevel", "token_count": 53, "operator_nodes": 3, "tree_depth": 2, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 8, "raw_scr": 0.1509433962264151, "semantic_density": 0.05660377358490566, "structural_efficiency": 0.05660377358490566}, "scr_improvement_vs_gpt2": 6.666666666666667, "scr_improvement_vs_sp": 5.07936507936508, "scr_improvement_vs_char": 7.361111111111112, "notes": []}
|
| 24 |
+
{"expression": "Derivative(u(x, t), t) - alpha * Derivative(u(x, t), x, 2)", "category": "ode_pde", "sexp": "[UNK]", "mathtok": {"name": "MathTok", "token_count": 1, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "gpt2": {"name": "GPT-2", "token_count": 29, "operator_nodes": 0, "tree_depth": 3, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.10344827586206896, "semantic_density": 0.0, "structural_efficiency": 0.0}, "sentencepiece": {"name": "SentencePiece", "token_count": 36, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.16666666666666666, "semantic_density": 0.05555555555555555, "structural_efficiency": 0.05555555555555555}, "char_level": {"name": "CharLevel", "token_count": 58, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.10344827586206896, "semantic_density": 0.034482758620689655, "structural_efficiency": 0.034482758620689655}, "scr_improvement_vs_gpt2": 0.0, "scr_improvement_vs_sp": 0.0, "scr_improvement_vs_char": 0.0, "notes": []}
|
| 25 |
+
{"expression": "A*x + b", "category": "linear_algebra", "sexp": "(OP_ADD VAR_B (OP_MUL VAR_A VAR_X))", "mathtok": {"name": "MathTok", "token_count": 7, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 2, "structural_score": 8, "raw_scr": 1.1428571428571428, "semantic_density": 0.7142857142857143, "structural_efficiency": 0.2857142857142857}, "gpt2": {"name": "GPT-2", "token_count": 5, "operator_nodes": 1, "tree_depth": 0, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.4, "semantic_density": 0.2, "structural_efficiency": 0.2}, "sentencepiece": {"name": "SentencePiece", "token_count": 7, "operator_nodes": 2, "tree_depth": 0, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.5714285714285714, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "char_level": {"name": "CharLevel", "token_count": 7, "operator_nodes": 2, "tree_depth": 0, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.5714285714285714, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "scr_improvement_vs_gpt2": 2.8571428571428568, "scr_improvement_vs_sp": 2.0, "scr_improvement_vs_char": 2.0, "notes": []}
|
| 26 |
+
{"expression": "det(A - lambda*I)", "category": "linear_algebra", "sexp": "[UNK]", "mathtok": {"name": "MathTok", "token_count": 1, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "gpt2": {"name": "GPT-2", "token_count": 8, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.375, "semantic_density": 0.125, "structural_efficiency": 0.125}, "sentencepiece": {"name": "SentencePiece", "token_count": 17, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.29411764705882354, "semantic_density": 0.11764705882352941, "structural_efficiency": 0.11764705882352941}, "char_level": {"name": "CharLevel", "token_count": 17, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.29411764705882354, "semantic_density": 0.11764705882352941, "structural_efficiency": 0.11764705882352941}, "scr_improvement_vs_gpt2": 0.0, "scr_improvement_vs_sp": 0.0, "scr_improvement_vs_char": 0.0, "notes": []}
|
| 27 |
+
{"expression": "P(A|B) * P(B) / P(A)", "category": "probability", "sexp": "[UNK]", "mathtok": {"name": "MathTok", "token_count": 1, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "gpt2": {"name": "GPT-2", "token_count": 16, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.0625, "semantic_density": 0.0, "structural_efficiency": 0.0}, "sentencepiece": {"name": "SentencePiece", "token_count": 21, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.23809523809523808, "semantic_density": 0.09523809523809523, "structural_efficiency": 0.09523809523809523}, "char_level": {"name": "CharLevel", "token_count": 20, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.25, "semantic_density": 0.1, "structural_efficiency": 0.1}, "scr_improvement_vs_gpt2": 0.0, "scr_improvement_vs_sp": 0.0, "scr_improvement_vs_char": 0.0, "notes": []}
|
| 28 |
+
{"expression": "exp(-x^2 / 2) / sqrt(2*pi)", "category": "probability", "sexp": "(OP_MUL (FRAC CONST_1 CONST_2) (OP_POW CONST_2 (FRAC CONST_1 CONST_2)) (OP_POW CONST_PI (FRAC (OP_NEG CONST_1) CONST_2)) (FUNC_EXP (OP_MUL (FRAC (OP_NEG CONST_1) CONST_2) (OP_POW VAR_X CONST_2))))", "mathtok": {"name": "MathTok", "token_count": 28, "operator_nodes": 11, "tree_depth": 5, "parent_child_relations": 12, "function_scope": 1, "canonical_bonus": 2, "structural_score": 31, "raw_scr": 1.1071428571428572, "semantic_density": 0.8571428571428571, "structural_efficiency": 0.42857142857142855}, "gpt2": {"name": "GPT-2", "token_count": 16, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 1, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.375, "semantic_density": 0.1875, "structural_efficiency": 0.125}, "sentencepiece": {"name": "SentencePiece", "token_count": 18, "operator_nodes": 3, "tree_depth": 1, "parent_child_relations": 3, "function_scope": 1, "canonical_bonus": 0, "structural_score": 8, "raw_scr": 0.4444444444444444, "semantic_density": 0.2222222222222222, "structural_efficiency": 0.16666666666666666}, "char_level": {"name": "CharLevel", "token_count": 26, "operator_nodes": 5, "tree_depth": 1, "parent_child_relations": 5, "function_scope": 0, "canonical_bonus": 0, "structural_score": 11, "raw_scr": 0.4230769230769231, "semantic_density": 0.19230769230769232, "structural_efficiency": 0.19230769230769232}, "scr_improvement_vs_gpt2": 2.9523809523809526, "scr_improvement_vs_sp": 2.491071428571429, "scr_improvement_vs_char": 2.616883116883117, "notes": []}
|
| 29 |
+
{"expression": "Union(A, B)", "category": "set_theory", "sexp": "[UNK]", "mathtok": {"name": "MathTok", "token_count": 1, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "gpt2": {"name": "GPT-2", "token_count": 6, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.16666666666666666, "semantic_density": 0.0, "structural_efficiency": 0.0}, "sentencepiece": {"name": "SentencePiece", "token_count": 12, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.08333333333333333, "semantic_density": 0.0, "structural_efficiency": 0.0}, "char_level": {"name": "CharLevel", "token_count": 11, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.09090909090909091, "semantic_density": 0.0, "structural_efficiency": 0.0}, "scr_improvement_vs_gpt2": 0.0, "scr_improvement_vs_sp": 0.0, "scr_improvement_vs_char": 0.0, "notes": []}
|
| 30 |
+
{"expression": "Intersection(A, B)", "category": "set_theory", "sexp": "[UNK]", "mathtok": {"name": "MathTok", "token_count": 1, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "gpt2": {"name": "GPT-2", "token_count": 7, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.14285714285714285, "semantic_density": 0.0, "structural_efficiency": 0.0}, "sentencepiece": {"name": "SentencePiece", "token_count": 19, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.05263157894736842, "semantic_density": 0.0, "structural_efficiency": 0.0}, "char_level": {"name": "CharLevel", "token_count": 18, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.05555555555555555, "semantic_density": 0.0, "structural_efficiency": 0.0}, "scr_improvement_vs_gpt2": 0.0, "scr_improvement_vs_sp": 0.0, "scr_improvement_vs_char": 0.0, "notes": []}
|
| 31 |
+
{"expression": "x + 2", "category": "canonical", "sexp": "(OP_ADD CONST_2 VAR_X)", "mathtok": {"name": "MathTok", "token_count": 5, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 2, "structural_score": 5, "raw_scr": 1.0, "semantic_density": 0.6, "structural_efficiency": 0.2}, "gpt2": {"name": "GPT-2", "token_count": 3, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "sentencepiece": {"name": "SentencePiece", "token_count": 5, "operator_nodes": 1, "tree_depth": 0, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.4, "semantic_density": 0.2, "structural_efficiency": 0.2}, "char_level": {"name": "CharLevel", "token_count": 5, "operator_nodes": 1, "tree_depth": 0, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.4, "semantic_density": 0.2, "structural_efficiency": 0.2}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": 2.5, "scr_improvement_vs_char": 2.5, "notes": []}
|
| 32 |
+
{"expression": "2 + x", "category": "canonical", "sexp": "(OP_ADD CONST_2 VAR_X)", "mathtok": {"name": "MathTok", "token_count": 5, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 2, "structural_score": 5, "raw_scr": 1.0, "semantic_density": 0.6, "structural_efficiency": 0.2}, "gpt2": {"name": "GPT-2", "token_count": 3, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "sentencepiece": {"name": "SentencePiece", "token_count": 5, "operator_nodes": 1, "tree_depth": 0, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.4, "semantic_density": 0.2, "structural_efficiency": 0.2}, "char_level": {"name": "CharLevel", "token_count": 5, "operator_nodes": 1, "tree_depth": 0, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.4, "semantic_density": 0.2, "structural_efficiency": 0.2}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": 2.5, "scr_improvement_vs_char": 2.5, "notes": []}
|
| 33 |
+
{"expression": "a*b + a*c", "category": "canonical", "sexp": "(OP_MUL VAR_A (OP_ADD VAR_B VAR_C))", "mathtok": {"name": "MathTok", "token_count": 7, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 2, "structural_score": 8, "raw_scr": 1.1428571428571428, "semantic_density": 0.7142857142857143, "structural_efficiency": 0.2857142857142857}, "gpt2": {"name": "GPT-2", "token_count": 7, "operator_nodes": 2, "tree_depth": 0, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.5714285714285714, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "sentencepiece": {"name": "SentencePiece", "token_count": 8, "operator_nodes": 3, "tree_depth": 0, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.75, "semantic_density": 0.375, "structural_efficiency": 0.375}, "char_level": {"name": "CharLevel", "token_count": 9, "operator_nodes": 3, "tree_depth": 0, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.6666666666666666, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.3333333333333333}, "scr_improvement_vs_gpt2": 2.0, "scr_improvement_vs_sp": 1.5238095238095237, "scr_improvement_vs_char": 1.7142857142857142, "notes": []}
|
| 34 |
+
{"expression": "a*(b+c)", "category": "canonical", "sexp": "(OP_MUL VAR_A (OP_ADD VAR_B VAR_C))", "mathtok": {"name": "MathTok", "token_count": 7, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 2, "structural_score": 8, "raw_scr": 1.1428571428571428, "semantic_density": 0.7142857142857143, "structural_efficiency": 0.2857142857142857}, "gpt2": {"name": "GPT-2", "token_count": 7, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.7142857142857143, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "sentencepiece": {"name": "SentencePiece", "token_count": 7, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.7142857142857143, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "char_level": {"name": "CharLevel", "token_count": 7, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.7142857142857143, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "scr_improvement_vs_gpt2": 1.5999999999999999, "scr_improvement_vs_sp": 1.5999999999999999, "scr_improvement_vs_char": 1.5999999999999999, "notes": []}
|
| 35 |
+
{"expression": "(x+1)^2", "category": "canonical", "sexp": "(OP_ADD CONST_1 (OP_POW VAR_X CONST_2) (OP_MUL CONST_2 VAR_X))", "mathtok": {"name": "MathTok", "token_count": 10, "operator_nodes": 3, "tree_depth": 2, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 2, "structural_score": 10, "raw_scr": 1.0, "semantic_density": 0.8, "structural_efficiency": 0.3}, "gpt2": {"name": "GPT-2", "token_count": 7, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.7142857142857143, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "sentencepiece": {"name": "SentencePiece", "token_count": 4, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.25, "semantic_density": 0.0, "structural_efficiency": 0.0}, "char_level": {"name": "CharLevel", "token_count": 7, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.7142857142857143, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "scr_improvement_vs_gpt2": 1.4, "scr_improvement_vs_sp": 4.0, "scr_improvement_vs_char": 1.4, "notes": []}
|
| 36 |
+
{"expression": "x^2 + 2*x + 1", "category": "canonical", "sexp": "(OP_ADD CONST_1 (OP_POW VAR_X CONST_2) (OP_MUL CONST_2 VAR_X))", "mathtok": {"name": "MathTok", "token_count": 10, "operator_nodes": 3, "tree_depth": 2, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 2, "structural_score": 10, "raw_scr": 1.0, "semantic_density": 0.8, "structural_efficiency": 0.3}, "gpt2": {"name": "GPT-2", "token_count": 9, "operator_nodes": 2, "tree_depth": 0, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.4444444444444444, "semantic_density": 0.2222222222222222, "structural_efficiency": 0.2222222222222222}, "sentencepiece": {"name": "SentencePiece", "token_count": 9, "operator_nodes": 2, "tree_depth": 0, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.4444444444444444, "semantic_density": 0.2222222222222222, "structural_efficiency": 0.2222222222222222}, "char_level": {"name": "CharLevel", "token_count": 13, "operator_nodes": 4, "tree_depth": 0, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 0, "structural_score": 8, "raw_scr": 0.6153846153846154, "semantic_density": 0.3076923076923077, "structural_efficiency": 0.3076923076923077}, "scr_improvement_vs_gpt2": 2.25, "scr_improvement_vs_sp": 2.25, "scr_improvement_vs_char": 1.625, "notes": []}
|
| 37 |
+
{"expression": "x^2 - y^2", "category": "canonical", "sexp": "(OP_ADD (OP_POW VAR_X CONST_2) (OP_NEG (OP_POW VAR_Y CONST_2)))", "mathtok": {"name": "MathTok", "token_count": 10, "operator_nodes": 4, "tree_depth": 3, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 2, "structural_score": 13, "raw_scr": 1.3, "semantic_density": 0.8, "structural_efficiency": 0.4}, "gpt2": {"name": "GPT-2", "token_count": 7, "operator_nodes": 2, "tree_depth": 0, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.5714285714285714, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "sentencepiece": {"name": "SentencePiece", "token_count": 7, "operator_nodes": 1, "tree_depth": 0, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.2857142857142857, "semantic_density": 0.14285714285714285, "structural_efficiency": 0.14285714285714285}, "char_level": {"name": "CharLevel", "token_count": 9, "operator_nodes": 3, "tree_depth": 0, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.6666666666666666, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.3333333333333333}, "scr_improvement_vs_gpt2": 2.2750000000000004, "scr_improvement_vs_sp": 4.550000000000001, "scr_improvement_vs_char": 1.9500000000000002, "notes": []}
|
| 38 |
+
{"expression": "(x+y)*(x-y)", "category": "canonical", "sexp": "(OP_ADD (OP_POW VAR_X CONST_2) (OP_NEG (OP_POW VAR_Y CONST_2)))", "mathtok": {"name": "MathTok", "token_count": 10, "operator_nodes": 4, "tree_depth": 3, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 2, "structural_score": 13, "raw_scr": 1.3, "semantic_density": 0.8, "structural_efficiency": 0.4}, "gpt2": {"name": "GPT-2", "token_count": 10, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.6, "semantic_density": 0.2, "structural_efficiency": 0.2}, "sentencepiece": {"name": "SentencePiece", "token_count": 12, "operator_nodes": 3, "tree_depth": 1, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 7, "raw_scr": 0.5833333333333334, "semantic_density": 0.25, "structural_efficiency": 0.25}, "char_level": {"name": "CharLevel", "token_count": 11, "operator_nodes": 3, "tree_depth": 1, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 7, "raw_scr": 0.6363636363636364, "semantic_density": 0.2727272727272727, "structural_efficiency": 0.2727272727272727}, "scr_improvement_vs_gpt2": 2.166666666666667, "scr_improvement_vs_sp": 2.2285714285714286, "scr_improvement_vs_char": 2.042857142857143, "notes": []}
|
| 39 |
+
{"expression": "sin(x)^2 + cos(x)^2", "category": "canonical", "sexp": "CONST_1", "mathtok": {"name": "MathTok", "token_count": 3, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 2, "structural_score": 2, "raw_scr": 0.6666666666666666, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.0}, "gpt2": {"name": "GPT-2", "token_count": 13, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 1, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.46153846153846156, "semantic_density": 0.23076923076923078, "structural_efficiency": 0.15384615384615385}, "sentencepiece": {"name": "SentencePiece", "token_count": 17, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.17647058823529413, "semantic_density": 0.058823529411764705, "structural_efficiency": 0.058823529411764705}, "char_level": {"name": "CharLevel", "token_count": 19, "operator_nodes": 3, "tree_depth": 1, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 7, "raw_scr": 0.3684210526315789, "semantic_density": 0.15789473684210525, "structural_efficiency": 0.15789473684210525}, "scr_improvement_vs_gpt2": 1.4444444444444442, "scr_improvement_vs_sp": 3.7777777777777772, "scr_improvement_vs_char": 1.8095238095238095, "notes": []}
|
| 40 |
+
{"expression": "1", "category": "canonical", "sexp": "CONST_1", "mathtok": {"name": "MathTok", "token_count": 3, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 2, "structural_score": 2, "raw_scr": 0.6666666666666666, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.0}, "gpt2": {"name": "GPT-2", "token_count": 1, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "sentencepiece": {"name": "SentencePiece", "token_count": 1, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "char_level": {"name": "CharLevel", "token_count": 1, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 0.0, "notes": []}
|
| 41 |
+
{"expression": "2*x + 2*y", "category": "canonical", "sexp": "(OP_ADD (OP_MUL CONST_2 VAR_X) (OP_MUL CONST_2 VAR_Y))", "mathtok": {"name": "MathTok", "token_count": 9, "operator_nodes": 3, "tree_depth": 2, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 2, "structural_score": 10, "raw_scr": 1.1111111111111112, "semantic_density": 0.7777777777777778, "structural_efficiency": 0.3333333333333333}, "gpt2": {"name": "GPT-2", "token_count": 7, "operator_nodes": 2, "tree_depth": 0, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.5714285714285714, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "sentencepiece": {"name": "SentencePiece", "token_count": 6, "operator_nodes": 1, "tree_depth": 0, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.3333333333333333, "semantic_density": 0.16666666666666666, "structural_efficiency": 0.16666666666666666}, "char_level": {"name": "CharLevel", "token_count": 9, "operator_nodes": 3, "tree_depth": 0, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.6666666666666666, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.3333333333333333}, "scr_improvement_vs_gpt2": 1.9444444444444446, "scr_improvement_vs_sp": 3.3333333333333335, "scr_improvement_vs_char": 1.6666666666666667, "notes": []}
|
| 42 |
+
{"expression": "2*(x+y)", "category": "canonical", "sexp": "(OP_ADD (OP_MUL CONST_2 VAR_X) (OP_MUL CONST_2 VAR_Y))", "mathtok": {"name": "MathTok", "token_count": 9, "operator_nodes": 3, "tree_depth": 2, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 2, "structural_score": 10, "raw_scr": 1.1111111111111112, "semantic_density": 0.7777777777777778, "structural_efficiency": 0.3333333333333333}, "gpt2": {"name": "GPT-2", "token_count": 7, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.7142857142857143, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "sentencepiece": {"name": "SentencePiece", "token_count": 6, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.5, "semantic_density": 0.16666666666666666, "structural_efficiency": 0.16666666666666666}, "char_level": {"name": "CharLevel", "token_count": 7, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.7142857142857143, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "scr_improvement_vs_gpt2": 1.5555555555555556, "scr_improvement_vs_sp": 2.2222222222222223, "scr_improvement_vs_char": 1.5555555555555556, "notes": []}
|
| 43 |
+
{"expression": "x*y + x*z", "category": "canonical", "sexp": "(OP_MUL VAR_X (OP_ADD VAR_Y VAR_Z))", "mathtok": {"name": "MathTok", "token_count": 7, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 2, "structural_score": 8, "raw_scr": 1.1428571428571428, "semantic_density": 0.7142857142857143, "structural_efficiency": 0.2857142857142857}, "gpt2": {"name": "GPT-2", "token_count": 7, "operator_nodes": 2, "tree_depth": 0, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.5714285714285714, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "sentencepiece": {"name": "SentencePiece", "token_count": 8, "operator_nodes": 3, "tree_depth": 0, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.75, "semantic_density": 0.375, "structural_efficiency": 0.375}, "char_level": {"name": "CharLevel", "token_count": 9, "operator_nodes": 3, "tree_depth": 0, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.6666666666666666, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.3333333333333333}, "scr_improvement_vs_gpt2": 2.0, "scr_improvement_vs_sp": 1.5238095238095237, "scr_improvement_vs_char": 1.7142857142857142, "notes": []}
|
| 44 |
+
{"expression": "x*(y+z)", "category": "canonical", "sexp": "(OP_MUL VAR_X (OP_ADD VAR_Y VAR_Z))", "mathtok": {"name": "MathTok", "token_count": 7, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 2, "structural_score": 8, "raw_scr": 1.1428571428571428, "semantic_density": 0.7142857142857143, "structural_efficiency": 0.2857142857142857}, "gpt2": {"name": "GPT-2", "token_count": 7, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.7142857142857143, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "sentencepiece": {"name": "SentencePiece", "token_count": 7, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.7142857142857143, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "char_level": {"name": "CharLevel", "token_count": 7, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.7142857142857143, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "scr_improvement_vs_gpt2": 1.5999999999999999, "scr_improvement_vs_sp": 1.5999999999999999, "scr_improvement_vs_char": 1.5999999999999999, "notes": []}
|
| 45 |
+
{"expression": "a^2 + 2*a*b + b^2", "category": "canonical", "sexp": "(OP_ADD (OP_POW VAR_A CONST_2) (OP_POW VAR_B CONST_2) (OP_MUL CONST_2 VAR_A VAR_B))", "mathtok": {"name": "MathTok", "token_count": 13, "operator_nodes": 4, "tree_depth": 2, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 2, "structural_score": 12, "raw_scr": 0.9230769230769231, "semantic_density": 0.8461538461538461, "structural_efficiency": 0.3076923076923077}, "gpt2": {"name": "GPT-2", "token_count": 13, "operator_nodes": 4, "tree_depth": 0, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 0, "structural_score": 8, "raw_scr": 0.6153846153846154, "semantic_density": 0.3076923076923077, "structural_efficiency": 0.3076923076923077}, "sentencepiece": {"name": "SentencePiece", "token_count": 12, "operator_nodes": 3, "tree_depth": 0, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 6, "raw_scr": 0.5, "semantic_density": 0.25, "structural_efficiency": 0.25}, "char_level": {"name": "CharLevel", "token_count": 17, "operator_nodes": 6, "tree_depth": 0, "parent_child_relations": 6, "function_scope": 0, "canonical_bonus": 0, "structural_score": 12, "raw_scr": 0.7058823529411765, "semantic_density": 0.35294117647058826, "structural_efficiency": 0.35294117647058826}, "scr_improvement_vs_gpt2": 1.5, "scr_improvement_vs_sp": 1.8461538461538463, "scr_improvement_vs_char": 1.3076923076923077, "notes": []}
|
| 46 |
+
{"expression": "(a+b)^2", "category": "canonical", "sexp": "(OP_ADD (OP_POW VAR_A CONST_2) (OP_POW VAR_B CONST_2) (OP_MUL CONST_2 VAR_A VAR_B))", "mathtok": {"name": "MathTok", "token_count": 13, "operator_nodes": 4, "tree_depth": 2, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 2, "structural_score": 12, "raw_scr": 0.9230769230769231, "semantic_density": 0.8461538461538461, "structural_efficiency": 0.3076923076923077}, "gpt2": {"name": "GPT-2", "token_count": 7, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.7142857142857143, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "sentencepiece": {"name": "SentencePiece", "token_count": 7, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.42857142857142855, "semantic_density": 0.14285714285714285, "structural_efficiency": 0.14285714285714285}, "char_level": {"name": "CharLevel", "token_count": 7, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.7142857142857143, "semantic_density": 0.2857142857142857, "structural_efficiency": 0.2857142857142857}, "scr_improvement_vs_gpt2": 1.2923076923076924, "scr_improvement_vs_sp": 2.153846153846154, "scr_improvement_vs_char": 1.2923076923076924, "notes": []}
|
| 47 |
+
{"expression": "The derivative of sin(x^2) with respect to x.", "category": "mixed_text_math", "sexp": "[UNK]", "mathtok": {"name": "MathTok", "token_count": 38, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 45, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.06666666666666667, "semantic_density": 0.022222222222222223, "structural_efficiency": 0.022222222222222223}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 0.0, "notes": []}
|
| 48 |
+
{"expression": "Solve for x when x^2 + 2*x + 1 = 0.", "category": "mixed_text_math", "sexp": "[UNK]", "mathtok": {"name": "MathTok", "token_count": 19, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 35, "operator_nodes": 5, "tree_depth": 0, "parent_child_relations": 5, "function_scope": 0, "canonical_bonus": 0, "structural_score": 10, "raw_scr": 0.2857142857142857, "semantic_density": 0.14285714285714285, "structural_efficiency": 0.14285714285714285}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 0.0, "notes": []}
|
| 49 |
+
{"expression": "The quadratic formula gives $x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}$.", "category": "mixed_text_math", "sexp": "(OP_EQ VAR_X (OP_MUL (FRAC (OP_NEG CONST_1) CONST_2) VAR_B VAR_PM (OP_RECIP VAR_A) (OP_POW (OP_ADD (OP_POW VAR_B CONST_2) (OP_MUL (OP_NEG CONST_4) VAR_A VAR_C)) (FRAC CONST_1 CONST_2))))", "mathtok": {"name": "MathTok", "token_count": 54, "operator_nodes": 11, "tree_depth": 6, "parent_child_relations": 11, "function_scope": 0, "canonical_bonus": 2, "structural_score": 30, "raw_scr": 0.5555555555555556, "semantic_density": 0.4444444444444444, "structural_efficiency": 0.2037037037037037}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 69, "operator_nodes": 4, "tree_depth": 2, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 0, "structural_score": 10, "raw_scr": 0.14492753623188406, "semantic_density": 0.057971014492753624, "structural_efficiency": 0.057971014492753624}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 3.8333333333333335, "notes": []}
|
| 50 |
+
{"expression": "For $n \\geq 1$, the sum $\\sum_{k=1}^{n} k = \\frac{n(n+1)}{2}$.", "category": "mixed_text_math", "sexp": "(OP_GE VAR_N CONST_1) (OP_EQ (OP_MUL (FRAC CONST_1 CONST_2) (FUNC_N (OP_ADD CONST_1 VAR_N))) (OP_SUM VAR_K (FUNC_TUPLE VAR_K CONST_1 VAR_N)))", "mathtok": {"name": "MathTok", "token_count": 39, "operator_nodes": 6, "tree_depth": 4, "parent_child_relations": 8, "function_scope": 2, "canonical_bonus": 2, "structural_score": 22, "raw_scr": 0.5641025641025641, "semantic_density": 0.46153846153846156, "structural_efficiency": 0.20512820512820512}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 62, "operator_nodes": 4, "tree_depth": 2, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 0, "structural_score": 10, "raw_scr": 0.16129032258064516, "semantic_density": 0.06451612903225806, "structural_efficiency": 0.06451612903225806}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 3.4974358974358974, "notes": []}
|
| 51 |
+
{"expression": "Integrate $\\int_0^1 x^2 dx$ to get $\\frac{1}{3}$.", "category": "mixed_text_math", "sexp": "(OP_INT (OP_POW VAR_X CONST_2) (FUNC_TUPLE VAR_X CONST_0 CONST_1)) (FRAC CONST_1 CONST_3)", "mathtok": {"name": "MathTok", "token_count": 33, "operator_nodes": 3, "tree_depth": 2, "parent_child_relations": 4, "function_scope": 1, "canonical_bonus": 2, "structural_score": 12, "raw_scr": 0.36363636363636365, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.12121212121212122}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 49, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.10204081632653061, "semantic_density": 0.04081632653061224, "structural_efficiency": 0.04081632653061224}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 3.5636363636363635, "notes": []}
|
| 52 |
+
{"expression": "If $a > 0$ and $b > 0$ then $\\log(a) + \\log(b) = \\log(ab)$.", "category": "mixed_text_math", "sexp": "(OP_GT VAR_A CONST_0) (OP_GT VAR_B CONST_0) (OP_EQ (FUNC_LOG (OP_MUL VAR_A VAR_B)) (OP_ADD (FUNC_LOG VAR_A) (FUNC_LOG VAR_B)))", "mathtok": {"name": "MathTok", "token_count": 38, "operator_nodes": 5, "tree_depth": 3, "parent_child_relations": 8, "function_scope": 3, "canonical_bonus": 2, "structural_score": 21, "raw_scr": 0.5526315789473685, "semantic_density": 0.42105263157894735, "structural_efficiency": 0.21052631578947367}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 59, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.0847457627118644, "semantic_density": 0.03389830508474576, "structural_efficiency": 0.03389830508474576}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 6.521052631578948, "notes": []}
|
| 53 |
+
{"expression": "The area of a circle of radius r is pi*r^2.", "category": "mixed_text_math", "sexp": "(OP_MUL CONST_PI (OP_POW VAR_R FLOAT_2p0))", "mathtok": {"name": "MathTok", "token_count": 42, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 2, "structural_score": 8, "raw_scr": 0.19047619047619047, "semantic_density": 0.09523809523809523, "structural_efficiency": 0.047619047619047616}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 43, "operator_nodes": 2, "tree_depth": 0, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 4, "raw_scr": 0.09302325581395349, "semantic_density": 0.046511627906976744, "structural_efficiency": 0.046511627906976744}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 2.0476190476190474, "notes": []}
|
| 54 |
+
{"expression": "Euler's identity: $e^{i\\pi} + 1 = 0$.", "category": "mixed_text_math", "sexp": "(OP_EQ (OP_ADD CONST_1 (OP_POW VAR_E (OP_MUL VAR_I VAR_PI))) CONST_0)", "mathtok": {"name": "MathTok", "token_count": 29, "operator_nodes": 4, "tree_depth": 4, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 2, "structural_score": 14, "raw_scr": 0.4827586206896552, "semantic_density": 0.3103448275862069, "structural_efficiency": 0.13793103448275862}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 37, "operator_nodes": 3, "tree_depth": 1, "parent_child_relations": 3, "function_scope": 0, "canonical_bonus": 0, "structural_score": 7, "raw_scr": 0.1891891891891892, "semantic_density": 0.08108108108108109, "structural_efficiency": 0.08108108108108109}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 2.5517241379310343, "notes": []}
|
| 55 |
+
{"expression": "sin(x^2)", "category": "latex_vs_ascii_ascii", "sexp": "(FUNC_SIN (OP_POW VAR_X CONST_2))", "mathtok": {"name": "MathTok", "token_count": 8, "operator_nodes": 1, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 1, "canonical_bonus": 2, "structural_score": 8, "raw_scr": 1.0, "semantic_density": 0.5, "structural_efficiency": 0.25}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 8, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.375, "semantic_density": 0.125, "structural_efficiency": 0.125}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 2.6666666666666665, "notes": ["pair_partner=\\sin(x^2)"]}
|
| 56 |
+
{"expression": "\\sin(x^2)", "category": "latex_vs_ascii_latex", "sexp": "(FUNC_SIN (OP_POW VAR_X CONST_2))", "mathtok": {"name": "MathTok", "token_count": 8, "operator_nodes": 1, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 1, "canonical_bonus": 2, "structural_score": 8, "raw_scr": 1.0, "semantic_density": 0.5, "structural_efficiency": 0.25}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 9, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.3333333333333333, "semantic_density": 0.1111111111111111, "structural_efficiency": 0.1111111111111111}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 3.0, "notes": ["pair_partner=sin(x^2)"]}
|
| 57 |
+
{"expression": "sqrt(x^2 + 1)", "category": "latex_vs_ascii_ascii", "sexp": "(OP_POW (OP_ADD CONST_1 (OP_POW VAR_X CONST_2)) (FRAC CONST_1 CONST_2))", "mathtok": {"name": "MathTok", "token_count": 11, "operator_nodes": 4, "tree_depth": 3, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 2, "structural_score": 13, "raw_scr": 1.1818181818181819, "semantic_density": 0.8181818181818182, "structural_efficiency": 0.36363636363636365}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 13, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.38461538461538464, "semantic_density": 0.15384615384615385, "structural_efficiency": 0.15384615384615385}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 3.0727272727272728, "notes": ["pair_partner=\\sqrt{x^2 + 1}"]}
|
| 58 |
+
{"expression": "\\sqrt{x^2 + 1}", "category": "latex_vs_ascii_latex", "sexp": "(OP_POW (OP_ADD CONST_1 (OP_POW VAR_X CONST_2)) (FRAC CONST_1 CONST_2))", "mathtok": {"name": "MathTok", "token_count": 11, "operator_nodes": 4, "tree_depth": 3, "parent_child_relations": 4, "function_scope": 0, "canonical_bonus": 2, "structural_score": 13, "raw_scr": 1.1818181818181819, "semantic_density": 0.8181818181818182, "structural_efficiency": 0.36363636363636365}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 14, "operator_nodes": 2, "tree_depth": 1, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 0, "structural_score": 5, "raw_scr": 0.35714285714285715, "semantic_density": 0.14285714285714285, "structural_efficiency": 0.14285714285714285}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 3.309090909090909, "notes": ["pair_partner=sqrt(x^2 + 1)"]}
|
| 59 |
+
{"expression": "log(x)", "category": "latex_vs_ascii_ascii", "sexp": "(FUNC_LOG VAR_X)", "mathtok": {"name": "MathTok", "token_count": 6, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 1, "canonical_bonus": 2, "structural_score": 5, "raw_scr": 0.8333333333333334, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.16666666666666666}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 6, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.16666666666666666, "semantic_density": 0.0, "structural_efficiency": 0.0}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 5.000000000000001, "notes": ["pair_partner=\\ln(x)"]}
|
| 60 |
+
{"expression": "\\ln(x)", "category": "latex_vs_ascii_latex", "sexp": "(FUNC_LOG VAR_X)", "mathtok": {"name": "MathTok", "token_count": 6, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 1, "canonical_bonus": 2, "structural_score": 5, "raw_scr": 0.8333333333333334, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.16666666666666666}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 6, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.16666666666666666, "semantic_density": 0.0, "structural_efficiency": 0.0}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 5.000000000000001, "notes": ["pair_partner=log(x)"]}
|
| 61 |
+
{"expression": "exp(x)", "category": "latex_vs_ascii_ascii", "sexp": "(FUNC_EXP VAR_X)", "mathtok": {"name": "MathTok", "token_count": 6, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 1, "canonical_bonus": 2, "structural_score": 5, "raw_scr": 0.8333333333333334, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.16666666666666666}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 6, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.16666666666666666, "semantic_density": 0.0, "structural_efficiency": 0.0}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 5.000000000000001, "notes": ["pair_partner=e^x"]}
|
| 62 |
+
{"expression": "e^x", "category": "latex_vs_ascii_latex", "sexp": "(FUNC_EXP VAR_X)", "mathtok": {"name": "MathTok", "token_count": 6, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 1, "canonical_bonus": 2, "structural_score": 5, "raw_scr": 0.8333333333333334, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.16666666666666666}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 3, "operator_nodes": 1, "tree_depth": 0, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.6666666666666666, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.3333333333333333}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 1.2500000000000002, "notes": ["pair_partner=exp(x)"]}
|
| 63 |
+
{"expression": "x/y", "category": "latex_vs_ascii_ascii", "sexp": "(OP_MUL VAR_X (OP_RECIP VAR_Y))", "mathtok": {"name": "MathTok", "token_count": 6, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 2, "structural_score": 8, "raw_scr": 1.3333333333333333, "semantic_density": 0.6666666666666666, "structural_efficiency": 0.3333333333333333}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 3, "operator_nodes": 1, "tree_depth": 0, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.6666666666666666, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.3333333333333333}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 2.0, "notes": ["pair_partner=\\frac{x}{y}"]}
|
| 64 |
+
{"expression": "\\frac{x}{y}", "category": "latex_vs_ascii_latex", "sexp": "(OP_MUL VAR_X (OP_RECIP VAR_Y))", "mathtok": {"name": "MathTok", "token_count": 6, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 2, "function_scope": 0, "canonical_bonus": 2, "structural_score": 8, "raw_scr": 1.3333333333333333, "semantic_density": 0.6666666666666666, "structural_efficiency": 0.3333333333333333}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 11, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.09090909090909091, "semantic_density": 0.0, "structural_efficiency": 0.0}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 14.666666666666666, "notes": ["pair_partner=x/y"]}
|
| 65 |
+
{"expression": "int(x^2, x)", "category": "latex_vs_ascii_ascii", "sexp": "[UNK]", "mathtok": {"name": "MathTok", "token_count": 1, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 11, "operator_nodes": 1, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 3, "raw_scr": 0.2727272727272727, "semantic_density": 0.09090909090909091, "structural_efficiency": 0.09090909090909091}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 0.0, "notes": ["pair_partner=\\int x^2 dx"]}
|
| 66 |
+
{"expression": "\\int x^2 dx", "category": "latex_vs_ascii_latex", "sexp": "(OP_INT (OP_POW VAR_X CONST_2) (FUNC_TUPLE VAR_X))", "mathtok": {"name": "MathTok", "token_count": 10, "operator_nodes": 2, "tree_depth": 2, "parent_child_relations": 3, "function_scope": 1, "canonical_bonus": 2, "structural_score": 10, "raw_scr": 1.0, "semantic_density": 0.6, "structural_efficiency": 0.3}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 11, "operator_nodes": 1, "tree_depth": 0, "parent_child_relations": 1, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.18181818181818182, "semantic_density": 0.09090909090909091, "structural_efficiency": 0.09090909090909091}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 5.5, "notes": ["pair_partner=int(x^2, x)"]}
|
| 67 |
+
{"expression": "diff(sin(x), x)", "category": "latex_vs_ascii_ascii", "sexp": "(FUNC_COS VAR_X)", "mathtok": {"name": "MathTok", "token_count": 6, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 1, "canonical_bonus": 2, "structural_score": 5, "raw_scr": 0.8333333333333334, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.16666666666666666}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 15, "operator_nodes": 0, "tree_depth": 2, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 2, "raw_scr": 0.13333333333333333, "semantic_density": 0.0, "structural_efficiency": 0.0}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 6.25, "notes": ["pair_partner=\\frac{d}{dx}\\sin(x)"]}
|
| 68 |
+
{"expression": "\\frac{d}{dx}\\sin(x)", "category": "latex_vs_ascii_latex", "sexp": "(FUNC_COS VAR_X)", "mathtok": {"name": "MathTok", "token_count": 6, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 1, "canonical_bonus": 2, "structural_score": 5, "raw_scr": 0.8333333333333334, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.16666666666666666}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 19, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.05263157894736842, "semantic_density": 0.0, "structural_efficiency": 0.0}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 15.833333333333336, "notes": ["pair_partner=diff(sin(x), x)"]}
|
| 69 |
+
{"expression": "factorial(n)", "category": "latex_vs_ascii_ascii", "sexp": "(FUNC_FACTORIAL VAR_N)", "mathtok": {"name": "MathTok", "token_count": 6, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 1, "canonical_bonus": 2, "structural_score": 5, "raw_scr": 0.8333333333333334, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.16666666666666666}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 12, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 1, "raw_scr": 0.08333333333333333, "semantic_density": 0.0, "structural_efficiency": 0.0}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 10.000000000000002, "notes": ["pair_partner=n!"]}
|
| 70 |
+
{"expression": "n!", "category": "latex_vs_ascii_latex", "sexp": "(FUNC_FACTORIAL VAR_N)", "mathtok": {"name": "MathTok", "token_count": 6, "operator_nodes": 0, "tree_depth": 1, "parent_child_relations": 1, "function_scope": 1, "canonical_bonus": 2, "structural_score": 5, "raw_scr": 0.8333333333333334, "semantic_density": 0.3333333333333333, "structural_efficiency": 0.16666666666666666}, "gpt2": null, "sentencepiece": null, "char_level": {"name": "CharLevel", "token_count": 2, "operator_nodes": 0, "tree_depth": 0, "parent_child_relations": 0, "function_scope": 0, "canonical_bonus": 0, "structural_score": 0, "raw_scr": 0.0, "semantic_density": 0.0, "structural_efficiency": 0.0}, "scr_improvement_vs_gpt2": null, "scr_improvement_vs_sp": null, "scr_improvement_vs_char": 0.0, "notes": ["pair_partner=factorial(n)"]}
|
evaluation/visualize.py
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Visualization Script for MathTok Evaluation Results
|
| 3 |
+
===================================================
|
| 4 |
+
|
| 5 |
+
Generates visual charts from the benchmark comparison results, making
|
| 6 |
+
it easy to understand the performance differences in Semantic Compression Ratio (SCR),
|
| 7 |
+
Canonical Consistency Score (CCS), and more.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python -m evaluation.visualize
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
import matplotlib.pyplot as plt
|
| 16 |
+
import seaborn as sns
|
| 17 |
+
import pandas as pd
|
| 18 |
+
|
| 19 |
+
_RESULTS_DIR = Path(__file__).parent / "results"
|
| 20 |
+
|
| 21 |
+
def load_summary():
|
| 22 |
+
summary_path = _RESULTS_DIR / "comparison_summary.json"
|
| 23 |
+
if not summary_path.exists():
|
| 24 |
+
raise FileNotFoundError(f"Results summary not found at {summary_path}. Run comparison.py first.")
|
| 25 |
+
with open(summary_path, "r", encoding="utf-8") as f:
|
| 26 |
+
return json.load(f)
|
| 27 |
+
|
| 28 |
+
def load_jsonl_results():
|
| 29 |
+
results_path = _RESULTS_DIR / "comparison_results.jsonl"
|
| 30 |
+
records = []
|
| 31 |
+
if not results_path.exists():
|
| 32 |
+
return records
|
| 33 |
+
with open(results_path, "r", encoding="utf-8") as f:
|
| 34 |
+
for line in f:
|
| 35 |
+
records.append(json.loads(line))
|
| 36 |
+
return records
|
| 37 |
+
|
| 38 |
+
def plot_aggregated_scr(summary):
|
| 39 |
+
"""Plot the overall mean Semantic Compression Ratio."""
|
| 40 |
+
fig, ax = plt.subplots(figsize=(8, 6))
|
| 41 |
+
|
| 42 |
+
models = ["Char-level", "GPT-2", "SentencePiece", "MathTok"]
|
| 43 |
+
scrs = [
|
| 44 |
+
summary.get("charlevel_mean_scr", 0),
|
| 45 |
+
summary.get("gpt2_scr", 0),
|
| 46 |
+
summary.get("sentencepiece_mean_scr", 0),
|
| 47 |
+
summary.get("mathtok_mean_scr", 0)
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
# Filter out missing models (like GPT-2 if not run)
|
| 51 |
+
valid_models = []
|
| 52 |
+
valid_scrs = []
|
| 53 |
+
colors = []
|
| 54 |
+
|
| 55 |
+
all_models = [("Char-level", scrs[0], "#EF4444"),
|
| 56 |
+
("GPT-2", scrs[1], "#6B7280"),
|
| 57 |
+
("SentencePiece", scrs[2], "#3B82F6"),
|
| 58 |
+
("MathTok", scrs[3], "#10B981")]
|
| 59 |
+
|
| 60 |
+
for m, s, c in all_models:
|
| 61 |
+
if s is not None and s > 0:
|
| 62 |
+
valid_models.append(m)
|
| 63 |
+
valid_scrs.append(s)
|
| 64 |
+
colors.append(c)
|
| 65 |
+
|
| 66 |
+
sns.barplot(x=valid_models, y=valid_scrs, palette=colors, ax=ax)
|
| 67 |
+
|
| 68 |
+
ax.set_title("Mean Semantic Compression Ratio (SCR)\n(Higher is Better)", fontsize=14, fontweight='bold', pad=15)
|
| 69 |
+
ax.set_ylabel("SCR (Structural Score / Tokens)", fontsize=12)
|
| 70 |
+
sns.despine(ax=ax)
|
| 71 |
+
|
| 72 |
+
# Add value labels
|
| 73 |
+
for i, v in enumerate(valid_scrs):
|
| 74 |
+
ax.text(i, v + 0.02, f"{v:.3f}", ha='center', fontweight='bold', fontsize=11)
|
| 75 |
+
|
| 76 |
+
plt.tight_layout()
|
| 77 |
+
out_path = _RESULTS_DIR / "scr_comparison.png"
|
| 78 |
+
plt.savefig(out_path, dpi=300)
|
| 79 |
+
print(f"Saved {out_path}")
|
| 80 |
+
plt.close()
|
| 81 |
+
|
| 82 |
+
def plot_category_scr(records):
|
| 83 |
+
"""Plot SCR breakdown by category."""
|
| 84 |
+
data = []
|
| 85 |
+
for r in records:
|
| 86 |
+
cat = r["category"]
|
| 87 |
+
if "mixed" in cat or "latex_vs_ascii" in cat:
|
| 88 |
+
continue # Focus on standard mathematical metrics for SCR
|
| 89 |
+
|
| 90 |
+
data.append({"Category": cat, "Model": "MathTok", "SCR": r["mathtok"]["raw_scr"]})
|
| 91 |
+
data.append({"Category": cat, "Model": "Char-level", "SCR": r["char_level"]["raw_scr"]})
|
| 92 |
+
if r.get("gpt2") and r["gpt2"].get("raw_scr") is not None:
|
| 93 |
+
data.append({"Category": cat, "Model": "GPT-2", "SCR": r["gpt2"]["raw_scr"]})
|
| 94 |
+
if r.get("sentencepiece") and r["sentencepiece"].get("raw_scr") is not None:
|
| 95 |
+
data.append({"Category": cat, "Model": "SentencePiece", "SCR": r["sentencepiece"]["raw_scr"]})
|
| 96 |
+
|
| 97 |
+
if not data:
|
| 98 |
+
return
|
| 99 |
+
|
| 100 |
+
df = pd.DataFrame(data)
|
| 101 |
+
|
| 102 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 103 |
+
sns.barplot(data=df, x="Category", y="SCR", hue="Model",
|
| 104 |
+
palette={"MathTok": "#10B981", "GPT-2": "#6B7280", "SentencePiece": "#3B82F6", "Char-level": "#EF4444"},
|
| 105 |
+
errorbar=None, ax=ax)
|
| 106 |
+
|
| 107 |
+
ax.set_title("Semantic Compression Ratio by Category", fontsize=14, fontweight='bold', pad=15)
|
| 108 |
+
ax.set_ylabel("Mean SCR", fontsize=12)
|
| 109 |
+
ax.set_xlabel("Expression Category", fontsize=12)
|
| 110 |
+
sns.despine(ax=ax)
|
| 111 |
+
plt.xticks(rotation=15)
|
| 112 |
+
plt.legend(title="Tokenizer")
|
| 113 |
+
|
| 114 |
+
plt.tight_layout()
|
| 115 |
+
out_path = _RESULTS_DIR / "scr_by_category.png"
|
| 116 |
+
plt.savefig(out_path, dpi=300)
|
| 117 |
+
print(f"Saved {out_path}")
|
| 118 |
+
plt.close()
|
| 119 |
+
|
| 120 |
+
def plot_token_counts(summary):
|
| 121 |
+
"""Plot total token counts as a bar chart to show efficiency."""
|
| 122 |
+
per_record = summary.get("per_record", [])
|
| 123 |
+
if not per_record:
|
| 124 |
+
return
|
| 125 |
+
|
| 126 |
+
# We'll just plot the first 15 for readability
|
| 127 |
+
subset = per_record[:15]
|
| 128 |
+
|
| 129 |
+
df_data = []
|
| 130 |
+
for i, r in enumerate(subset):
|
| 131 |
+
expr_short = r["expression"][:15] + ".." if len(r["expression"]) > 15 else r["expression"]
|
| 132 |
+
df_data.append({"Expression": expr_short, "Model": "MathTok", "Tokens": r["mt_tokens"], "Order": i})
|
| 133 |
+
df_data.append({"Expression": expr_short, "Model": "Char-level", "Tokens": r["ch_tokens"], "Order": i})
|
| 134 |
+
if r.get("gp_tokens"):
|
| 135 |
+
df_data.append({"Expression": expr_short, "Model": "GPT-2", "Tokens": r["gp_tokens"], "Order": i})
|
| 136 |
+
if r.get("sp_tokens"):
|
| 137 |
+
df_data.append({"Expression": expr_short, "Model": "SentencePiece", "Tokens": r["sp_tokens"], "Order": i})
|
| 138 |
+
|
| 139 |
+
df = pd.DataFrame(df_data)
|
| 140 |
+
|
| 141 |
+
fig, ax = plt.subplots(figsize=(12, 6))
|
| 142 |
+
# Sort by original order
|
| 143 |
+
df = df.sort_values("Order")
|
| 144 |
+
|
| 145 |
+
sns.barplot(data=df, x="Expression", y="Tokens", hue="Model",
|
| 146 |
+
palette={"MathTok": "#10B981", "GPT-2": "#6B7280", "SentencePiece": "#3B82F6", "Char-level": "#EF4444"}, ax=ax)
|
| 147 |
+
|
| 148 |
+
ax.set_title("Token Counts per Expression (Fewer is usually better, but SCR is the true metric)", fontsize=14, fontweight='bold', pad=15)
|
| 149 |
+
ax.set_ylabel("Number of Tokens", fontsize=12)
|
| 150 |
+
sns.despine(ax=ax)
|
| 151 |
+
plt.xticks(rotation=45, ha='right')
|
| 152 |
+
plt.legend(title="Tokenizer")
|
| 153 |
+
|
| 154 |
+
plt.tight_layout()
|
| 155 |
+
out_path = _RESULTS_DIR / "token_counts_sample.png"
|
| 156 |
+
plt.savefig(out_path, dpi=300)
|
| 157 |
+
print(f"Saved {out_path}")
|
| 158 |
+
plt.close()
|
| 159 |
+
|
| 160 |
+
def plot_semantic_density(records):
|
| 161 |
+
"""Plot the overall mean Semantic Density."""
|
| 162 |
+
ch_dens = [r["char_level"]["semantic_density"] for r in records if r.get("char_level")]
|
| 163 |
+
gp_dens = [r["gpt2"]["semantic_density"] for r in records if r.get("gpt2") and r["gpt2"].get("semantic_density") is not None]
|
| 164 |
+
sp_dens = [r["sentencepiece"]["semantic_density"] for r in records if r.get("sentencepiece") and r["sentencepiece"].get("semantic_density") is not None]
|
| 165 |
+
mt_dens = [r["mathtok"]["semantic_density"] for r in records if r.get("mathtok")]
|
| 166 |
+
|
| 167 |
+
mean_ch = sum(ch_dens) / len(ch_dens) if ch_dens else 0.0
|
| 168 |
+
mean_gp = sum(gp_dens) / len(gp_dens) if gp_dens else 0.0
|
| 169 |
+
mean_sp = sum(sp_dens) / len(sp_dens) if sp_dens else 0.0
|
| 170 |
+
mean_mt = sum(mt_dens) / len(mt_dens) if mt_dens else 0.0
|
| 171 |
+
|
| 172 |
+
valid_models = []
|
| 173 |
+
valid_dens = []
|
| 174 |
+
colors = []
|
| 175 |
+
|
| 176 |
+
all_models = [("Char-level", mean_ch, "#EF4444"),
|
| 177 |
+
("GPT-2", mean_gp, "#6B7280"),
|
| 178 |
+
("SentencePiece", mean_sp, "#3B82F6"),
|
| 179 |
+
("MathTok", mean_mt, "#10B981")]
|
| 180 |
+
|
| 181 |
+
for model, val, color in all_models:
|
| 182 |
+
if val > 0:
|
| 183 |
+
valid_models.append(model)
|
| 184 |
+
valid_dens.append(val)
|
| 185 |
+
colors.append(color)
|
| 186 |
+
|
| 187 |
+
fig, ax = plt.subplots(figsize=(8, 6))
|
| 188 |
+
sns.barplot(x=valid_models, y=valid_dens, palette=colors, ax=ax)
|
| 189 |
+
ax.set_title("Mean Semantic Density\n(Ratio of Math-Centric Tokens to Total Tokens)", fontsize=14, fontweight='bold', pad=15)
|
| 190 |
+
ax.set_ylabel("Semantic Density Score (Higher is Better)", fontsize=12)
|
| 191 |
+
sns.despine(ax=ax)
|
| 192 |
+
|
| 193 |
+
for i, v in enumerate(valid_dens):
|
| 194 |
+
ax.text(i, v + 0.01, f"{v:.3f}", ha='center', fontweight='bold', fontsize=11)
|
| 195 |
+
|
| 196 |
+
plt.tight_layout()
|
| 197 |
+
out_path = _RESULTS_DIR / "semantic_density_comparison.png"
|
| 198 |
+
plt.savefig(out_path, dpi=300)
|
| 199 |
+
print(f"Saved {out_path}")
|
| 200 |
+
plt.close()
|
| 201 |
+
|
| 202 |
+
def plot_structural_efficiency(records):
|
| 203 |
+
"""Plot the overall mean Structural Efficiency."""
|
| 204 |
+
ch_eff = [r["char_level"]["structural_efficiency"] for r in records if r.get("char_level")]
|
| 205 |
+
gp_eff = [r["gpt2"]["structural_efficiency"] for r in records if r.get("gpt2") and r["gpt2"].get("structural_efficiency") is not None]
|
| 206 |
+
sp_eff = [r["sentencepiece"]["structural_efficiency"] for r in records if r.get("sentencepiece") and r["sentencepiece"].get("structural_efficiency") is not None]
|
| 207 |
+
mt_eff = [r["mathtok"]["structural_efficiency"] for r in records if r.get("mathtok")]
|
| 208 |
+
|
| 209 |
+
mean_ch = sum(ch_eff) / len(ch_eff) if ch_eff else 0.0
|
| 210 |
+
mean_gp = sum(gp_eff) / len(gp_eff) if gp_eff else 0.0
|
| 211 |
+
mean_sp = sum(sp_eff) / len(sp_eff) if sp_eff else 0.0
|
| 212 |
+
mean_mt = sum(mt_eff) / len(mt_eff) if mt_eff else 0.0
|
| 213 |
+
|
| 214 |
+
valid_models = []
|
| 215 |
+
valid_eff = []
|
| 216 |
+
colors = []
|
| 217 |
+
|
| 218 |
+
all_models = [("Char-level", mean_ch, "#EF4444"),
|
| 219 |
+
("GPT-2", mean_gp, "#6B7280"),
|
| 220 |
+
("SentencePiece", mean_sp, "#3B82F6"),
|
| 221 |
+
("MathTok", mean_mt, "#10B981")]
|
| 222 |
+
|
| 223 |
+
for model, val, color in all_models:
|
| 224 |
+
if val > 0:
|
| 225 |
+
valid_models.append(model)
|
| 226 |
+
valid_eff.append(val)
|
| 227 |
+
colors.append(color)
|
| 228 |
+
|
| 229 |
+
fig, ax = plt.subplots(figsize=(8, 6))
|
| 230 |
+
sns.barplot(x=valid_models, y=valid_eff, palette=colors, ax=ax)
|
| 231 |
+
ax.set_title("Mean Structural Efficiency\n(Parent-Child Relations per Token)", fontsize=14, fontweight='bold', pad=15)
|
| 232 |
+
ax.set_ylabel("Structural Efficiency Score (Higher is Better)", fontsize=12)
|
| 233 |
+
sns.despine(ax=ax)
|
| 234 |
+
|
| 235 |
+
for i, v in enumerate(valid_eff):
|
| 236 |
+
ax.text(i, v + 0.01, f"{v:.3f}", ha='center', fontweight='bold', fontsize=11)
|
| 237 |
+
|
| 238 |
+
plt.tight_layout()
|
| 239 |
+
out_path = _RESULTS_DIR / "structural_efficiency_comparison.png"
|
| 240 |
+
plt.savefig(out_path, dpi=300)
|
| 241 |
+
print(f"Saved {out_path}")
|
| 242 |
+
plt.close()
|
| 243 |
+
|
| 244 |
+
def plot_unified_dashboard(summary, records):
|
| 245 |
+
"""Generates a side-by-side three-panel dashboard showing SCR, Semantic Density, and Structural Efficiency."""
|
| 246 |
+
fig, axes = plt.subplots(1, 3, figsize=(18, 5.5))
|
| 247 |
+
|
| 248 |
+
# 1. SCR
|
| 249 |
+
models = ["Char-level", "GPT-2", "SentencePiece", "MathTok"]
|
| 250 |
+
scrs = [
|
| 251 |
+
summary.get("charlevel_mean_scr", 0),
|
| 252 |
+
summary.get("gpt2_scr", 0),
|
| 253 |
+
summary.get("sentencepiece_mean_scr", 0),
|
| 254 |
+
summary.get("mathtok_mean_scr", 0)
|
| 255 |
+
]
|
| 256 |
+
|
| 257 |
+
valid_models_scr = []
|
| 258 |
+
valid_scrs = []
|
| 259 |
+
colors_scr = []
|
| 260 |
+
all_scr = [("Char-level", scrs[0], "#EF4444"),
|
| 261 |
+
("GPT-2", scrs[1], "#6B7280"),
|
| 262 |
+
("SentencePiece", scrs[2], "#3B82F6"),
|
| 263 |
+
("MathTok", scrs[3], "#10B981")]
|
| 264 |
+
for m, v, c in all_scr:
|
| 265 |
+
if v is not None and v > 0:
|
| 266 |
+
valid_models_scr.append(m)
|
| 267 |
+
valid_scrs.append(v)
|
| 268 |
+
colors_scr.append(c)
|
| 269 |
+
|
| 270 |
+
sns.barplot(x=valid_models_scr, y=valid_scrs, palette=colors_scr, ax=axes[0])
|
| 271 |
+
axes[0].set_title("Semantic Compression Ratio (SCR)", fontsize=12, fontweight='bold', pad=10)
|
| 272 |
+
axes[0].set_ylabel("SCR Score (Higher is Better)", fontsize=10)
|
| 273 |
+
sns.despine(ax=axes[0])
|
| 274 |
+
for i, v in enumerate(valid_scrs):
|
| 275 |
+
axes[0].text(i, v + 0.02, f"{v:.3f}", ha='center', fontweight='bold', fontsize=10)
|
| 276 |
+
|
| 277 |
+
# 2. Semantic Density
|
| 278 |
+
ch_dens = [r["char_level"]["semantic_density"] for r in records if r.get("char_level")]
|
| 279 |
+
gp_dens = [r["gpt2"]["semantic_density"] for r in records if r.get("gpt2") and r["gpt2"].get("semantic_density") is not None]
|
| 280 |
+
sp_dens = [r["sentencepiece"]["semantic_density"] for r in records if r.get("sentencepiece") and r["sentencepiece"].get("semantic_density") is not None]
|
| 281 |
+
mt_dens = [r["mathtok"]["semantic_density"] for r in records if r.get("mathtok")]
|
| 282 |
+
|
| 283 |
+
mean_ch_d = sum(ch_dens) / len(ch_dens) if ch_dens else 0.0
|
| 284 |
+
mean_gp_d = sum(gp_dens) / len(gp_dens) if gp_dens else 0.0
|
| 285 |
+
mean_sp_d = sum(sp_dens) / len(sp_dens) if sp_dens else 0.0
|
| 286 |
+
mean_mt_d = sum(mt_dens) / len(mt_dens) if mt_dens else 0.0
|
| 287 |
+
|
| 288 |
+
valid_models_d = []
|
| 289 |
+
valid_dens = []
|
| 290 |
+
colors_d = []
|
| 291 |
+
all_dens = [("Char-level", mean_ch_d, "#EF4444"),
|
| 292 |
+
("GPT-2", mean_gp_d, "#6B7280"),
|
| 293 |
+
("SentencePiece", mean_sp_d, "#3B82F6"),
|
| 294 |
+
("MathTok", mean_mt_d, "#10B981")]
|
| 295 |
+
for m, v, c in all_dens:
|
| 296 |
+
if v > 0:
|
| 297 |
+
valid_models_d.append(m)
|
| 298 |
+
valid_dens.append(v)
|
| 299 |
+
colors_d.append(c)
|
| 300 |
+
|
| 301 |
+
sns.barplot(x=valid_models_d, y=valid_dens, palette=colors_d, ax=axes[1])
|
| 302 |
+
axes[1].set_title("Semantic Density", fontsize=12, fontweight='bold', pad=10)
|
| 303 |
+
axes[1].set_ylabel("Density Score (Higher is Better)", fontsize=10)
|
| 304 |
+
sns.despine(ax=axes[1])
|
| 305 |
+
for i, v in enumerate(valid_dens):
|
| 306 |
+
axes[1].text(i, v + 0.01, f"{v:.3f}", ha='center', fontweight='bold', fontsize=10)
|
| 307 |
+
|
| 308 |
+
# 3. Structural Efficiency
|
| 309 |
+
ch_eff = [r["char_level"]["structural_efficiency"] for r in records if r.get("char_level")]
|
| 310 |
+
gp_eff = [r["gpt2"]["structural_efficiency"] for r in records if r.get("gpt2") and r["gpt2"].get("structural_efficiency") is not None]
|
| 311 |
+
sp_eff = [r["sentencepiece"]["structural_efficiency"] for r in records if r.get("sentencepiece") and r["sentencepiece"].get("structural_efficiency") is not None]
|
| 312 |
+
mt_eff = [r["mathtok"]["structural_efficiency"] for r in records if r.get("mathtok")]
|
| 313 |
+
|
| 314 |
+
mean_ch_e = sum(ch_eff) / len(ch_eff) if ch_eff else 0.0
|
| 315 |
+
mean_gp_e = sum(gp_eff) / len(gp_eff) if gp_eff else 0.0
|
| 316 |
+
mean_sp_e = sum(sp_eff) / len(sp_eff) if sp_eff else 0.0
|
| 317 |
+
mean_mt_e = sum(mt_eff) / len(mt_eff) if mt_eff else 0.0
|
| 318 |
+
|
| 319 |
+
valid_models_e = []
|
| 320 |
+
valid_eff = []
|
| 321 |
+
colors_e = []
|
| 322 |
+
all_eff = [("Char-level", mean_ch_e, "#EF4444"),
|
| 323 |
+
("GPT-2", mean_gp_e, "#6B7280"),
|
| 324 |
+
("SentencePiece", mean_sp_e, "#3B82F6"),
|
| 325 |
+
("MathTok", mean_mt_e, "#10B981")]
|
| 326 |
+
for m, v, c in all_eff:
|
| 327 |
+
if v > 0:
|
| 328 |
+
valid_models_e.append(m)
|
| 329 |
+
valid_eff.append(v)
|
| 330 |
+
colors_e.append(c)
|
| 331 |
+
|
| 332 |
+
sns.barplot(x=valid_models_e, y=valid_eff, palette=colors_e, ax=axes[2])
|
| 333 |
+
axes[2].set_title("Structural Efficiency", fontsize=12, fontweight='bold', pad=10)
|
| 334 |
+
axes[2].set_ylabel("Efficiency Score (Higher is Better)", fontsize=10)
|
| 335 |
+
sns.despine(ax=axes[2])
|
| 336 |
+
for i, v in enumerate(valid_eff):
|
| 337 |
+
axes[2].text(i, v + 0.01, f"{v:.3f}", ha='center', fontweight='bold', fontsize=10)
|
| 338 |
+
|
| 339 |
+
plt.suptitle("MathTok Comparative Evaluation Framework — Unified Dashboard", fontsize=16, fontweight='bold', y=1.02)
|
| 340 |
+
plt.tight_layout()
|
| 341 |
+
out_path = _RESULTS_DIR / "metrics_dashboard.png"
|
| 342 |
+
plt.savefig(out_path, dpi=300, bbox_inches='tight')
|
| 343 |
+
print(f"Saved {out_path}")
|
| 344 |
+
plt.close()
|
| 345 |
+
|
| 346 |
+
def main():
|
| 347 |
+
print("Generating visualizations from benchmark results...")
|
| 348 |
+
|
| 349 |
+
# Set nice styling
|
| 350 |
+
sns.set_theme(style="whitegrid", rc={"grid.alpha": 0.3})
|
| 351 |
+
|
| 352 |
+
try:
|
| 353 |
+
summary = load_summary()
|
| 354 |
+
records = load_jsonl_results()
|
| 355 |
+
|
| 356 |
+
plot_aggregated_scr(summary)
|
| 357 |
+
|
| 358 |
+
if records:
|
| 359 |
+
plot_category_scr(records)
|
| 360 |
+
plot_semantic_density(records)
|
| 361 |
+
plot_structural_efficiency(records)
|
| 362 |
+
plot_unified_dashboard(summary, records)
|
| 363 |
+
|
| 364 |
+
plot_token_counts(summary)
|
| 365 |
+
|
| 366 |
+
print("\nAll visualizations generated successfully in evaluation/results/.")
|
| 367 |
+
except Exception as e:
|
| 368 |
+
print(f"Error generating visualizations: {e}")
|
| 369 |
+
|
| 370 |
+
if __name__ == "__main__":
|
| 371 |
+
main()
|
mathtok/__init__.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MathTok: A Hybrid Canonicalized AST-Based Tokenization Framework
|
| 3 |
+
for Mathematical Language Modeling.
|
| 4 |
+
|
| 5 |
+
Paper: "MathTok: A Hybrid Canonicalized AST-Based Tokenization Framework
|
| 6 |
+
for Mathematical Language Modeling"
|
| 7 |
+
|
| 8 |
+
Pipeline stages
|
| 9 |
+
───────────────
|
| 10 |
+
1. Canonicalization — normalize mathematically equivalent forms
|
| 11 |
+
2. Hybrid Lexer — split text / math spans (LaTeX + ASCII)
|
| 12 |
+
3. AST Generator — SymPy expression → typed ASTNode tree
|
| 13 |
+
4. Operator Registry — semantic metadata per operator/function
|
| 14 |
+
5. Serializer — DFS preorder flattening of tree
|
| 15 |
+
6. Metadata — per-token structural attention hints
|
| 16 |
+
7. Vocabulary — fixed math vocab + BPE text; HF-compatible
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from .pipeline import MathTokPipeline
|
| 20 |
+
from .canonicalizer import Canonicalizer, CanonicalizationResult
|
| 21 |
+
from .lexer import HybridLexer, LexSpan, SpanType
|
| 22 |
+
from .ast_generator import ASTGenerator, ASTNode
|
| 23 |
+
from .operator_registry import OPERATOR_REGISTRY, OperatorMeta, get_operator, get_all_operator_tokens, INVERSE_PAIRS
|
| 24 |
+
from .serializer import StructuralSerializer, SerializedToken
|
| 25 |
+
from .metadata import MetadataGenerator, TokenMetadata
|
| 26 |
+
from .vocabulary import MathTokVocabulary, MathTokHFTokenizer
|
| 27 |
+
from .validator import RoundTripValidator, ValidationResult
|
| 28 |
+
from .streaming import MathTokStreamingPipeline
|
| 29 |
+
|
| 30 |
+
__version__ = "0.1.0"
|
| 31 |
+
__all__ = [
|
| 32 |
+
"MathTokPipeline",
|
| 33 |
+
"Canonicalizer", "CanonicalizationResult",
|
| 34 |
+
"HybridLexer", "LexSpan", "SpanType",
|
| 35 |
+
"ASTGenerator", "ASTNode",
|
| 36 |
+
"OperatorMeta", "OPERATOR_REGISTRY", "get_operator", "get_all_operator_tokens", "INVERSE_PAIRS",
|
| 37 |
+
"StructuralSerializer", "SerializedToken",
|
| 38 |
+
"MetadataGenerator", "TokenMetadata",
|
| 39 |
+
"MathTokVocabulary", "MathTokHFTokenizer",
|
| 40 |
+
"RoundTripValidator", "ValidationResult",
|
| 41 |
+
"MathTokStreamingPipeline",
|
| 42 |
+
]
|
mathtok/ast_generator.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Layer 3: AST Generator
|
| 3 |
+
|
| 4 |
+
Converts a canonical SymPy expression into a typed ASTNode tree.
|
| 5 |
+
Each node carries:
|
| 6 |
+
- token : MathTok vocabulary string (e.g. "OP_ADD", "VAR_X")
|
| 7 |
+
- sympy_expr : the original SymPy subexpression
|
| 8 |
+
- children : ordered child ASTNodes
|
| 9 |
+
- depth : 0 = root
|
| 10 |
+
- node_id : unique integer assigned by DFS counter
|
| 11 |
+
- parent_id : -1 for root
|
| 12 |
+
|
| 13 |
+
The tree faithfully mirrors the SymPy internal representation while
|
| 14 |
+
mapping SymPy types onto the richer MathTok operator vocabulary.
|
| 15 |
+
|
| 16 |
+
Key design decisions
|
| 17 |
+
────────────────────
|
| 18 |
+
• Mul(-1, x) → OP_NEG(x) (detect unary negation)
|
| 19 |
+
• Pow(x, -1) → OP_RECIP(x) (detect reciprocal)
|
| 20 |
+
• Rational(p, q) → FRAC(p, q) (explicit fraction node)
|
| 21 |
+
• Unknown functions → FUNC_<NAME> (graceful fallback)
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
from __future__ import annotations
|
| 25 |
+
|
| 26 |
+
import logging
|
| 27 |
+
from dataclasses import dataclass, field
|
| 28 |
+
from typing import Any, Optional
|
| 29 |
+
|
| 30 |
+
import sympy as sp
|
| 31 |
+
from sympy import (
|
| 32 |
+
Add, Mul, Pow, Symbol, Integer, Rational, Float, Number,
|
| 33 |
+
Abs, Derivative, Integral, Limit, Sum, Product,
|
| 34 |
+
sin, cos, tan, asin, acos, atan, sinh, cosh, tanh,
|
| 35 |
+
exp, log, sqrt, gamma, factorial, floor, ceiling, re, im,
|
| 36 |
+
Eq, Ne, Lt, Gt, Le, Ge,
|
| 37 |
+
S,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
logger = logging.getLogger(__name__)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# ── ASTNode dataclass ──────────────────────────────────────────────────────
|
| 44 |
+
|
| 45 |
+
@dataclass
|
| 46 |
+
class ASTNode:
|
| 47 |
+
"""
|
| 48 |
+
A node in the MathTok abstract syntax tree.
|
| 49 |
+
|
| 50 |
+
Attributes
|
| 51 |
+
----------
|
| 52 |
+
token : str
|
| 53 |
+
MathTok vocabulary token, e.g. "OP_ADD", "VAR_X", "CONST_2".
|
| 54 |
+
sympy_expr : Any
|
| 55 |
+
Original SymPy (sub)expression for debugging / round-tripping.
|
| 56 |
+
children : list[ASTNode]
|
| 57 |
+
Ordered child nodes (left-to-right as in mathematical notation).
|
| 58 |
+
depth : int
|
| 59 |
+
Depth from the root (root = 0).
|
| 60 |
+
node_id : int
|
| 61 |
+
Unique integer ID assigned during tree construction.
|
| 62 |
+
parent_id : int
|
| 63 |
+
Parent node's ID; -1 for the root.
|
| 64 |
+
"""
|
| 65 |
+
token: str
|
| 66 |
+
sympy_expr: Any
|
| 67 |
+
children: list[ASTNode] = field(default_factory=list)
|
| 68 |
+
depth: int = 0
|
| 69 |
+
node_id: int = -1
|
| 70 |
+
parent_id: int = -1
|
| 71 |
+
confidence: float = 1.0
|
| 72 |
+
|
| 73 |
+
@property
|
| 74 |
+
def is_leaf(self) -> bool:
|
| 75 |
+
return len(self.children) == 0
|
| 76 |
+
|
| 77 |
+
@property
|
| 78 |
+
def subtree_size(self) -> int:
|
| 79 |
+
return 1 + sum(c.subtree_size for c in self.children)
|
| 80 |
+
|
| 81 |
+
@property
|
| 82 |
+
def height(self) -> int:
|
| 83 |
+
if self.is_leaf:
|
| 84 |
+
return 0
|
| 85 |
+
return 1 + max(c.height for c in self.children)
|
| 86 |
+
|
| 87 |
+
def __repr__(self) -> str:
|
| 88 |
+
if self.children:
|
| 89 |
+
return f"{self.token}({', '.join(repr(c) for c in self.children)})"
|
| 90 |
+
return self.token
|
| 91 |
+
|
| 92 |
+
def to_dict(self) -> dict:
|
| 93 |
+
return {
|
| 94 |
+
"token": self.token,
|
| 95 |
+
"node_id": self.node_id,
|
| 96 |
+
"parent_id": self.parent_id,
|
| 97 |
+
"depth": self.depth,
|
| 98 |
+
"is_leaf": self.is_leaf,
|
| 99 |
+
"subtree_size": self.subtree_size,
|
| 100 |
+
"confidence": self.confidence,
|
| 101 |
+
"children": [c.to_dict() for c in self.children],
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# ── SymPy type → MathTok token mapping ────────────────────────────────────
|
| 106 |
+
|
| 107 |
+
_FUNC_MAP: dict[type, str] = {
|
| 108 |
+
sin: "FUNC_SIN",
|
| 109 |
+
cos: "FUNC_COS",
|
| 110 |
+
tan: "FUNC_TAN",
|
| 111 |
+
asin: "FUNC_ASIN",
|
| 112 |
+
acos: "FUNC_ACOS",
|
| 113 |
+
atan: "FUNC_ATAN",
|
| 114 |
+
sinh: "FUNC_SINH",
|
| 115 |
+
cosh: "FUNC_COSH",
|
| 116 |
+
tanh: "FUNC_TANH",
|
| 117 |
+
exp: "FUNC_EXP",
|
| 118 |
+
log: "FUNC_LOG",
|
| 119 |
+
sqrt: "FUNC_SQRT",
|
| 120 |
+
Abs: "OP_ABS",
|
| 121 |
+
gamma: "FUNC_GAMMA",
|
| 122 |
+
factorial: "FUNC_FACTORIAL",
|
| 123 |
+
floor: "FUNC_FLOOR",
|
| 124 |
+
ceiling: "FUNC_CEIL",
|
| 125 |
+
re: "FUNC_RE",
|
| 126 |
+
im: "FUNC_IM",
|
| 127 |
+
Derivative: "OP_DERIV",
|
| 128 |
+
Integral: "OP_INT",
|
| 129 |
+
Limit: "OP_LIMIT",
|
| 130 |
+
Sum: "OP_SUM",
|
| 131 |
+
Product: "OP_PROD",
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
_REL_MAP: dict[type, str] = {
|
| 135 |
+
Eq: "OP_EQ",
|
| 136 |
+
Ne: "OP_NEQ",
|
| 137 |
+
Lt: "OP_LT",
|
| 138 |
+
Gt: "OP_GT",
|
| 139 |
+
Le: "OP_LE",
|
| 140 |
+
Ge: "OP_GE",
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
# Pre-defined variable tokens (name → token)
|
| 144 |
+
_VAR_MAP: dict[str, str] = {
|
| 145 |
+
"x": "VAR_X", "y": "VAR_Y", "z": "VAR_Z", "t": "VAR_T",
|
| 146 |
+
"n": "VAR_N", "k": "VAR_K", "a": "VAR_A", "b": "VAR_B",
|
| 147 |
+
"c": "VAR_C", "m": "VAR_M", "i": "VAR_I", "j": "VAR_J",
|
| 148 |
+
"r": "VAR_R", "s": "VAR_S", "u": "VAR_U", "v": "VAR_V",
|
| 149 |
+
"w": "VAR_W", "p": "VAR_P", "q": "VAR_Q", "l": "VAR_L",
|
| 150 |
+
"f": "VAR_F", "g": "VAR_G", "h": "VAR_H",
|
| 151 |
+
# Greek letters
|
| 152 |
+
"theta": "VAR_THETA", "alpha": "VAR_ALPHA",
|
| 153 |
+
"beta": "VAR_BETA", "gamma": "VAR_GAMMA_",
|
| 154 |
+
"delta": "VAR_DELTA", "epsilon": "VAR_EPSILON",
|
| 155 |
+
"zeta": "VAR_ZETA", "eta": "VAR_ETA",
|
| 156 |
+
"lambda": "VAR_LAMBDA", "mu": "VAR_MU",
|
| 157 |
+
"nu": "VAR_NU", "xi": "VAR_XI",
|
| 158 |
+
"rho": "VAR_RHO", "sigma": "VAR_SIGMA",
|
| 159 |
+
"tau": "VAR_TAU", "phi": "VAR_PHI",
|
| 160 |
+
"chi": "VAR_CHI", "psi": "VAR_PSI",
|
| 161 |
+
"omega": "VAR_OMEGA",
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
# Small integer dedicated tokens (covers the vast majority of constants)
|
| 165 |
+
_INT_TOKENS: dict[int, str] = {i: f"CONST_{i}" for i in range(-10, 101)}
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# ── ASTGenerator ──────────────────────────────────────────────────────────
|
| 169 |
+
|
| 170 |
+
class ASTGenerator:
|
| 171 |
+
"""
|
| 172 |
+
Convert a canonical SymPy expression into a typed ASTNode tree.
|
| 173 |
+
|
| 174 |
+
Usage
|
| 175 |
+
-----
|
| 176 |
+
>>> gen = ASTGenerator()
|
| 177 |
+
>>> import sympy as sp
|
| 178 |
+
>>> ast = gen.generate(sp.parse_expr("x**2 + 2*x + 1"))
|
| 179 |
+
>>> print(ast)
|
| 180 |
+
OP_ADD(OP_POW(VAR_X, CONST_2), OP_MUL(CONST_2, VAR_X), CONST_1)
|
| 181 |
+
"""
|
| 182 |
+
|
| 183 |
+
def __init__(self, max_depth: int = 20) -> None:
|
| 184 |
+
self.max_depth = max_depth
|
| 185 |
+
self._counter: int = 0
|
| 186 |
+
|
| 187 |
+
def generate(self, expr: sp.Expr) -> ASTNode:
|
| 188 |
+
"""
|
| 189 |
+
Build the ASTNode tree for a SymPy expression.
|
| 190 |
+
|
| 191 |
+
Parameters
|
| 192 |
+
----------
|
| 193 |
+
expr : sp.Expr
|
| 194 |
+
Canonical SymPy expression (output of Canonicalizer).
|
| 195 |
+
|
| 196 |
+
Returns
|
| 197 |
+
-------
|
| 198 |
+
ASTNode
|
| 199 |
+
Root of the typed AST.
|
| 200 |
+
"""
|
| 201 |
+
self._counter = 0
|
| 202 |
+
return self._visit(expr, depth=0, parent_id=-1)
|
| 203 |
+
|
| 204 |
+
def get_all_tokens(self, root: ASTNode) -> list[str]:
|
| 205 |
+
"""Collect all tokens from a tree (preorder DFS)."""
|
| 206 |
+
result: list[str] = []
|
| 207 |
+
self._collect_tokens(root, result)
|
| 208 |
+
return result
|
| 209 |
+
|
| 210 |
+
def get_variable_tokens(self, root: ASTNode) -> set[str]:
|
| 211 |
+
"""Extract the set of variable tokens in the tree."""
|
| 212 |
+
return {t for t in self.get_all_tokens(root) if t.startswith("VAR_")}
|
| 213 |
+
|
| 214 |
+
def get_operator_tokens(self, root: ASTNode) -> set[str]:
|
| 215 |
+
"""Extract the set of operator/function tokens in the tree."""
|
| 216 |
+
return {
|
| 217 |
+
t for t in self.get_all_tokens(root)
|
| 218 |
+
if t.startswith("OP_") or t.startswith("FUNC_") or t == "FRAC"
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
# ── Visitor dispatch ──────────────────────────────────────────────────
|
| 222 |
+
|
| 223 |
+
def _visit(self, expr: sp.Expr, depth: int, parent_id: int) -> ASTNode:
|
| 224 |
+
"""Recursively build ASTNode for a SymPy expression."""
|
| 225 |
+
nid = self._counter
|
| 226 |
+
self._counter += 1
|
| 227 |
+
|
| 228 |
+
if depth >= self.max_depth:
|
| 229 |
+
return ASTNode("SUBTREE_TRUNCATED", expr, depth=depth, node_id=nid, parent_id=parent_id, confidence=0.0)
|
| 230 |
+
|
| 231 |
+
# ── Special constants ─────────────────────────────────────────────
|
| 232 |
+
if expr is sp.pi:
|
| 233 |
+
return ASTNode("CONST_PI", expr, depth=depth, node_id=nid, parent_id=parent_id)
|
| 234 |
+
if expr is sp.E:
|
| 235 |
+
return ASTNode("CONST_E", expr, depth=depth, node_id=nid, parent_id=parent_id)
|
| 236 |
+
if expr is sp.I:
|
| 237 |
+
return ASTNode("CONST_I", expr, depth=depth, node_id=nid, parent_id=parent_id)
|
| 238 |
+
if expr is sp.oo:
|
| 239 |
+
return ASTNode("CONST_INF", expr, depth=depth, node_id=nid, parent_id=parent_id)
|
| 240 |
+
if expr is sp.nan:
|
| 241 |
+
return ASTNode("CONST_NAN", expr, depth=depth, node_id=nid, parent_id=parent_id)
|
| 242 |
+
if expr == S.NegativeInfinity:
|
| 243 |
+
return ASTNode("CONST_NEG_INF", expr, depth=depth, node_id=nid, parent_id=parent_id)
|
| 244 |
+
|
| 245 |
+
# ── Integer ───────────────────────────────────────────────────────
|
| 246 |
+
if isinstance(expr, Integer):
|
| 247 |
+
val = int(expr)
|
| 248 |
+
if val < 0:
|
| 249 |
+
# Represent as OP_NEG(CONST_N)
|
| 250 |
+
inner_token = _INT_TOKENS.get(-val, f"NUM_{-val}")
|
| 251 |
+
inner = ASTNode(inner_token, -expr,
|
| 252 |
+
depth=depth + 1, node_id=self._counter, parent_id=nid)
|
| 253 |
+
self._counter += 1
|
| 254 |
+
return ASTNode("OP_NEG", expr, children=[inner],
|
| 255 |
+
depth=depth, node_id=nid, parent_id=parent_id)
|
| 256 |
+
token = _INT_TOKENS.get(val, f"NUM_{val}")
|
| 257 |
+
return ASTNode(token, expr, depth=depth, node_id=nid, parent_id=parent_id)
|
| 258 |
+
|
| 259 |
+
# ── Rational (not integer) ────────────────────────────────────────
|
| 260 |
+
if isinstance(expr, Rational):
|
| 261 |
+
num_node = self._visit(Integer(expr.p), depth + 1, nid)
|
| 262 |
+
den_node = self._visit(Integer(expr.q), depth + 1, nid)
|
| 263 |
+
return ASTNode("FRAC", expr, children=[num_node, den_node],
|
| 264 |
+
depth=depth, node_id=nid, parent_id=parent_id)
|
| 265 |
+
|
| 266 |
+
# ── Float ─────────────────────────────────────────────────────────
|
| 267 |
+
if isinstance(expr, Float):
|
| 268 |
+
safe = str(float(expr)).replace(".", "p").replace("-", "NEG")
|
| 269 |
+
return ASTNode(f"FLOAT_{safe}", expr, depth=depth, node_id=nid, parent_id=parent_id)
|
| 270 |
+
|
| 271 |
+
# ── Symbol ────────────────────────────────────────────────────────
|
| 272 |
+
if isinstance(expr, Symbol):
|
| 273 |
+
name = expr.name
|
| 274 |
+
token = _VAR_MAP.get(name, f"VAR_{name.upper()}")
|
| 275 |
+
return ASTNode(token, expr, depth=depth, node_id=nid, parent_id=parent_id)
|
| 276 |
+
|
| 277 |
+
# ── Add ───────────────────────────────────────────────────────────
|
| 278 |
+
if isinstance(expr, Add):
|
| 279 |
+
children = [self._visit(a, depth + 1, nid) for a in expr.args]
|
| 280 |
+
return ASTNode("OP_ADD", expr, children=children,
|
| 281 |
+
depth=depth, node_id=nid, parent_id=parent_id)
|
| 282 |
+
|
| 283 |
+
# ── Mul ───────────────────────────────────────────────────────────
|
| 284 |
+
if isinstance(expr, Mul):
|
| 285 |
+
args = expr.args
|
| 286 |
+
# Detect pure unary negation: Mul(-1, x)
|
| 287 |
+
if len(args) == 2 and args[0] == Integer(-1):
|
| 288 |
+
inner = self._visit(args[1], depth + 1, nid)
|
| 289 |
+
return ASTNode("OP_NEG", expr, children=[inner],
|
| 290 |
+
depth=depth, node_id=nid, parent_id=parent_id)
|
| 291 |
+
children = [self._visit(a, depth + 1, nid) for a in args]
|
| 292 |
+
return ASTNode("OP_MUL", expr, children=children,
|
| 293 |
+
depth=depth, node_id=nid, parent_id=parent_id)
|
| 294 |
+
|
| 295 |
+
# ── Pow ───────────────────────────────────────────────────────────
|
| 296 |
+
if isinstance(expr, Pow):
|
| 297 |
+
base_node = self._visit(expr.base, depth + 1, nid)
|
| 298 |
+
# Detect reciprocal: x^{-1}
|
| 299 |
+
if expr.exp == Integer(-1):
|
| 300 |
+
return ASTNode("OP_RECIP", expr, children=[base_node],
|
| 301 |
+
depth=depth, node_id=nid, parent_id=parent_id)
|
| 302 |
+
exp_node = self._visit(expr.exp, depth + 1, nid)
|
| 303 |
+
return ASTNode("OP_POW", expr, children=[base_node, exp_node],
|
| 304 |
+
depth=depth, node_id=nid, parent_id=parent_id)
|
| 305 |
+
|
| 306 |
+
# ── Known functions ───────────────────────────────────────────────
|
| 307 |
+
expr_type = type(expr)
|
| 308 |
+
if expr_type in _FUNC_MAP:
|
| 309 |
+
token = _FUNC_MAP[expr_type]
|
| 310 |
+
children = [self._visit(a, depth + 1, nid) for a in expr.args]
|
| 311 |
+
return ASTNode(token, expr, children=children,
|
| 312 |
+
depth=depth, node_id=nid, parent_id=parent_id)
|
| 313 |
+
|
| 314 |
+
# ── Relational ────────────────────────────────────────────────────
|
| 315 |
+
if expr_type in _REL_MAP:
|
| 316 |
+
token = _REL_MAP[expr_type]
|
| 317 |
+
children = [self._visit(a, depth + 1, nid) for a in expr.args]
|
| 318 |
+
return ASTNode(token, expr, children=children,
|
| 319 |
+
depth=depth, node_id=nid, parent_id=parent_id)
|
| 320 |
+
|
| 321 |
+
# ── Generic fallback ──────────────────────────────────────────────
|
| 322 |
+
cls_name = type(expr).__name__.upper()
|
| 323 |
+
token = f"FUNC_{cls_name}"
|
| 324 |
+
logger.debug("Unknown SymPy type %s → fallback token %s", type(expr).__name__, token)
|
| 325 |
+
children = [self._visit(a, depth + 1, nid) for a in expr.args] if expr.args else []
|
| 326 |
+
return ASTNode(token, expr, children=children,
|
| 327 |
+
depth=depth, node_id=nid, parent_id=parent_id, confidence=0.5)
|
| 328 |
+
|
| 329 |
+
# ── Utilities ─────────────────────────────────────────────────────────
|
| 330 |
+
|
| 331 |
+
def _collect_tokens(self, node: ASTNode, result: list[str]) -> None:
|
| 332 |
+
result.append(node.token)
|
| 333 |
+
for child in node.children:
|
| 334 |
+
self._collect_tokens(child, result)
|
mathtok/canonicalizer.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Layer 1: Canonicalization Engine
|
| 3 |
+
|
| 4 |
+
Normalizes mathematically equivalent expressions so that structurally
|
| 5 |
+
similar inputs produce consistent token streams downstream.
|
| 6 |
+
|
| 7 |
+
Transformation pipeline
|
| 8 |
+
───────────────────────
|
| 9 |
+
1. Format detection — infer LaTeX vs ASCII from input heuristics
|
| 10 |
+
2. Parse — sympy.parsing.latex.parse_latex OR
|
| 11 |
+
sympy.parsing.sympy_parser.parse_expr
|
| 12 |
+
3. Expand — distribute products/powers over sums
|
| 13 |
+
4. Simplify — apply algebraic identities (optional)
|
| 14 |
+
5. Factor — factorise if requested (off by default)
|
| 15 |
+
6. Normalize sub/div — subtraction → Add(x, Mul(-1,y));
|
| 16 |
+
division → Mul(x, Pow(y,-1))
|
| 17 |
+
(SymPy does this automatically internally)
|
| 18 |
+
|
| 19 |
+
Example
|
| 20 |
+
-------
|
| 21 |
+
>>> c = Canonicalizer()
|
| 22 |
+
>>> r = c.canonicalize("b + a")
|
| 23 |
+
>>> print(r.canonical_str) # "a + b"
|
| 24 |
+
>>> c.are_equivalent("x^2 + 2*x + 1", "(x+1)^2") # True
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
|
| 29 |
+
import logging
|
| 30 |
+
from dataclasses import dataclass, field
|
| 31 |
+
from typing import Optional
|
| 32 |
+
import concurrent.futures
|
| 33 |
+
|
| 34 |
+
import sympy as sp
|
| 35 |
+
from sympy.parsing.sympy_parser import (
|
| 36 |
+
parse_expr,
|
| 37 |
+
standard_transformations,
|
| 38 |
+
implicit_multiplication_application,
|
| 39 |
+
convert_xor,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
logger = logging.getLogger(__name__)
|
| 43 |
+
|
| 44 |
+
# Augmented ASCII transformation set
|
| 45 |
+
_ASCII_TRANSFORMS = standard_transformations + (
|
| 46 |
+
implicit_multiplication_application,
|
| 47 |
+
convert_xor,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# LaTeX detection markers — presence of any of these implies LaTeX input
|
| 51 |
+
_LATEX_MARKERS = (
|
| 52 |
+
"\\frac", "\\sqrt", "\\int", "\\sum", "\\prod",
|
| 53 |
+
"\\sin", "\\cos", "\\tan", "\\log", "\\ln", "\\exp",
|
| 54 |
+
"\\lim", "\\cdot", "\\times", "\\infty",
|
| 55 |
+
"\\alpha","\\beta", "\\gamma", "\\delta", "\\theta",
|
| 56 |
+
"\\pi", "\\sigma","\\mu", "\\lambda","\\phi", "\\psi",
|
| 57 |
+
"\\leq", "\\geq", "\\neq", "\\in", "\\subset",
|
| 58 |
+
"{", # LaTeX grouping
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
# LaTeX math-mode delimiter pairs (outer, inner)
|
| 62 |
+
_LATEX_DELIMITERS = [
|
| 63 |
+
("$$", "$$"),
|
| 64 |
+
("$", "$"),
|
| 65 |
+
("\\[", "\\]"),
|
| 66 |
+
("\\(", "\\)"),
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
# Local symbol dictionary for ASCII parser
|
| 70 |
+
_LOCAL_DICT: dict[str, object] = {
|
| 71 |
+
"x": sp.Symbol("x"), "y": sp.Symbol("y"), "z": sp.Symbol("z"),
|
| 72 |
+
"t": sp.Symbol("t"), "n": sp.Symbol("n"), "k": sp.Symbol("k"),
|
| 73 |
+
"a": sp.Symbol("a"), "b": sp.Symbol("b"), "c": sp.Symbol("c"),
|
| 74 |
+
"m": sp.Symbol("m"), "r": sp.Symbol("r"), "s": sp.Symbol("s"),
|
| 75 |
+
"u": sp.Symbol("u"), "v": sp.Symbol("v"), "w": sp.Symbol("w"),
|
| 76 |
+
"p": sp.Symbol("p"), "q": sp.Symbol("q"),
|
| 77 |
+
"e": sp.E,
|
| 78 |
+
"pi": sp.pi,
|
| 79 |
+
"i": sp.I,
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# ── Result dataclass ───────────────────────────────────────────────────────
|
| 84 |
+
|
| 85 |
+
@dataclass
|
| 86 |
+
class CanonicalizationResult:
|
| 87 |
+
"""Output of the canonicalization stage."""
|
| 88 |
+
original: str
|
| 89 |
+
expr: sp.Expr
|
| 90 |
+
canonical_str: str
|
| 91 |
+
input_format: str # 'latex' | 'ascii'
|
| 92 |
+
transformations_applied: list[str] = field(default_factory=list)
|
| 93 |
+
warnings: list[str] = field(default_factory=list)
|
| 94 |
+
success: bool = True
|
| 95 |
+
|
| 96 |
+
def __repr__(self) -> str:
|
| 97 |
+
return (
|
| 98 |
+
f"CanonicalizationResult("
|
| 99 |
+
f"fmt={self.input_format!r}, "
|
| 100 |
+
f"canonical={self.canonical_str!r}, "
|
| 101 |
+
f"ok={self.success})"
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# ── Main class ────────────────────────────────────────────────────────────
|
| 106 |
+
|
| 107 |
+
class Canonicalizer:
|
| 108 |
+
"""
|
| 109 |
+
Canonicalize mathematical expressions (LaTeX or ASCII) via SymPy.
|
| 110 |
+
|
| 111 |
+
Parameters
|
| 112 |
+
----------
|
| 113 |
+
do_simplify : bool
|
| 114 |
+
Apply sympy.simplify(). Recommended ON (may be slow for complex exprs).
|
| 115 |
+
do_expand : bool
|
| 116 |
+
Apply sympy.expand() before simplify.
|
| 117 |
+
do_factor : bool
|
| 118 |
+
Apply sympy.factor() as an alternative to expand+simplify.
|
| 119 |
+
sort_operands : bool
|
| 120 |
+
SymPy sorts Add/Mul operands canonically by default; flag kept for
|
| 121 |
+
documentation clarity.
|
| 122 |
+
"""
|
| 123 |
+
|
| 124 |
+
def __init__(
|
| 125 |
+
self,
|
| 126 |
+
do_simplify: bool = True,
|
| 127 |
+
do_expand: bool = True,
|
| 128 |
+
do_factor: bool = False,
|
| 129 |
+
timeout_seconds: float = 5.0,
|
| 130 |
+
) -> None:
|
| 131 |
+
self.do_simplify = do_simplify
|
| 132 |
+
self.do_expand = do_expand
|
| 133 |
+
self.do_factor = do_factor
|
| 134 |
+
self.timeout_seconds = timeout_seconds
|
| 135 |
+
|
| 136 |
+
# Simple LRU cache setup
|
| 137 |
+
self._cache: dict[str, CanonicalizationResult] = {}
|
| 138 |
+
self._max_cache_size = 512
|
| 139 |
+
|
| 140 |
+
# ── Public API ────────────────────────────────────────────────────────
|
| 141 |
+
|
| 142 |
+
def canonicalize(self, expression: str) -> CanonicalizationResult:
|
| 143 |
+
"""
|
| 144 |
+
Canonicalize a raw mathematical expression string with LRU caching.
|
| 145 |
+
"""
|
| 146 |
+
expression = expression.strip()
|
| 147 |
+
|
| 148 |
+
if expression in self._cache:
|
| 149 |
+
return self._cache[expression]
|
| 150 |
+
|
| 151 |
+
result = self._canonicalize_impl(expression)
|
| 152 |
+
|
| 153 |
+
# Cache management
|
| 154 |
+
if len(self._cache) >= self._max_cache_size:
|
| 155 |
+
# Pop the oldest item (first inserted in Python 3.7+ dict)
|
| 156 |
+
self._cache.pop(next(iter(self._cache)))
|
| 157 |
+
self._cache[expression] = result
|
| 158 |
+
|
| 159 |
+
return result
|
| 160 |
+
|
| 161 |
+
def _canonicalize_impl(self, expression: str) -> CanonicalizationResult:
|
| 162 |
+
"""Internal canonicalize implementation without caching."""
|
| 163 |
+
fmt, expr, warnings = self._parse(expression)
|
| 164 |
+
applied: list[str] = [f"parse_{fmt}"]
|
| 165 |
+
|
| 166 |
+
if expr is None:
|
| 167 |
+
return CanonicalizationResult(
|
| 168 |
+
original=expression,
|
| 169 |
+
expr=sp.Symbol("PARSE_ERROR"),
|
| 170 |
+
canonical_str="PARSE_ERROR",
|
| 171 |
+
input_format=fmt,
|
| 172 |
+
transformations_applied=applied,
|
| 173 |
+
warnings=warnings,
|
| 174 |
+
success=False,
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
# ── Normalization pipeline ────────────────────────────────────────
|
| 178 |
+
if self.do_expand:
|
| 179 |
+
expr, applied, warnings = _safe_apply(
|
| 180 |
+
sp.expand, expr, "expand", applied, warnings, self.timeout_seconds
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
if self.do_simplify:
|
| 184 |
+
expr, applied, warnings = _safe_apply(
|
| 185 |
+
sp.simplify, expr, "simplify", applied, warnings, self.timeout_seconds
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
if self.do_factor:
|
| 189 |
+
expr, applied, warnings = _safe_apply(
|
| 190 |
+
sp.factor, expr, "factor", applied, warnings, self.timeout_seconds
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
# Subtraction/division normalization is automatic in SymPy's
|
| 194 |
+
# internal representation (Add/Mul/Pow nodes).
|
| 195 |
+
applied.append("normalize_sub_div")
|
| 196 |
+
|
| 197 |
+
return CanonicalizationResult(
|
| 198 |
+
original=expression,
|
| 199 |
+
expr=expr,
|
| 200 |
+
canonical_str=str(expr),
|
| 201 |
+
input_format=fmt,
|
| 202 |
+
transformations_applied=applied,
|
| 203 |
+
warnings=warnings,
|
| 204 |
+
success=True,
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
def are_equivalent(self, expr_a: str, expr_b: str) -> bool:
|
| 208 |
+
"""
|
| 209 |
+
Return True iff two expressions are mathematically equivalent.
|
| 210 |
+
|
| 211 |
+
Used for the Canonical Consistency Score (CCS) metric.
|
| 212 |
+
"""
|
| 213 |
+
try:
|
| 214 |
+
ra = self.canonicalize(expr_a)
|
| 215 |
+
rb = self.canonicalize(expr_b)
|
| 216 |
+
if not ra.success or not rb.success:
|
| 217 |
+
return False
|
| 218 |
+
diff = sp.simplify(ra.expr - rb.expr)
|
| 219 |
+
return diff == 0
|
| 220 |
+
except Exception as exc:
|
| 221 |
+
logger.debug("are_equivalent failed: %s", exc)
|
| 222 |
+
return False
|
| 223 |
+
|
| 224 |
+
def batch_canonicalize(
|
| 225 |
+
self, expressions: list[str]
|
| 226 |
+
) -> list[CanonicalizationResult]:
|
| 227 |
+
"""Canonicalize a list of expressions."""
|
| 228 |
+
return [self.canonicalize(e) for e in expressions]
|
| 229 |
+
|
| 230 |
+
# ── Parsing ───────────────────────────────────────────────────────────
|
| 231 |
+
|
| 232 |
+
def _parse(
|
| 233 |
+
self, expression: str
|
| 234 |
+
) -> tuple[str, Optional[sp.Expr], list[str]]:
|
| 235 |
+
warnings: list[str] = []
|
| 236 |
+
fmt = _detect_format(expression)
|
| 237 |
+
cleaned = _strip_delimiters(expression)
|
| 238 |
+
|
| 239 |
+
if fmt == "latex":
|
| 240 |
+
expr = _parse_latex(cleaned, warnings)
|
| 241 |
+
if expr is not None:
|
| 242 |
+
return "latex", expr, warnings
|
| 243 |
+
warnings.append("LaTeX parse failed — falling back to ASCII parser.")
|
| 244 |
+
|
| 245 |
+
expr = _parse_ascii(cleaned, warnings)
|
| 246 |
+
if expr is not None:
|
| 247 |
+
return "ascii", expr, warnings
|
| 248 |
+
|
| 249 |
+
return fmt, None, warnings
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
# ── Module-level helpers ───────────────────────────────────────────────────
|
| 253 |
+
|
| 254 |
+
def _detect_format(expression: str) -> str:
|
| 255 |
+
"""Heuristically decide if input is LaTeX or ASCII."""
|
| 256 |
+
for marker in _LATEX_MARKERS:
|
| 257 |
+
if marker in expression:
|
| 258 |
+
return "latex"
|
| 259 |
+
s = expression.strip()
|
| 260 |
+
if s.startswith("$") or s.startswith("\\(") or s.startswith("\\["):
|
| 261 |
+
return "latex"
|
| 262 |
+
return "ascii"
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def _strip_delimiters(expression: str) -> str:
|
| 266 |
+
"""Remove outer LaTeX math-mode delimiters."""
|
| 267 |
+
s = expression.strip()
|
| 268 |
+
for open_d, close_d in _LATEX_DELIMITERS:
|
| 269 |
+
if s.startswith(open_d) and s.endswith(close_d) and len(s) > len(open_d) + len(close_d):
|
| 270 |
+
return s[len(open_d):-len(close_d)].strip()
|
| 271 |
+
return s
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def _parse_latex(expression: str, warnings: list[str]) -> Optional[sp.Expr]:
|
| 275 |
+
try:
|
| 276 |
+
from sympy.parsing.latex import parse_latex # antlr4 required
|
| 277 |
+
return parse_latex(expression)
|
| 278 |
+
except ImportError:
|
| 279 |
+
warnings.append(
|
| 280 |
+
"sympy.parsing.latex unavailable (install antlr4-python3-runtime==4.11.1)."
|
| 281 |
+
)
|
| 282 |
+
return None
|
| 283 |
+
except Exception as exc:
|
| 284 |
+
warnings.append(f"LaTeX parse error: {exc}")
|
| 285 |
+
return None
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
def _parse_ascii(expression: str, warnings: list[str]) -> Optional[sp.Expr]:
|
| 289 |
+
try:
|
| 290 |
+
return parse_expr(
|
| 291 |
+
expression,
|
| 292 |
+
local_dict=_LOCAL_DICT,
|
| 293 |
+
transformations=_ASCII_TRANSFORMS,
|
| 294 |
+
)
|
| 295 |
+
except Exception as exc:
|
| 296 |
+
warnings.append(f"ASCII parse error: {exc}")
|
| 297 |
+
return None
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def _safe_apply(
|
| 301 |
+
fn,
|
| 302 |
+
expr: sp.Expr,
|
| 303 |
+
name: str,
|
| 304 |
+
applied: list[str],
|
| 305 |
+
warnings: list[str],
|
| 306 |
+
timeout_seconds: float = 5.0,
|
| 307 |
+
) -> tuple[sp.Expr, list[str], list[str]]:
|
| 308 |
+
"""Apply a SymPy transformation safely, catching all exceptions and timing out."""
|
| 309 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
| 310 |
+
future = executor.submit(fn, expr)
|
| 311 |
+
try:
|
| 312 |
+
result = future.result(timeout=timeout_seconds)
|
| 313 |
+
applied.append(name)
|
| 314 |
+
return result, applied, warnings
|
| 315 |
+
except concurrent.futures.TimeoutError:
|
| 316 |
+
warnings.append(f"{name} timed out after {timeout_seconds}s")
|
| 317 |
+
return expr, applied, warnings
|
| 318 |
+
except Exception as exc:
|
| 319 |
+
warnings.append(f"{name} failed: {exc}")
|
| 320 |
+
return expr, applied, warnings
|
mathtok/lexer.py
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Layer 2: Hybrid Mathematical Lexer
|
| 3 |
+
|
| 4 |
+
Splits mixed text+math input into alternating typed spans:
|
| 5 |
+
- TEXT spans → forwarded to the BPE text tokenizer
|
| 6 |
+
- MATH spans → forwarded to the canonicalization + AST pipeline
|
| 7 |
+
|
| 8 |
+
Detection strategy (two-stage)
|
| 9 |
+
───────────────────────────────
|
| 10 |
+
Stage 1 — LaTeX delimiter detection
|
| 11 |
+
$...$ $$...$$ \\(...\\) \\[...\\]
|
| 12 |
+
These are unambiguous; inner content is always MATH.
|
| 13 |
+
|
| 14 |
+
Stage 2 — ASCII math heuristic detection
|
| 15 |
+
Applied only to remaining TEXT spans.
|
| 16 |
+
Looks for patterns like: sin(x), x^2, a+b=c, 3*x+1
|
| 17 |
+
|
| 18 |
+
Outputs a flat ordered list of LexSpan objects.
|
| 19 |
+
Adjacent spans of the same type are merged before returning.
|
| 20 |
+
|
| 21 |
+
Example
|
| 22 |
+
───────
|
| 23 |
+
>>> lex = HybridLexer()
|
| 24 |
+
>>> lex.lex("The derivative of $\\\\sin(x^2)$ plus 3x")
|
| 25 |
+
[TEXT("The derivative of "), MATH("\\sin(x^2)"), TEXT(" plus "), MATH("3x")]
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
from __future__ import annotations
|
| 29 |
+
|
| 30 |
+
import re
|
| 31 |
+
from dataclasses import dataclass
|
| 32 |
+
from enum import Enum
|
| 33 |
+
from typing import Iterator
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# ── Types ──────────────────────────────────────────────────────────────────
|
| 37 |
+
|
| 38 |
+
class SpanType(str, Enum):
|
| 39 |
+
TEXT = "TEXT"
|
| 40 |
+
MATH = "MATH"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@dataclass
|
| 44 |
+
class LexSpan:
|
| 45 |
+
"""A contiguous span of homogeneous content type."""
|
| 46 |
+
span_type: SpanType
|
| 47 |
+
content: str
|
| 48 |
+
start: int # character offset in original string
|
| 49 |
+
end: int
|
| 50 |
+
confidence: float = 1.0 # 0.0 to 1.0
|
| 51 |
+
|
| 52 |
+
def __repr__(self) -> str:
|
| 53 |
+
preview = self.content[:50].replace("\n", " ")
|
| 54 |
+
return f"{self.span_type.value}({preview!r}, conf={self.confidence:.2f})"
|
| 55 |
+
|
| 56 |
+
def __len__(self) -> int:
|
| 57 |
+
return len(self.content)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# ── Compiled regex patterns ────────────────────────────────────────────────
|
| 61 |
+
|
| 62 |
+
# Stage 1 — LaTeX delimiters (ordered: longer/greedier patterns first)
|
| 63 |
+
_PAT_DISPLAY_DOLLAR = re.compile(r"\$\$(.+?)\$\$", re.DOTALL)
|
| 64 |
+
_PAT_INLINE_DOLLAR = re.compile(r"\$(.+?)\$", re.DOTALL)
|
| 65 |
+
_PAT_DISPLAY_BRACKET = re.compile(r"\\\[(.+?)\\\]", re.DOTALL)
|
| 66 |
+
_PAT_INLINE_PAREN = re.compile(r"\\\((.+?)\\\)", re.DOTALL)
|
| 67 |
+
|
| 68 |
+
_LATEX_PATTERNS = [
|
| 69 |
+
_PAT_DISPLAY_DOLLAR, # must come before inline dollar
|
| 70 |
+
_PAT_INLINE_DOLLAR,
|
| 71 |
+
_PAT_DISPLAY_BRACKET,
|
| 72 |
+
_PAT_INLINE_PAREN,
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
# Stage 2 — ASCII math heuristic sub-patterns
|
| 76 |
+
# Matches: function calls, exponentiation, arithmetic expressions
|
| 77 |
+
_ASCII_FUNC_CALL = re.compile(
|
| 78 |
+
r"\b(?:sin|cos|tan|asin|acos|atan|sinh|cosh|tanh|"
|
| 79 |
+
r"exp|log|ln|sqrt|cbrt|abs|floor|ceil|"
|
| 80 |
+
r"lim|sum|prod|int|diff|derivative|integral|limit|"
|
| 81 |
+
r"gamma|factorial)\s*\(",
|
| 82 |
+
re.IGNORECASE,
|
| 83 |
+
)
|
| 84 |
+
_ASCII_EXPONENT = re.compile(
|
| 85 |
+
r"[a-zA-Z_]\w*\s*(?:\^|\*\*)\s*[\w(]"
|
| 86 |
+
)
|
| 87 |
+
_ASCII_ARITH = re.compile(
|
| 88 |
+
r"(?<!\w)[-+]?\d+(?:\.\d+)?\s*[+\-*/]\s*[-+]?\d"
|
| 89 |
+
)
|
| 90 |
+
_ASCII_EQUATION = re.compile(
|
| 91 |
+
r"[a-zA-Z_]\w*\s*[+\-*/^=<>]\s*[a-zA-Z0-9_]"
|
| 92 |
+
)
|
| 93 |
+
_ASCII_FUNCTION_DEF = re.compile(
|
| 94 |
+
r"\b[a-zA-Z_]\w*\([a-zA-Z0-9_,\s]*\)\s*="
|
| 95 |
+
)
|
| 96 |
+
_ASCII_GREEK = re.compile(
|
| 97 |
+
r"\b(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\b",
|
| 98 |
+
re.IGNORECASE
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
_ASCII_PATTERNS = [
|
| 102 |
+
_ASCII_FUNC_CALL, _ASCII_EXPONENT, _ASCII_ARITH, _ASCII_EQUATION,
|
| 103 |
+
_ASCII_FUNCTION_DEF, _ASCII_GREEK
|
| 104 |
+
]
|
| 105 |
+
|
| 106 |
+
# Characters that can appear in an ASCII math expression context
|
| 107 |
+
_MATH_CHARS = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
| 108 |
+
"0123456789+-*/^=<>()[]{}.,_! \t")
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ── Main class ────────────────────────────────────────────────────────────
|
| 112 |
+
|
| 113 |
+
class HybridLexer:
|
| 114 |
+
"""
|
| 115 |
+
Split mixed text+math input into LexSpan objects.
|
| 116 |
+
|
| 117 |
+
Parameters
|
| 118 |
+
----------
|
| 119 |
+
ascii_math_detection : bool
|
| 120 |
+
Enable Stage-2 heuristic detection inside TEXT spans.
|
| 121 |
+
min_math_len : int
|
| 122 |
+
Minimum character length for an ASCII math span to be emitted
|
| 123 |
+
as MATH (prevents false positives on short strings like "a+b").
|
| 124 |
+
"""
|
| 125 |
+
|
| 126 |
+
def __init__(
|
| 127 |
+
self,
|
| 128 |
+
ascii_math_detection: bool = True,
|
| 129 |
+
min_math_len: int = 3,
|
| 130 |
+
) -> None:
|
| 131 |
+
self.ascii_math_detection = ascii_math_detection
|
| 132 |
+
self.min_math_len = min_math_len
|
| 133 |
+
|
| 134 |
+
# ── Public API ────────────────────────────────────────────────────────
|
| 135 |
+
|
| 136 |
+
def lex(self, text: str) -> list[LexSpan]:
|
| 137 |
+
"""
|
| 138 |
+
Lex a mixed text+math string into typed spans.
|
| 139 |
+
|
| 140 |
+
Parameters
|
| 141 |
+
----------
|
| 142 |
+
text : str
|
| 143 |
+
Input string containing natural language and/or math.
|
| 144 |
+
|
| 145 |
+
Returns
|
| 146 |
+
-------
|
| 147 |
+
list[LexSpan]
|
| 148 |
+
Ordered list of TEXT and MATH spans.
|
| 149 |
+
"""
|
| 150 |
+
if not text:
|
| 151 |
+
return []
|
| 152 |
+
|
| 153 |
+
spans = self._stage1_latex(text)
|
| 154 |
+
|
| 155 |
+
if self.ascii_math_detection:
|
| 156 |
+
refined: list[LexSpan] = []
|
| 157 |
+
for span in spans:
|
| 158 |
+
if span.span_type is SpanType.TEXT:
|
| 159 |
+
refined.extend(self._stage2_ascii(span))
|
| 160 |
+
else:
|
| 161 |
+
refined.append(span)
|
| 162 |
+
spans = refined
|
| 163 |
+
|
| 164 |
+
return _merge_adjacent(spans)
|
| 165 |
+
|
| 166 |
+
def iter_spans(self, text: str) -> Iterator[LexSpan]:
|
| 167 |
+
"""Lazy iterator over lexed spans."""
|
| 168 |
+
yield from self.lex(text)
|
| 169 |
+
|
| 170 |
+
def is_math_only(self, text: str) -> bool:
|
| 171 |
+
"""Return True if the entire string is a math expression."""
|
| 172 |
+
spans = self.lex(text)
|
| 173 |
+
return all(s.span_type is SpanType.MATH for s in spans if s.content.strip())
|
| 174 |
+
|
| 175 |
+
# ── Stage 1: LaTeX delimiter detection ───────────────────────────────
|
| 176 |
+
|
| 177 |
+
def _stage1_latex(self, text: str) -> list[LexSpan]:
|
| 178 |
+
"""Find all LaTeX-delimited math regions, fill gaps with TEXT."""
|
| 179 |
+
matches: list[tuple[int, int, str]] = [] # (start, end, inner_content)
|
| 180 |
+
|
| 181 |
+
for pat in _LATEX_PATTERNS:
|
| 182 |
+
for m in pat.finditer(text):
|
| 183 |
+
s, e = m.start(), m.end()
|
| 184 |
+
# Skip if overlapping with already found match
|
| 185 |
+
if any(not (e <= ms or s >= me) for ms, me, _ in matches):
|
| 186 |
+
continue
|
| 187 |
+
matches.append((s, e, m.group(1))) # group(1) = inner content
|
| 188 |
+
|
| 189 |
+
matches.sort(key=lambda t: t[0])
|
| 190 |
+
|
| 191 |
+
spans: list[LexSpan] = []
|
| 192 |
+
cursor = 0
|
| 193 |
+
for start, end, content in matches:
|
| 194 |
+
if start > cursor:
|
| 195 |
+
spans.append(LexSpan(SpanType.TEXT, text[cursor:start], cursor, start, confidence=1.0))
|
| 196 |
+
spans.append(LexSpan(SpanType.MATH, content.strip(), start, end, confidence=1.0))
|
| 197 |
+
cursor = end
|
| 198 |
+
|
| 199 |
+
if cursor < len(text):
|
| 200 |
+
spans.append(LexSpan(SpanType.TEXT, text[cursor:], cursor, len(text), confidence=1.0))
|
| 201 |
+
|
| 202 |
+
return spans or [LexSpan(SpanType.TEXT, text, 0, len(text), confidence=1.0)]
|
| 203 |
+
|
| 204 |
+
# ── Stage 2: ASCII math detection ────────────────────────────────────
|
| 205 |
+
|
| 206 |
+
def _stage2_ascii(self, text_span: LexSpan) -> list[LexSpan]:
|
| 207 |
+
"""Within a TEXT span, identify and extract ASCII math regions."""
|
| 208 |
+
text = text_span.content
|
| 209 |
+
base = text_span.start
|
| 210 |
+
|
| 211 |
+
math_ranges: list[tuple[int, int]] = []
|
| 212 |
+
for pat in _ASCII_PATTERNS:
|
| 213 |
+
for m in pat.finditer(text):
|
| 214 |
+
s, e = m.start(), m.end()
|
| 215 |
+
s, e = self._expand_region(text, s, e)
|
| 216 |
+
math_ranges.append((s, e))
|
| 217 |
+
|
| 218 |
+
if not math_ranges:
|
| 219 |
+
return [text_span]
|
| 220 |
+
|
| 221 |
+
math_ranges = _merge_ranges(math_ranges)
|
| 222 |
+
|
| 223 |
+
spans: list[LexSpan] = []
|
| 224 |
+
cursor = 0
|
| 225 |
+
for s, e in math_ranges:
|
| 226 |
+
if s > cursor:
|
| 227 |
+
spans.append(LexSpan(SpanType.TEXT, text[cursor:s], base + cursor, base + s, confidence=1.0))
|
| 228 |
+
content = text[s:e].strip()
|
| 229 |
+
|
| 230 |
+
# Simple heuristic confidence based on length
|
| 231 |
+
# Short strings are less likely to be purely math (e.g., variable names vs full equations)
|
| 232 |
+
conf = min(0.95, max(0.5, 0.5 + 0.05 * len(content)))
|
| 233 |
+
|
| 234 |
+
span_type = SpanType.MATH if len(content) >= self.min_math_len else SpanType.TEXT
|
| 235 |
+
spans.append(LexSpan(span_type, text[s:e], base + s, base + e, confidence=conf if span_type == SpanType.MATH else 1.0))
|
| 236 |
+
cursor = e
|
| 237 |
+
|
| 238 |
+
if cursor < len(text):
|
| 239 |
+
spans.append(LexSpan(SpanType.TEXT, text[cursor:], base + cursor, base + len(text), confidence=1.0))
|
| 240 |
+
|
| 241 |
+
return spans
|
| 242 |
+
|
| 243 |
+
def _expand_region(self, text: str, start: int, end: int) -> tuple[int, int]:
|
| 244 |
+
"""
|
| 245 |
+
Expand a detected math seed region to capture surrounding balanced
|
| 246 |
+
parentheses and chained operators.
|
| 247 |
+
"""
|
| 248 |
+
# Expand backwards: include leading unary minus, digits, spaces
|
| 249 |
+
while start > 0 and text[start - 1] in "(-+0123456789 \t":
|
| 250 |
+
if text[start - 1] == "(":
|
| 251 |
+
break
|
| 252 |
+
start -= 1
|
| 253 |
+
|
| 254 |
+
# Expand forwards: follow balanced parens and math characters
|
| 255 |
+
depth = 0
|
| 256 |
+
i = end
|
| 257 |
+
while i < len(text):
|
| 258 |
+
ch = text[i]
|
| 259 |
+
if ch in "([{":
|
| 260 |
+
depth += 1
|
| 261 |
+
i += 1
|
| 262 |
+
elif ch in ")]}":
|
| 263 |
+
if depth == 0:
|
| 264 |
+
break
|
| 265 |
+
depth -= 1
|
| 266 |
+
i += 1
|
| 267 |
+
elif ch in " \t" and depth == 0:
|
| 268 |
+
# Stop at word boundary outside parens
|
| 269 |
+
# — but keep going if next char is still math-ish
|
| 270 |
+
if i + 1 < len(text) and text[i + 1] in "+-*/^=<>)":
|
| 271 |
+
i += 1
|
| 272 |
+
else:
|
| 273 |
+
break
|
| 274 |
+
elif ch in _MATH_CHARS:
|
| 275 |
+
i += 1
|
| 276 |
+
else:
|
| 277 |
+
break
|
| 278 |
+
|
| 279 |
+
return start, i
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
# ── Module helpers ────────────────────────────────────────────────────────
|
| 283 |
+
|
| 284 |
+
def _merge_ranges(ranges: list[tuple[int, int]]) -> list[tuple[int, int]]:
|
| 285 |
+
"""Merge overlapping (start, end) integer ranges."""
|
| 286 |
+
if not ranges:
|
| 287 |
+
return []
|
| 288 |
+
ranges = sorted(ranges)
|
| 289 |
+
merged = [list(ranges[0])]
|
| 290 |
+
for s, e in ranges[1:]:
|
| 291 |
+
if s <= merged[-1][1]:
|
| 292 |
+
merged[-1][1] = max(merged[-1][1], e)
|
| 293 |
+
else:
|
| 294 |
+
merged.append([s, e])
|
| 295 |
+
return [tuple(r) for r in merged]
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def _merge_adjacent(spans: list[LexSpan]) -> list[LexSpan]:
|
| 299 |
+
"""Merge adjacent spans of the same type."""
|
| 300 |
+
if not spans:
|
| 301 |
+
return []
|
| 302 |
+
merged = [spans[0]]
|
| 303 |
+
for span in spans[1:]:
|
| 304 |
+
prev = merged[-1]
|
| 305 |
+
if span.span_type is prev.span_type:
|
| 306 |
+
merged[-1] = LexSpan(
|
| 307 |
+
prev.span_type,
|
| 308 |
+
prev.content + span.content,
|
| 309 |
+
prev.start,
|
| 310 |
+
span.end,
|
| 311 |
+
confidence=max(prev.confidence, span.confidence)
|
| 312 |
+
)
|
| 313 |
+
else:
|
| 314 |
+
merged.append(span)
|
| 315 |
+
return merged
|
mathtok/metadata.py
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Layer 6: Structural Attention Metadata Generator
|
| 3 |
+
|
| 4 |
+
For every token in the serialized stream, generate a rich metadata
|
| 5 |
+
record capturing its full tree context. This metadata is the primary
|
| 6 |
+
research contribution of MathTok — it enables structure-aware attention
|
| 7 |
+
in downstream transformer models without architectural changes.
|
| 8 |
+
|
| 9 |
+
Metadata fields per token
|
| 10 |
+
─────────────────────────
|
| 11 |
+
position : flat index in sequence
|
| 12 |
+
token : token string
|
| 13 |
+
token_id : vocabulary ID (filled if vocab is provided)
|
| 14 |
+
node_id : AST node ID
|
| 15 |
+
parent_id : parent node ID (-1 = root)
|
| 16 |
+
children_ids : list of direct child node IDs
|
| 17 |
+
depth : tree depth (root = 0)
|
| 18 |
+
child_index : index among siblings
|
| 19 |
+
subtree_size : total nodes in subtree
|
| 20 |
+
is_leaf : terminal node flag
|
| 21 |
+
num_children : number of direct children
|
| 22 |
+
token_category : 'operator' | 'function' | 'variable' | 'constant'
|
| 23 |
+
| 'structural' | 'boundary' | 'text'
|
| 24 |
+
tree_position_key: dot-notation path from root, e.g. "0.1.2"
|
| 25 |
+
sibling_count : total number of siblings (including self)
|
| 26 |
+
|
| 27 |
+
Attention mask helpers
|
| 28 |
+
──────────────────────
|
| 29 |
+
to_attention_mask_hints() returns binary NxN matrices for:
|
| 30 |
+
parent_mask — attend to parent
|
| 31 |
+
children_mask — attend to children
|
| 32 |
+
sibling_mask — attend to siblings
|
| 33 |
+
subtree_mask — attend within own subtree
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
from __future__ import annotations
|
| 37 |
+
|
| 38 |
+
from collections import defaultdict
|
| 39 |
+
from dataclasses import dataclass, asdict
|
| 40 |
+
from typing import Optional
|
| 41 |
+
|
| 42 |
+
from .serializer import SerializedToken
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# ── Token classification ───────────────────────────────────────────────────
|
| 46 |
+
|
| 47 |
+
_BOUNDARY_TOKENS = {
|
| 48 |
+
"[MATH_START]", "[MATH_END]",
|
| 49 |
+
"[TEXT_START]", "[TEXT_END]",
|
| 50 |
+
"[BOS]", "[EOS]", "[PAD]", "[UNK]",
|
| 51 |
+
"[SCOPE_OPEN]", "[SCOPE_CLOSE]",
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _classify(token: str) -> str:
|
| 56 |
+
if token in _BOUNDARY_TOKENS:
|
| 57 |
+
return "boundary"
|
| 58 |
+
if token.startswith("OP_") or token == "FRAC":
|
| 59 |
+
return "operator"
|
| 60 |
+
if token.startswith("FUNC_"):
|
| 61 |
+
return "function"
|
| 62 |
+
if token.startswith("VAR_"):
|
| 63 |
+
return "variable"
|
| 64 |
+
if (token.startswith("CONST_") or token.startswith("NUM_")
|
| 65 |
+
or token.startswith("FLOAT_")):
|
| 66 |
+
return "constant"
|
| 67 |
+
if token.startswith("SUBTREE_REF_") or token == "SUBTREE_TRUNCATED":
|
| 68 |
+
return "structural"
|
| 69 |
+
return "text"
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# ── Metadata dataclass ────────────────────────────────────────────────────
|
| 73 |
+
|
| 74 |
+
@dataclass
|
| 75 |
+
class TokenMetadata:
|
| 76 |
+
"""
|
| 77 |
+
Rich structural metadata for one token position.
|
| 78 |
+
|
| 79 |
+
This record provides all information needed to implement
|
| 80 |
+
structure-aware attention, tree positional encoding, or
|
| 81 |
+
graph-neural-network processing of math token sequences.
|
| 82 |
+
"""
|
| 83 |
+
# ── Identity ─────────────────────────────────────────────────────────
|
| 84 |
+
position: int
|
| 85 |
+
token: str
|
| 86 |
+
token_id: int # -1 if vocab not provided
|
| 87 |
+
|
| 88 |
+
# ── Tree structure ────────────────────────────────────────────────────
|
| 89 |
+
node_id: int
|
| 90 |
+
parent_id: int
|
| 91 |
+
parent_token: str
|
| 92 |
+
children_ids: list[int]
|
| 93 |
+
depth: int
|
| 94 |
+
child_index: int
|
| 95 |
+
|
| 96 |
+
# ── Subtree info ──────────────────────────────────────────────────────
|
| 97 |
+
subtree_size: int
|
| 98 |
+
is_leaf: bool
|
| 99 |
+
num_children: int
|
| 100 |
+
|
| 101 |
+
# ── Semantic category ─────────────────────────────────────────────────
|
| 102 |
+
token_category: str # operator | function | variable | constant | boundary | text
|
| 103 |
+
|
| 104 |
+
# ── Positional hints ──────────────────────────────────────────────────
|
| 105 |
+
tree_position_key: str # e.g. "0.1.2" = root→child[1]→child[2]
|
| 106 |
+
sibling_count: int
|
| 107 |
+
|
| 108 |
+
def to_dict(self) -> dict:
|
| 109 |
+
return asdict(self)
|
| 110 |
+
|
| 111 |
+
def __repr__(self) -> str:
|
| 112 |
+
return (
|
| 113 |
+
f"TokenMetadata(pos={self.position}, token={self.token!r}, "
|
| 114 |
+
f"depth={self.depth}, cat={self.token_category!r})"
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
# ── Generator ────────────────────────────────────────────────────���───────
|
| 119 |
+
|
| 120 |
+
class MetadataGenerator:
|
| 121 |
+
"""
|
| 122 |
+
Generate structural metadata for a serialized token stream.
|
| 123 |
+
|
| 124 |
+
Usage
|
| 125 |
+
-----
|
| 126 |
+
>>> gen = MetadataGenerator()
|
| 127 |
+
>>> meta = gen.generate(serialized_tokens, vocab={"OP_ADD": 8, ...})
|
| 128 |
+
>>> for m in meta:
|
| 129 |
+
... print(m.tree_position_key, m.token_category)
|
| 130 |
+
"""
|
| 131 |
+
|
| 132 |
+
def generate(
|
| 133 |
+
self,
|
| 134 |
+
tokens: list[SerializedToken],
|
| 135 |
+
vocab: Optional[dict[str, int]] = None,
|
| 136 |
+
) -> list[TokenMetadata]:
|
| 137 |
+
"""
|
| 138 |
+
Generate TokenMetadata for every token in the stream.
|
| 139 |
+
|
| 140 |
+
Parameters
|
| 141 |
+
----------
|
| 142 |
+
tokens : list[SerializedToken]
|
| 143 |
+
Output of StructuralSerializer.serialize().
|
| 144 |
+
vocab : dict[str, int] | None
|
| 145 |
+
Optional vocabulary mapping token → ID.
|
| 146 |
+
|
| 147 |
+
Returns
|
| 148 |
+
-------
|
| 149 |
+
list[TokenMetadata]
|
| 150 |
+
"""
|
| 151 |
+
vocab = vocab or {}
|
| 152 |
+
|
| 153 |
+
# Build structural lookup tables
|
| 154 |
+
node_to_pos: dict[int, int] = {}
|
| 155 |
+
node_to_token: dict[int, str] = {}
|
| 156 |
+
parent_to_children: dict[int, list[int]] = defaultdict(list)
|
| 157 |
+
|
| 158 |
+
for pos, st in enumerate(tokens):
|
| 159 |
+
if st.node_id >= 0:
|
| 160 |
+
node_to_pos[st.node_id] = pos
|
| 161 |
+
node_to_token[st.node_id] = st.token
|
| 162 |
+
if st.parent_id >= 0:
|
| 163 |
+
parent_to_children[st.parent_id].append(st.node_id)
|
| 164 |
+
|
| 165 |
+
position_keys = self._build_position_keys(tokens)
|
| 166 |
+
|
| 167 |
+
result: list[TokenMetadata] = []
|
| 168 |
+
for pos, st in enumerate(tokens):
|
| 169 |
+
children_ids = parent_to_children.get(st.node_id, [])
|
| 170 |
+
siblings = parent_to_children.get(st.parent_id, []) if st.parent_id >= 0 else []
|
| 171 |
+
|
| 172 |
+
meta = TokenMetadata(
|
| 173 |
+
position = pos,
|
| 174 |
+
token = st.token,
|
| 175 |
+
token_id = vocab.get(st.token, -1),
|
| 176 |
+
node_id = st.node_id,
|
| 177 |
+
parent_id = st.parent_id,
|
| 178 |
+
parent_token = node_to_token.get(st.parent_id, ""),
|
| 179 |
+
children_ids = list(children_ids),
|
| 180 |
+
depth = max(st.depth, 0),
|
| 181 |
+
child_index = st.child_index,
|
| 182 |
+
subtree_size = st.subtree_size,
|
| 183 |
+
is_leaf = st.is_leaf,
|
| 184 |
+
num_children = st.num_children,
|
| 185 |
+
token_category = _classify(st.token),
|
| 186 |
+
tree_position_key = position_keys.get(st.node_id, "root"),
|
| 187 |
+
sibling_count = len(siblings),
|
| 188 |
+
)
|
| 189 |
+
result.append(meta)
|
| 190 |
+
|
| 191 |
+
return result
|
| 192 |
+
|
| 193 |
+
def to_attention_mask_hints(
|
| 194 |
+
self,
|
| 195 |
+
metadata: list[TokenMetadata],
|
| 196 |
+
) -> dict[str, list[list[int]]]:
|
| 197 |
+
"""
|
| 198 |
+
Generate NxN binary attention mask hints from metadata.
|
| 199 |
+
|
| 200 |
+
Returns
|
| 201 |
+
-------
|
| 202 |
+
dict with keys:
|
| 203 |
+
'parent_mask' : token i can attend to its parent
|
| 204 |
+
'children_mask' : token i can attend to all its children
|
| 205 |
+
'sibling_mask' : token i can attend to its siblings
|
| 206 |
+
'subtree_mask' : token i can attend to all nodes in its subtree
|
| 207 |
+
|
| 208 |
+
Each mask value is a list-of-lists of 0/1 integers (N x N).
|
| 209 |
+
"""
|
| 210 |
+
n = len(metadata)
|
| 211 |
+
node_to_pos: dict[int, int] = {m.node_id: m.position for m in metadata if m.node_id >= 0}
|
| 212 |
+
|
| 213 |
+
parent_mask = [[0] * n for _ in range(n)]
|
| 214 |
+
children_mask = [[0] * n for _ in range(n)]
|
| 215 |
+
sibling_mask = [[0] * n for _ in range(n)]
|
| 216 |
+
subtree_mask = [[0] * n for _ in range(n)]
|
| 217 |
+
|
| 218 |
+
# Build subtree membership: node_id → set of all descendant node_ids
|
| 219 |
+
subtree_members = self._build_subtree_members(metadata, node_to_pos)
|
| 220 |
+
|
| 221 |
+
for m in metadata:
|
| 222 |
+
i = m.position
|
| 223 |
+
|
| 224 |
+
# Parent
|
| 225 |
+
if m.parent_id >= 0 and m.parent_id in node_to_pos:
|
| 226 |
+
parent_mask[i][node_to_pos[m.parent_id]] = 1
|
| 227 |
+
|
| 228 |
+
# Children
|
| 229 |
+
for child_id in m.children_ids:
|
| 230 |
+
if child_id in node_to_pos:
|
| 231 |
+
children_mask[i][node_to_pos[child_id]] = 1
|
| 232 |
+
|
| 233 |
+
# Siblings (same parent, different node)
|
| 234 |
+
if m.parent_id >= 0:
|
| 235 |
+
for m2 in metadata:
|
| 236 |
+
if m2.parent_id == m.parent_id and m2.position != i:
|
| 237 |
+
sibling_mask[i][m2.position] = 1
|
| 238 |
+
|
| 239 |
+
# Subtree
|
| 240 |
+
for desc_pos in subtree_members.get(m.node_id, set()):
|
| 241 |
+
subtree_mask[i][desc_pos] = 1
|
| 242 |
+
|
| 243 |
+
return {
|
| 244 |
+
"parent_mask": parent_mask,
|
| 245 |
+
"children_mask": children_mask,
|
| 246 |
+
"sibling_mask": sibling_mask,
|
| 247 |
+
"subtree_mask": subtree_mask,
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
# ── Private helpers ───────────────────────────────────────────────────
|
| 251 |
+
|
| 252 |
+
def _build_position_keys(self, tokens: list[SerializedToken]) -> dict[int, str]:
|
| 253 |
+
"""
|
| 254 |
+
Build a dot-separated path string for every node.
|
| 255 |
+
The root gets key "0"; each child appends ".{child_index}".
|
| 256 |
+
"""
|
| 257 |
+
keys: dict[int, str] = {}
|
| 258 |
+
|
| 259 |
+
# Find root node(s) — parent_id == -1 and not a boundary
|
| 260 |
+
for st in tokens:
|
| 261 |
+
if st.parent_id == -1 and st.node_id >= 0:
|
| 262 |
+
keys[st.node_id] = "0"
|
| 263 |
+
|
| 264 |
+
# Iterative BFS propagation
|
| 265 |
+
changed = True
|
| 266 |
+
while changed:
|
| 267 |
+
changed = False
|
| 268 |
+
for st in tokens:
|
| 269 |
+
if st.node_id not in keys and st.parent_id in keys:
|
| 270 |
+
keys[st.node_id] = f"{keys[st.parent_id]}.{st.child_index}"
|
| 271 |
+
changed = True
|
| 272 |
+
|
| 273 |
+
return keys
|
| 274 |
+
|
| 275 |
+
def _build_subtree_members(
|
| 276 |
+
self,
|
| 277 |
+
metadata: list[TokenMetadata],
|
| 278 |
+
node_to_pos: dict[int, int],
|
| 279 |
+
) -> dict[int, set[int]]:
|
| 280 |
+
"""
|
| 281 |
+
For each node, compute the set of *positions* of all its descendants.
|
| 282 |
+
Used for building the subtree attention mask.
|
| 283 |
+
"""
|
| 284 |
+
# Build parent→children mapping
|
| 285 |
+
children_of: dict[int, list[int]] = defaultdict(list)
|
| 286 |
+
for m in metadata:
|
| 287 |
+
if m.parent_id >= 0:
|
| 288 |
+
children_of[m.parent_id].append(m.node_id)
|
| 289 |
+
|
| 290 |
+
subtree: dict[int, set[int]] = {}
|
| 291 |
+
|
| 292 |
+
def collect(node_id: int) -> set[int]:
|
| 293 |
+
if node_id in subtree:
|
| 294 |
+
return subtree[node_id]
|
| 295 |
+
members: set[int] = set()
|
| 296 |
+
if node_id in node_to_pos:
|
| 297 |
+
members.add(node_to_pos[node_id])
|
| 298 |
+
for child_id in children_of.get(node_id, []):
|
| 299 |
+
members |= collect(child_id)
|
| 300 |
+
subtree[node_id] = members
|
| 301 |
+
return members
|
| 302 |
+
|
| 303 |
+
for m in metadata:
|
| 304 |
+
if m.node_id >= 0:
|
| 305 |
+
collect(m.node_id)
|
| 306 |
+
|
| 307 |
+
return subtree
|
mathtok/operator_registry.py
ADDED
|
@@ -0,0 +1,429 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Layer 4: Operator-Aware Semantic Registry
|
| 3 |
+
|
| 4 |
+
Every mathematical operator and function is assigned a rich metadata
|
| 5 |
+
record that captures its semantic role in mathematical computation.
|
| 6 |
+
This registry is the backbone of the structural token vocabulary.
|
| 7 |
+
|
| 8 |
+
Each OperatorMeta record encodes:
|
| 9 |
+
- token : unique string identifier in the MathTok vocabulary
|
| 10 |
+
- sympy_type : corresponding SymPy internal class name
|
| 11 |
+
- arity : number of operands (-1 = variadic)
|
| 12 |
+
- precedence : parsing binding strength (higher = tighter)
|
| 13 |
+
- associativity: 'left' | 'right' | 'none'
|
| 14 |
+
- semantic_role: high-level mathematical interpretation
|
| 15 |
+
- latex_repr : canonical LaTeX representation
|
| 16 |
+
- ascii_repr : ASCII fallback representation
|
| 17 |
+
- category : broad grouping for analysis
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
from dataclasses import dataclass
|
| 22 |
+
from typing import List, Optional
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ── Data Model ────────────────────────────────────────────────────────────
|
| 26 |
+
|
| 27 |
+
@dataclass(frozen=True)
|
| 28 |
+
class OperatorMeta:
|
| 29 |
+
"""Immutable semantic descriptor for a single MathTok operator token."""
|
| 30 |
+
token: str
|
| 31 |
+
sympy_type: str
|
| 32 |
+
arity: int # -1 = variadic
|
| 33 |
+
precedence: int # 0 = lowest binding
|
| 34 |
+
associativity: str # 'left' | 'right' | 'none'
|
| 35 |
+
semantic_role: str
|
| 36 |
+
latex_repr: str
|
| 37 |
+
ascii_repr: str
|
| 38 |
+
category: str # 'arithmetic' | 'relational' | 'calculus' | 'function' | 'structural' | 'logic' | 'set' | 'geometry' | 'statistics'
|
| 39 |
+
is_commutative: bool = False
|
| 40 |
+
|
| 41 |
+
def to_dict(self) -> dict:
|
| 42 |
+
return {
|
| 43 |
+
"token": self.token,
|
| 44 |
+
"sympy_type": self.sympy_type,
|
| 45 |
+
"arity": self.arity,
|
| 46 |
+
"precedence": self.precedence,
|
| 47 |
+
"associativity": self.associativity,
|
| 48 |
+
"semantic_role": self.semantic_role,
|
| 49 |
+
"latex_repr": self.latex_repr,
|
| 50 |
+
"ascii_repr": self.ascii_repr,
|
| 51 |
+
"category": self.category,
|
| 52 |
+
"is_commutative": self.is_commutative,
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# ── Registry ──────────────────────────────────────────────────────────────
|
| 57 |
+
|
| 58 |
+
OPERATOR_REGISTRY: dict[str, OperatorMeta] = {
|
| 59 |
+
|
| 60 |
+
# ── Arithmetic ──────────────────────────────────────────────────────
|
| 61 |
+
"OP_ADD": OperatorMeta(
|
| 62 |
+
token="OP_ADD", sympy_type="Add",
|
| 63 |
+
arity=-1, precedence=1, associativity="left",
|
| 64 |
+
semantic_role="aggregation",
|
| 65 |
+
latex_repr="+", ascii_repr="+", category="arithmetic", is_commutative=True,
|
| 66 |
+
),
|
| 67 |
+
"OP_MUL": OperatorMeta(
|
| 68 |
+
token="OP_MUL", sympy_type="Mul",
|
| 69 |
+
arity=-1, precedence=2, associativity="left",
|
| 70 |
+
semantic_role="scaling",
|
| 71 |
+
latex_repr="\\cdot", ascii_repr="*", category="arithmetic", is_commutative=True,
|
| 72 |
+
),
|
| 73 |
+
"OP_POW": OperatorMeta(
|
| 74 |
+
token="OP_POW", sympy_type="Pow",
|
| 75 |
+
arity=2, precedence=4, associativity="right",
|
| 76 |
+
semantic_role="recursive_growth",
|
| 77 |
+
latex_repr="^", ascii_repr="**", category="arithmetic",
|
| 78 |
+
),
|
| 79 |
+
"OP_NEG": OperatorMeta(
|
| 80 |
+
token="OP_NEG", sympy_type="Mul", # -x == Mul(-1, x) in SymPy
|
| 81 |
+
arity=1, precedence=3, associativity="none",
|
| 82 |
+
semantic_role="negation",
|
| 83 |
+
latex_repr="-", ascii_repr="-", category="arithmetic",
|
| 84 |
+
),
|
| 85 |
+
"OP_RECIP": OperatorMeta(
|
| 86 |
+
token="OP_RECIP", sympy_type="Pow", # x^{-1}
|
| 87 |
+
arity=1, precedence=3, associativity="none",
|
| 88 |
+
semantic_role="reciprocal",
|
| 89 |
+
latex_repr="^{-1}", ascii_repr="**(-1)", category="arithmetic",
|
| 90 |
+
),
|
| 91 |
+
"OP_ABS": OperatorMeta(
|
| 92 |
+
token="OP_ABS", sympy_type="Abs",
|
| 93 |
+
arity=1, precedence=5, associativity="none",
|
| 94 |
+
semantic_role="magnitude",
|
| 95 |
+
latex_repr="|\\cdot|", ascii_repr="abs", category="arithmetic",
|
| 96 |
+
),
|
| 97 |
+
"FRAC": OperatorMeta(
|
| 98 |
+
token="FRAC", sympy_type="Rational",
|
| 99 |
+
arity=2, precedence=3, associativity="none",
|
| 100 |
+
semantic_role="ratio",
|
| 101 |
+
latex_repr="\\frac", ascii_repr="/", category="structural",
|
| 102 |
+
),
|
| 103 |
+
|
| 104 |
+
# ── Relational ──────────────────────────────────────────────────────
|
| 105 |
+
"OP_EQ": OperatorMeta(
|
| 106 |
+
token="OP_EQ", sympy_type="Eq",
|
| 107 |
+
arity=2, precedence=0, associativity="none",
|
| 108 |
+
semantic_role="equality",
|
| 109 |
+
latex_repr="=", ascii_repr="==", category="relational", is_commutative=True,
|
| 110 |
+
),
|
| 111 |
+
"OP_NEQ": OperatorMeta(
|
| 112 |
+
token="OP_NEQ", sympy_type="Ne",
|
| 113 |
+
arity=2, precedence=0, associativity="none",
|
| 114 |
+
semantic_role="inequality",
|
| 115 |
+
latex_repr="\\neq", ascii_repr="!=", category="relational", is_commutative=True,
|
| 116 |
+
),
|
| 117 |
+
"OP_LT": OperatorMeta(
|
| 118 |
+
token="OP_LT", sympy_type="StrictLessThan",
|
| 119 |
+
arity=2, precedence=0, associativity="none",
|
| 120 |
+
semantic_role="strict_ordering",
|
| 121 |
+
latex_repr="<", ascii_repr="<", category="relational",
|
| 122 |
+
),
|
| 123 |
+
"OP_GT": OperatorMeta(
|
| 124 |
+
token="OP_GT", sympy_type="StrictGreaterThan",
|
| 125 |
+
arity=2, precedence=0, associativity="none",
|
| 126 |
+
semantic_role="strict_ordering",
|
| 127 |
+
latex_repr=">", ascii_repr=">", category="relational",
|
| 128 |
+
),
|
| 129 |
+
"OP_LE": OperatorMeta(
|
| 130 |
+
token="OP_LE", sympy_type="LessThan",
|
| 131 |
+
arity=2, precedence=0, associativity="none",
|
| 132 |
+
semantic_role="ordering",
|
| 133 |
+
latex_repr="\\leq", ascii_repr="<=", category="relational",
|
| 134 |
+
),
|
| 135 |
+
"OP_GE": OperatorMeta(
|
| 136 |
+
token="OP_GE", sympy_type="GreaterThan",
|
| 137 |
+
arity=2, precedence=0, associativity="none",
|
| 138 |
+
semantic_role="ordering",
|
| 139 |
+
latex_repr="\\geq", ascii_repr=">=", category="relational",
|
| 140 |
+
),
|
| 141 |
+
|
| 142 |
+
# ── Calculus ────────────────────────────────────────────────────────
|
| 143 |
+
"OP_DERIV": OperatorMeta(
|
| 144 |
+
token="OP_DERIV", sympy_type="Derivative",
|
| 145 |
+
arity=2, precedence=5, associativity="none",
|
| 146 |
+
semantic_role="local_change",
|
| 147 |
+
latex_repr="\\frac{d}{dx}", ascii_repr="diff", category="calculus",
|
| 148 |
+
),
|
| 149 |
+
"OP_INT": OperatorMeta(
|
| 150 |
+
token="OP_INT", sympy_type="Integral",
|
| 151 |
+
arity=2, precedence=0, associativity="none",
|
| 152 |
+
semantic_role="accumulation",
|
| 153 |
+
latex_repr="\\int", ascii_repr="integrate", category="calculus",
|
| 154 |
+
),
|
| 155 |
+
"OP_LIMIT": OperatorMeta(
|
| 156 |
+
token="OP_LIMIT", sympy_type="Limit",
|
| 157 |
+
arity=3, precedence=0, associativity="none",
|
| 158 |
+
semantic_role="asymptotic_behavior",
|
| 159 |
+
latex_repr="\\lim", ascii_repr="limit", category="calculus",
|
| 160 |
+
),
|
| 161 |
+
"OP_SUM": OperatorMeta(
|
| 162 |
+
token="OP_SUM", sympy_type="Sum",
|
| 163 |
+
arity=2, precedence=0, associativity="none",
|
| 164 |
+
semantic_role="discrete_accumulation",
|
| 165 |
+
latex_repr="\\sum", ascii_repr="Sum", category="calculus",
|
| 166 |
+
),
|
| 167 |
+
"OP_PROD": OperatorMeta(
|
| 168 |
+
token="OP_PROD", sympy_type="Product",
|
| 169 |
+
arity=2, precedence=0, associativity="none",
|
| 170 |
+
semantic_role="discrete_scaling",
|
| 171 |
+
latex_repr="\\prod", ascii_repr="Product", category="calculus",
|
| 172 |
+
),
|
| 173 |
+
|
| 174 |
+
# ── Trigonometric Functions ─────────────────────────────────────────
|
| 175 |
+
"FUNC_SIN": OperatorMeta(
|
| 176 |
+
token="FUNC_SIN", sympy_type="sin",
|
| 177 |
+
arity=1, precedence=5, associativity="none",
|
| 178 |
+
semantic_role="periodic_oscillation",
|
| 179 |
+
latex_repr="\\sin", ascii_repr="sin", category="function",
|
| 180 |
+
),
|
| 181 |
+
"FUNC_COS": OperatorMeta(
|
| 182 |
+
token="FUNC_COS", sympy_type="cos",
|
| 183 |
+
arity=1, precedence=5, associativity="none",
|
| 184 |
+
semantic_role="periodic_oscillation",
|
| 185 |
+
latex_repr="\\cos", ascii_repr="cos", category="function",
|
| 186 |
+
),
|
| 187 |
+
"FUNC_TAN": OperatorMeta(
|
| 188 |
+
token="FUNC_TAN", sympy_type="tan",
|
| 189 |
+
arity=1, precedence=5, associativity="none",
|
| 190 |
+
semantic_role="periodic_ratio",
|
| 191 |
+
latex_repr="\\tan", ascii_repr="tan", category="function",
|
| 192 |
+
),
|
| 193 |
+
"FUNC_ASIN": OperatorMeta(
|
| 194 |
+
token="FUNC_ASIN", sympy_type="asin",
|
| 195 |
+
arity=1, precedence=5, associativity="none",
|
| 196 |
+
semantic_role="inverse_periodic",
|
| 197 |
+
latex_repr="\\arcsin", ascii_repr="asin", category="function",
|
| 198 |
+
),
|
| 199 |
+
"FUNC_ACOS": OperatorMeta(
|
| 200 |
+
token="FUNC_ACOS", sympy_type="acos",
|
| 201 |
+
arity=1, precedence=5, associativity="none",
|
| 202 |
+
semantic_role="inverse_periodic",
|
| 203 |
+
latex_repr="\\arccos", ascii_repr="acos", category="function",
|
| 204 |
+
),
|
| 205 |
+
"FUNC_ATAN": OperatorMeta(
|
| 206 |
+
token="FUNC_ATAN", sympy_type="atan",
|
| 207 |
+
arity=1, precedence=5, associativity="none",
|
| 208 |
+
semantic_role="inverse_periodic",
|
| 209 |
+
latex_repr="\\arctan", ascii_repr="atan", category="function",
|
| 210 |
+
),
|
| 211 |
+
"FUNC_SINH": OperatorMeta(
|
| 212 |
+
token="FUNC_SINH", sympy_type="sinh",
|
| 213 |
+
arity=1, precedence=5, associativity="none",
|
| 214 |
+
semantic_role="hyperbolic_oscillation",
|
| 215 |
+
latex_repr="\\sinh", ascii_repr="sinh", category="function",
|
| 216 |
+
),
|
| 217 |
+
"FUNC_COSH": OperatorMeta(
|
| 218 |
+
token="FUNC_COSH", sympy_type="cosh",
|
| 219 |
+
arity=1, precedence=5, associativity="none",
|
| 220 |
+
semantic_role="hyperbolic_oscillation",
|
| 221 |
+
latex_repr="\\cosh", ascii_repr="cosh", category="function",
|
| 222 |
+
),
|
| 223 |
+
"FUNC_TANH": OperatorMeta(
|
| 224 |
+
token="FUNC_TANH", sympy_type="tanh",
|
| 225 |
+
arity=1, precedence=5, associativity="none",
|
| 226 |
+
semantic_role="hyperbolic_ratio",
|
| 227 |
+
latex_repr="\\tanh", ascii_repr="tanh", category="function",
|
| 228 |
+
),
|
| 229 |
+
|
| 230 |
+
# ── Exponential / Logarithmic ────────────────────────────────────────
|
| 231 |
+
"FUNC_EXP": OperatorMeta(
|
| 232 |
+
token="FUNC_EXP", sympy_type="exp",
|
| 233 |
+
arity=1, precedence=5, associativity="none",
|
| 234 |
+
semantic_role="exponential_growth",
|
| 235 |
+
latex_repr="e^", ascii_repr="exp", category="function",
|
| 236 |
+
),
|
| 237 |
+
"FUNC_LOG": OperatorMeta(
|
| 238 |
+
token="FUNC_LOG", sympy_type="log",
|
| 239 |
+
arity=1, precedence=5, associativity="none",
|
| 240 |
+
semantic_role="logarithmic_compression",
|
| 241 |
+
latex_repr="\\ln", ascii_repr="log", category="function",
|
| 242 |
+
),
|
| 243 |
+
"FUNC_LOG10": OperatorMeta(
|
| 244 |
+
token="FUNC_LOG10", sympy_type="log",
|
| 245 |
+
arity=1, precedence=5, associativity="none",
|
| 246 |
+
semantic_role="logarithmic_compression",
|
| 247 |
+
latex_repr="\\log_{10}", ascii_repr="log10", category="function",
|
| 248 |
+
),
|
| 249 |
+
"FUNC_SQRT": OperatorMeta(
|
| 250 |
+
token="FUNC_SQRT", sympy_type="sqrt",
|
| 251 |
+
arity=1, precedence=5, associativity="none",
|
| 252 |
+
semantic_role="root_extraction",
|
| 253 |
+
latex_repr="\\sqrt", ascii_repr="sqrt", category="function",
|
| 254 |
+
),
|
| 255 |
+
"FUNC_CBRT": OperatorMeta(
|
| 256 |
+
token="FUNC_CBRT", sympy_type="cbrt",
|
| 257 |
+
arity=1, precedence=5, associativity="none",
|
| 258 |
+
semantic_role="root_extraction",
|
| 259 |
+
latex_repr="\\sqrt[3]", ascii_repr="cbrt", category="function",
|
| 260 |
+
),
|
| 261 |
+
|
| 262 |
+
# ── Special Functions ────────────────────────────────────────────────
|
| 263 |
+
"FUNC_GAMMA": OperatorMeta(
|
| 264 |
+
token="FUNC_GAMMA", sympy_type="gamma",
|
| 265 |
+
arity=1, precedence=5, associativity="none",
|
| 266 |
+
semantic_role="factorial_extension",
|
| 267 |
+
latex_repr="\\Gamma", ascii_repr="gamma", category="function",
|
| 268 |
+
),
|
| 269 |
+
"FUNC_FACTORIAL": OperatorMeta(
|
| 270 |
+
token="FUNC_FACTORIAL", sympy_type="factorial",
|
| 271 |
+
arity=1, precedence=6, associativity="none",
|
| 272 |
+
semantic_role="combinatorial_growth",
|
| 273 |
+
latex_repr="!", ascii_repr="factorial", category="function",
|
| 274 |
+
),
|
| 275 |
+
"FUNC_FLOOR": OperatorMeta(
|
| 276 |
+
token="FUNC_FLOOR", sympy_type="floor",
|
| 277 |
+
arity=1, precedence=5, associativity="none",
|
| 278 |
+
semantic_role="integer_rounding_down",
|
| 279 |
+
latex_repr="\\lfloor\\rfloor", ascii_repr="floor", category="function",
|
| 280 |
+
),
|
| 281 |
+
"FUNC_CEIL": OperatorMeta(
|
| 282 |
+
token="FUNC_CEIL", sympy_type="ceiling",
|
| 283 |
+
arity=1, precedence=5, associativity="none",
|
| 284 |
+
semantic_role="integer_rounding_up",
|
| 285 |
+
latex_repr="\\lceil\\rceil", ascii_repr="ceil", category="function",
|
| 286 |
+
),
|
| 287 |
+
"FUNC_RE": OperatorMeta(
|
| 288 |
+
token="FUNC_RE", sympy_type="re",
|
| 289 |
+
arity=1, precedence=5, associativity="none",
|
| 290 |
+
semantic_role="real_part",
|
| 291 |
+
latex_repr="\\Re", ascii_repr="re", category="function",
|
| 292 |
+
),
|
| 293 |
+
"FUNC_IM": OperatorMeta(
|
| 294 |
+
token="FUNC_IM", sympy_type="im",
|
| 295 |
+
arity=1, precedence=5, associativity="none",
|
| 296 |
+
semantic_role="imaginary_part",
|
| 297 |
+
latex_repr="\\Im", ascii_repr="im", category="function",
|
| 298 |
+
),
|
| 299 |
+
|
| 300 |
+
# ── Logic ───────────────────────────────────────────────────────────
|
| 301 |
+
"OP_AND": OperatorMeta(
|
| 302 |
+
token="OP_AND", sympy_type="And",
|
| 303 |
+
arity=-1, precedence=1, associativity="left",
|
| 304 |
+
semantic_role="logical_conjunction",
|
| 305 |
+
latex_repr="\\land", ascii_repr="and", category="logic", is_commutative=True,
|
| 306 |
+
),
|
| 307 |
+
"OP_OR": OperatorMeta(
|
| 308 |
+
token="OP_OR", sympy_type="Or",
|
| 309 |
+
arity=-1, precedence=1, associativity="left",
|
| 310 |
+
semantic_role="logical_disjunction",
|
| 311 |
+
latex_repr="\\lor", ascii_repr="or", category="logic", is_commutative=True,
|
| 312 |
+
),
|
| 313 |
+
"OP_NOT": OperatorMeta(
|
| 314 |
+
token="OP_NOT", sympy_type="Not",
|
| 315 |
+
arity=1, precedence=5, associativity="none",
|
| 316 |
+
semantic_role="logical_negation",
|
| 317 |
+
latex_repr="\\lnot", ascii_repr="not", category="logic",
|
| 318 |
+
),
|
| 319 |
+
"OP_IMPLIES": OperatorMeta(
|
| 320 |
+
token="OP_IMPLIES", sympy_type="Implies",
|
| 321 |
+
arity=2, precedence=0, associativity="none",
|
| 322 |
+
semantic_role="logical_implication",
|
| 323 |
+
latex_repr="\\implies", ascii_repr="=>", category="logic",
|
| 324 |
+
),
|
| 325 |
+
|
| 326 |
+
# ── Set Theory ──────────────────────────────────────────────────────
|
| 327 |
+
"OP_UNION": OperatorMeta(
|
| 328 |
+
token="OP_UNION", sympy_type="Union",
|
| 329 |
+
arity=-1, precedence=2, associativity="left",
|
| 330 |
+
semantic_role="set_union",
|
| 331 |
+
latex_repr="\\cup", ascii_repr="U", category="set", is_commutative=True,
|
| 332 |
+
),
|
| 333 |
+
"OP_INTERSECT": OperatorMeta(
|
| 334 |
+
token="OP_INTERSECT", sympy_type="Intersection",
|
| 335 |
+
arity=-1, precedence=2, associativity="left",
|
| 336 |
+
semantic_role="set_intersection",
|
| 337 |
+
latex_repr="\\cap", ascii_repr="intersect", category="set", is_commutative=True,
|
| 338 |
+
),
|
| 339 |
+
"OP_IN": OperatorMeta(
|
| 340 |
+
token="OP_IN", sympy_type="Contains",
|
| 341 |
+
arity=2, precedence=0, associativity="none",
|
| 342 |
+
semantic_role="set_membership",
|
| 343 |
+
latex_repr="\\in", ascii_repr="in", category="set",
|
| 344 |
+
),
|
| 345 |
+
"OP_SUBSET": OperatorMeta(
|
| 346 |
+
token="OP_SUBSET", sympy_type="Subset",
|
| 347 |
+
arity=2, precedence=0, associativity="none",
|
| 348 |
+
semantic_role="subset",
|
| 349 |
+
latex_repr="\\subset", ascii_repr="subset", category="set",
|
| 350 |
+
),
|
| 351 |
+
|
| 352 |
+
# ── Geometry ────────────────────────────────────────────────────────
|
| 353 |
+
"OP_ANGLE": OperatorMeta(
|
| 354 |
+
token="OP_ANGLE", sympy_type="Angle",
|
| 355 |
+
arity=1, precedence=5, associativity="none",
|
| 356 |
+
semantic_role="geometric_angle",
|
| 357 |
+
latex_repr="\\angle", ascii_repr="angle", category="geometry",
|
| 358 |
+
),
|
| 359 |
+
"OP_PARALLEL": OperatorMeta(
|
| 360 |
+
token="OP_PARALLEL", sympy_type="Parallel",
|
| 361 |
+
arity=2, precedence=0, associativity="none",
|
| 362 |
+
semantic_role="geometric_parallel",
|
| 363 |
+
latex_repr="\\parallel", ascii_repr="||", category="geometry", is_commutative=True,
|
| 364 |
+
),
|
| 365 |
+
"OP_PERP": OperatorMeta(
|
| 366 |
+
token="OP_PERP", sympy_type="Perpendicular",
|
| 367 |
+
arity=2, precedence=0, associativity="none",
|
| 368 |
+
semantic_role="geometric_perpendicular",
|
| 369 |
+
latex_repr="\\perp", ascii_repr="perp", category="geometry", is_commutative=True,
|
| 370 |
+
),
|
| 371 |
+
|
| 372 |
+
# ── Statistics ──────────────────────────────────────────────────────
|
| 373 |
+
"FUNC_MEAN": OperatorMeta(
|
| 374 |
+
token="FUNC_MEAN", sympy_type="Mean",
|
| 375 |
+
arity=-1, precedence=5, associativity="none",
|
| 376 |
+
semantic_role="statistical_mean",
|
| 377 |
+
latex_repr="\\mu", ascii_repr="mean", category="statistics",
|
| 378 |
+
),
|
| 379 |
+
"FUNC_STDEV": OperatorMeta(
|
| 380 |
+
token="FUNC_STDEV", sympy_type="StdDev",
|
| 381 |
+
arity=-1, precedence=5, associativity="none",
|
| 382 |
+
semantic_role="statistical_deviation",
|
| 383 |
+
latex_repr="\\sigma", ascii_repr="stdev", category="statistics",
|
| 384 |
+
),
|
| 385 |
+
"FUNC_VAR": OperatorMeta(
|
| 386 |
+
token="FUNC_VAR", sympy_type="Variance",
|
| 387 |
+
arity=-1, precedence=5, associativity="none",
|
| 388 |
+
semantic_role="statistical_variance",
|
| 389 |
+
latex_repr="\\sigma^2", ascii_repr="var", category="statistics",
|
| 390 |
+
),
|
| 391 |
+
}
|
| 392 |
+
|
| 393 |
+
INVERSE_PAIRS: dict[str, str] = {
|
| 394 |
+
"FUNC_SIN": "FUNC_ASIN", "FUNC_ASIN": "FUNC_SIN",
|
| 395 |
+
"FUNC_COS": "FUNC_ACOS", "FUNC_ACOS": "FUNC_COS",
|
| 396 |
+
"FUNC_TAN": "FUNC_ATAN", "FUNC_ATAN": "FUNC_TAN",
|
| 397 |
+
"FUNC_EXP": "FUNC_LOG", "FUNC_LOG": "FUNC_EXP",
|
| 398 |
+
"OP_ADD": "OP_NEG", "OP_NEG": "OP_ADD",
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
# ── Derived Lookups ────────────────────────────────────────────────────────
|
| 402 |
+
|
| 403 |
+
# sympy class name → list of tokens (may be many-to-one, e.g. log)
|
| 404 |
+
SYMPY_TYPE_TO_TOKENS: dict[str, list[str]] = {}
|
| 405 |
+
for _tok, _meta in OPERATOR_REGISTRY.items():
|
| 406 |
+
SYMPY_TYPE_TO_TOKENS.setdefault(_meta.sympy_type, []).append(_tok)
|
| 407 |
+
|
| 408 |
+
# Group tokens by category
|
| 409 |
+
OPERATOR_CATEGORIES: dict[str, list[str]] = {
|
| 410 |
+
cat: [t for t, m in OPERATOR_REGISTRY.items() if m.category == cat]
|
| 411 |
+
for cat in {"arithmetic", "relational", "calculus", "function", "structural", "logic", "set", "geometry", "statistics"}
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
# ── Public Helpers ─────────────────────────────────────────────────────────
|
| 416 |
+
|
| 417 |
+
def get_operator(token: str) -> Optional[OperatorMeta]:
|
| 418 |
+
"""Return OperatorMeta for a given token, or None."""
|
| 419 |
+
return OPERATOR_REGISTRY.get(token)
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
def get_all_operator_tokens() -> List[str]:
|
| 423 |
+
"""Return all operator/function token strings."""
|
| 424 |
+
return list(OPERATOR_REGISTRY.keys())
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
def get_by_category(category: str) -> List[str]:
|
| 428 |
+
"""Return all tokens in a given category."""
|
| 429 |
+
return OPERATOR_CATEGORIES.get(category, [])
|
mathtok/pipeline.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
End-to-end MathTok Pipeline
|
| 3 |
+
|
| 4 |
+
Orchestrates all 7 layers into a single encode() call.
|
| 5 |
+
|
| 6 |
+
Pipeline flow
|
| 7 |
+
─────────────
|
| 8 |
+
Input text
|
| 9 |
+
→ HybridLexer (split TEXT / MATH spans)
|
| 10 |
+
→ For each MATH span:
|
| 11 |
+
→ Canonicalizer (normalize expression)
|
| 12 |
+
→ ASTGenerator (SymPy → ASTNode tree)
|
| 13 |
+
→ StructuralSerializer (DFS → SerializedToken list)
|
| 14 |
+
→ MetadataGenerator (structural attention metadata)
|
| 15 |
+
→ MathTokVocabulary (token → ID)
|
| 16 |
+
→ For each TEXT span:
|
| 17 |
+
→ MathTokVocabulary.encode_text() (BPE)
|
| 18 |
+
→ Merge results into TokenizedOutput
|
| 19 |
+
|
| 20 |
+
Usage
|
| 21 |
+
─────
|
| 22 |
+
>>> from mathtok import MathTokPipeline
|
| 23 |
+
>>> p = MathTokPipeline()
|
| 24 |
+
>>> out = p.encode("The derivative of $\\sin(x^2) + 3x$")
|
| 25 |
+
>>> out.tokens # list[str]
|
| 26 |
+
>>> out.input_ids # list[int]
|
| 27 |
+
>>> out.metadata # list[TokenMetadata]
|
| 28 |
+
>>> out.sexp # S-expression string (math spans only)
|
| 29 |
+
|
| 30 |
+
CLI
|
| 31 |
+
───
|
| 32 |
+
python -m mathtok.pipeline "sin(x^2) + 3x"
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
from __future__ import annotations
|
| 36 |
+
|
| 37 |
+
import argparse
|
| 38 |
+
import json
|
| 39 |
+
import logging
|
| 40 |
+
from dataclasses import dataclass, field
|
| 41 |
+
from typing import Optional
|
| 42 |
+
|
| 43 |
+
from .canonicalizer import Canonicalizer, CanonicalizationResult
|
| 44 |
+
from .lexer import HybridLexer, SpanType, LexSpan
|
| 45 |
+
from .ast_generator import ASTGenerator, ASTNode
|
| 46 |
+
from .serializer import StructuralSerializer, SerializedToken
|
| 47 |
+
from .metadata import MetadataGenerator, TokenMetadata
|
| 48 |
+
from .vocabulary import MathTokVocabulary
|
| 49 |
+
|
| 50 |
+
logger = logging.getLogger(__name__)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# ── Output dataclass ──────────────────────────────────────────────────────
|
| 54 |
+
|
| 55 |
+
@dataclass
|
| 56 |
+
class TokenizedOutput:
|
| 57 |
+
"""
|
| 58 |
+
Complete output of the MathTok pipeline for one input string.
|
| 59 |
+
|
| 60 |
+
Attributes
|
| 61 |
+
----------
|
| 62 |
+
tokens : Merged token string sequence (math + text tokens).
|
| 63 |
+
input_ids : Corresponding vocabulary integer IDs.
|
| 64 |
+
metadata : Structural metadata for each token position.
|
| 65 |
+
spans : Original LexSpan objects (TEXT / MATH segments).
|
| 66 |
+
math_sexps : S-expression strings for each MATH span.
|
| 67 |
+
canon_results : CanonicalizationResult per MATH span.
|
| 68 |
+
warnings : Any non-fatal warnings from the pipeline.
|
| 69 |
+
"""
|
| 70 |
+
tokens: list[str] = field(default_factory=list)
|
| 71 |
+
input_ids: list[int] = field(default_factory=list)
|
| 72 |
+
metadata: list[TokenMetadata] = field(default_factory=list)
|
| 73 |
+
spans: list[LexSpan] = field(default_factory=list)
|
| 74 |
+
math_sexps: list[str] = field(default_factory=list)
|
| 75 |
+
canon_results: list[CanonicalizationResult] = field(default_factory=list)
|
| 76 |
+
warnings: list[str] = field(default_factory=list)
|
| 77 |
+
|
| 78 |
+
@property
|
| 79 |
+
def sexp(self) -> str:
|
| 80 |
+
"""Join all math S-expressions with a space."""
|
| 81 |
+
return " ".join(self.math_sexps)
|
| 82 |
+
|
| 83 |
+
def summary(self) -> str:
|
| 84 |
+
"""Human-readable summary."""
|
| 85 |
+
lines = [
|
| 86 |
+
f"Tokens : {len(self.tokens)}",
|
| 87 |
+
f"Math spans : {len(self.math_sexps)}",
|
| 88 |
+
f"Vocab IDs : {self.input_ids[:10]}{'...' if len(self.input_ids) > 10 else ''}",
|
| 89 |
+
f"S-expression: {self.sexp[:120]}",
|
| 90 |
+
]
|
| 91 |
+
if self.warnings:
|
| 92 |
+
lines.append(f"Warnings : {'; '.join(self.warnings)}")
|
| 93 |
+
return "\n".join(lines)
|
| 94 |
+
|
| 95 |
+
def to_dict(self) -> dict:
|
| 96 |
+
return {
|
| 97 |
+
"tokens": self.tokens,
|
| 98 |
+
"input_ids": self.input_ids,
|
| 99 |
+
"metadata": [m.to_dict() for m in self.metadata],
|
| 100 |
+
"math_sexps": self.math_sexps,
|
| 101 |
+
"warnings": self.warnings,
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# ── Main pipeline ─────────────────────────────────────────────────────────
|
| 106 |
+
|
| 107 |
+
class MathTokPipeline:
|
| 108 |
+
"""
|
| 109 |
+
End-to-end tokenization pipeline for mixed text+math input.
|
| 110 |
+
|
| 111 |
+
Parameters
|
| 112 |
+
----------
|
| 113 |
+
canonicalizer : Canonicalizer | None
|
| 114 |
+
Override the default canonicalizer.
|
| 115 |
+
lexer : HybridLexer | None
|
| 116 |
+
Override the default lexer.
|
| 117 |
+
ast_generator : ASTGenerator | None
|
| 118 |
+
Override the default AST generator.
|
| 119 |
+
serializer : StructuralSerializer | None
|
| 120 |
+
Override the default serializer.
|
| 121 |
+
metadata_gen : MetadataGenerator | None
|
| 122 |
+
Override the default metadata generator.
|
| 123 |
+
vocab : MathTokVocabulary | None
|
| 124 |
+
Override the default vocabulary.
|
| 125 |
+
include_metadata : bool
|
| 126 |
+
Whether to compute structural metadata (slightly slower).
|
| 127 |
+
"""
|
| 128 |
+
|
| 129 |
+
def __init__(
|
| 130 |
+
self,
|
| 131 |
+
canonicalizer: Optional[Canonicalizer] = None,
|
| 132 |
+
lexer: Optional[HybridLexer] = None,
|
| 133 |
+
ast_generator: Optional[ASTGenerator] = None,
|
| 134 |
+
serializer: Optional[StructuralSerializer] = None,
|
| 135 |
+
metadata_gen: Optional[MetadataGenerator] = None,
|
| 136 |
+
vocab: Optional[MathTokVocabulary] = None,
|
| 137 |
+
include_metadata: bool = True,
|
| 138 |
+
timeout_seconds: float = 5.0,
|
| 139 |
+
max_depth: int = 20,
|
| 140 |
+
emit_scope_tokens: bool = True,
|
| 141 |
+
) -> None:
|
| 142 |
+
self.canon = canonicalizer or Canonicalizer(timeout_seconds=timeout_seconds)
|
| 143 |
+
self.lexer = lexer or HybridLexer()
|
| 144 |
+
self.ast_gen = ast_generator or ASTGenerator(max_depth=max_depth)
|
| 145 |
+
self.serializer= serializer or StructuralSerializer(emit_scope_tokens=emit_scope_tokens)
|
| 146 |
+
self.meta_gen = metadata_gen or MetadataGenerator()
|
| 147 |
+
self.vocab = vocab or MathTokVocabulary()
|
| 148 |
+
self.include_metadata = include_metadata
|
| 149 |
+
|
| 150 |
+
# ── Public API ────────────────────────────────────────────────────────
|
| 151 |
+
|
| 152 |
+
def encode(self, text: str) -> TokenizedOutput:
|
| 153 |
+
"""
|
| 154 |
+
Tokenize a mixed text+math string through the full pipeline.
|
| 155 |
+
|
| 156 |
+
Parameters
|
| 157 |
+
----------
|
| 158 |
+
text : str
|
| 159 |
+
Input containing natural language and/or mathematical
|
| 160 |
+
expressions in LaTeX or ASCII format.
|
| 161 |
+
|
| 162 |
+
Returns
|
| 163 |
+
-------
|
| 164 |
+
TokenizedOutput
|
| 165 |
+
"""
|
| 166 |
+
out = TokenizedOutput()
|
| 167 |
+
spans = self.lexer.lex(text)
|
| 168 |
+
out.spans = spans
|
| 169 |
+
|
| 170 |
+
all_serialized: list[SerializedToken] = []
|
| 171 |
+
|
| 172 |
+
for span in spans:
|
| 173 |
+
if span.span_type is SpanType.MATH:
|
| 174 |
+
ser_tokens, sexp, canon_result, warnings = self._process_math(span.content)
|
| 175 |
+
out.math_sexps.append(sexp)
|
| 176 |
+
out.canon_results.append(canon_result)
|
| 177 |
+
out.warnings.extend(warnings)
|
| 178 |
+
all_serialized.extend(ser_tokens)
|
| 179 |
+
out.tokens.extend(st.token for st in ser_tokens)
|
| 180 |
+
out.input_ids.extend(self.vocab.token_to_id(st.token) for st in ser_tokens)
|
| 181 |
+
else:
|
| 182 |
+
text_ids = self.vocab.encode_text(span.content.strip())
|
| 183 |
+
text_tokens = [self.vocab.id_to_token(i) for i in text_ids]
|
| 184 |
+
out.tokens.extend(text_tokens)
|
| 185 |
+
out.input_ids.extend(text_ids)
|
| 186 |
+
|
| 187 |
+
# Structural metadata
|
| 188 |
+
if self.include_metadata and all_serialized:
|
| 189 |
+
vocab_map = self.vocab.get_vocab()
|
| 190 |
+
out.metadata = self.meta_gen.generate(all_serialized, vocab=vocab_map)
|
| 191 |
+
|
| 192 |
+
return out
|
| 193 |
+
|
| 194 |
+
def encode_batch(self, texts: list[str]) -> list[TokenizedOutput]:
|
| 195 |
+
"""Tokenize a list of strings."""
|
| 196 |
+
return [self.encode(t) for t in texts]
|
| 197 |
+
|
| 198 |
+
def encode_math_only(self, expression: str) -> TokenizedOutput:
|
| 199 |
+
"""
|
| 200 |
+
Directly tokenize a pure math expression (no lexer splitting).
|
| 201 |
+
Use when the input is guaranteed to be a single math expression.
|
| 202 |
+
"""
|
| 203 |
+
ser_tokens, sexp, canon_result, warnings = self._process_math(expression)
|
| 204 |
+
out = TokenizedOutput(
|
| 205 |
+
tokens = [st.token for st in ser_tokens],
|
| 206 |
+
input_ids = [self.vocab.token_to_id(st.token) for st in ser_tokens],
|
| 207 |
+
math_sexps = [sexp],
|
| 208 |
+
canon_results = [canon_result],
|
| 209 |
+
warnings = warnings,
|
| 210 |
+
)
|
| 211 |
+
if self.include_metadata and ser_tokens:
|
| 212 |
+
vocab_map = self.vocab.get_vocab()
|
| 213 |
+
out.metadata = self.meta_gen.generate(ser_tokens, vocab=vocab_map)
|
| 214 |
+
return out
|
| 215 |
+
|
| 216 |
+
def get_hf_tokenizer(self):
|
| 217 |
+
"""Return a HuggingFace-compatible tokenizer wrapper."""
|
| 218 |
+
return self.vocab.build_hf_tokenizer(pipeline=self)
|
| 219 |
+
|
| 220 |
+
# ── Math processing sub-pipeline ──────────────────────────────────────
|
| 221 |
+
|
| 222 |
+
def _process_math(
|
| 223 |
+
self, expression: str
|
| 224 |
+
) -> tuple[list[SerializedToken], str, CanonicalizationResult, list[str]]:
|
| 225 |
+
"""
|
| 226 |
+
Run a single math expression through:
|
| 227 |
+
Canonicalize → AST → Serialize → (metadata later)
|
| 228 |
+
|
| 229 |
+
Returns (serialized_tokens, sexp_string, canon_result, warnings)
|
| 230 |
+
"""
|
| 231 |
+
warnings: list[str] = []
|
| 232 |
+
|
| 233 |
+
# Step 1: Canonicalize
|
| 234 |
+
canon_result = self.canon.canonicalize(expression)
|
| 235 |
+
warnings.extend(canon_result.warnings)
|
| 236 |
+
|
| 237 |
+
if not canon_result.success:
|
| 238 |
+
# Emit a single error token so downstream doesn't break
|
| 239 |
+
error_tok = SerializedToken(
|
| 240 |
+
token="[UNK]", position=0, depth=0, node_id=-1,
|
| 241 |
+
parent_id=-1, child_index=0, num_children=0,
|
| 242 |
+
is_leaf=True, subtree_size=1,
|
| 243 |
+
)
|
| 244 |
+
return [error_tok], "[UNK]", canon_result, warnings
|
| 245 |
+
|
| 246 |
+
# Step 2: Build AST
|
| 247 |
+
try:
|
| 248 |
+
ast_root = self.ast_gen.generate(canon_result.expr)
|
| 249 |
+
except Exception as exc:
|
| 250 |
+
warnings.append(f"AST generation failed: {exc}")
|
| 251 |
+
error_tok = SerializedToken(
|
| 252 |
+
token="[UNK]", position=0, depth=0, node_id=-1,
|
| 253 |
+
parent_id=-1, child_index=0, num_children=0,
|
| 254 |
+
is_leaf=True, subtree_size=1,
|
| 255 |
+
)
|
| 256 |
+
return [error_tok], "[UNK]", canon_result, warnings
|
| 257 |
+
|
| 258 |
+
# Step 3: Serialize to flat token stream
|
| 259 |
+
try:
|
| 260 |
+
ser_tokens = self.serializer.serialize(ast_root)
|
| 261 |
+
sexp = self.serializer.to_sexp(ast_root)
|
| 262 |
+
except Exception as exc:
|
| 263 |
+
warnings.append(f"Serialization failed: {exc}")
|
| 264 |
+
return [], "", canon_result, warnings
|
| 265 |
+
|
| 266 |
+
# Step 4: Dynamically register any new variable tokens
|
| 267 |
+
for st in ser_tokens:
|
| 268 |
+
if st.token.startswith("VAR_") or st.token.startswith("NUM_"):
|
| 269 |
+
self.vocab.add_math_token(st.token)
|
| 270 |
+
|
| 271 |
+
return ser_tokens, sexp, canon_result, warnings
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
# ── CLI ───────────────────────────────────────────────────────────────────
|
| 275 |
+
|
| 276 |
+
def cli() -> None:
|
| 277 |
+
"""Command-line interface for quick testing."""
|
| 278 |
+
parser = argparse.ArgumentParser(
|
| 279 |
+
description="MathTok: Tokenize a mathematical expression."
|
| 280 |
+
)
|
| 281 |
+
parser.add_argument("expression", nargs="?", help="Math expression to tokenize")
|
| 282 |
+
parser.add_argument("--json", action="store_true", help="Output full JSON")
|
| 283 |
+
parser.add_argument("--sexp", action="store_true", help="Output S-expression only")
|
| 284 |
+
args = parser.parse_args()
|
| 285 |
+
|
| 286 |
+
text = args.expression or input("Expression: ")
|
| 287 |
+
|
| 288 |
+
pipeline = MathTokPipeline()
|
| 289 |
+
out = pipeline.encode(text)
|
| 290 |
+
|
| 291 |
+
if args.json:
|
| 292 |
+
print(json.dumps(out.to_dict(), indent=2))
|
| 293 |
+
elif args.sexp:
|
| 294 |
+
print(out.sexp)
|
| 295 |
+
else:
|
| 296 |
+
print(out.summary())
|
| 297 |
+
print("\nTokens:", out.tokens)
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
if __name__ == "__main__":
|
| 301 |
+
cli()
|
mathtok/serializer.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Layer 5: Structural Serialization
|
| 3 |
+
|
| 4 |
+
Flattens the ASTNode tree into a 1-D token sequence suitable for
|
| 5 |
+
transformer consumption via DFS preorder traversal.
|
| 6 |
+
|
| 7 |
+
Three output formats
|
| 8 |
+
────────────────────
|
| 9 |
+
flat [OP_ADD, VAR_X, CONST_1] ← primary output
|
| 10 |
+
sexp (OP_ADD VAR_X CONST_1) ← Lisp-style, human readable
|
| 11 |
+
indented OP_ADD ← indented tree
|
| 12 |
+
VAR_X
|
| 13 |
+
CONST_1
|
| 14 |
+
|
| 15 |
+
Each emitted token is wrapped in a SerializedToken dataclass that
|
| 16 |
+
carries position, depth, parent, child-index, and subtree-size metadata.
|
| 17 |
+
This metadata is used by the MetadataGenerator (Layer 6).
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import hashlib
|
| 23 |
+
from dataclasses import dataclass, asdict
|
| 24 |
+
|
| 25 |
+
from .ast_generator import ASTNode
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# ── Boundary tokens ───────────────────────────────────────────────────────
|
| 29 |
+
|
| 30 |
+
MATH_START = "[MATH_START]"
|
| 31 |
+
MATH_END = "[MATH_END]"
|
| 32 |
+
TEXT_START = "[TEXT_START]"
|
| 33 |
+
TEXT_END = "[TEXT_END]"
|
| 34 |
+
SCOPE_OPEN = "[SCOPE_OPEN]"
|
| 35 |
+
SCOPE_CLOSE = "[SCOPE_CLOSE]"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ── Token dataclass ───────────────────────────────────────────────────────
|
| 39 |
+
|
| 40 |
+
@dataclass
|
| 41 |
+
class SerializedToken:
|
| 42 |
+
"""
|
| 43 |
+
One token in the flattened structural stream.
|
| 44 |
+
|
| 45 |
+
Attributes
|
| 46 |
+
----------
|
| 47 |
+
token : MathTok vocabulary string.
|
| 48 |
+
position : Index in the flat sequence (0-based).
|
| 49 |
+
depth : Tree depth at emission time (root = 0).
|
| 50 |
+
node_id : Unique AST node identifier.
|
| 51 |
+
parent_id : Parent's node_id (-1 for root / boundary tokens).
|
| 52 |
+
child_index : This node's index among its siblings (0-based).
|
| 53 |
+
num_children : Number of direct children of this node.
|
| 54 |
+
is_leaf : True iff no children.
|
| 55 |
+
subtree_size : Total nodes in the subtree rooted here.
|
| 56 |
+
is_boundary : True for [MATH_START], [MATH_END], etc.
|
| 57 |
+
"""
|
| 58 |
+
token: str
|
| 59 |
+
position: int
|
| 60 |
+
depth: int
|
| 61 |
+
node_id: int
|
| 62 |
+
parent_id: int
|
| 63 |
+
child_index: int
|
| 64 |
+
num_children: int
|
| 65 |
+
is_leaf: bool
|
| 66 |
+
subtree_size: int
|
| 67 |
+
is_boundary: bool = False
|
| 68 |
+
|
| 69 |
+
def to_dict(self) -> dict:
|
| 70 |
+
return asdict(self)
|
| 71 |
+
|
| 72 |
+
def __repr__(self) -> str:
|
| 73 |
+
return (
|
| 74 |
+
f"SerializedToken(pos={self.position}, token={self.token!r}, "
|
| 75 |
+
f"depth={self.depth}, children={self.num_children})"
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# ── Serializer ────────────────────────────────────────────────────────────
|
| 80 |
+
|
| 81 |
+
class StructuralSerializer:
|
| 82 |
+
"""
|
| 83 |
+
Serialize an ASTNode tree into a flat SerializedToken stream.
|
| 84 |
+
|
| 85 |
+
The serialization order is DFS preorder (root first, then children
|
| 86 |
+
left-to-right). This ordering is:
|
| 87 |
+
- recoverable given depth metadata
|
| 88 |
+
- compatible with causal language model training
|
| 89 |
+
- established practice for tree-to-sequence in NLP research
|
| 90 |
+
|
| 91 |
+
Parameters
|
| 92 |
+
----------
|
| 93 |
+
include_boundaries : bool
|
| 94 |
+
Wrap the token stream with [MATH_START] / [MATH_END] sentinels.
|
| 95 |
+
"""
|
| 96 |
+
|
| 97 |
+
def __init__(
|
| 98 |
+
self,
|
| 99 |
+
include_boundaries: bool = True,
|
| 100 |
+
emit_scope_tokens: bool = True,
|
| 101 |
+
dedup_subtrees: bool = False,
|
| 102 |
+
) -> None:
|
| 103 |
+
self.include_boundaries = include_boundaries
|
| 104 |
+
self.emit_scope_tokens = emit_scope_tokens
|
| 105 |
+
self.dedup_subtrees = dedup_subtrees
|
| 106 |
+
self._hash_cache: dict[str, int] = {}
|
| 107 |
+
|
| 108 |
+
# ── Public API ────────────────────────────────────────────────────────
|
| 109 |
+
|
| 110 |
+
def serialize(self, root: ASTNode) -> list[SerializedToken]:
|
| 111 |
+
"""
|
| 112 |
+
Serialize the AST to a flat SerializedToken stream.
|
| 113 |
+
|
| 114 |
+
Parameters
|
| 115 |
+
----------
|
| 116 |
+
root : ASTNode
|
| 117 |
+
Root node output by ASTGenerator.
|
| 118 |
+
|
| 119 |
+
Returns
|
| 120 |
+
-------
|
| 121 |
+
list[SerializedToken]
|
| 122 |
+
"""
|
| 123 |
+
tokens: list[SerializedToken] = []
|
| 124 |
+
self._hash_cache.clear()
|
| 125 |
+
|
| 126 |
+
if self.include_boundaries:
|
| 127 |
+
tokens.append(_boundary_token(MATH_START, 0))
|
| 128 |
+
|
| 129 |
+
self._dfs(root, tokens)
|
| 130 |
+
|
| 131 |
+
if self.include_boundaries:
|
| 132 |
+
tokens.append(_boundary_token(MATH_END, len(tokens)))
|
| 133 |
+
|
| 134 |
+
# Fix positions after boundary prepend
|
| 135 |
+
for i, t in enumerate(tokens):
|
| 136 |
+
object.__setattr__(t, "position", i) if hasattr(t, "__dataclass_fields__") else None
|
| 137 |
+
t.position = i
|
| 138 |
+
|
| 139 |
+
return tokens
|
| 140 |
+
|
| 141 |
+
def to_token_list(self, root: ASTNode) -> list[str]:
|
| 142 |
+
"""Return just the token strings (for vocabulary mapping)."""
|
| 143 |
+
return [st.token for st in self.serialize(root)]
|
| 144 |
+
|
| 145 |
+
def to_sexp(self, root: ASTNode) -> str:
|
| 146 |
+
"""Serialize to a Lisp-style S-expression string."""
|
| 147 |
+
return self._sexp(root)
|
| 148 |
+
|
| 149 |
+
def to_indented(self, root: ASTNode, indent: int = 2) -> str:
|
| 150 |
+
"""Serialize to an indented tree string."""
|
| 151 |
+
lines: list[str] = []
|
| 152 |
+
self._indent(root, lines, 0, indent)
|
| 153 |
+
return "\n".join(lines)
|
| 154 |
+
|
| 155 |
+
def reconstruct_depth_sequence(self, tokens: list[SerializedToken]) -> list[int]:
|
| 156 |
+
"""Return the depth of each token position (useful for pos-encoding)."""
|
| 157 |
+
return [max(t.depth, 0) for t in tokens]
|
| 158 |
+
|
| 159 |
+
def subtree_hash(self, node: ASTNode) -> str:
|
| 160 |
+
"""Compute a stable MD5 structural hash of the subtree rooted at node."""
|
| 161 |
+
hasher = hashlib.md5()
|
| 162 |
+
hasher.update(node.token.encode('utf-8'))
|
| 163 |
+
for child in node.children:
|
| 164 |
+
hasher.update(self.subtree_hash(child).encode('utf-8'))
|
| 165 |
+
return hasher.hexdigest()
|
| 166 |
+
|
| 167 |
+
# ── DFS preorder traversal ────────────────────────────────────────────
|
| 168 |
+
|
| 169 |
+
def _dfs(
|
| 170 |
+
self,
|
| 171 |
+
node: ASTNode,
|
| 172 |
+
tokens: list[SerializedToken],
|
| 173 |
+
child_index: int = 0,
|
| 174 |
+
) -> None:
|
| 175 |
+
"""Emit current node then recurse into children."""
|
| 176 |
+
if self.dedup_subtrees and not node.is_leaf:
|
| 177 |
+
node_hash = self.subtree_hash(node)
|
| 178 |
+
if node_hash in self._hash_cache:
|
| 179 |
+
tokens.append(SerializedToken(
|
| 180 |
+
token=f"SUBTREE_REF_{node_hash[:8]}",
|
| 181 |
+
position=len(tokens),
|
| 182 |
+
depth=node.depth,
|
| 183 |
+
node_id=node.node_id,
|
| 184 |
+
parent_id=node.parent_id,
|
| 185 |
+
child_index=child_index,
|
| 186 |
+
num_children=0,
|
| 187 |
+
is_leaf=True,
|
| 188 |
+
subtree_size=1,
|
| 189 |
+
))
|
| 190 |
+
return
|
| 191 |
+
self._hash_cache[node_hash] = node.node_id
|
| 192 |
+
|
| 193 |
+
pos = len(tokens)
|
| 194 |
+
tokens.append(SerializedToken(
|
| 195 |
+
token=node.token,
|
| 196 |
+
position=pos,
|
| 197 |
+
depth=node.depth,
|
| 198 |
+
node_id=node.node_id,
|
| 199 |
+
parent_id=node.parent_id,
|
| 200 |
+
child_index=child_index,
|
| 201 |
+
num_children=len(node.children),
|
| 202 |
+
is_leaf=node.is_leaf,
|
| 203 |
+
subtree_size=node.subtree_size,
|
| 204 |
+
))
|
| 205 |
+
|
| 206 |
+
is_function = node.token.startswith("FUNC_")
|
| 207 |
+
if is_function and self.emit_scope_tokens and not node.is_leaf:
|
| 208 |
+
tokens.append(_boundary_token(SCOPE_OPEN, len(tokens), depth=node.depth + 1, parent_id=node.node_id))
|
| 209 |
+
|
| 210 |
+
for i, child in enumerate(node.children):
|
| 211 |
+
self._dfs(child, tokens, child_index=i)
|
| 212 |
+
|
| 213 |
+
if is_function and self.emit_scope_tokens and not node.is_leaf:
|
| 214 |
+
tokens.append(_boundary_token(SCOPE_CLOSE, len(tokens), depth=node.depth + 1, parent_id=node.node_id))
|
| 215 |
+
|
| 216 |
+
# ── S-expression ──────────────────────────────────────────────────────
|
| 217 |
+
|
| 218 |
+
def _sexp(self, node: ASTNode) -> str:
|
| 219 |
+
if node.is_leaf:
|
| 220 |
+
return node.token
|
| 221 |
+
child_parts = " ".join(self._sexp(c) for c in node.children)
|
| 222 |
+
return f"({node.token} {child_parts})"
|
| 223 |
+
|
| 224 |
+
# ── Indented tree ─────────────────────────────────────────────────────
|
| 225 |
+
|
| 226 |
+
def _indent(self, node: ASTNode, lines: list[str], level: int, indent: int) -> None:
|
| 227 |
+
lines.append(" " * (level * indent) + node.token)
|
| 228 |
+
for child in node.children:
|
| 229 |
+
self._indent(child, lines, level + 1, indent)
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
# ── Helpers ───────────────────────────────────────────────────────────────
|
| 233 |
+
|
| 234 |
+
def _boundary_token(tok: str, pos: int, depth: int = -1, parent_id: int = -1) -> SerializedToken:
|
| 235 |
+
return SerializedToken(
|
| 236 |
+
token=tok, position=pos, depth=depth, node_id=-1,
|
| 237 |
+
parent_id=parent_id, child_index=0, num_children=0,
|
| 238 |
+
is_leaf=True, subtree_size=0, is_boundary=True,
|
| 239 |
+
)
|
mathtok/streaming.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Iterator, Optional, Iterable
|
| 3 |
+
|
| 4 |
+
from .pipeline import TokenizedOutput, MathTokPipeline
|
| 5 |
+
from .canonicalizer import Canonicalizer
|
| 6 |
+
from .lexer import HybridLexer
|
| 7 |
+
from .ast_generator import ASTGenerator
|
| 8 |
+
from .serializer import StructuralSerializer
|
| 9 |
+
from .metadata import MetadataGenerator
|
| 10 |
+
from .vocabulary import MathTokVocabulary
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class MathTokStreamingPipeline:
|
| 16 |
+
"""
|
| 17 |
+
A memory-efficient streaming wrapper for MathTokPipeline.
|
| 18 |
+
Uses generators to process massive datasets (e.g., millions of equations)
|
| 19 |
+
without loading all inputs or outputs into RAM simultaneously.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(
|
| 23 |
+
self,
|
| 24 |
+
canonicalizer: Optional[Canonicalizer] = None,
|
| 25 |
+
lexer: Optional[HybridLexer] = None,
|
| 26 |
+
ast_generator: Optional[ASTGenerator] = None,
|
| 27 |
+
serializer: Optional[StructuralSerializer] = None,
|
| 28 |
+
metadata_gen: Optional[MetadataGenerator] = None,
|
| 29 |
+
vocab: Optional[MathTokVocabulary] = None,
|
| 30 |
+
include_metadata: bool = True,
|
| 31 |
+
timeout_seconds: float = 5.0,
|
| 32 |
+
max_depth: int = 20,
|
| 33 |
+
emit_scope_tokens: bool = True,
|
| 34 |
+
) -> None:
|
| 35 |
+
self.pipeline = MathTokPipeline(
|
| 36 |
+
canonicalizer=canonicalizer,
|
| 37 |
+
lexer=lexer,
|
| 38 |
+
ast_generator=ast_generator,
|
| 39 |
+
serializer=serializer,
|
| 40 |
+
metadata_gen=metadata_gen,
|
| 41 |
+
vocab=vocab,
|
| 42 |
+
include_metadata=include_metadata,
|
| 43 |
+
timeout_seconds=timeout_seconds,
|
| 44 |
+
max_depth=max_depth,
|
| 45 |
+
emit_scope_tokens=emit_scope_tokens,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
def encode_stream(self, text_stream: Iterable[str]) -> Iterator[TokenizedOutput]:
|
| 49 |
+
"""
|
| 50 |
+
Lazily tokenize a stream of text strings.
|
| 51 |
+
|
| 52 |
+
Yields TokenizedOutput instances one at a time.
|
| 53 |
+
"""
|
| 54 |
+
for text in text_stream:
|
| 55 |
+
try:
|
| 56 |
+
yield self.pipeline.encode(text)
|
| 57 |
+
except Exception as e:
|
| 58 |
+
logger.warning(f"Failed to encode text {text[:50]!r}: {e}")
|
| 59 |
+
# Yield an empty output or skip? We'll yield an empty one with warning.
|
| 60 |
+
yield TokenizedOutput(warnings=[str(e)])
|
| 61 |
+
|
| 62 |
+
def encode_file(self, file_path: str, encoding: str = 'utf-8') -> Iterator[TokenizedOutput]:
|
| 63 |
+
"""
|
| 64 |
+
Stream expressions from a line-delimited text file.
|
| 65 |
+
"""
|
| 66 |
+
def line_generator() -> Iterator[str]:
|
| 67 |
+
with open(file_path, 'r', encoding=encoding) as f:
|
| 68 |
+
for line in f:
|
| 69 |
+
line = line.strip()
|
| 70 |
+
if line:
|
| 71 |
+
yield line
|
| 72 |
+
|
| 73 |
+
return self.encode_stream(line_generator())
|
mathtok/validator.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sympy as sp
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
from typing import Optional, Union
|
| 4 |
+
|
| 5 |
+
from .pipeline import TokenizedOutput
|
| 6 |
+
from .operator_registry import OPERATOR_REGISTRY
|
| 7 |
+
from .canonicalizer import Canonicalizer
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class ValidationResult:
|
| 12 |
+
is_valid: bool
|
| 13 |
+
original_expr: Optional[sp.Expr]
|
| 14 |
+
reconstructed_expr: Optional[sp.Expr]
|
| 15 |
+
error_message: Optional[str]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class RoundTripValidator:
|
| 19 |
+
"""
|
| 20 |
+
Validates that a tokenized math expression can be perfectly
|
| 21 |
+
reconstructed back into the original SymPy expression.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self):
|
| 25 |
+
self.canon = Canonicalizer()
|
| 26 |
+
|
| 27 |
+
def validate(self, output: TokenizedOutput, original_expr: Union[sp.Expr, str]) -> ValidationResult:
|
| 28 |
+
try:
|
| 29 |
+
if isinstance(original_expr, str):
|
| 30 |
+
fmt, expr, warnings = self.canon._parse(original_expr)
|
| 31 |
+
if expr is None:
|
| 32 |
+
return ValidationResult(False, None, None, f"Could not parse original: {warnings}")
|
| 33 |
+
original_expr = expr
|
| 34 |
+
|
| 35 |
+
# We need to extract the math tokens. We'll rely on the metadata array.
|
| 36 |
+
# Find the first MATH_START and MATH_END
|
| 37 |
+
math_start_idx = -1
|
| 38 |
+
math_end_idx = -1
|
| 39 |
+
for i, meta in enumerate(output.metadata):
|
| 40 |
+
if meta.token == "[MATH_START]":
|
| 41 |
+
math_start_idx = i
|
| 42 |
+
elif meta.token == "[MATH_END]":
|
| 43 |
+
math_end_idx = i
|
| 44 |
+
break
|
| 45 |
+
|
| 46 |
+
if math_start_idx == -1 or math_end_idx == -1:
|
| 47 |
+
return ValidationResult(False, original_expr, None, "No valid math span found in output")
|
| 48 |
+
|
| 49 |
+
math_metadata = output.metadata[math_start_idx+1:math_end_idx]
|
| 50 |
+
|
| 51 |
+
# Reconstruct the tree from metadata using node_id and children_ids
|
| 52 |
+
node_map = {m.node_id: m for m in math_metadata if m.node_id >= 0}
|
| 53 |
+
|
| 54 |
+
if not node_map:
|
| 55 |
+
return ValidationResult(False, original_expr, None, "No math nodes found")
|
| 56 |
+
|
| 57 |
+
# Find root (parent_id == -1)
|
| 58 |
+
root_id = -1
|
| 59 |
+
for m in node_map.values():
|
| 60 |
+
if m.parent_id == -1:
|
| 61 |
+
root_id = m.node_id
|
| 62 |
+
break
|
| 63 |
+
|
| 64 |
+
if root_id == -1:
|
| 65 |
+
return ValidationResult(False, original_expr, None, "No root node found")
|
| 66 |
+
|
| 67 |
+
reconstructed = self._build_expr(root_id, node_map)
|
| 68 |
+
|
| 69 |
+
# Use sympy.simplify to check equivalence
|
| 70 |
+
diff = sp.simplify(original_expr - reconstructed)
|
| 71 |
+
is_valid = diff == 0
|
| 72 |
+
|
| 73 |
+
return ValidationResult(
|
| 74 |
+
is_valid=is_valid,
|
| 75 |
+
original_expr=original_expr,
|
| 76 |
+
reconstructed_expr=reconstructed,
|
| 77 |
+
error_message=None if is_valid else f"Difference is non-zero: {diff}"
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
except Exception as exc:
|
| 81 |
+
return ValidationResult(False, original_expr if isinstance(original_expr, sp.Expr) else None, None, f"Validation failed: {exc}")
|
| 82 |
+
|
| 83 |
+
def _build_expr(self, node_id: int, node_map: dict) -> sp.Expr:
|
| 84 |
+
meta = node_map[node_id]
|
| 85 |
+
|
| 86 |
+
# Base cases (leaves)
|
| 87 |
+
if meta.token_category == "constant":
|
| 88 |
+
if meta.token.startswith("CONST_"):
|
| 89 |
+
val = meta.token[6:]
|
| 90 |
+
if val == "PI": return sp.pi
|
| 91 |
+
if val == "E": return sp.E
|
| 92 |
+
if val == "I": return sp.I
|
| 93 |
+
if val == "INF": return sp.oo
|
| 94 |
+
if val == "NEG_INF": return sp.S.NegativeInfinity
|
| 95 |
+
if val == "NAN": return sp.nan
|
| 96 |
+
return sp.Integer(int(val))
|
| 97 |
+
elif meta.token.startswith("NUM_"):
|
| 98 |
+
return sp.Integer(int(meta.token[4:]))
|
| 99 |
+
elif meta.token.startswith("FLOAT_"):
|
| 100 |
+
val_str = meta.token[6:].replace("p", ".").replace("NEG", "-")
|
| 101 |
+
return sp.Float(val_str)
|
| 102 |
+
|
| 103 |
+
if meta.token_category == "variable":
|
| 104 |
+
var_name = meta.token[4:].lower()
|
| 105 |
+
if var_name == "gamma_": var_name = "gamma"
|
| 106 |
+
return sp.Symbol(var_name)
|
| 107 |
+
|
| 108 |
+
if meta.token == "SUBTREE_TRUNCATED":
|
| 109 |
+
return sp.Symbol("TRUNCATED")
|
| 110 |
+
|
| 111 |
+
# Recursive case
|
| 112 |
+
children = [self._build_expr(cid, node_map) for cid in meta.children_ids]
|
| 113 |
+
|
| 114 |
+
if meta.token == "FRAC":
|
| 115 |
+
return sp.Rational(children[0], children[1])
|
| 116 |
+
|
| 117 |
+
op_meta = OPERATOR_REGISTRY.get(meta.token)
|
| 118 |
+
if op_meta:
|
| 119 |
+
cls = getattr(sp, op_meta.sympy_type, None)
|
| 120 |
+
if cls:
|
| 121 |
+
if op_meta.sympy_type == "Mul" and meta.token == "OP_NEG":
|
| 122 |
+
return sp.Mul(sp.Integer(-1), children[0])
|
| 123 |
+
if op_meta.sympy_type == "Pow" and meta.token == "OP_RECIP":
|
| 124 |
+
return sp.Pow(children[0], sp.Integer(-1))
|
| 125 |
+
return cls(*children)
|
| 126 |
+
|
| 127 |
+
# Fallback functions
|
| 128 |
+
if meta.token.startswith("FUNC_"):
|
| 129 |
+
cls_name = meta.token[5:].capitalize()
|
| 130 |
+
cls = getattr(sp, cls_name, None)
|
| 131 |
+
if cls:
|
| 132 |
+
return cls(*children)
|
| 133 |
+
else:
|
| 134 |
+
return sp.Function(meta.token[5:].lower())(*children)
|
| 135 |
+
|
| 136 |
+
# Unknown
|
| 137 |
+
return sp.Symbol(f"UNKNOWN_{meta.token}")
|
mathtok/vocabulary.py
ADDED
|
@@ -0,0 +1,408 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Layer 7: Vocabulary & BPE Compression
|
| 3 |
+
|
| 4 |
+
Two-tier vocabulary design
|
| 5 |
+
──────────────────────────
|
| 6 |
+
Tier 1 — Fixed Math Vocabulary
|
| 7 |
+
Every mathematical token (operators, functions, variables, constants,
|
| 8 |
+
structural) has a deterministic integer ID. These IDs are NEVER
|
| 9 |
+
computed by BPE; their meaning is exact and invariant.
|
| 10 |
+
|
| 11 |
+
Tier 2 — BPE Text Vocabulary
|
| 12 |
+
Natural-language text spans are compressed using the HuggingFace
|
| 13 |
+
`tokenizers` library (Byte-Pair Encoding). Only text tokens are
|
| 14 |
+
subject to BPE; math tokens bypass BPE entirely.
|
| 15 |
+
|
| 16 |
+
HuggingFace PreTrainedTokenizer compatibility
|
| 17 |
+
─────────────────────────────────────────────
|
| 18 |
+
MathTokHFTokenizer subclasses PreTrainedTokenizer so the tokenizer
|
| 19 |
+
can be used as a drop-in replacement in any HF training pipeline:
|
| 20 |
+
|
| 21 |
+
from mathtok import MathTokVocabulary
|
| 22 |
+
tok = MathTokVocabulary.build_hf_tokenizer(pipeline)
|
| 23 |
+
tok.save_pretrained("./mathtok-tokenizer")
|
| 24 |
+
tok = MathTokHFTokenizer.from_pretrained("./mathtok-tokenizer")
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
|
| 29 |
+
import json
|
| 30 |
+
import logging
|
| 31 |
+
import os
|
| 32 |
+
from pathlib import Path
|
| 33 |
+
from typing import Optional
|
| 34 |
+
|
| 35 |
+
from .operator_registry import get_all_operator_tokens
|
| 36 |
+
|
| 37 |
+
logger = logging.getLogger(__name__)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# ── Fixed vocabulary constants ────────────────────────────────────────────
|
| 41 |
+
|
| 42 |
+
_SPECIAL_TOKENS = [
|
| 43 |
+
"[PAD]", # 0
|
| 44 |
+
"[UNK]", # 1
|
| 45 |
+
"[UNK_MATH]", # 2
|
| 46 |
+
"[BOS]", # 3
|
| 47 |
+
"[EOS]", # 4
|
| 48 |
+
"[MATH_START]", # 5
|
| 49 |
+
"[MATH_END]", # 6
|
| 50 |
+
"[TEXT_START]", # 7
|
| 51 |
+
"[TEXT_END]", # 8
|
| 52 |
+
"[SEP]", # 9
|
| 53 |
+
"[MASK]", # 10
|
| 54 |
+
"[SCOPE_OPEN]", # 11
|
| 55 |
+
"[SCOPE_CLOSE]",# 12
|
| 56 |
+
"SUBTREE_TRUNCATED", # 13
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
# Common variable tokens
|
| 60 |
+
_VAR_TOKENS = [
|
| 61 |
+
"VAR_X", "VAR_Y", "VAR_Z", "VAR_T", "VAR_N", "VAR_K",
|
| 62 |
+
"VAR_A", "VAR_B", "VAR_C", "VAR_M", "VAR_I", "VAR_J",
|
| 63 |
+
"VAR_R", "VAR_S", "VAR_U", "VAR_V", "VAR_W", "VAR_P",
|
| 64 |
+
"VAR_Q", "VAR_L", "VAR_F", "VAR_G", "VAR_H",
|
| 65 |
+
# Greek
|
| 66 |
+
"VAR_THETA", "VAR_ALPHA", "VAR_BETA", "VAR_GAMMA_",
|
| 67 |
+
"VAR_DELTA", "VAR_EPSILON","VAR_ZETA", "VAR_ETA",
|
| 68 |
+
"VAR_LAMBDA","VAR_MU", "VAR_NU", "VAR_XI",
|
| 69 |
+
"VAR_RHO", "VAR_SIGMA", "VAR_TAU", "VAR_PHI",
|
| 70 |
+
"VAR_CHI", "VAR_PSI", "VAR_OMEGA",
|
| 71 |
+
"VAR_IOTA", "VAR_KAPPA", "VAR_OMICRON", "VAR_UPSILON",
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
# Constant tokens: CONST_-10 through CONST_100
|
| 75 |
+
_CONST_TOKENS = (
|
| 76 |
+
[f"CONST_{i}" for i in range(-10, 101)]
|
| 77 |
+
+ ["CONST_PI", "CONST_E", "CONST_I", "CONST_INF", "CONST_NEG_INF", "CONST_NAN"]
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Large-number / float fallback tokens (dynamically added as needed)
|
| 81 |
+
_NUMERIC_PLACEHOLDERS = [f"NUM_{i}" for i in range(101, 1001)]
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _build_fixed_vocab() -> dict[str, int]:
|
| 85 |
+
"""
|
| 86 |
+
Build the complete fixed math vocabulary: token → integer ID.
|
| 87 |
+
The ordering here determines the permanent token IDs.
|
| 88 |
+
"""
|
| 89 |
+
tokens: list[str] = []
|
| 90 |
+
tokens.extend(_SPECIAL_TOKENS)
|
| 91 |
+
tokens.extend(get_all_operator_tokens()) # from operator_registry
|
| 92 |
+
tokens.extend(_VAR_TOKENS)
|
| 93 |
+
tokens.extend(_CONST_TOKENS)
|
| 94 |
+
tokens.extend(_NUMERIC_PLACEHOLDERS)
|
| 95 |
+
# Deduplicate while preserving order
|
| 96 |
+
seen: set[str] = set()
|
| 97 |
+
deduped: list[str] = []
|
| 98 |
+
for t in tokens:
|
| 99 |
+
if t not in seen:
|
| 100 |
+
seen.add(t)
|
| 101 |
+
deduped.append(t)
|
| 102 |
+
return {tok: idx for idx, tok in enumerate(deduped)}
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# ── MathTokVocabulary ─────────────────────────────────────────────────────
|
| 106 |
+
|
| 107 |
+
class MathTokVocabulary:
|
| 108 |
+
"""
|
| 109 |
+
Two-tier math + BPE vocabulary manager.
|
| 110 |
+
|
| 111 |
+
Fixed math tokens are deterministically assigned IDs.
|
| 112 |
+
BPE vocabulary (trained on text corpora) is appended after.
|
| 113 |
+
|
| 114 |
+
Parameters
|
| 115 |
+
----------
|
| 116 |
+
bpe_vocab_size : int
|
| 117 |
+
Target size of the BPE sub-vocabulary for text tokens.
|
| 118 |
+
"""
|
| 119 |
+
|
| 120 |
+
VOCAB_FILE = "mathtok_vocab.json"
|
| 121 |
+
MERGES_FILE = "mathtok_bpe_merges.txt"
|
| 122 |
+
|
| 123 |
+
def __init__(self, bpe_vocab_size: int = 8000) -> None:
|
| 124 |
+
self.bpe_vocab_size = bpe_vocab_size
|
| 125 |
+
self._math_vocab: dict[str, int] = _build_fixed_vocab()
|
| 126 |
+
self._ids_to_tokens: dict[int, str] = {v: k for k, v in self._math_vocab.items()}
|
| 127 |
+
self._bpe_tokenizer = None # HF tokenizers.Tokenizer for text
|
| 128 |
+
self._bpe_offset = len(self._math_vocab) # BPE IDs start here
|
| 129 |
+
|
| 130 |
+
# ── Properties ───────────────────────────────────────────────────────
|
| 131 |
+
|
| 132 |
+
@property
|
| 133 |
+
def math_vocab_size(self) -> int:
|
| 134 |
+
return len(self._math_vocab)
|
| 135 |
+
|
| 136 |
+
@property
|
| 137 |
+
def total_vocab_size(self) -> int:
|
| 138 |
+
if self._bpe_tokenizer is None:
|
| 139 |
+
return self.math_vocab_size
|
| 140 |
+
return self.math_vocab_size + len(self._bpe_tokenizer.get_vocab())
|
| 141 |
+
|
| 142 |
+
def get_vocab(self) -> dict[str, int]:
|
| 143 |
+
"""Return the complete merged vocabulary."""
|
| 144 |
+
vocab = dict(self._math_vocab)
|
| 145 |
+
if self._bpe_tokenizer is not None:
|
| 146 |
+
for tok, idx in self._bpe_tokenizer.get_vocab().items():
|
| 147 |
+
merged_id = self._bpe_offset + idx
|
| 148 |
+
if tok not in vocab:
|
| 149 |
+
vocab[tok] = merged_id
|
| 150 |
+
return vocab
|
| 151 |
+
|
| 152 |
+
# ── Token ↔ ID ────────────────────────────────────────────────────────
|
| 153 |
+
|
| 154 |
+
def token_to_id(self, token: str) -> int:
|
| 155 |
+
"""Return the integer ID for a token, using [UNK]=1 as fallback."""
|
| 156 |
+
if token in self._math_vocab:
|
| 157 |
+
return self._math_vocab[token]
|
| 158 |
+
if self._bpe_tokenizer is not None:
|
| 159 |
+
bpe_id = self._bpe_tokenizer.token_to_id(token)
|
| 160 |
+
if bpe_id is not None:
|
| 161 |
+
return self._bpe_offset + bpe_id
|
| 162 |
+
return self._math_vocab["[UNK]"]
|
| 163 |
+
|
| 164 |
+
def id_to_token(self, idx: int) -> str:
|
| 165 |
+
"""Return the token string for an integer ID."""
|
| 166 |
+
if idx in self._ids_to_tokens:
|
| 167 |
+
return self._ids_to_tokens[idx]
|
| 168 |
+
if self._bpe_tokenizer is not None:
|
| 169 |
+
bpe_idx = idx - self._bpe_offset
|
| 170 |
+
if bpe_idx >= 0:
|
| 171 |
+
tok = self._bpe_tokenizer.id_to_token(bpe_idx)
|
| 172 |
+
if tok is not None:
|
| 173 |
+
return tok
|
| 174 |
+
return "[UNK]"
|
| 175 |
+
|
| 176 |
+
def encode_text(self, text: str) -> list[int]:
|
| 177 |
+
"""Encode a plain text span with BPE (fallback to char-level)."""
|
| 178 |
+
if self._bpe_tokenizer is not None:
|
| 179 |
+
enc = self._bpe_tokenizer.encode(text)
|
| 180 |
+
return [self._bpe_offset + i for i in enc.ids]
|
| 181 |
+
# Character-level fallback
|
| 182 |
+
return [self.token_to_id(ch) for ch in text]
|
| 183 |
+
|
| 184 |
+
def encode_math_tokens(self, tokens: list[str]) -> list[int]:
|
| 185 |
+
"""Map a list of math token strings to integer IDs."""
|
| 186 |
+
return [self.token_to_id(t) for t in tokens]
|
| 187 |
+
|
| 188 |
+
def add_math_token(self, token: str) -> int:
|
| 189 |
+
"""Dynamically add a new math token (e.g. VAR_FOO) to vocabulary."""
|
| 190 |
+
if token not in self._math_vocab:
|
| 191 |
+
new_id = len(self._math_vocab)
|
| 192 |
+
self._math_vocab[token] = new_id
|
| 193 |
+
self._ids_to_tokens[new_id] = token
|
| 194 |
+
self._bpe_offset = len(self._math_vocab)
|
| 195 |
+
return self._math_vocab[token]
|
| 196 |
+
|
| 197 |
+
# ── BPE training ──────────────────────────────────────────────────────
|
| 198 |
+
|
| 199 |
+
def train_bpe(self, text_corpus: list[str]) -> None:
|
| 200 |
+
"""
|
| 201 |
+
Train a BPE tokenizer on a list of text strings.
|
| 202 |
+
Only the TEXT spans of math problem descriptions should be used.
|
| 203 |
+
|
| 204 |
+
Requires: pip install tokenizers
|
| 205 |
+
"""
|
| 206 |
+
try:
|
| 207 |
+
from tokenizers import Tokenizer
|
| 208 |
+
from tokenizers.models import BPE
|
| 209 |
+
from tokenizers.trainers import BpeTrainer
|
| 210 |
+
from tokenizers.pre_tokenizers import Whitespace
|
| 211 |
+
except ImportError:
|
| 212 |
+
raise ImportError("Install 'tokenizers' package: pip install tokenizers")
|
| 213 |
+
|
| 214 |
+
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
| 215 |
+
tokenizer.pre_tokenizer = Whitespace()
|
| 216 |
+
trainer = BpeTrainer(
|
| 217 |
+
vocab_size=self.bpe_vocab_size,
|
| 218 |
+
special_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]"],
|
| 219 |
+
show_progress=False,
|
| 220 |
+
)
|
| 221 |
+
tokenizer.train_from_iterator(text_corpus, trainer=trainer)
|
| 222 |
+
self._bpe_tokenizer = tokenizer
|
| 223 |
+
logger.info(
|
| 224 |
+
"BPE trained: vocab_size=%d, total_vocab=%d",
|
| 225 |
+
len(tokenizer.get_vocab()),
|
| 226 |
+
self.total_vocab_size,
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
def load_bpe_from_pretrained(self, model_name_or_path: str = "gpt2") -> None:
|
| 230 |
+
"""
|
| 231 |
+
Load a pre-trained HuggingFace tokenizer as the BPE backend.
|
| 232 |
+
Useful as a zero-shot baseline for the text sub-vocabulary.
|
| 233 |
+
"""
|
| 234 |
+
try:
|
| 235 |
+
from transformers import AutoTokenizer
|
| 236 |
+
hf_tok = AutoTokenizer.from_pretrained(model_name_or_path)
|
| 237 |
+
# Wrap in our interface by using its encoding
|
| 238 |
+
self._hf_text_tokenizer = hf_tok
|
| 239 |
+
self._bpe_tokenizer = None # use _hf_text_tokenizer path instead
|
| 240 |
+
logger.info("Loaded HF text tokenizer: %s", model_name_or_path)
|
| 241 |
+
except Exception as exc:
|
| 242 |
+
logger.warning("Could not load HF tokenizer %s: %s", model_name_or_path, exc)
|
| 243 |
+
|
| 244 |
+
# ── Persistence ───────────────────────────────────────────────────────
|
| 245 |
+
|
| 246 |
+
def save(self, directory: str) -> None:
|
| 247 |
+
"""Save vocabulary to directory."""
|
| 248 |
+
dirpath = Path(directory)
|
| 249 |
+
dirpath.mkdir(parents=True, exist_ok=True)
|
| 250 |
+
|
| 251 |
+
vocab_path = dirpath / self.VOCAB_FILE
|
| 252 |
+
with open(vocab_path, "w", encoding="utf-8") as f:
|
| 253 |
+
json.dump(self._math_vocab, f, indent=2)
|
| 254 |
+
|
| 255 |
+
if self._bpe_tokenizer is not None:
|
| 256 |
+
merges_path = dirpath / self.MERGES_FILE
|
| 257 |
+
self._bpe_tokenizer.model.save(str(dirpath))
|
| 258 |
+
logger.info("Vocabulary saved to %s", dirpath)
|
| 259 |
+
|
| 260 |
+
@classmethod
|
| 261 |
+
def load(cls, directory: str) -> "MathTokVocabulary":
|
| 262 |
+
"""Load vocabulary from a saved directory."""
|
| 263 |
+
dirpath = Path(directory)
|
| 264 |
+
vocab_path = dirpath / cls.VOCAB_FILE
|
| 265 |
+
|
| 266 |
+
instance = cls()
|
| 267 |
+
with open(vocab_path, "r", encoding="utf-8") as f:
|
| 268 |
+
instance._math_vocab = json.load(f)
|
| 269 |
+
instance._ids_to_tokens = {v: k for k, v in instance._math_vocab.items()}
|
| 270 |
+
instance._bpe_offset = len(instance._math_vocab)
|
| 271 |
+
|
| 272 |
+
# Try loading BPE if present
|
| 273 |
+
bpe_path = dirpath / "vocab.json"
|
| 274 |
+
if bpe_path.exists():
|
| 275 |
+
try:
|
| 276 |
+
from tokenizers import Tokenizer
|
| 277 |
+
instance._bpe_tokenizer = Tokenizer.from_file(str(dirpath / "tokenizer.json"))
|
| 278 |
+
except Exception as exc:
|
| 279 |
+
logger.warning("Could not load BPE tokenizer: %s", exc)
|
| 280 |
+
|
| 281 |
+
logger.info("Vocabulary loaded from %s (size=%d)", dirpath, len(instance._math_vocab))
|
| 282 |
+
return instance
|
| 283 |
+
|
| 284 |
+
# ── HuggingFace PreTrainedTokenizer factory ───────────────────────────
|
| 285 |
+
|
| 286 |
+
def build_hf_tokenizer(self, pipeline=None) -> "MathTokHFTokenizer":
|
| 287 |
+
"""
|
| 288 |
+
Build a HuggingFace PreTrainedTokenizer wrapping this vocabulary
|
| 289 |
+
and the given MathTokPipeline.
|
| 290 |
+
|
| 291 |
+
Parameters
|
| 292 |
+
----------
|
| 293 |
+
pipeline : MathTokPipeline | None
|
| 294 |
+
If None, a default pipeline is created.
|
| 295 |
+
"""
|
| 296 |
+
return MathTokHFTokenizer(vocab=self, pipeline=pipeline)
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
# ── HuggingFace PreTrainedTokenizer wrapper ───────────────────────────────
|
| 300 |
+
|
| 301 |
+
class MathTokHFTokenizer:
|
| 302 |
+
"""
|
| 303 |
+
HuggingFace-compatible tokenizer wrapping MathTokVocabulary.
|
| 304 |
+
|
| 305 |
+
Implements the PreTrainedTokenizer interface so it can be used with:
|
| 306 |
+
- transformers.Trainer
|
| 307 |
+
- datasets.map(..., batched=True)
|
| 308 |
+
- model.generate(tokenizer(...))
|
| 309 |
+
|
| 310 |
+
The full MathTok pipeline (canonicalize → AST → serialize) runs
|
| 311 |
+
inside _tokenize(), making it a transparent drop-in replacement.
|
| 312 |
+
"""
|
| 313 |
+
|
| 314 |
+
def __init__(self, vocab: MathTokVocabulary, pipeline=None) -> None:
|
| 315 |
+
self.vocab = vocab
|
| 316 |
+
self.pipeline = pipeline
|
| 317 |
+
|
| 318 |
+
# HF-compatible special token IDs
|
| 319 |
+
self.pad_token = "[PAD]"
|
| 320 |
+
self.unk_token = "[UNK]"
|
| 321 |
+
self.bos_token = "[BOS]"
|
| 322 |
+
self.eos_token = "[EOS]"
|
| 323 |
+
self.mask_token = "[MASK]"
|
| 324 |
+
self.sep_token = "[SEP]"
|
| 325 |
+
|
| 326 |
+
self.pad_token_id = vocab.token_to_id("[PAD]")
|
| 327 |
+
self.unk_token_id = vocab.token_to_id("[UNK]")
|
| 328 |
+
self.bos_token_id = vocab.token_to_id("[BOS]")
|
| 329 |
+
self.eos_token_id = vocab.token_to_id("[EOS]")
|
| 330 |
+
|
| 331 |
+
# ── Tokenization ──────────────────────────────────────────────────────
|
| 332 |
+
|
| 333 |
+
def tokenize(self, text: str) -> list[str]:
|
| 334 |
+
"""Return token strings for the input."""
|
| 335 |
+
if self.pipeline is not None:
|
| 336 |
+
out = self.pipeline.encode(text)
|
| 337 |
+
return out.tokens
|
| 338 |
+
# Minimal fallback: just split on spaces
|
| 339 |
+
return text.split()
|
| 340 |
+
|
| 341 |
+
def encode(self, text: str, add_special_tokens: bool = True) -> list[int]:
|
| 342 |
+
"""Return token IDs for the input."""
|
| 343 |
+
tokens = self.tokenize(text)
|
| 344 |
+
ids = self.vocab.encode_math_tokens(tokens)
|
| 345 |
+
if add_special_tokens:
|
| 346 |
+
ids = [self.bos_token_id] + ids + [self.eos_token_id]
|
| 347 |
+
return ids
|
| 348 |
+
|
| 349 |
+
def decode(self, ids: list[int], skip_special_tokens: bool = True) -> str:
|
| 350 |
+
"""Convert token IDs back to a string."""
|
| 351 |
+
tokens = [self.vocab.id_to_token(i) for i in ids]
|
| 352 |
+
if skip_special_tokens:
|
| 353 |
+
tokens = [t for t in tokens if not t.startswith("[")]
|
| 354 |
+
return " ".join(tokens)
|
| 355 |
+
|
| 356 |
+
def __call__(
|
| 357 |
+
self,
|
| 358 |
+
text: str | list[str],
|
| 359 |
+
add_special_tokens: bool = True,
|
| 360 |
+
return_tensors: Optional[str] = None,
|
| 361 |
+
) -> dict:
|
| 362 |
+
"""Callable interface compatible with HF DataCollator."""
|
| 363 |
+
if isinstance(text, str):
|
| 364 |
+
text = [text]
|
| 365 |
+
all_ids = [self.encode(t, add_special_tokens=add_special_tokens) for t in text]
|
| 366 |
+
result = {"input_ids": all_ids}
|
| 367 |
+
if return_tensors == "pt":
|
| 368 |
+
try:
|
| 369 |
+
import torch
|
| 370 |
+
max_len = max(len(ids) for ids in all_ids)
|
| 371 |
+
padded = [
|
| 372 |
+
ids + [self.pad_token_id] * (max_len - len(ids))
|
| 373 |
+
for ids in all_ids
|
| 374 |
+
]
|
| 375 |
+
result["input_ids"] = torch.tensor(padded, dtype=torch.long)
|
| 376 |
+
result["attention_mask"] = (result["input_ids"] != self.pad_token_id).long()
|
| 377 |
+
except ImportError:
|
| 378 |
+
pass
|
| 379 |
+
return result
|
| 380 |
+
|
| 381 |
+
def get_vocab(self) -> dict[str, int]:
|
| 382 |
+
return self.vocab.get_vocab()
|
| 383 |
+
|
| 384 |
+
def __len__(self) -> int:
|
| 385 |
+
return self.vocab.total_vocab_size
|
| 386 |
+
|
| 387 |
+
def save_pretrained(self, save_directory: str) -> None:
|
| 388 |
+
"""Save tokenizer to a directory."""
|
| 389 |
+
self.vocab.save(save_directory)
|
| 390 |
+
config = {
|
| 391 |
+
"tokenizer_class": "MathTokHFTokenizer",
|
| 392 |
+
"model_max_length": 2048,
|
| 393 |
+
"pad_token": self.pad_token,
|
| 394 |
+
"unk_token": self.unk_token,
|
| 395 |
+
"bos_token": self.bos_token,
|
| 396 |
+
"eos_token": self.eos_token,
|
| 397 |
+
"mask_token": self.mask_token,
|
| 398 |
+
}
|
| 399 |
+
config_path = Path(save_directory) / "tokenizer_config.json"
|
| 400 |
+
with open(config_path, "w", encoding="utf-8") as f:
|
| 401 |
+
json.dump(config, f, indent=2)
|
| 402 |
+
logger.info("HF tokenizer saved to %s", save_directory)
|
| 403 |
+
|
| 404 |
+
@classmethod
|
| 405 |
+
def from_pretrained(cls, load_directory: str) -> "MathTokHFTokenizer":
|
| 406 |
+
"""Load tokenizer from a saved directory."""
|
| 407 |
+
vocab = MathTokVocabulary.load(load_directory)
|
| 408 |
+
return cls(vocab=vocab)
|
model.md
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MathTok Pipeline —
|
| 2 |
+
|
| 3 |
+
## What Was Built
|
| 4 |
+
|
| 5 |
+
7-layer mathematical tokenizer research pipeline at `c:\Users\surwe\Project\math_token`.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## File Summary
|
| 10 |
+
|
| 11 |
+
| File | Role |
|
| 12 |
+
|------|------|
|
| 13 |
+
| [canonicalizer.py](file:///c:/Users/surwe/Project/math_token/mathtok/canonicalizer.py) | Layer 1 — LaTeX/ASCII → canonical SymPy via simplify/expand |
|
| 14 |
+
| [lexer.py](file:///c:/Users/surwe/Project/math_token/mathtok/lexer.py) | Layer 2 — Split TEXT/MATH spans (LaTeX delimiters + ASCII heuristics) |
|
| 15 |
+
| [ast_generator.py](file:///c:/Users/surwe/Project/math_token/mathtok/ast_generator.py) | Layer 3 — SymPy expression tree → typed ASTNode tree |
|
| 16 |
+
| [operator_registry.py](file:///c:/Users/surwe/Project/math_token/mathtok/operator_registry.py) | Layer 4 — Full semantic metadata per operator/function |
|
| 17 |
+
| [serializer.py](file:///c:/Users/surwe/Project/math_token/mathtok/serializer.py) | Layer 5 — DFS preorder → flat SerializedToken stream |
|
| 18 |
+
| [metadata.py](file:///c:/Users/surwe/Project/math_token/mathtok/metadata.py) | Layer 6 — Per-token structural attention metadata + masks |
|
| 19 |
+
| [vocabulary.py](file:///c:/Users/surwe/Project/math_token/mathtok/vocabulary.py) | Layer 7 — Fixed math vocab + BPE + HF PreTrainedTokenizer compat |
|
| 20 |
+
| [pipeline.py](file:///c:/Users/surwe/Project/math_token/mathtok/pipeline.py) | Orchestrator + CLI |
|
| 21 |
+
| [metrics.py](file:///c:/Users/surwe/Project/math_token/evaluation/metrics.py) | 5 evaluation metrics (SCR, CCS, OPS, TS, TDF) |
|
| 22 |
+
| [benchmark.py](file:///c:/Users/surwe/Project/math_token/evaluation/benchmark.py) | Benchmark runner vs baselines |
|
| 23 |
+
|
| 24 |
+
---
|
| 25 |
+
|
| 26 |
+
## Test Results
|
| 27 |
+
|
| 28 |
+
```
|
| 29 |
+
86 passed in 6.89s
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
All 86 tests pass across 5 test modules.
|
| 33 |
+
|
| 34 |
+
---
|
| 35 |
+
|
| 36 |
+
## Benchmark Results (20 expressions)
|
| 37 |
+
|
| 38 |
+
```
|
| 39 |
+
SCR: 0.6292 Structural Compression Ratio (lower = more compressed)
|
| 40 |
+
CCS: 0.9467 Canonical Consistency Score (higher is better) ← KEY METRIC
|
| 41 |
+
OPS: 0.4000 Operator Preservation Score
|
| 42 |
+
TS: 0.8763 Token Stability
|
| 43 |
+
TDF: 0.9588 Tree Depth Fidelity
|
| 44 |
+
|
| 45 |
+
vs Character-level baseline:
|
| 46 |
+
MathTok SCR=0.63 CCS=0.9467
|
| 47 |
+
CharLvl SCR=1.00 CCS=0.3916 ← CCS is 2.4x worse
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
**MathTok achieves 2.4x better Canonical Consistency over character-level tokenization** — this is your key result for the paper.
|
| 51 |
+
|
| 52 |
+
---
|
| 53 |
+
|
| 54 |
+
## CLI Demo
|
| 55 |
+
|
| 56 |
+
```bash
|
| 57 |
+
# Input: "$\sin(x^2) + 3x$"
|
| 58 |
+
# Output tokens:
|
| 59 |
+
['[MATH_START]', 'OP_ADD', 'OP_MUL', 'CONST_3', 'VAR_X',
|
| 60 |
+
'FUNC_SIN', 'OP_POW', 'VAR_X', 'CONST_2', '[MATH_END]']
|
| 61 |
+
|
| 62 |
+
# S-expression:
|
| 63 |
+
(OP_ADD (OP_MUL CONST_3 VAR_X) (FUNC_SIN (OP_POW VAR_X CONST_2)))
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
---
|
| 67 |
+
|
| 68 |
+
## Quick Start
|
| 69 |
+
|
| 70 |
+
```bash
|
| 71 |
+
cd c:\Users\surwe\Project\math_token
|
| 72 |
+
pip install -e ".[eval,dev]"
|
| 73 |
+
pytest tests/ -v
|
| 74 |
+
python -m evaluation.benchmark --quick --baselines
|
| 75 |
+
python -m evaluation.comparison --save # 3-level SCR comparison
|
| 76 |
+
python -m mathtok.pipeline "$\sin(x^2) + 3x$"
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
---
|
| 80 |
+
|
| 81 |
+
## 3-Level Semantic Comparison Results (vs GPT-2)
|
| 82 |
+
|
| 83 |
+
### Aggregated (63 expressions, 5 categories)
|
| 84 |
+
|
| 85 |
+
| Metric | MathTok | GPT-2 | Char-level |
|
| 86 |
+
|--------|---------|-------|------------|
|
| 87 |
+
| **Level 1 — SCR** (struct_score / tokens) | **1.14** | 0.47 | 0.42 |
|
| 88 |
+
| **Level 2 — Semantic Density** (math_toks / total) | **0.675** | 0.209 | — |
|
| 89 |
+
| **Level 3 — Structural Efficiency** (relations / tokens) | **0.307** | — | — |
|
| 90 |
+
| **SCR improvement vs GPT-2** | **2.44x** | — | — |
|
| 91 |
+
| **SCR improvement vs Char-level** | **2.72x** | — | — |
|
| 92 |
+
|
| 93 |
+
### Canonical Equivalence (headline result)
|
| 94 |
+
|
| 95 |
+
| Pair | MathTok Jaccard | GPT-2 Jaccard |
|
| 96 |
+
|------|----------------|---------------|
|
| 97 |
+
| `x + 2` vs `2 + x` | **1.000** | 0.200 |
|
| 98 |
+
| `(x+1)^2` vs `x^2+2x+1` | **1.000** | 0.273 |
|
| 99 |
+
| `sin^2+cos^2` vs `1` | **1.000** | 0.000 |
|
| 100 |
+
| `a^2-b^2` vs `(a+b)(a-b)` | **1.000** | 0.091 |
|
| 101 |
+
|
| 102 |
+
> MathTok achieves **perfect canonical convergence (Jaccard=1.0)** on all 8 equivalent pairs.
|
| 103 |
+
> GPT-2 ranges from 0.00 to 0.44 on the same pairs.
|
| 104 |
+
|
| 105 |
+
### LaTeX vs ASCII Normalization
|
| 106 |
+
|
| 107 |
+
| ASCII | LaTeX | MathTok converged? | GPT-2 tokens A/L |
|
| 108 |
+
|-------|-------|--------------------|------------------|
|
| 109 |
+
| `sin(x^2)` | `\sin(x^2)` | **YES (1.00)** | 6 / 7 |
|
| 110 |
+
| `sqrt(x^2+1)` | `\sqrt{x^2+1}` | **YES (1.00)** | 9 / 10 |
|
| 111 |
+
| `diff(sin(x),x)` | `\frac{d}{dx}\sin(x)` | **YES (1.00)** | 8 / 11 |
|
| 112 |
+
| `factorial(n)` | `n!` | **YES (1.00)** | 5 / 2 |
|
| 113 |
+
|
| 114 |
+
### Sample Expression Comparison
|
| 115 |
+
|
| 116 |
+
| Expression | MT tokens | MT SCR | GPT-2 tokens | GPT-2 SCR | Improvement |
|
| 117 |
+
|-----------|-----------|--------|-------------|-----------|-------------|
|
| 118 |
+
| `(x+1)^2` | 10 | 1.00 | 7 | 0.71 | **1.40x** |
|
| 119 |
+
| `sin(x^2)+3x` | 10 | 1.30 | 10 | 0.60 | **2.17x** |
|
| 120 |
+
| `factorial(n)` | 4 | 1.25 | 5 | 0.20 | **6.25x** |
|
| 121 |
+
| `sin(cos((x+1)^2+y^3))` | 15 | 1.20 | 15 | 0.60 | **2.00x** |
|
| 122 |
+
| `((a+b)*(a-b))/((a+b)^2)` | 11 | 1.36 | 19 | 0.16 | **8.64x** |
|
| 123 |
+
|
| 124 |
+
---
|
| 125 |
+
|
| 126 |
+
## Visualized Results
|
| 127 |
+
|
| 128 |
+
The graphs below clearly summarize MathTok's structural efficiency advantages:
|
| 129 |
+
|
| 130 |
+

|
| 131 |
+
|
| 132 |
+

|
| 133 |
+
|
| 134 |
+

|
| 135 |
+
|
| 136 |
+
---
|
| 137 |
+
|
| 138 |
+
## Output Files
|
| 139 |
+
|
| 140 |
+
- [comparison_results.jsonl](file:///c:/Users/surwe/Project/math_token/evaluation/results/comparison_results.jsonl) — one JSONL record per expression
|
| 141 |
+
- [comparison_summary.json](file:///c:/Users/surwe/Project/math_token/evaluation/results/comparison_summary.json) — aggregated metrics
|
| 142 |
+
|
| 143 |
+
---
|
| 144 |
+
|
| 145 |
+
## Paper-Ready Contributions
|
| 146 |
+
|
| 147 |
+
1. **Two-format input** — handles both LaTeX and ASCII, auto-detected
|
| 148 |
+
2. **Canonical consistency** — equivalent expressions produce token sets with 0.947 Jaccard overlap
|
| 149 |
+
3. **Semantic operator registry** — every operator has `arity`, `precedence`, `associativity`, `semantic_role` metadata
|
| 150 |
+
4.# Implementation Details
|
| 151 |
+
The following changes were successfully implemented:
|
| 152 |
+
- **L1 Canonicalization**: Improved reliability with parsing timeouts and LRU caching to prevent SymPy hangs.
|
| 153 |
+
- **L2 Hybrid Lexer**: Added confidence scores to lexical spans, along with improved regular expressions for parsing LaTeX and inline math constructs.
|
| 154 |
+
- **L3 AST Generator**: Implemented `max_depth` limits to gracefully truncate extremely deep ASTs (like malicious deeply nested formulas).
|
| 155 |
+
- **L4 Semantic Operator Registry**: Added `is_commutative` metadata, inverse-pair mappings (`INVERSE_PAIRS`), and expanded domains (Logic, Sets, Geometry, Probability).
|
| 156 |
+
- **L5 Structural Serializer**: Integrated subtree hashing and `[SCOPE_OPEN]`/`[SCOPE_CLOSE]` markers to better delineate function arguments.
|
| 157 |
+
- **L6 Attention Metadata**: Included `parent_token` context in the metadata structural hints to support graph-based attention models.
|
| 158 |
+
- **L7 Two-Tier Vocabulary**: Added explicit tokens such as `[UNK_MATH]`, missing Greek variables (`VAR_IOTA`, `VAR_KAPPA`, etc.), and structural boundary tokens.
|
| 159 |
+
- **Pipeline & Integration**: `MathTokPipeline` exposes configurable timeouts, max depth, and scopes. All key tokens/metadata symbols are correctly exported.
|
| 160 |
+
|
| 161 |
+
# Validation & Evaluation
|
| 162 |
+
- **RoundTripValidator**: Added `mathtok/validator.py` to reconstruct `sympy` expression trees from a flat tokenized stream, mathematically comparing them using `sp.simplify()` to ensure semantic fidelity.
|
| 163 |
+
- **Streaming Tokenizer**: Added `MathTokStreamingPipeline` with Python generator (`yield`) support for memory-efficient corpus-scale tokenization.
|
| 164 |
+
- **Benchmark Expansion**: Added `ODE_PDE`, `LINEAR_ALGEBRA`, `PROBABILITY`, and `SET_THEORY` domains into the `evaluation/comparison.py` suite.
|
| 165 |
+
|
| 166 |
+
> [!NOTE]
|
| 167 |
+
> The MathTok Tokenizer improves the Structural Encoding Ratio (SCR) by **2.29x** over Character Level Tokenization across the evaluation suite!
|
| 168 |
+
6. **HF-compatible tokenizer** — drop-in for transformers training pipelines
|
pyproject.toml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=61.0"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "mathtok"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "Mathematical symbolic tokenizer framework for LLM reasoning"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.9"
|
| 11 |
+
|
| 12 |
+
authors = [
|
| 13 |
+
{ name="Surweesh SP" }
|
| 14 |
+
]
|
requirements.txt
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MathTok — Research Dependencies
|
| 2 |
+
# Install with: pip install -e .
|
| 3 |
+
|
| 4 |
+
# ── Symbolic Mathematics ──────────────────────────────────────────────────
|
| 5 |
+
sympy>=1.12
|
| 6 |
+
antlr4-python3-runtime==4.11.1 # Required by sympy.parsing.latex
|
| 7 |
+
|
| 8 |
+
# ── NLP / Tokenization ────────────────────────────────────────────────────
|
| 9 |
+
tokenizers>=0.15.0
|
| 10 |
+
transformers>=4.38.0
|
| 11 |
+
|
| 12 |
+
# ── Numerics / Evaluation ─────────────────────────────────────────────────
|
| 13 |
+
numpy>=1.26.0
|
| 14 |
+
scipy>=1.12.0
|
| 15 |
+
|
| 16 |
+
# ── Visualisation ─────────────────────────────────────────────────────────
|
| 17 |
+
matplotlib>=3.8.0
|
| 18 |
+
seaborn>=0.13.0
|
| 19 |
+
networkx>=3.2 # AST graph visualisation
|
| 20 |
+
|
| 21 |
+
# ── Dev / Testing ─────────────────────────────────────────────────────────
|
| 22 |
+
pytest>=8.0.0
|
| 23 |
+
pytest-cov>=5.0.0
|
| 24 |
+
tqdm>=4.66.0
|
| 25 |
+
|
| 26 |
+
# ── Notebooks ─────────────────────────────────────────────────────────────
|
| 27 |
+
jupyter>=1.0.0
|
| 28 |
+
ipykernel>=6.29.0
|
| 29 |
+
|
| 30 |
+
# ── Utilities ─────────────────────────────────────────────────────────────
|
| 31 |
+
regex>=2023.12.25
|
review.md
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🌟 MathTok: Canonicalized AST-Based Mathematical Tokenizer Codebase Review
|
| 2 |
+
|
| 3 |
+
An in-depth structural and architectural analysis of the **MathTok** pipeline located at `c:\Users\surwe\Project\math_token`. This document serves as a comprehensive system review, detailing the mathematical foundations, the 7-layer pipeline design, system components, evaluation metrics, empirical results, and downstream application patterns of MathTok.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## 📖 Executive Summary
|
| 8 |
+
|
| 9 |
+
Standard natural language tokenizers (like Byte-Pair Encoding or SentencePiece) treat mathematical expressions as plain text sequences. This results in **structural fragmentation** (e.g., splitting a variable `VAR_THETA` or operator `OP_ADD` into arbitrary character chunks) and **semantic blindness** (failing to recognize algebraic equivalences like $x + 2 \equiv 2 + x$).
|
| 10 |
+
|
| 11 |
+
**MathTok** solves this by introducing a **hybrid, structure-aware tokenization framework** for mathematical language modeling. By constructing an Abstract Syntax Tree (AST) from mathematical expressions, normalizing algebraic equivalences via symbolic mathematics (SymPy), and serializing the tree using Depth-First Search (DFS) preorder traversal, MathTok preserves full mathematical syntax and hierarchy.
|
| 12 |
+
|
| 13 |
+
Additionally, MathTok automatically emits **structural attention metadata** for every token position, enabling downstream transformer models to implement tree-based or graph-structured attention patterns without architectural modifications.
|
| 14 |
+
|
| 15 |
+
```mermaid
|
| 16 |
+
graph TD
|
| 17 |
+
A[Raw Input: Mixed Text + Math] --> B[Layer 2: Hybrid Lexer]
|
| 18 |
+
B -->|TEXT Spans| C[Layer 7: BPE Text Sub-Vocab]
|
| 19 |
+
B -->|MATH Spans| D[Layer 1: Canonicalizer Engine]
|
| 20 |
+
D -->|SymPy Expression| E[Layer 3: AST Generator]
|
| 21 |
+
E -->|Typed AST Tree| F[Layer 4: Semantic Operator Registry]
|
| 22 |
+
F -->|Enriched Nodes| G[Layer 5: Structural Serializer]
|
| 23 |
+
G -->|DFS Preorder Stream| H[Layer 6: Attention Metadata Gen]
|
| 24 |
+
H -->|Attention Masks & Hints| I[Final Merged Token Stream]
|
| 25 |
+
C --> I
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
---
|
| 29 |
+
|
| 30 |
+
## 🛠️ The 7-Layer Processing Pipeline
|
| 31 |
+
|
| 32 |
+
MathTok's core engine is structured into seven distinct modular layers. Every component resides in the [`mathtok/`](file:///c:/Users/surwe/Project/math_token/mathtok) package.
|
| 33 |
+
|
| 34 |
+
### Layer 1: Canonicalizer Engine
|
| 35 |
+
* **Location**: [`canonicalizer.py`](file:///c:/Users/surwe/Project/math_token/mathtok/canonicalizer.py)
|
| 36 |
+
* **Role**: Algebraic normalisation and format conversion (LaTeX $\to$ ASCII $\to$ SymPy).
|
| 37 |
+
* **Implementation Details**:
|
| 38 |
+
* **Heuristic Format Detection**: Inspects the input for LaTeX syntax (e.g., `\frac`, `\sqrt`, `\sin`, `{`, math delimiters like `$` or `\(`).
|
| 39 |
+
* **Parsing**: Utilizes `sympy.parsing.latex.parse_latex` (with ANTLR4) for LaTeX, falling back to `sympy.parsing.sympy_parser.parse_expr` with standard and implicit multiplication transformations for ASCII.
|
| 40 |
+
* **Normalisation**: Leverages SymPy's symbolic engine to `expand()` products over sums and `simplify()` algebraic expressions. It normalizes operations internally (e.g., transforming subtractions $a - b$ into additions of products $\text{Add}(a, \text{Mul}(-1, b))$, and divisions $a / b$ into multiplications of powers $\text{Mul}(a, \text{Pow}(b, -1))$).
|
| 41 |
+
* **Robustness & Performance**: Employs an LRU cache (default: 512 entries) to prevent redundant parsing and wraps expensive SymPy calls in a `ThreadPoolExecutor` with configurable parsing timeouts (default: 5.0 seconds) to prevent infinite loops on malicious, highly-complex inputs.
|
| 42 |
+
|
| 43 |
+
### Layer 2: Hybrid Mathematical Lexer
|
| 44 |
+
* **Location**: [`lexer.py`](file:///c:/Users/surwe/Project/math_token/mathtok/lexer.py)
|
| 45 |
+
* **Role**: Alternating segment segmentation (TEXT spans vs. MATH spans).
|
| 46 |
+
* **Implementation Details**:
|
| 47 |
+
* **Stage 1 (Unambiguous Delimiters)**: Extracts LaTeX math environments (double dollar `$$...$$`, inline dollar `$...\$`, bracket `\[...\]`, or parenthesis `\(...\)`).
|
| 48 |
+
* **Stage 2 (ASCII Heuristics)**: Parses remaining text regions using pre-compiled regular expressions matching mathematical patterns (e.g., function calls `sin(...)`, exponents `x^2`, arithmetic boundaries `2*x+1`, relational equations `a+b=c`, and spelled-out Greek variables).
|
| 49 |
+
* **Region Expansion**: Expands detected math seeds backwards to include leading unary operators and digits, and forwards to match balanced braces/parentheses and continuous math characters. Adjacent spans of identical types are merged.
|
| 50 |
+
|
| 51 |
+
### Layer 3: AST Generator
|
| 52 |
+
* **Location**: [`ast_generator.py`](file:///c:/Users/surwe/Project/math_token/mathtok/ast_generator.py)
|
| 53 |
+
* **Role**: SymPy AST conversion to typed, abstract vocabulary trees.
|
| 54 |
+
* **Implementation Details**:
|
| 55 |
+
* Walks the SymPy internal expression tree recursively.
|
| 56 |
+
* Maps generic SymPy types into the vocabulary of MathTok:
|
| 57 |
+
* **Variables**: Standard letters map to `VAR_X`, `VAR_Y`, etc. Spelled-out Greek names map to `VAR_THETA`, `VAR_LAMBDA`, etc.
|
| 58 |
+
* **Constants**: Values between $-10$ and $100$ receive dedicated tokens (e.g., `CONST_3`, `CONST_12`), large integers map to placeholders (e.g., `NUM_145`), floats map to string-encoded float tokens (e.g., `FLOAT_3p14`), and special constants map to `CONST_PI`, `CONST_E`, `CONST_I`, and `CONST_INF`.
|
| 59 |
+
* **Unary Operations**: Converts negative numbers or multiplication by $-1$ to explicit `OP_NEG` nodes, and division inverses to `OP_RECIP` nodes.
|
| 60 |
+
* **Fractions**: Converts `Rational(p, q)` into explicit binary `FRAC(numerator, denominator)` nodes.
|
| 61 |
+
* **Recursion Guard**: Enforces `max_depth` limits (default: 20) to truncate overly-nested expressions, replacing them with a special `SUBTREE_TRUNCATED` node to avoid Python stack overflows.
|
| 62 |
+
|
| 63 |
+
### Layer 4: Semantic Operator Registry
|
| 64 |
+
* **Location**: [`operator_registry.py`](file:///c:/Users/surwe/Project/math_token/mathtok/operator_registry.py)
|
| 65 |
+
* **Role**: Rich metadata storage and categorisation for mathematical operators.
|
| 66 |
+
* **Implementation Details**:
|
| 67 |
+
* Maintains an immutable registry of `OperatorMeta` instances mapping token strings to mathematical properties:
|
| 68 |
+
* **Properties**: `arity` ($-1$ for variadic, or fixed integers like 1 or 2), `precedence`, `associativity` (left, right, or none), `semantic_role` (e.g., `aggregation` for addition, `periodic_oscillation` for sine), `latex_repr`, `ascii_repr`, `category`, and `is_commutative`.
|
| 69 |
+
* **Domains**: Spans multiple mathematical branches: Arithmetic, Relational, Calculus, Trigonometry, Exponential/Logarithmic, Logic, Set Theory, Geometry, and Statistics.
|
| 70 |
+
* **Inverses**: Declares explicit mathematical inverses in `INVERSE_PAIRS` (e.g., `FUNC_SIN` $\leftrightarrow$ `FUNC_ASIN`, `FUNC_EXP` $\leftrightarrow$ `FUNC_LOG`).
|
| 71 |
+
|
| 72 |
+
### Layer 5: Structural Serializer
|
| 73 |
+
* **Location**: [`serializer.py`](file:///c:/Users/surwe/Project/math_token/mathtok/serializer.py)
|
| 74 |
+
* **Role**: Flattening the 2D tree structure into a 1-D stream using DFS preorder traversal.
|
| 75 |
+
* **Implementation Details**:
|
| 76 |
+
* Emits nodes starting from the root down to the leaves, producing a flat sequence of `SerializedToken` objects carrying: `depth`, `node_id`, `parent_id`, `child_index`, `num_children`, `is_leaf`, and `subtree_size`.
|
| 77 |
+
* **Scope Delineation**: Emits `[SCOPE_OPEN]` and `[SCOPE_CLOSE]` boundary tokens to explicitly group parameters for functions (e.g., `FUNC_SIN [SCOPE_OPEN] VAR_X [SCOPE_CLOSE]`).
|
| 78 |
+
* **Subtree Deduplication**: Integrates MD5 structural hashing (`dedup_subtrees`) to replace duplicated structures (e.g., repeating sub-formulas) with a pointer reference (e.g., `SUBTREE_REF_ae34df51`), improving sequence compression.
|
| 79 |
+
|
| 80 |
+
### Layer 6: Structural Attention Metadata Generator
|
| 81 |
+
* **Location**: [`metadata.py`](file:///c:/Users/surwe/Project/math_token/mathtok/metadata.py)
|
| 82 |
+
* **Role**: Calculating positional contexts and binary attention mask matrices.
|
| 83 |
+
* **Implementation Details**:
|
| 84 |
+
* Classifies tokens into categories: `operator`, `function`, `variable`, `constant`, `structural`, `boundary`, or `text`.
|
| 85 |
+
* Generates a dot-separated positional hierarchy string for each node in `tree_position_key` (e.g., `0.1.2` denotes root $\to$ 2nd child $\to$ 3rd child), which is useful for hierarchical positional encodings.
|
| 86 |
+
* **Attention Mask Matrix Synthesis**: Dynamically compiles four $N \times N$ binary attention mask matrices:
|
| 87 |
+
* `parent_mask`: Direct dependency attention.
|
| 88 |
+
* `children_mask`: Inverse dependency attention.
|
| 89 |
+
* `sibling_mask`: Horizontal syntactic context attention.
|
| 90 |
+
* `subtree_mask`: Complete structural scope attention.
|
| 91 |
+
|
| 92 |
+
### Layer 7: Vocabulary & BPE Compression
|
| 93 |
+
* **Location**: [`vocabulary.py`](file:///c:/Users/surwe/Project/math_token/mathtok/vocabulary.py)
|
| 94 |
+
* **Role**: Merging deterministic structural math vocabularies with Byte-Pair Encoding (BPE) text sub-vocabularies.
|
| 95 |
+
* **Implementation Details**:
|
| 96 |
+
* **Two-Tier Architecture**:
|
| 97 |
+
* **Tier 1 (Fixed Math Vocabulary)**: Reservoirs of deterministic, immutable IDs for standard operators, Greek/Latin variables, constants, boundaries, and placeholders. BPE is completely bypassed for math terms.
|
| 98 |
+
* **Tier 2 (BPE Text Vocabulary)**: Natural language regions are processed via HuggingFace's `tokenizers` library, trained on corpus-specific text spans.
|
| 99 |
+
* **HuggingFace Wrapper**: Under the hood, `MathTokHFTokenizer` acts as a drop-in subclass wrapper for `PreTrainedTokenizer`, enabling immediate integration into standard pipelines such as `transformers.Trainer`, `datasets.map`, and PyTorch collators.
|
| 100 |
+
|
| 101 |
+
---
|
| 102 |
+
|
| 103 |
+
## 🔄 Verification & Streaming Sub-systems
|
| 104 |
+
|
| 105 |
+
Beyond the core layers, MathTok implements crucial sub-systems to guarantee mathematical correctess and scale.
|
| 106 |
+
|
| 107 |
+
### Round-Trip Validation
|
| 108 |
+
* **Location**: [`validator.py`](file:///c:/Users/surwe/Project/math_token/mathtok/validator.py)
|
| 109 |
+
* **Role**: Guaranteeing zero semantic information loss during tokenization.
|
| 110 |
+
* **Implementation Details**:
|
| 111 |
+
* Uses the emitted `TokenMetadata` sequence to mathematically reconstruct the original SymPy expression.
|
| 112 |
+
* Rebuilds leaf nodes based on their category (constants, variables, truncations) and moves upwards to reconstruct complex nodes (`FRAC`, operators, custom functions).
|
| 113 |
+
* Performs formal validation by checking if the algebraic difference between the original and reconstructed expressions simplifies to zero (`sp.simplify(original - reconstructed) == 0`).
|
| 114 |
+
|
| 115 |
+
### Streaming Pipeline
|
| 116 |
+
* **Location**: [`streaming.py`](file:///c:/Users/surwe/Project/math_token/mathtok/streaming.py)
|
| 117 |
+
* **Role**: Corpus-scale processing of large datasets without exhausting system memory.
|
| 118 |
+
* **Implementation Details**:
|
| 119 |
+
* Wraps `MathTokPipeline` inside a lazy Python generator (`yield`).
|
| 120 |
+
* Supports encoding custom iterators and streams line-delimited files sequentially, ensuring constant memory ($O(1)$ RAM) overhead during dataset processing.
|
| 121 |
+
|
| 122 |
+
---
|
| 123 |
+
|
| 124 |
+
## 📈 Evaluation Suite & Benchmark Metrics
|
| 125 |
+
|
| 126 |
+
The [`evaluation/`](file:///c:/Users/surwe/Project/math_token/evaluation) package defines five core evaluation metrics (residing in [`metrics.py`](file:///c:/Users/surwe/Project/math_token/evaluation/metrics.py)) to assess tokenizer quality, benchmarked in [`comparison.py`](file:///c:/Users/surwe/Project/math_token/evaluation/comparison.py).
|
| 127 |
+
|
| 128 |
+
### Core Metrics
|
| 129 |
+
|
| 130 |
+
| Metric | Symbol | Definition & Formula | Mathematical Value |
|
| 131 |
+
| :--- | :---: | :--- | :--- |
|
| 132 |
+
| **Structural Compression Ratio** | **SCR** | $\text{mean}\left(\frac{\text{Structural Score}}{\text{Token Count}}\right)$ | Quantifies structural information density. Higher is better (more structure packed into fewer tokens). |
|
| 133 |
+
| **Canonical Consistency Score** | **CCS** | $\text{mean}\left( \text{Jaccard}(S_A, S_B) \right)$ over equivalent pairs | Evaluates algebraic invariance. A score of $1.0$ represents perfect semantic convergence. |
|
| 134 |
+
| **Operator Preservation Score** | **OPS** | $\%$ of expressions containing all expected operators | Measures robustness; ensures mathematical operations are never lost or corrupted. |
|
| 135 |
+
| **Token Stability** | **TS** | $1 - \text{Coefficient of Variation}(\text{length})$ | Assesses syntactic variance stability under re-writings. Higher is more stable. |
|
| 136 |
+
| **Tree Depth Fidelity** | **TDF** | $1 - \text{mean}\left( \frac{\vert d_{\text{actual}} - d_{\text{ground}} \vert}{d_{\text{ground}}} \right)$ | Measures max metadata depth accuracy against the ground truth SymPy height. |
|
| 137 |
+
|
| 138 |
+
> [!NOTE]
|
| 139 |
+
> **Semantic Compression Ratio (SCR)** is evaluated at three hierarchical levels in `comparison.py`:
|
| 140 |
+
> * **Level 1 — Structural Score to Token Ratio**: `structural_score / token_count`
|
| 141 |
+
> * **Level 2 — Semantic Density**: `math_tokens / total_tokens`
|
| 142 |
+
> * **Level 3 — Structural Efficiency**: `parent_child_relations / token_count`
|
| 143 |
+
|
| 144 |
+
---
|
| 145 |
+
|
| 146 |
+
## 🔬 Empirical Benchmark Results
|
| 147 |
+
|
| 148 |
+
Empirical comparisons of MathTok against a standard subword tokenizer (GPT-2 BPE), a custom-trained SentencePiece (unigram) tokenizer, and character-level baselines over 70 complex test expressions across multiple disciplines reveal substantial improvements.
|
| 149 |
+
|
| 150 |
+
### 1. 3-Level Semantic Comparison (Aggregated)
|
| 151 |
+
|
| 152 |
+
Across the entire evaluation suite, the aggregated results illustrate MathTok's efficiency:
|
| 153 |
+
|
| 154 |
+
| Metric | MathTok | GPT-2 | SentencePiece | Character-Level |
|
| 155 |
+
| :--- | :---: | :---: | :---: | :---: |
|
| 156 |
+
| **Level 1 — SCR** (struct_score / tokens) | **0.9161** | 0.4251 | 0.3696 | 0.4005 |
|
| 157 |
+
| **Level 2 — Semantic Density** (math / total) | **0.5633** | 0.1838 | 0.1499 | — |
|
| 158 |
+
| **Level 3 — Structural Efficiency** (relations / tokens) | **0.2492** | *N/A* | *N/A* | — |
|
| 159 |
+
| **SCR Improvement Factor** (MathTok vs. Baseline) | **—** | **2.16x** | **2.48x** | **2.29x** |
|
| 160 |
+
|
| 161 |
+
### 2. Canonical Convergence & Consistency (Jaccard Overlap)
|
| 162 |
+
|
| 163 |
+
For mathematically equivalent pairs, MathTok achieves perfect Jaccard alignment (Jaccard = 1.0), whereas standard text-based tokenizers suffer significant fragmentation:
|
| 164 |
+
|
| 165 |
+
| Expression Pair | MathTok Jaccard | GPT-2 Jaccard | SentencePiece Jaccard | Convergence Status |
|
| 166 |
+
| :--- | :---: | :---: | :---: | :---: |
|
| 167 |
+
| `x + 2` vs. `2 + x` | **1.000** | 0.200 | 1.000 | **CONVERGED (100%)** |
|
| 168 |
+
| `a*b + a*c` vs. `a*(b+c)` | **1.000** | 0.444 | 0.625 | **CONVERGED (100%)** |
|
| 169 |
+
| `(x+1)^2` vs. `x^2+2x+1` | **1.000** | 0.273 | 0.222 | **CONVERGED (100%)** |
|
| 170 |
+
| `x^2 - y^2` vs. `(x+y)*(x-y)` | **1.000** | 0.091 | 0.300 | **CONVERGED (100%)** |
|
| 171 |
+
| `sin(x)^2 + cos(x)^2` vs. `1` | **1.000** | 0.000 | 0.000 | **CONVERGED (100%)** |
|
| 172 |
+
| `2*x + 2*y` vs. `2*(x+y)` | **1.000** | 0.444 | 0.571 | **CONVERGED (100%)** |
|
| 173 |
+
| `x*y + x*z` vs. `x*(y+z)` | **1.000** | 0.444 | 0.625 | **CONVERGED (100%)** |
|
| 174 |
+
| `a^2 + 2*a*b + b^2` vs. `(a+b)^2` | **1.000** | 0.364 | 0.455 | **CONVERGED (100%)** |
|
| 175 |
+
|
| 176 |
+
### 3. LaTeX vs. ASCII Format Invariance
|
| 177 |
+
|
| 178 |
+
MathTok perfectly converges inputs in differing representations to identical structural sequences, while subword tokenizers have severe variance:
|
| 179 |
+
|
| 180 |
+
| ASCII Expression | LaTeX Expression | MathTok same? | MT tokens A/L | GPT-2 tokens A/L | SP tokens A/L |
|
| 181 |
+
| :--- | :--- | :---: | :---: | :---: | :---: |
|
| 182 |
+
| `sin(x^2)` | `\sin(x^2)` | **YES (1.00)** | **8 / 8** | 6 / 7 | 6 / 6 |
|
| 183 |
+
| `sqrt(x^2 + 1)` | `\sqrt{x^2 + 1}` | **YES (1.00)** | **11 / 11** | 9 / 10 | 9 / 9 |
|
| 184 |
+
| `log(x)` | `\ln(x)` | **YES (1.00)** | **6 / 6** | 4 / 5 | 6 / 6 |
|
| 185 |
+
| `exp(x)` | `e^x` | **YES (1.00)** | **6 / 6** | 4 / 3 | 6 / 3 |
|
| 186 |
+
| `x/y` | `\frac{x}{y}` | **YES (1.00)** | **6 / 6** | 3 / 7 | 3 / 9 |
|
| 187 |
+
| `int(x^2, x)` | `\int x^2 dx` | **NO (~/fallback)** | **1 / 10** | 8 / 6 | 8 / 7 |
|
| 188 |
+
| `diff(sin(x), x)` | `\frac{d}{dx}\sin(x)` | **YES (1.00)** | **6 / 6** | 8 / 11 | 14 / 16 |
|
| 189 |
+
| `factorial(n)` | `n!` | **YES (1.00)** | **6 / 6** | 5 / 2 | 11 / 3 |
|
| 190 |
+
|
| 191 |
+
---
|
| 192 |
+
|
| 193 |
+
## 🚀 Custom Attention Integration Patterns
|
| 194 |
+
|
| 195 |
+
The core value of MathTok for downstream machine learning practitioners is the **Layer 6 Attention Hints**. By translating tree relationships into standard masking shapes, model creators can train structure-aware networks natively.
|
| 196 |
+
|
| 197 |
+
Below are three attention mask designs that can be constructed directly from the outputs of `to_attention_mask_hints()`:
|
| 198 |
+
|
| 199 |
+
### 1. Parent-Child Hierarchical Mask
|
| 200 |
+
Encourages top-down syntactic attention. Nodes are only allowed to attend to their direct parent or child node.
|
| 201 |
+
|
| 202 |
+
```
|
| 203 |
+
[+ (root)] Parent Attention Mask Matrix:
|
| 204 |
+
/ \
|
| 205 |
+
[x] [3] [ ] [+ (root)] [x] [3]
|
| 206 |
+
| [+ (root)] 1 1 1
|
| 207 |
+
[sin] [x] 1 1 0
|
| 208 |
+
[3] 1 0 1
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
### 2. Sibling Horizontal Mask
|
| 212 |
+
Focuses horizontal attention across operands of identical scopes (e.g., connecting operands inside an addition sequence, $a$ and $b$ and $c$, without parent noise).
|
| 213 |
+
|
| 214 |
+
### 3. Subtree Scope Mask
|
| 215 |
+
A highly effective block mask for mathematical reasoning. Restricts attention strictly within a subtree, isolating independent sub-expressions during reasoning loops.
|
| 216 |
+
|
| 217 |
+
---
|
| 218 |
+
|
| 219 |
+
## 🎯 Codebase Evaluation & Recommendations
|
| 220 |
+
|
| 221 |
+
### Key Strengths
|
| 222 |
+
1. **Outstanding Structural Integrity**: Modularity is excellent. Clear abstraction separation (canonicalization, tokenization, serialization, and vocabulary grouping) makes codebase expansion extremely straightforward.
|
| 223 |
+
2. **HuggingFace Compatibility**: Subclassing/wrapping the standard tokenizer class ensures immediate, zero-friction integration with existing libraries like PyTorch and HuggingFace.
|
| 224 |
+
3. **Rigorous Validation**: The inclusion of `validator.py` and the round-trip checking logic demonstrates high development standards.
|
| 225 |
+
4. **Reliability Guards**: LRU caches, concurrency thread pools, and recursion limits make this pipeline safe for server-side deployment.
|
| 226 |
+
|
| 227 |
+
### Recommended Enhancements
|
| 228 |
+
* **Vocabulary Extension**: Dynamically augment `_VAR_MAP` in `ast_generator.py` to natively support multi-character variables (e.g., physics variables like $v_{\text{init}}$ or matrix names) without splitting them into generic token placeholders.
|
| 229 |
+
* **SymPy Parser Customisation**: SymPy's LaTeX parser can occasionally fail on non-standard, custom LaTeX macros. Adding pre-processing ASCII/LaTeX regex cleaners in `lexer.py` prior to passing them to SymPy will improve the parse success rate of dirty online forum data.
|
| 230 |
+
* **TDF Precision**: In case of multi-nested subtrees (e.g., highly deeply-nested fractions), customize the tree depth calculation in `metrics.py` to evaluate structural depths on custom mathematical representations rather than internal SymPy structures.
|
| 231 |
+
|
| 232 |
+
---
|
| 233 |
+
|
| 234 |
+
### Citation Reference
|
| 235 |
+
```bibtex
|
| 236 |
+
@article{mathtok2026,
|
| 237 |
+
title = {MathTok: A Hybrid Canonicalized AST-Based Tokenization Framework
|
| 238 |
+
for Mathematical Language Modeling},
|
| 239 |
+
author = {Anonymous},
|
| 240 |
+
year = {2026},
|
| 241 |
+
note = {Under review}
|
| 242 |
+
}
|
| 243 |
+
```
|
setup.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MathTok setup — installable as: pip install -e .
|
| 3 |
+
"""
|
| 4 |
+
from setuptools import setup, find_packages
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
long_description = (Path(__file__).parent / "README.md").read_text(encoding="utf-8")
|
| 8 |
+
|
| 9 |
+
setup(
|
| 10 |
+
name="mathtok",
|
| 11 |
+
version="0.1.0",
|
| 12 |
+
description=(
|
| 13 |
+
"A Hybrid Canonicalized AST-Based Tokenization Framework "
|
| 14 |
+
"for Mathematical Language Modeling"
|
| 15 |
+
),
|
| 16 |
+
long_description=long_description,
|
| 17 |
+
long_description_content_type="text/markdown",
|
| 18 |
+
author="Surweesh SP",
|
| 19 |
+
python_requires=">=3.10",
|
| 20 |
+
packages=find_packages(exclude=["tests*", "notebooks*", "paper*"]),
|
| 21 |
+
install_requires=[
|
| 22 |
+
"sympy>=1.12",
|
| 23 |
+
"antlr4-python3-runtime==4.11.1",
|
| 24 |
+
"tokenizers>=0.15.0",
|
| 25 |
+
"transformers>=4.38.0",
|
| 26 |
+
"numpy>=1.26.0",
|
| 27 |
+
"regex>=2023.12.25",
|
| 28 |
+
"tqdm>=4.66.0",
|
| 29 |
+
],
|
| 30 |
+
extras_require={
|
| 31 |
+
"eval": ["scipy>=1.12.0", "matplotlib>=3.8.0", "seaborn>=0.13.0", "networkx>=3.2"],
|
| 32 |
+
"dev": ["pytest>=8.0.0", "pytest-cov>=5.0.0", "jupyter>=1.0.0"],
|
| 33 |
+
},
|
| 34 |
+
classifiers=[
|
| 35 |
+
"Development Status :: 3 - Alpha",
|
| 36 |
+
"Intended Audience :: Science/Research",
|
| 37 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 38 |
+
"License :: OSI Approved :: MIT License",
|
| 39 |
+
"Programming Language :: Python :: 3.10",
|
| 40 |
+
],
|
| 41 |
+
entry_points={
|
| 42 |
+
"console_scripts": [
|
| 43 |
+
"mathtok=mathtok.pipeline:cli",
|
| 44 |
+
]
|
| 45 |
+
},
|
| 46 |
+
)
|
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# tests package
|
tests/test_ast_generator.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for the AST Generator (Layer 3).
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
import sympy as sp
|
| 7 |
+
|
| 8 |
+
from mathtok.ast_generator import ASTGenerator, ASTNode
|
| 9 |
+
from mathtok.canonicalizer import Canonicalizer
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@pytest.fixture
|
| 13 |
+
def gen():
|
| 14 |
+
return ASTGenerator()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@pytest.fixture
|
| 18 |
+
def canon():
|
| 19 |
+
return Canonicalizer(do_simplify=False, do_expand=False)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def parse(expr_str: str):
|
| 23 |
+
from sympy.parsing.sympy_parser import (
|
| 24 |
+
parse_expr, standard_transformations,
|
| 25 |
+
implicit_multiplication_application, convert_xor,
|
| 26 |
+
)
|
| 27 |
+
return parse_expr(
|
| 28 |
+
expr_str,
|
| 29 |
+
transformations=standard_transformations + (
|
| 30 |
+
implicit_multiplication_application, convert_xor,
|
| 31 |
+
),
|
| 32 |
+
local_dict={"x": sp.Symbol("x"), "y": sp.Symbol("y"),
|
| 33 |
+
"a": sp.Symbol("a"), "b": sp.Symbol("b"),
|
| 34 |
+
"n": sp.Symbol("n")},
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class TestBasicNodes:
|
| 39 |
+
def test_symbol(self, gen):
|
| 40 |
+
ast = gen.generate(sp.Symbol("x"))
|
| 41 |
+
assert ast.token == "VAR_X"
|
| 42 |
+
assert ast.is_leaf
|
| 43 |
+
|
| 44 |
+
def test_integer_zero(self, gen):
|
| 45 |
+
ast = gen.generate(sp.Integer(0))
|
| 46 |
+
assert ast.token == "CONST_0"
|
| 47 |
+
|
| 48 |
+
def test_integer_positive(self, gen):
|
| 49 |
+
ast = gen.generate(sp.Integer(5))
|
| 50 |
+
assert ast.token == "CONST_5"
|
| 51 |
+
|
| 52 |
+
def test_integer_negative(self, gen):
|
| 53 |
+
ast = gen.generate(sp.Integer(-3))
|
| 54 |
+
assert ast.token == "OP_NEG"
|
| 55 |
+
assert ast.children[0].token == "CONST_3"
|
| 56 |
+
|
| 57 |
+
def test_pi(self, gen):
|
| 58 |
+
ast = gen.generate(sp.pi)
|
| 59 |
+
assert ast.token == "CONST_PI"
|
| 60 |
+
|
| 61 |
+
def test_e(self, gen):
|
| 62 |
+
ast = gen.generate(sp.E)
|
| 63 |
+
assert ast.token == "CONST_E"
|
| 64 |
+
|
| 65 |
+
def test_rational(self, gen):
|
| 66 |
+
ast = gen.generate(sp.Rational(1, 2))
|
| 67 |
+
assert ast.token == "FRAC"
|
| 68 |
+
assert len(ast.children) == 2
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class TestArithmetic:
|
| 72 |
+
def test_add(self, gen):
|
| 73 |
+
expr = parse("x + 1")
|
| 74 |
+
ast = gen.generate(expr)
|
| 75 |
+
assert ast.token == "OP_ADD"
|
| 76 |
+
tokens = gen.get_all_tokens(ast)
|
| 77 |
+
assert "VAR_X" in tokens
|
| 78 |
+
assert "CONST_1" in tokens
|
| 79 |
+
|
| 80 |
+
def test_mul(self, gen):
|
| 81 |
+
expr = parse("2*x")
|
| 82 |
+
ast = gen.generate(expr)
|
| 83 |
+
# 2*x is either OP_MUL or OP_NEG etc.
|
| 84 |
+
assert ast.token in ("OP_MUL", "VAR_X", "CONST_2")
|
| 85 |
+
|
| 86 |
+
def test_pow(self, gen):
|
| 87 |
+
expr = parse("x^2")
|
| 88 |
+
ast = gen.generate(expr)
|
| 89 |
+
assert ast.token == "OP_POW"
|
| 90 |
+
assert ast.children[0].token == "VAR_X"
|
| 91 |
+
assert ast.children[1].token == "CONST_2"
|
| 92 |
+
|
| 93 |
+
def test_negation(self, gen):
|
| 94 |
+
expr = sp.Mul(sp.Integer(-1), sp.Symbol("x"))
|
| 95 |
+
ast = gen.generate(expr)
|
| 96 |
+
assert ast.token == "OP_NEG"
|
| 97 |
+
|
| 98 |
+
def test_reciprocal(self, gen):
|
| 99 |
+
expr = sp.Pow(sp.Symbol("x"), sp.Integer(-1))
|
| 100 |
+
ast = gen.generate(expr)
|
| 101 |
+
assert ast.token == "OP_RECIP"
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class TestFunctions:
|
| 105 |
+
def test_sin(self, gen):
|
| 106 |
+
expr = sp.sin(sp.Symbol("x"))
|
| 107 |
+
ast = gen.generate(expr)
|
| 108 |
+
assert ast.token == "FUNC_SIN"
|
| 109 |
+
assert ast.children[0].token == "VAR_X"
|
| 110 |
+
|
| 111 |
+
def test_cos(self, gen):
|
| 112 |
+
ast = gen.generate(sp.cos(sp.Symbol("x")))
|
| 113 |
+
assert ast.token == "FUNC_COS"
|
| 114 |
+
|
| 115 |
+
def test_exp(self, gen):
|
| 116 |
+
ast = gen.generate(sp.exp(sp.Symbol("x")))
|
| 117 |
+
assert ast.token == "FUNC_EXP"
|
| 118 |
+
|
| 119 |
+
def test_log(self, gen):
|
| 120 |
+
ast = gen.generate(sp.log(sp.Symbol("x")))
|
| 121 |
+
assert ast.token == "FUNC_LOG"
|
| 122 |
+
|
| 123 |
+
def test_sqrt(self, gen):
|
| 124 |
+
# SymPy represents sqrt(x) internally as Pow(x, Rational(1,2))
|
| 125 |
+
# so the AST correctly emits OP_POW; FUNC_SQRT is only emitted
|
| 126 |
+
# when sympy.sqrt is used directly before any canonicalization.
|
| 127 |
+
ast = gen.generate(sp.sqrt(sp.Symbol("x")))
|
| 128 |
+
# Accept either FUNC_SQRT (direct) or OP_POW (post-simplification)
|
| 129 |
+
assert ast.token in ("FUNC_SQRT", "OP_POW")
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
class TestTreeProperties:
|
| 133 |
+
def test_depth_assignment(self, gen):
|
| 134 |
+
expr = parse("x^2 + 1")
|
| 135 |
+
ast = gen.generate(expr)
|
| 136 |
+
assert ast.depth == 0
|
| 137 |
+
for child in ast.children:
|
| 138 |
+
assert child.depth == 1
|
| 139 |
+
|
| 140 |
+
def test_unique_node_ids(self, gen):
|
| 141 |
+
expr = parse("x^2 + 2*x + 1")
|
| 142 |
+
ast = gen.generate(expr)
|
| 143 |
+
all_ids: list[int] = []
|
| 144 |
+
|
| 145 |
+
def collect(node):
|
| 146 |
+
all_ids.append(node.node_id)
|
| 147 |
+
for c in node.children:
|
| 148 |
+
collect(c)
|
| 149 |
+
|
| 150 |
+
collect(ast)
|
| 151 |
+
assert len(all_ids) == len(set(all_ids)), "Node IDs must be unique"
|
| 152 |
+
|
| 153 |
+
def test_subtree_size(self, gen):
|
| 154 |
+
ast = gen.generate(sp.Integer(5))
|
| 155 |
+
assert ast.subtree_size == 1
|
| 156 |
+
|
| 157 |
+
expr = parse("x + 1")
|
| 158 |
+
ast = gen.generate(expr)
|
| 159 |
+
assert ast.subtree_size == 3 # ADD + VAR_X + CONST_1
|
| 160 |
+
|
| 161 |
+
def test_variable_extraction(self, gen):
|
| 162 |
+
expr = parse("x^2 + y + 1")
|
| 163 |
+
ast = gen.generate(expr)
|
| 164 |
+
vars_ = gen.get_variable_tokens(ast)
|
| 165 |
+
assert "VAR_X" in vars_
|
| 166 |
+
assert "VAR_Y" in vars_
|
tests/test_canonicalizer.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for the Canonicalization Layer (Layer 1).
|
| 3 |
+
|
| 4 |
+
Covers:
|
| 5 |
+
- ASCII expression parsing
|
| 6 |
+
- LaTeX expression parsing
|
| 7 |
+
- Equivalence detection (are_equivalent)
|
| 8 |
+
- Normalization transformations
|
| 9 |
+
- Fallback behaviour on parse errors
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import pytest
|
| 13 |
+
import sympy as sp
|
| 14 |
+
|
| 15 |
+
from mathtok.canonicalizer import Canonicalizer, CanonicalizationResult
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@pytest.fixture
|
| 19 |
+
def canon():
|
| 20 |
+
return Canonicalizer(do_simplify=True, do_expand=True)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# ── Parsing ───────────────────────────────────────────────────────────────
|
| 24 |
+
|
| 25 |
+
class TestParsing:
|
| 26 |
+
def test_ascii_simple(self, canon):
|
| 27 |
+
r = canon.canonicalize("x^2 + 1")
|
| 28 |
+
assert r.success
|
| 29 |
+
assert r.input_format == "ascii"
|
| 30 |
+
assert "x" in str(r.expr)
|
| 31 |
+
|
| 32 |
+
def test_ascii_implicit_mul(self, canon):
|
| 33 |
+
r = canon.canonicalize("2x + 1")
|
| 34 |
+
assert r.success
|
| 35 |
+
|
| 36 |
+
def test_ascii_constants(self, canon):
|
| 37 |
+
r = canon.canonicalize("pi + e")
|
| 38 |
+
assert r.success
|
| 39 |
+
assert sp.pi in r.expr.free_symbols or r.expr == sp.pi + sp.E
|
| 40 |
+
|
| 41 |
+
def test_latex_frac(self, canon):
|
| 42 |
+
r = canon.canonicalize("\\frac{x^2}{2}")
|
| 43 |
+
# LaTeX detected
|
| 44 |
+
assert r.input_format == "latex" or r.success # may fallback
|
| 45 |
+
|
| 46 |
+
def test_latex_sin(self, canon):
|
| 47 |
+
r = canon.canonicalize("\\sin(x^2)")
|
| 48 |
+
assert r.success
|
| 49 |
+
|
| 50 |
+
def test_latex_sqrt(self, canon):
|
| 51 |
+
r = canon.canonicalize("\\sqrt{x^2 + 1}")
|
| 52 |
+
assert r.success
|
| 53 |
+
|
| 54 |
+
def test_parse_error_graceful(self, canon):
|
| 55 |
+
r = canon.canonicalize("@@@invalid@@@")
|
| 56 |
+
assert not r.success
|
| 57 |
+
assert len(r.warnings) > 0
|
| 58 |
+
|
| 59 |
+
def test_delimiters_stripped(self, canon):
|
| 60 |
+
r = canon.canonicalize("$x^2 + 1$")
|
| 61 |
+
assert r.success
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# ── Normalization ─────────────────────────────────────────────────────────
|
| 65 |
+
|
| 66 |
+
class TestNormalization:
|
| 67 |
+
def test_expand(self, canon):
|
| 68 |
+
r = canon.canonicalize("(x+1)^2")
|
| 69 |
+
# expanded form should include x^2 and 2x
|
| 70 |
+
expr_str = str(r.expr)
|
| 71 |
+
assert "x**2" in expr_str or "x^2" in expr_str
|
| 72 |
+
|
| 73 |
+
def test_commutativity_canonical(self, canon):
|
| 74 |
+
r1 = canon.canonicalize("a + b")
|
| 75 |
+
r2 = canon.canonicalize("b + a")
|
| 76 |
+
# SymPy canonicalises Add ordering
|
| 77 |
+
assert str(r1.expr) == str(r2.expr)
|
| 78 |
+
|
| 79 |
+
def test_subtraction_to_add(self, canon):
|
| 80 |
+
r = canon.canonicalize("x - y")
|
| 81 |
+
# SymPy represents x-y as Add(x, Mul(-1, y))
|
| 82 |
+
assert isinstance(r.expr, sp.Add)
|
| 83 |
+
|
| 84 |
+
def test_division_to_mul(self, canon):
|
| 85 |
+
r = canon.canonicalize("x / y")
|
| 86 |
+
# SymPy represents x/y as Mul(x, Pow(y, -1))
|
| 87 |
+
assert isinstance(r.expr, sp.Mul)
|
| 88 |
+
|
| 89 |
+
def test_transformations_recorded(self, canon):
|
| 90 |
+
r = canon.canonicalize("x^2 + 2*x + 1")
|
| 91 |
+
assert "expand" in r.transformations_applied
|
| 92 |
+
assert "simplify" in r.transformations_applied
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# ── Equivalence ───────────────────────────────────────────────────────────
|
| 96 |
+
|
| 97 |
+
class TestEquivalence:
|
| 98 |
+
def test_basic_equivalent(self, canon):
|
| 99 |
+
assert canon.are_equivalent("(x+1)^2", "x^2 + 2*x + 1")
|
| 100 |
+
|
| 101 |
+
def test_commutative_equivalent(self, canon):
|
| 102 |
+
assert canon.are_equivalent("a + b", "b + a")
|
| 103 |
+
|
| 104 |
+
def test_not_equivalent(self, canon):
|
| 105 |
+
assert not canon.are_equivalent("x^2", "x^3")
|
| 106 |
+
|
| 107 |
+
def test_trig_identity(self, canon):
|
| 108 |
+
# sin^2 + cos^2 = 1
|
| 109 |
+
assert canon.are_equivalent("sin(x)^2 + cos(x)^2", "1")
|
| 110 |
+
|
| 111 |
+
def test_log_product(self, canon):
|
| 112 |
+
# log(x)+log(y) = log(x*y) requires positive assumptions;
|
| 113 |
+
# SymPy's simplify may not collapse it without them.
|
| 114 |
+
# Verify at least that both are valid canonical expressions.
|
| 115 |
+
r1 = canon.canonicalize("log(x) + log(y)")
|
| 116 |
+
r2 = canon.canonicalize("log(x*y)")
|
| 117 |
+
assert r1.success and r2.success
|
| 118 |
+
# With positive assumptions the difference simplifies to 0
|
| 119 |
+
import sympy as sp
|
| 120 |
+
x, y = sp.Symbol("x", positive=True), sp.Symbol("y", positive=True)
|
| 121 |
+
diff = sp.simplify(sp.log(x) + sp.log(y) - sp.log(x * y))
|
| 122 |
+
assert diff == 0
|
| 123 |
+
|
| 124 |
+
def test_difference_of_squares(self, canon):
|
| 125 |
+
assert canon.are_equivalent("a^2 - b^2", "(a+b)*(a-b)")
|
tests/test_comparison.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for the Semantic Tokenizer Comparison Framework.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
from evaluation.comparison import (
|
| 7 |
+
TokenizerStats, ComparisonRecord, TokenizerComparison,
|
| 8 |
+
_score_char, _score_gpt2, _score_mathtok,
|
| 9 |
+
_jaccard, _mean,
|
| 10 |
+
STANDARD_EXPRESSIONS, DEEP_NESTING_EXPRESSIONS, CANONICAL_PAIRS,
|
| 11 |
+
)
|
| 12 |
+
from mathtok.pipeline import MathTokPipeline
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@pytest.fixture(scope="module")
|
| 16 |
+
def pipeline():
|
| 17 |
+
return MathTokPipeline(include_metadata=True)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@pytest.fixture(scope="module")
|
| 21 |
+
def comp(pipeline):
|
| 22 |
+
return TokenizerComparison(pipeline, gpt2_fn=None, save_jsonl=False)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ── TokenizerStats ────────────────────────────────────────────────────────
|
| 26 |
+
|
| 27 |
+
class TestTokenizerStats:
|
| 28 |
+
def test_scr_computed(self):
|
| 29 |
+
stats = TokenizerStats(
|
| 30 |
+
name="test", tokens=["OP_ADD", "VAR_X", "CONST_1"],
|
| 31 |
+
token_count=3,
|
| 32 |
+
operator_nodes=1, tree_depth=1,
|
| 33 |
+
parent_child_relations=1, function_scope=0,
|
| 34 |
+
canonical_bonus=2,
|
| 35 |
+
)
|
| 36 |
+
stats.compute_scr()
|
| 37 |
+
assert stats.structural_score == 5 # 1+1+1+0+2
|
| 38 |
+
assert abs(stats.raw_scr - 5/3) < 1e-9
|
| 39 |
+
assert abs(stats.structural_efficiency - 1/3) < 1e-9
|
| 40 |
+
|
| 41 |
+
def test_zero_token_count_safe(self):
|
| 42 |
+
stats = TokenizerStats(name="empty", tokens=[], token_count=0)
|
| 43 |
+
stats.compute_scr()
|
| 44 |
+
assert stats.raw_scr == 0.0
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# ── Character-level scorer ─────────────────────────────────────────────────
|
| 48 |
+
|
| 49 |
+
class TestCharScore:
|
| 50 |
+
def test_simple(self):
|
| 51 |
+
stats = _score_char("x + 1")
|
| 52 |
+
assert stats.token_count == 5
|
| 53 |
+
assert stats.operator_nodes >= 1 # at least +
|
| 54 |
+
assert stats.raw_scr >= 0
|
| 55 |
+
|
| 56 |
+
def test_nested_parens_depth(self):
|
| 57 |
+
stats = _score_char("sin((x+1)^2)")
|
| 58 |
+
assert stats.tree_depth >= 2 # at least 2 levels of parens
|
| 59 |
+
|
| 60 |
+
def test_no_function_scope(self):
|
| 61 |
+
# Character-level can't identify functions
|
| 62 |
+
stats = _score_char("sin(x)")
|
| 63 |
+
assert stats.function_scope == 0
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# ── GPT-2 heuristic scorer ─────────────────────────────────────────────────
|
| 67 |
+
|
| 68 |
+
class TestGPT2Score:
|
| 69 |
+
def test_operators_detected(self):
|
| 70 |
+
tokens = ["(", "x", "+", "1", ")", "^", "2"]
|
| 71 |
+
stats = _score_gpt2(tokens)
|
| 72 |
+
assert stats.operator_nodes >= 1
|
| 73 |
+
|
| 74 |
+
def test_function_detected(self):
|
| 75 |
+
tokens = ["sin", "(", "x", ")"]
|
| 76 |
+
stats = _score_gpt2(tokens)
|
| 77 |
+
assert stats.function_scope >= 1
|
| 78 |
+
|
| 79 |
+
def test_paren_depth(self):
|
| 80 |
+
tokens = ["(", "(", "x", ")", ")"]
|
| 81 |
+
stats = _score_gpt2(tokens)
|
| 82 |
+
assert stats.tree_depth == 2
|
| 83 |
+
|
| 84 |
+
def test_scr_positive(self):
|
| 85 |
+
tokens = ["sin", "(", "x", "^", "2", ")"]
|
| 86 |
+
stats = _score_gpt2(tokens)
|
| 87 |
+
stats.compute_scr()
|
| 88 |
+
assert stats.raw_scr >= 0
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
# ── MathTok scorer ────────────────────────────────────────────────────────
|
| 92 |
+
|
| 93 |
+
class TestMathTokScore:
|
| 94 |
+
def test_add_expression(self, pipeline):
|
| 95 |
+
out = pipeline.encode_math_only("x + 1")
|
| 96 |
+
stats = _score_mathtok(out)
|
| 97 |
+
assert stats.token_count > 0
|
| 98 |
+
assert stats.operator_nodes >= 1 # OP_ADD
|
| 99 |
+
assert stats.canonical_bonus == 2 # successful parse
|
| 100 |
+
|
| 101 |
+
def test_function_expression(self, pipeline):
|
| 102 |
+
out = pipeline.encode_math_only("sin(x^2)")
|
| 103 |
+
stats = _score_mathtok(out)
|
| 104 |
+
assert stats.function_scope >= 1 # FUNC_SIN
|
| 105 |
+
|
| 106 |
+
def test_depth_nonzero(self, pipeline):
|
| 107 |
+
out = pipeline.encode_math_only("sin(x^2 + 1)")
|
| 108 |
+
stats = _score_mathtok(out)
|
| 109 |
+
assert stats.tree_depth >= 2
|
| 110 |
+
|
| 111 |
+
def test_scr_computed(self, pipeline):
|
| 112 |
+
out = pipeline.encode_math_only("(x+1)^2")
|
| 113 |
+
stats = _score_mathtok(out)
|
| 114 |
+
assert stats.raw_scr > 0
|
| 115 |
+
|
| 116 |
+
def test_mathtok_scr_higher_than_char(self, pipeline):
|
| 117 |
+
expr = "sin(x^2 + 1)"
|
| 118 |
+
out = pipeline.encode_math_only(expr)
|
| 119 |
+
mt = _score_mathtok(out)
|
| 120 |
+
ch = _score_char(expr)
|
| 121 |
+
# MathTok should have higher SCR due to semantic richness
|
| 122 |
+
assert mt.raw_scr > ch.raw_scr
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
# ── Comparison mechanics ──────────────────────────────────────────────────
|
| 126 |
+
|
| 127 |
+
class TestComparison:
|
| 128 |
+
def test_compare_one(self, comp):
|
| 129 |
+
rec = comp._compare_one("x + 1", "test")
|
| 130 |
+
assert isinstance(rec, ComparisonRecord)
|
| 131 |
+
assert rec.mathtok.token_count > 0
|
| 132 |
+
assert rec.char_level.token_count > 0
|
| 133 |
+
assert rec.gpt2 is None # no GPT-2 in fixture
|
| 134 |
+
|
| 135 |
+
def test_scr_improvement_vs_char(self, comp):
|
| 136 |
+
rec = comp._compare_one("sin(x^2)", "test")
|
| 137 |
+
# MathTok should outperform char-level on SCR
|
| 138 |
+
assert rec.scr_improvement_vs_char > 0
|
| 139 |
+
|
| 140 |
+
def test_canonical_jaccard(self, comp, pipeline):
|
| 141 |
+
# Equivalent expressions should have high Jaccard
|
| 142 |
+
out_a = pipeline.encode_math_only("x + 2")
|
| 143 |
+
out_b = pipeline.encode_math_only("2 + x")
|
| 144 |
+
mt_a = set(t for t in out_a.tokens if not t.startswith("["))
|
| 145 |
+
mt_b = set(t for t in out_b.tokens if not t.startswith("["))
|
| 146 |
+
jac = _jaccard(mt_a, mt_b)
|
| 147 |
+
assert jac > 0.5 # should be near 1.0 due to canonicalization
|
| 148 |
+
|
| 149 |
+
def test_run_standard_small(self, comp):
|
| 150 |
+
# Run just 3 expressions to keep test fast
|
| 151 |
+
for expr in STANDARD_EXPRESSIONS[:3]:
|
| 152 |
+
rec = comp._compare_one(expr, "standard")
|
| 153 |
+
assert rec.mathtok.token_count > 0
|
| 154 |
+
|
| 155 |
+
def test_deep_nesting_depth_increases(self, comp, pipeline):
|
| 156 |
+
flat = pipeline.encode_math_only("x + 1")
|
| 157 |
+
nested = pipeline.encode_math_only("sin(cos((x+1)^2))")
|
| 158 |
+
flat_d = max((m.depth for m in flat.metadata if m.depth >= 0), default=0)
|
| 159 |
+
nest_d = max((m.depth for m in nested.metadata if m.depth >= 0), default=0)
|
| 160 |
+
assert nest_d > flat_d
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# ── Utility helpers ───────────────────────────────────────────────────────
|
| 164 |
+
|
| 165 |
+
class TestHelpers:
|
| 166 |
+
def test_jaccard_identical(self):
|
| 167 |
+
assert _jaccard({"a", "b"}, {"a", "b"}) == 1.0
|
| 168 |
+
|
| 169 |
+
def test_jaccard_disjoint(self):
|
| 170 |
+
assert _jaccard({"a"}, {"b"}) == 0.0
|
| 171 |
+
|
| 172 |
+
def test_jaccard_partial(self):
|
| 173 |
+
j = _jaccard({"a", "b"}, {"b", "c"})
|
| 174 |
+
assert abs(j - 1/3) < 1e-9
|
| 175 |
+
|
| 176 |
+
def test_mean_empty(self):
|
| 177 |
+
assert _mean([]) == 0.0
|
| 178 |
+
|
| 179 |
+
def test_mean_values(self):
|
| 180 |
+
assert abs(_mean([1.0, 2.0, 3.0]) - 2.0) < 1e-9
|
tests/test_lexer.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for the Hybrid Lexer (Layer 2).
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
from mathtok.lexer import HybridLexer, LexSpan, SpanType
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@pytest.fixture
|
| 10 |
+
def lex():
|
| 11 |
+
return HybridLexer(ascii_math_detection=True, min_math_len=3)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class TestLatexDetection:
|
| 15 |
+
def test_inline_dollar(self, lex):
|
| 16 |
+
spans = lex.lex("Let $x^2 + 1$ be given.")
|
| 17 |
+
types = [s.span_type for s in spans if s.content.strip()]
|
| 18 |
+
assert SpanType.MATH in types
|
| 19 |
+
assert SpanType.TEXT in types
|
| 20 |
+
|
| 21 |
+
def test_display_dollar(self, lex):
|
| 22 |
+
spans = lex.lex("$$x^2 + y^2 = 1$$")
|
| 23 |
+
math_spans = [s for s in spans if s.span_type is SpanType.MATH]
|
| 24 |
+
assert len(math_spans) >= 1
|
| 25 |
+
assert "x^2" in math_spans[0].content or "x" in math_spans[0].content
|
| 26 |
+
|
| 27 |
+
def test_inline_paren(self, lex):
|
| 28 |
+
spans = lex.lex("We have \\(a + b\\) here.")
|
| 29 |
+
math_spans = [s for s in spans if s.span_type is SpanType.MATH]
|
| 30 |
+
assert len(math_spans) == 1
|
| 31 |
+
|
| 32 |
+
def test_display_bracket(self, lex):
|
| 33 |
+
spans = lex.lex("Result: \\[x = \\frac{-b}{2a}\\]")
|
| 34 |
+
math_spans = [s for s in spans if s.span_type is SpanType.MATH]
|
| 35 |
+
assert len(math_spans) == 1
|
| 36 |
+
|
| 37 |
+
def test_multiple_math_spans(self, lex):
|
| 38 |
+
spans = lex.lex("If $a > 0$ and $b < 0$, then $a + b$ may be zero.")
|
| 39 |
+
math_spans = [s for s in spans if s.span_type is SpanType.MATH]
|
| 40 |
+
assert len(math_spans) == 3
|
| 41 |
+
|
| 42 |
+
def test_pure_text(self, lex):
|
| 43 |
+
spans = lex.lex("This is plain English text with no math at all.")
|
| 44 |
+
math_spans = [s for s in spans if s.span_type is SpanType.MATH]
|
| 45 |
+
assert len(math_spans) == 0
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class TestAsciiDetection:
|
| 49 |
+
def test_function_call(self, lex):
|
| 50 |
+
spans = lex.lex("Compute sin(x) for x = pi.")
|
| 51 |
+
math_spans = [s for s in spans if s.span_type is SpanType.MATH]
|
| 52 |
+
assert any("sin" in s.content for s in math_spans)
|
| 53 |
+
|
| 54 |
+
def test_exponentiation(self, lex):
|
| 55 |
+
spans = lex.lex("The value of x^2 is always positive.")
|
| 56 |
+
math_spans = [s for s in spans if s.span_type is SpanType.MATH]
|
| 57 |
+
assert len(math_spans) >= 1
|
| 58 |
+
|
| 59 |
+
def test_equation(self, lex):
|
| 60 |
+
spans = lex.lex("Solve x^2 + 2*x + 1 = 0.")
|
| 61 |
+
math_spans = [s for s in spans if s.span_type is SpanType.MATH]
|
| 62 |
+
assert len(math_spans) >= 1
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class TestEdgeCases:
|
| 66 |
+
def test_empty_string(self, lex):
|
| 67 |
+
spans = lex.lex("")
|
| 68 |
+
assert spans == []
|
| 69 |
+
|
| 70 |
+
def test_only_whitespace(self, lex):
|
| 71 |
+
spans = lex.lex(" ")
|
| 72 |
+
assert all(s.span_type is SpanType.TEXT for s in spans)
|
| 73 |
+
|
| 74 |
+
def test_is_math_only_true(self, lex):
|
| 75 |
+
assert lex.is_math_only("$x^2 + 1$")
|
| 76 |
+
|
| 77 |
+
def test_adjacent_spans_merged(self, lex):
|
| 78 |
+
spans = lex.lex("hello world, no math here at all.")
|
| 79 |
+
# All-text should be merged into a minimal number of spans
|
| 80 |
+
text_spans = [s for s in spans if s.span_type is SpanType.TEXT]
|
| 81 |
+
assert len(text_spans) <= 2
|
tests/test_pipeline.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Integration tests for the end-to-end MathTok Pipeline.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
from mathtok.pipeline import MathTokPipeline, TokenizedOutput
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@pytest.fixture(scope="module")
|
| 10 |
+
def pipeline():
|
| 11 |
+
return MathTokPipeline(include_metadata=True)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class TestBasicEncode:
|
| 15 |
+
def test_returns_output(self, pipeline):
|
| 16 |
+
out = pipeline.encode("x^2 + 1")
|
| 17 |
+
assert isinstance(out, TokenizedOutput)
|
| 18 |
+
|
| 19 |
+
def test_tokens_nonempty(self, pipeline):
|
| 20 |
+
out = pipeline.encode("sin(x)")
|
| 21 |
+
assert len(out.tokens) > 0
|
| 22 |
+
|
| 23 |
+
def test_input_ids_match_tokens(self, pipeline):
|
| 24 |
+
out = pipeline.encode("x^2 + 2*x + 1")
|
| 25 |
+
assert len(out.tokens) == len(out.input_ids)
|
| 26 |
+
|
| 27 |
+
def test_ids_are_integers(self, pipeline):
|
| 28 |
+
out = pipeline.encode("x + 1")
|
| 29 |
+
assert all(isinstance(i, int) for i in out.input_ids)
|
| 30 |
+
|
| 31 |
+
def test_no_negative_ids(self, pipeline):
|
| 32 |
+
out = pipeline.encode("x + 1")
|
| 33 |
+
# All IDs should be non-negative (UNK=1 is minimum valid)
|
| 34 |
+
assert all(i >= 0 for i in out.input_ids)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class TestMathSpans:
|
| 38 |
+
def test_math_start_end_tokens(self, pipeline):
|
| 39 |
+
out = pipeline.encode("x^2")
|
| 40 |
+
assert "[MATH_START]" in out.tokens
|
| 41 |
+
assert "[MATH_END]" in out.tokens
|
| 42 |
+
|
| 43 |
+
def test_sexp_nonempty(self, pipeline):
|
| 44 |
+
out = pipeline.encode("x^2 + 1")
|
| 45 |
+
assert len(out.sexp) > 0
|
| 46 |
+
|
| 47 |
+
def test_sexp_contains_op(self, pipeline):
|
| 48 |
+
out = pipeline.encode("x^2")
|
| 49 |
+
assert "OP_POW" in out.sexp
|
| 50 |
+
|
| 51 |
+
def test_canon_results(self, pipeline):
|
| 52 |
+
# Use a simple ASCII expression guaranteed to parse successfully
|
| 53 |
+
out = pipeline.encode("x^2 + 1")
|
| 54 |
+
assert len(out.canon_results) >= 1
|
| 55 |
+
assert out.canon_results[0].success
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class TestMixedInput:
|
| 59 |
+
def test_mixed_latex(self, pipeline):
|
| 60 |
+
out = pipeline.encode("The result is $x^2 + 1$.")
|
| 61 |
+
assert len(out.tokens) > 0
|
| 62 |
+
|
| 63 |
+
def test_mixed_ascii(self, pipeline):
|
| 64 |
+
out = pipeline.encode("Compute sin(x) for x = pi.")
|
| 65 |
+
assert len(out.tokens) > 0
|
| 66 |
+
|
| 67 |
+
def test_multiple_math_spans(self, pipeline):
|
| 68 |
+
out = pipeline.encode("If $a > 0$ and $b < 0$ then $a + b$ can be zero.")
|
| 69 |
+
# Should have at least some math tokens
|
| 70 |
+
math_toks = [t for t in out.tokens if t.startswith("OP_") or t.startswith("VAR_")]
|
| 71 |
+
assert len(math_toks) > 0
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class TestMetadata:
|
| 75 |
+
def test_metadata_present(self, pipeline):
|
| 76 |
+
out = pipeline.encode("x + 1")
|
| 77 |
+
assert len(out.metadata) > 0
|
| 78 |
+
|
| 79 |
+
def test_metadata_positions_sequential(self, pipeline):
|
| 80 |
+
out = pipeline.encode("x^2 + 1")
|
| 81 |
+
positions = [m.position for m in out.metadata]
|
| 82 |
+
assert positions == sorted(positions)
|
| 83 |
+
|
| 84 |
+
def test_metadata_categories(self, pipeline):
|
| 85 |
+
out = pipeline.encode("x + 1")
|
| 86 |
+
categories = {m.token_category for m in out.metadata}
|
| 87 |
+
assert "operator" in categories or "variable" in categories or "constant" in categories
|
| 88 |
+
|
| 89 |
+
def test_tree_position_keys(self, pipeline):
|
| 90 |
+
out = pipeline.encode("x + 1")
|
| 91 |
+
keys = [m.tree_position_key for m in out.metadata if m.node_id >= 0]
|
| 92 |
+
assert len(keys) > 0
|
| 93 |
+
assert all(isinstance(k, str) for k in keys)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
class TestEncodeMathOnly:
|
| 97 |
+
def test_encode_math_only(self, pipeline):
|
| 98 |
+
out = pipeline.encode_math_only("x^2 + 2*x + 1")
|
| 99 |
+
assert len(out.tokens) > 0
|
| 100 |
+
assert "OP_ADD" in out.tokens or "OP_POW" in out.tokens
|
| 101 |
+
|
| 102 |
+
def test_encode_batch(self, pipeline):
|
| 103 |
+
exprs = ["x + 1", "sin(x)", "x^2"]
|
| 104 |
+
outs = pipeline.encode_batch(exprs)
|
| 105 |
+
assert len(outs) == 3
|
| 106 |
+
assert all(len(o.tokens) > 0 for o in outs)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
class TestHFTokenizer:
|
| 110 |
+
def test_hf_tokenizer_callable(self, pipeline):
|
| 111 |
+
hf_tok = pipeline.get_hf_tokenizer()
|
| 112 |
+
result = hf_tok("x^2 + 1")
|
| 113 |
+
assert "input_ids" in result
|
| 114 |
+
assert len(result["input_ids"]) == 1
|
| 115 |
+
|
| 116 |
+
def test_hf_tokenizer_encode(self, pipeline):
|
| 117 |
+
hf_tok = pipeline.get_hf_tokenizer()
|
| 118 |
+
ids = hf_tok.encode("sin(x)")
|
| 119 |
+
assert isinstance(ids, list)
|
| 120 |
+
assert len(ids) > 0
|
| 121 |
+
|
| 122 |
+
def test_hf_vocab_size(self, pipeline):
|
| 123 |
+
hf_tok = pipeline.get_hf_tokenizer()
|
| 124 |
+
assert len(hf_tok) > 100
|
tests/test_serializer.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for the Structural Serializer (Layer 5).
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
import sympy as sp
|
| 7 |
+
|
| 8 |
+
from mathtok.ast_generator import ASTGenerator
|
| 9 |
+
from mathtok.serializer import StructuralSerializer, MATH_START, MATH_END
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@pytest.fixture
|
| 13 |
+
def gen():
|
| 14 |
+
return ASTGenerator()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@pytest.fixture
|
| 18 |
+
def ser():
|
| 19 |
+
return StructuralSerializer(include_boundaries=True)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@pytest.fixture
|
| 23 |
+
def ser_no_boundary():
|
| 24 |
+
return StructuralSerializer(include_boundaries=False)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def make_ast(expr_str: str) -> object:
|
| 28 |
+
from sympy.parsing.sympy_parser import (
|
| 29 |
+
parse_expr, standard_transformations,
|
| 30 |
+
implicit_multiplication_application, convert_xor,
|
| 31 |
+
)
|
| 32 |
+
expr = parse_expr(
|
| 33 |
+
expr_str,
|
| 34 |
+
transformations=standard_transformations + (
|
| 35 |
+
implicit_multiplication_application, convert_xor,
|
| 36 |
+
),
|
| 37 |
+
local_dict={"x": sp.Symbol("x"), "y": sp.Symbol("y"),
|
| 38 |
+
"a": sp.Symbol("a"), "b": sp.Symbol("b")},
|
| 39 |
+
)
|
| 40 |
+
return ASTGenerator().generate(expr)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class TestBoundaries:
|
| 44 |
+
def test_start_end_tokens(self, ser):
|
| 45 |
+
ast = make_ast("x + 1")
|
| 46 |
+
tokens = ser.serialize(ast)
|
| 47 |
+
assert tokens[0].token == MATH_START
|
| 48 |
+
assert tokens[-1].token == MATH_END
|
| 49 |
+
|
| 50 |
+
def test_no_boundaries(self, ser_no_boundary):
|
| 51 |
+
ast = make_ast("x")
|
| 52 |
+
tokens = ser_no_boundary.serialize(ast)
|
| 53 |
+
assert tokens[0].token != MATH_START
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class TestTokenStream:
|
| 57 |
+
def test_leaf_node(self, ser):
|
| 58 |
+
ast = ASTGenerator().generate(sp.Symbol("x"))
|
| 59 |
+
tokens = ser.serialize(ast)
|
| 60 |
+
# [MATH_START, VAR_X, MATH_END]
|
| 61 |
+
tok_strs = [t.token for t in tokens]
|
| 62 |
+
assert "VAR_X" in tok_strs
|
| 63 |
+
|
| 64 |
+
def test_preorder_order(self, ser_no_boundary):
|
| 65 |
+
# x + 1 → ADD(VAR_X, CONST_1) → [OP_ADD, VAR_X, CONST_1]
|
| 66 |
+
ast = make_ast("x + 1")
|
| 67 |
+
tokens = ser_no_boundary.serialize(ast)
|
| 68 |
+
tok_strs = [t.token for t in tokens]
|
| 69 |
+
add_idx = tok_strs.index("OP_ADD")
|
| 70 |
+
x_idx = tok_strs.index("VAR_X")
|
| 71 |
+
assert add_idx < x_idx # parent before children
|
| 72 |
+
|
| 73 |
+
def test_depth_assigned(self, ser_no_boundary):
|
| 74 |
+
ast = make_ast("x + 1")
|
| 75 |
+
tokens = ser_no_boundary.serialize(ast)
|
| 76 |
+
root_tok = next(t for t in tokens if t.token == "OP_ADD")
|
| 77 |
+
assert root_tok.depth == 0
|
| 78 |
+
child_toks = [t for t in tokens if t.token in ("VAR_X", "CONST_1")]
|
| 79 |
+
for ct in child_toks:
|
| 80 |
+
assert ct.depth == 1
|
| 81 |
+
|
| 82 |
+
def test_positions_sequential(self, ser):
|
| 83 |
+
ast = make_ast("x^2 + 1")
|
| 84 |
+
tokens = ser.serialize(ast)
|
| 85 |
+
positions = [t.position for t in tokens]
|
| 86 |
+
assert positions == list(range(len(tokens)))
|
| 87 |
+
|
| 88 |
+
def test_is_leaf_flag(self, ser_no_boundary):
|
| 89 |
+
ast = ASTGenerator().generate(sp.Symbol("x"))
|
| 90 |
+
tokens = ser_no_boundary.serialize(ast)
|
| 91 |
+
assert all(t.is_leaf for t in tokens)
|
| 92 |
+
|
| 93 |
+
def test_subtree_size_root(self, ser_no_boundary):
|
| 94 |
+
ast = make_ast("x + 1")
|
| 95 |
+
tokens = ser_no_boundary.serialize(ast)
|
| 96 |
+
root = tokens[0] # OP_ADD
|
| 97 |
+
assert root.subtree_size == 3 # ADD + VAR_X + CONST_1
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
class TestSexp:
|
| 101 |
+
def test_sexp_leaf(self, ser):
|
| 102 |
+
ast = ASTGenerator().generate(sp.Symbol("x"))
|
| 103 |
+
sexp = ser.to_sexp(ast)
|
| 104 |
+
assert sexp == "VAR_X"
|
| 105 |
+
|
| 106 |
+
def test_sexp_simple(self, ser):
|
| 107 |
+
ast = make_ast("x + 1")
|
| 108 |
+
sexp = ser.to_sexp(ast)
|
| 109 |
+
assert sexp.startswith("(OP_ADD")
|
| 110 |
+
|
| 111 |
+
def test_sexp_nested(self, ser):
|
| 112 |
+
ast = make_ast("x^2 + 1")
|
| 113 |
+
sexp = ser.to_sexp(ast)
|
| 114 |
+
assert "OP_POW" in sexp
|
| 115 |
+
assert "OP_ADD" in sexp
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
class TestTokenList:
|
| 119 |
+
def test_to_token_list(self, ser):
|
| 120 |
+
ast = make_ast("x + 1")
|
| 121 |
+
tok_list = ser.to_token_list(ast)
|
| 122 |
+
assert isinstance(tok_list, list)
|
| 123 |
+
assert all(isinstance(t, str) for t in tok_list)
|
| 124 |
+
assert MATH_START in tok_list
|
| 125 |
+
assert MATH_END in tok_list
|