Ytgetahun Claude Sonnet 4.6 commited on
Commit
46eead7
·
1 Parent(s): 3347e91

Add vn score — GPT-4o-as-judge AD quality scorer (VN-013)

Browse files

LLM-as-judge pipeline: loads manifest.json, extracts frames at each
narration timestamp, scores accuracy/relevance/WCAG/conciseness 0-10,
aggregates to letter grade. Supports json/text/flagged output formats.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (3) hide show
  1. cli/vn/main.py +68 -0
  2. cli/vn/output.py +62 -0
  3. cli/vn/score.py +534 -0
cli/vn/main.py CHANGED
@@ -17,7 +17,9 @@ from .gaps import GapDetectionError, detect_gaps
17
  from .kit import assemble_kit
18
  from .output import render_compliance_report, render_gap_results, render_results, result_from_api_response
19
  from .output import render_ad, render_edu, render_kit, render_podcast, render_sports, render_theater
 
20
  from .podcast import PodcastDescriptionError, PodcastMixError, PodcastTTSError, assemble_podcast
 
21
  from .sports import SportsDetectionError, assemble_sports_kit
22
  from .theater import TheaterDescriptionError, TheaterTTSError, assemble_theater_kit
23
  from .youtube import YouTubeDownloadError, download_video, is_url
@@ -350,6 +352,65 @@ def ad(
350
  typer.echo(render_ad(result, output_format))
351
 
352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  @keys_app.command("create")
354
  def keys_create(
355
  email: str = typer.Argument(..., help="Email address for the free-tier API key."),
@@ -391,6 +452,13 @@ def _normalize_podcast_format(output_format: str) -> str:
391
  return normalized
392
 
393
 
 
 
 
 
 
 
 
394
  def _fail(message: str) -> None:
395
  typer.echo(f"Error: {message}", err=True)
396
  raise typer.Exit(code=1)
 
17
  from .kit import assemble_kit
18
  from .output import render_compliance_report, render_gap_results, render_results, result_from_api_response
19
  from .output import render_ad, render_edu, render_kit, render_podcast, render_sports, render_theater
20
+ from .output import render_score
21
  from .podcast import PodcastDescriptionError, PodcastMixError, PodcastTTSError, assemble_podcast
22
+ from .score import ScoreError, score_manifest
23
  from .sports import SportsDetectionError, assemble_sports_kit
24
  from .theater import TheaterDescriptionError, TheaterTTSError, assemble_theater_kit
25
  from .youtube import YouTubeDownloadError, download_video, is_url
 
352
  typer.echo(render_ad(result, output_format))
353
 
354
 
355
+ @app.command()
356
+ def score(
357
+ source: str = typer.Argument(
358
+ ...,
359
+ help="Local video file or YouTube URL (same source used to generate the manifest).",
360
+ ),
361
+ manifest: Path = typer.Option(
362
+ ...,
363
+ "--manifest",
364
+ "-m",
365
+ help="Path to manifest.json from vn ad or vn theater.",
366
+ ),
367
+ output_format: str = typer.Option(
368
+ "text",
369
+ "--format",
370
+ "-f",
371
+ help="Output format: json, text, or flagged.",
372
+ ),
373
+ output_dir: Path = typer.Option(
374
+ Path("./vn-score-output"),
375
+ "--output-dir",
376
+ help="Directory for score-report.json.",
377
+ ),
378
+ word_limit: Optional[int] = typer.Option(
379
+ None,
380
+ "--word-limit",
381
+ min=1,
382
+ help="Word limit for within_limit check. Auto-detected from manifest if not set.",
383
+ ),
384
+ min_score: float = typer.Option(
385
+ 6.0,
386
+ "--min-score",
387
+ min=0.0,
388
+ max=10.0,
389
+ help="Flag threshold: descriptions with any dimension below this are flagged.",
390
+ ),
391
+ ) -> None:
392
+ """Score AD description quality using GPT-4o Vision as a judge."""
393
+ output_format = _normalize_score_format(output_format)
394
+
395
+ with tempfile.TemporaryDirectory(prefix="vn-cli-") as tmp:
396
+ tmp_path = Path(tmp)
397
+ try:
398
+ media_path = _resolve_source(source, tmp_path / "download")
399
+ report = score_manifest(
400
+ manifest_path=manifest.expanduser().resolve(),
401
+ video_source=media_path,
402
+ word_limit=word_limit,
403
+ min_score=min_score,
404
+ output_dir=output_dir,
405
+ source_label=source,
406
+ manifest_label=str(manifest),
407
+ )
408
+ except (FileNotFoundError, ValueError, FrameExtractionError, YouTubeDownloadError, ScoreError) as exc:
409
+ _fail(str(exc))
410
+
411
+ typer.echo(render_score(report, output_format))
412
+
413
+
414
  @keys_app.command("create")
415
  def keys_create(
416
  email: str = typer.Argument(..., help="Email address for the free-tier API key."),
 
452
  return normalized
453
 
454
 
455
+ def _normalize_score_format(output_format: str) -> str:
456
+ normalized = output_format.lower()
457
+ if normalized not in {"json", "text", "flagged"}:
458
+ _fail("--format must be one of: json, text, flagged")
459
+ return normalized
460
+
461
+
462
  def _fail(message: str) -> None:
463
  typer.echo(f"Error: {message}", err=True)
464
  raise typer.Exit(code=1)
cli/vn/output.py CHANGED
@@ -143,6 +143,16 @@ def render_ad(kit: Any, output_format: str) -> str:
143
  raise ValueError(f"unsupported output format: {output_format}")
144
 
145
 
 
 
 
 
 
 
 
 
 
 
146
  def result_from_api_response(response: dict[str, Any], timestamp: float, duration: float) -> DescriptionResult:
147
  return DescriptionResult(
148
  timecode=format_json_time(timestamp),
@@ -479,6 +489,54 @@ def render_ad_text(kit: Any) -> str:
479
  return "\n".join(lines).rstrip()
480
 
481
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
  def format_json_time(seconds: float) -> str:
483
  hours, minutes, secs, millis = _split_time(seconds)
484
  return f"{hours:02d}:{minutes:02d}:{secs:02d}.{millis:03d}"
@@ -518,6 +576,10 @@ def _split_time(seconds: float) -> tuple[int, int, int, int]:
518
  return hours, minutes, secs, millis
519
 
520
 
 
 
 
 
521
  def _objects_from_response(response: dict[str, Any]) -> list[Any]:
522
  objects = response.get("objects_detected")
523
  if isinstance(objects, list):
 
143
  raise ValueError(f"unsupported output format: {output_format}")
144
 
145
 
146
+ def render_score(report: Any, output_format: str) -> str:
147
+ if output_format == "json":
148
+ return json.dumps(report.json_dict(), indent=2)
149
+ if output_format == "text":
150
+ return render_score_text(report)
151
+ if output_format == "flagged":
152
+ return render_score_text(report, flagged_only=True)
153
+ raise ValueError(f"unsupported output format: {output_format}")
154
+
155
+
156
  def result_from_api_response(response: dict[str, Any], timestamp: float, duration: float) -> DescriptionResult:
157
  return DescriptionResult(
158
  timecode=format_json_time(timestamp),
 
489
  return "\n".join(lines).rstrip()
490
 
491
 
492
+ def render_score_text(report: Any, flagged_only: bool = False) -> str:
493
+ visible_scores = [score for score in report.scores if score.flag] if flagged_only else list(report.scores)
494
+ lines = [
495
+ "AD Quality Score Report",
496
+ f"Source: {report.source} | Manifest: {report.manifest}",
497
+ (
498
+ f"Scored: {report.scored} descriptions | Flagged: {report.flagged} | "
499
+ f"Grade: {report.grade} | GPT cost: ${report.gpt_cost_estimate:.3f}"
500
+ ),
501
+ "",
502
+ "Aggregate",
503
+ f" Accuracy: {report.aggregate.accuracy:.1f}/10",
504
+ f" Relevance: {report.aggregate.relevance:.1f}/10",
505
+ f" WCAG Compliance: {report.aggregate.wcag_compliance:.1f}/10",
506
+ f" Conciseness: {report.aggregate.conciseness:.1f}/10",
507
+ f" Overall: {report.aggregate.overall:.1f}/10",
508
+ (
509
+ f" Within limit: {report.aggregate.within_limit_pct:.1f}% | "
510
+ f"Present tense: {report.aggregate.tense_ok_pct:.1f}%"
511
+ ),
512
+ "",
513
+ ]
514
+
515
+ if visible_scores:
516
+ for score in visible_scores:
517
+ status = "✗ FLAGGED" if score.flag else "✓"
518
+ lines.append(
519
+ f"[{format_gap_time(score.start_sec)}] → [{format_gap_time(score.end_sec)}] "
520
+ f"overall={_format_brief_score(score.overall)} words={score.word_count} {status}"
521
+ )
522
+ lines.append(score.description)
523
+ if score.flag and score.flag_reason:
524
+ lines.append(
525
+ " ↳ "
526
+ f"accuracy={_format_brief_score(score.accuracy)}, "
527
+ f"relevance={_format_brief_score(score.relevance)}, "
528
+ f"wcag_compliance={_format_brief_score(score.wcag_compliance)}, "
529
+ f"conciseness={_format_brief_score(score.conciseness)}"
530
+ f" — {score.flag_reason}"
531
+ )
532
+ lines.append("")
533
+ else:
534
+ lines.append("No flagged descriptions." if flagged_only else "No descriptions scored.")
535
+ lines.append("")
536
+
537
+ return "\n".join(lines).rstrip()
538
+
539
+
540
  def format_json_time(seconds: float) -> str:
541
  hours, minutes, secs, millis = _split_time(seconds)
542
  return f"{hours:02d}:{minutes:02d}:{secs:02d}.{millis:03d}"
 
576
  return hours, minutes, secs, millis
577
 
578
 
579
+ def _format_brief_score(value: float) -> str:
580
+ return f"{value:.1f}".rstrip("0").rstrip(".")
581
+
582
+
583
  def _objects_from_response(response: dict[str, Any]) -> list[Any]:
584
  objects = response.get("objects_detected")
585
  if isinstance(objects, list):
cli/vn/score.py ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import mimetypes
5
+ import os
6
+ import sys
7
+ import tempfile
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import httpx
13
+
14
+ from .api import encode_file_base64
15
+ from .frame import FrameExtractionError, extract_frames_at
16
+
17
+
18
+ SCORE_PROMPT_TEMPLATE = """You are an expert audio description quality reviewer. You will be shown a video frame and an audio description that was written to describe it.
19
+
20
+ Score the description on each dimension from 0 to 10:
21
+
22
+ - accuracy: Does the description correctly describe what is visually present in the frame?
23
+ - relevance: Does it focus on information that is important for understanding the content (not background clutter)?
24
+ - wcag_compliance: Is it factual, objective, present tense, and free of emotional interpretation?
25
+ - conciseness: Is it appropriately brief without omitting critical information?
26
+
27
+ Also check:
28
+ - word_count: Count the words in the description
29
+ - tense_ok: true if the description uses present tense throughout
30
+ - within_limit: true if word_count <= {word_limit}
31
+
32
+ Respond with JSON only, no explanation:
33
+ {{
34
+ "accuracy": <0-10>,
35
+ "relevance": <0-10>,
36
+ "wcag_compliance": <0-10>,
37
+ "conciseness": <0-10>,
38
+ "overall": <0-10>,
39
+ "word_count": <int>,
40
+ "tense_ok": <bool>,
41
+ "within_limit": <bool>,
42
+ "flag": <bool - true if any dimension < 6 or within_limit is false>,
43
+ "flag_reason": "<short explanation if flag is true, else null>"
44
+ }}
45
+
46
+ Audio description:
47
+ {description}
48
+ """
49
+
50
+ OPENAI_URL = "https://api.openai.com/v1/chat/completions"
51
+ OPENAI_MODEL = "gpt-4o"
52
+ SCORE_COST_PER_FRAME = 0.0013
53
+
54
+ GRADE_THRESHOLDS = [
55
+ (9.0, "A"),
56
+ (8.0, "B+"),
57
+ (7.0, "B"),
58
+ (6.0, "C"),
59
+ (0.0, "F"),
60
+ ]
61
+
62
+
63
+ class ScoreError(RuntimeError):
64
+ """Raised when GPT-4o scoring fails or returns invalid JSON."""
65
+
66
+
67
+ @dataclass(frozen=True)
68
+ class DescriptionScore:
69
+ srt_index: int
70
+ start_sec: float
71
+ end_sec: float
72
+ frame_timestamp_sec: float
73
+ description: str
74
+ accuracy: float
75
+ relevance: float
76
+ wcag_compliance: float
77
+ conciseness: float
78
+ overall: float
79
+ word_count: int
80
+ tense_ok: bool
81
+ within_limit: bool
82
+ flag: bool
83
+ flag_reason: str | None
84
+ gpt_cost: float
85
+
86
+ def json_dict(self) -> dict[str, Any]:
87
+ return {
88
+ "srt_index": self.srt_index,
89
+ "start_sec": round(self.start_sec, 3),
90
+ "end_sec": round(self.end_sec, 3),
91
+ "frame_timestamp_sec": round(self.frame_timestamp_sec, 3),
92
+ "description": self.description,
93
+ "accuracy": round(self.accuracy, 3),
94
+ "relevance": round(self.relevance, 3),
95
+ "wcag_compliance": round(self.wcag_compliance, 3),
96
+ "conciseness": round(self.conciseness, 3),
97
+ "overall": round(self.overall, 3),
98
+ "word_count": self.word_count,
99
+ "tense_ok": self.tense_ok,
100
+ "within_limit": self.within_limit,
101
+ "flag": self.flag,
102
+ "flag_reason": self.flag_reason,
103
+ "gpt_cost": round(self.gpt_cost, 6),
104
+ }
105
+
106
+
107
+ @dataclass(frozen=True)
108
+ class ScoreAggregate:
109
+ accuracy: float
110
+ relevance: float
111
+ wcag_compliance: float
112
+ conciseness: float
113
+ overall: float
114
+ within_limit_pct: float
115
+ tense_ok_pct: float
116
+
117
+ def json_dict(self) -> dict[str, Any]:
118
+ return {
119
+ "accuracy": round(self.accuracy, 3),
120
+ "relevance": round(self.relevance, 3),
121
+ "wcag_compliance": round(self.wcag_compliance, 3),
122
+ "conciseness": round(self.conciseness, 3),
123
+ "overall": round(self.overall, 3),
124
+ "within_limit_pct": round(self.within_limit_pct, 3),
125
+ "tense_ok_pct": round(self.tense_ok_pct, 3),
126
+ }
127
+
128
+
129
+ @dataclass(frozen=True)
130
+ class ScoreReport:
131
+ source: str
132
+ manifest: str
133
+ scored: int
134
+ flagged: int
135
+ word_limit: int
136
+ aggregate: ScoreAggregate
137
+ grade: str
138
+ gpt_cost_estimate: float
139
+ scores: list[DescriptionScore]
140
+
141
+ def json_dict(self) -> dict[str, Any]:
142
+ return {
143
+ "source": self.source,
144
+ "manifest": self.manifest,
145
+ "scored": self.scored,
146
+ "flagged": self.flagged,
147
+ "word_limit": self.word_limit,
148
+ "aggregate": self.aggregate.json_dict(),
149
+ "grade": self.grade,
150
+ "gpt_cost_estimate": round(self.gpt_cost_estimate, 6),
151
+ "scores": [score.json_dict() for score in self.scores],
152
+ }
153
+
154
+
155
+ @dataclass(frozen=True)
156
+ class _ManifestNarration:
157
+ srt_index: int
158
+ start_sec: float
159
+ end_sec: float
160
+ frame_timestamp_sec: float
161
+ description: str
162
+
163
+
164
+ def score_manifest(
165
+ manifest_path: Path,
166
+ video_source: Path,
167
+ word_limit: int | None = None,
168
+ min_score: float = 6.0,
169
+ output_dir: Path | None = None,
170
+ source_label: str | None = None,
171
+ manifest_label: str | None = None,
172
+ ) -> ScoreReport:
173
+ if min_score < 0 or min_score > 10:
174
+ raise ValueError("--min-score must be between 0 and 10")
175
+ if word_limit is not None and word_limit <= 0:
176
+ raise ValueError("--word-limit must be greater than 0")
177
+
178
+ resolved_manifest_path = manifest_path.expanduser()
179
+ if not resolved_manifest_path.exists():
180
+ raise FileNotFoundError(f"manifest not found: {manifest_path}")
181
+
182
+ manifest_data = _load_manifest(resolved_manifest_path)
183
+ narrations = _load_narrations(manifest_data)
184
+ resolved_word_limit = word_limit or _detect_word_limit(manifest_data)
185
+ requested_output_dir = output_dir or Path("./vn-score-output")
186
+ resolved_output_dir = requested_output_dir.expanduser()
187
+ resolved_output_dir.mkdir(parents=True, exist_ok=True)
188
+
189
+ scores: list[DescriptionScore] = []
190
+ with tempfile.TemporaryDirectory(prefix="vn-score-frames-") as tmp:
191
+ frame_root = Path(tmp)
192
+ for narration in narrations:
193
+ try:
194
+ frames = extract_frames_at(
195
+ video_source,
196
+ [narration.frame_timestamp_sec],
197
+ frame_root / f"{narration.srt_index:05d}",
198
+ )
199
+ except FrameExtractionError as exc:
200
+ print(
201
+ (
202
+ f"Warning: skipping narration {narration.srt_index} at "
203
+ f"{narration.frame_timestamp_sec:.3f}s: {exc}"
204
+ ),
205
+ file=sys.stderr,
206
+ )
207
+ continue
208
+ score = _score_description(
209
+ frames[0].path,
210
+ narration,
211
+ word_limit=resolved_word_limit,
212
+ min_score=min_score,
213
+ )
214
+ scores.append(score)
215
+
216
+ aggregate = _aggregate_scores(scores)
217
+ flagged = sum(1 for score in scores if score.flag)
218
+ report = ScoreReport(
219
+ source=source_label or str(video_source),
220
+ manifest=manifest_label or str(manifest_path),
221
+ scored=len(scores),
222
+ flagged=flagged,
223
+ word_limit=resolved_word_limit,
224
+ aggregate=aggregate,
225
+ grade=_grade_for_score(aggregate.overall),
226
+ gpt_cost_estimate=sum(score.gpt_cost for score in scores),
227
+ scores=scores,
228
+ )
229
+
230
+ report_path = resolved_output_dir / "score-report.json"
231
+ report_path.write_text(json.dumps(report.json_dict(), indent=2), encoding="utf-8")
232
+ return report
233
+
234
+
235
+ def _load_manifest(manifest_path: Path) -> dict[str, Any]:
236
+ try:
237
+ data = json.loads(manifest_path.read_text(encoding="utf-8"))
238
+ except ValueError as exc:
239
+ raise ValueError(f"manifest.json is not valid JSON: {manifest_path}") from exc
240
+ if not isinstance(data, dict):
241
+ raise ValueError("manifest.json must contain a JSON object")
242
+ return data
243
+
244
+
245
+ def _load_narrations(manifest_data: dict[str, Any]) -> list[_ManifestNarration]:
246
+ narrations_data = manifest_data.get("narrations")
247
+ if narrations_data is None:
248
+ raise ValueError("manifest.json has no narrations field")
249
+ if not isinstance(narrations_data, list):
250
+ raise ValueError("manifest.json narrations field must be a list")
251
+
252
+ narrations: list[_ManifestNarration] = []
253
+ for index, item in enumerate(narrations_data, start=1):
254
+ if not isinstance(item, dict):
255
+ raise ValueError(f"manifest narration {index} must be an object")
256
+ start_sec = _coerce_float(item.get("start_sec"), f"narration {index} start_sec")
257
+ end_sec = _coerce_float(item.get("end_sec"), f"narration {index} end_sec")
258
+ frame_timestamp_raw = item.get("frame_timestamp_sec")
259
+ frame_timestamp_sec = (
260
+ _coerce_float(frame_timestamp_raw, f"narration {index} frame_timestamp_sec")
261
+ if frame_timestamp_raw is not None
262
+ else (start_sec + end_sec) / 2
263
+ )
264
+ description = item.get("description")
265
+ if not isinstance(description, str) or not description.strip():
266
+ raise ValueError(f"manifest narration {index} description must be a non-empty string")
267
+ srt_index_raw = item.get("srt_index", index)
268
+ if isinstance(srt_index_raw, bool):
269
+ raise ValueError(f"manifest narration {index} srt_index must be an integer")
270
+ try:
271
+ srt_index = int(srt_index_raw)
272
+ except (TypeError, ValueError) as exc:
273
+ raise ValueError(f"manifest narration {index} srt_index must be an integer") from exc
274
+ narrations.append(
275
+ _ManifestNarration(
276
+ srt_index=srt_index,
277
+ start_sec=start_sec,
278
+ end_sec=end_sec,
279
+ frame_timestamp_sec=frame_timestamp_sec,
280
+ description=description.strip(),
281
+ )
282
+ )
283
+ return narrations
284
+
285
+
286
+ def _detect_word_limit(manifest_data: dict[str, Any]) -> int:
287
+ explicit_limit = manifest_data.get("word_limit")
288
+ if explicit_limit is not None:
289
+ try:
290
+ parsed_limit = int(explicit_limit)
291
+ except (TypeError, ValueError) as exc:
292
+ raise ValueError("manifest word_limit must be an integer") from exc
293
+ if parsed_limit <= 0:
294
+ raise ValueError("manifest word_limit must be greater than 0")
295
+ return parsed_limit
296
+ if "compliance_level" in manifest_data:
297
+ return 30
298
+ return 60
299
+
300
+
301
+ def _score_description(
302
+ frame_path: Path,
303
+ narration: _ManifestNarration,
304
+ word_limit: int,
305
+ min_score: float,
306
+ ) -> DescriptionScore:
307
+ api_key = os.getenv("OPENAI_API_KEY")
308
+ if not api_key:
309
+ raise ScoreError("OPENAI_API_KEY is not set.")
310
+
311
+ mime_type = mimetypes.guess_type(frame_path.name)[0] or "image/jpeg"
312
+ prompt = SCORE_PROMPT_TEMPLATE.format(
313
+ word_limit=word_limit,
314
+ description=narration.description,
315
+ )
316
+ payload = {
317
+ "model": OPENAI_MODEL,
318
+ "messages": [
319
+ {
320
+ "role": "user",
321
+ "content": [
322
+ {"type": "text", "text": prompt},
323
+ {
324
+ "type": "image_url",
325
+ "image_url": {
326
+ "url": f"data:{mime_type};base64,{encode_file_base64(frame_path)}",
327
+ "detail": "low",
328
+ },
329
+ },
330
+ ],
331
+ }
332
+ ],
333
+ "response_format": {"type": "json_object"},
334
+ "temperature": 0,
335
+ "max_tokens": 250,
336
+ }
337
+
338
+ try:
339
+ with httpx.Client(timeout=120.0, follow_redirects=True) as client:
340
+ response = client.post(
341
+ OPENAI_URL,
342
+ json=payload,
343
+ headers={
344
+ "Authorization": f"Bearer {api_key}",
345
+ "Content-Type": "application/json",
346
+ },
347
+ )
348
+ response.raise_for_status()
349
+ except httpx.HTTPStatusError as exc:
350
+ raise ScoreError(f"OpenAI API error {exc.response.status_code}: {exc.response.text}") from exc
351
+ except httpx.RequestError as exc:
352
+ raise ScoreError(f"OpenAI request failed: {exc}") from exc
353
+
354
+ try:
355
+ data = response.json()
356
+ except ValueError as exc:
357
+ raise ScoreError(f"OpenAI returned invalid JSON: {response.text[:300]}") from exc
358
+ if not isinstance(data, dict):
359
+ raise ScoreError("OpenAI returned a non-object response.")
360
+
361
+ raw_content = _assistant_text_from_response(data).strip()
362
+ score_payload = _parse_score_payload(raw_content)
363
+ accuracy = _bounded_score(score_payload.get("accuracy"), "accuracy")
364
+ relevance = _bounded_score(score_payload.get("relevance"), "relevance")
365
+ wcag_compliance = _bounded_score(score_payload.get("wcag_compliance"), "wcag_compliance")
366
+ conciseness = _bounded_score(score_payload.get("conciseness"), "conciseness")
367
+ overall = _bounded_score(score_payload.get("overall"), "overall")
368
+ word_count = _coerce_int(score_payload.get("word_count"), "word_count")
369
+ tense_ok = _coerce_bool(score_payload.get("tense_ok"), "tense_ok")
370
+ within_limit = _coerce_bool(score_payload.get("within_limit"), "within_limit")
371
+
372
+ dimension_scores = {
373
+ "accuracy": accuracy,
374
+ "relevance": relevance,
375
+ "wcag_compliance": wcag_compliance,
376
+ "conciseness": conciseness,
377
+ "overall": overall,
378
+ }
379
+ flag = any(value < min_score for value in dimension_scores.values()) or not within_limit
380
+ raw_flag_reason = score_payload.get("flag_reason")
381
+ flag_reason = _build_flag_reason(
382
+ dimension_scores,
383
+ within_limit,
384
+ min_score,
385
+ raw_flag_reason if isinstance(raw_flag_reason, str) else None,
386
+ )
387
+
388
+ return DescriptionScore(
389
+ srt_index=narration.srt_index,
390
+ start_sec=narration.start_sec,
391
+ end_sec=narration.end_sec,
392
+ frame_timestamp_sec=narration.frame_timestamp_sec,
393
+ description=narration.description,
394
+ accuracy=accuracy,
395
+ relevance=relevance,
396
+ wcag_compliance=wcag_compliance,
397
+ conciseness=conciseness,
398
+ overall=overall,
399
+ word_count=word_count,
400
+ tense_ok=tense_ok,
401
+ within_limit=within_limit,
402
+ flag=flag,
403
+ flag_reason=flag_reason if flag else None,
404
+ gpt_cost=SCORE_COST_PER_FRAME,
405
+ )
406
+
407
+
408
+ def _assistant_text_from_response(data: dict[str, Any]) -> str:
409
+ choices = data.get("choices")
410
+ if not isinstance(choices, list) or not choices:
411
+ raise ScoreError("OpenAI response did not include choices.")
412
+
413
+ message = choices[0].get("message")
414
+ if not isinstance(message, dict):
415
+ raise ScoreError("OpenAI response did not include a valid message.")
416
+
417
+ content = message.get("content")
418
+ if isinstance(content, str):
419
+ return content
420
+ if isinstance(content, list):
421
+ text_parts: list[str] = []
422
+ for item in content:
423
+ if not isinstance(item, dict):
424
+ continue
425
+ text = item.get("text")
426
+ if isinstance(text, str) and text.strip():
427
+ text_parts.append(text.strip())
428
+ return " ".join(text_parts).strip()
429
+ raise ScoreError("OpenAI response content was not text.")
430
+
431
+
432
+ def _parse_score_payload(raw_content: str) -> dict[str, Any]:
433
+ normalized = raw_content.strip()
434
+ if normalized.startswith("```"):
435
+ normalized = normalized.strip("`")
436
+ if normalized.startswith("json"):
437
+ normalized = normalized[4:].strip()
438
+ try:
439
+ data = json.loads(normalized)
440
+ except ValueError as exc:
441
+ raise ScoreError(f"GPT-4o returned non-JSON scoring output: {raw_content}") from exc
442
+ if not isinstance(data, dict):
443
+ raise ScoreError(f"GPT-4o returned non-object scoring output: {raw_content}")
444
+ return data
445
+
446
+
447
+ def _aggregate_scores(scores: list[DescriptionScore]) -> ScoreAggregate:
448
+ if not scores:
449
+ return ScoreAggregate(
450
+ accuracy=0.0,
451
+ relevance=0.0,
452
+ wcag_compliance=0.0,
453
+ conciseness=0.0,
454
+ overall=0.0,
455
+ within_limit_pct=0.0,
456
+ tense_ok_pct=0.0,
457
+ )
458
+
459
+ total = float(len(scores))
460
+ return ScoreAggregate(
461
+ accuracy=sum(score.accuracy for score in scores) / total,
462
+ relevance=sum(score.relevance for score in scores) / total,
463
+ wcag_compliance=sum(score.wcag_compliance for score in scores) / total,
464
+ conciseness=sum(score.conciseness for score in scores) / total,
465
+ overall=sum(score.overall for score in scores) / total,
466
+ within_limit_pct=100.0 * sum(1 for score in scores if score.within_limit) / total,
467
+ tense_ok_pct=100.0 * sum(1 for score in scores if score.tense_ok) / total,
468
+ )
469
+
470
+
471
+ def _grade_for_score(overall: float) -> str:
472
+ for threshold, grade in GRADE_THRESHOLDS:
473
+ if overall >= threshold:
474
+ return grade
475
+ return "F"
476
+
477
+
478
+ def _coerce_float(value: Any, field_name: str) -> float:
479
+ if isinstance(value, bool):
480
+ raise ValueError(f"{field_name} must be a number")
481
+ try:
482
+ return float(value)
483
+ except (TypeError, ValueError) as exc:
484
+ raise ValueError(f"{field_name} must be a number") from exc
485
+
486
+
487
+ def _coerce_int(value: Any, field_name: str) -> int:
488
+ if isinstance(value, bool):
489
+ raise ScoreError(f"GPT-4o returned invalid {field_name}: {value!r}")
490
+ try:
491
+ return int(value)
492
+ except (TypeError, ValueError) as exc:
493
+ raise ScoreError(f"GPT-4o returned invalid {field_name}: {value!r}") from exc
494
+
495
+
496
+ def _coerce_bool(value: Any, field_name: str) -> bool:
497
+ if isinstance(value, bool):
498
+ return value
499
+ raise ScoreError(f"GPT-4o returned invalid {field_name}: {value!r}")
500
+
501
+
502
+ def _bounded_score(value: Any, field_name: str) -> float:
503
+ try:
504
+ score = _coerce_float(value, field_name)
505
+ except ValueError as exc:
506
+ raise ScoreError(f"GPT-4o returned invalid {field_name}: {value!r}") from exc
507
+ if score < 0 or score > 10:
508
+ raise ScoreError(f"GPT-4o returned {field_name} outside 0-10: {score!r}")
509
+ return score
510
+
511
+
512
+ def _build_flag_reason(
513
+ dimension_scores: dict[str, float],
514
+ within_limit: bool,
515
+ min_score: float,
516
+ raw_flag_reason: str | None,
517
+ ) -> str | None:
518
+ reasons: list[str] = []
519
+ low_dimensions = [name for name, value in dimension_scores.items() if value < min_score]
520
+ if low_dimensions:
521
+ reasons.append(f"below threshold on {', '.join(low_dimensions)} (< {_format_score(min_score)})")
522
+ if not within_limit:
523
+ reasons.append("exceeds word limit")
524
+ if raw_flag_reason:
525
+ normalized = raw_flag_reason.strip()
526
+ if normalized and normalized not in reasons:
527
+ reasons.append(normalized)
528
+ if not reasons:
529
+ return None
530
+ return " - ".join(reasons)
531
+
532
+
533
+ def _format_score(value: float) -> str:
534
+ return f"{value:.1f}".rstrip("0").rstrip(".")