nickname-xingxing commited on
Commit
8c9473c
·
verified ·
1 Parent(s): fd0fa93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -12
app.py CHANGED
@@ -18,6 +18,7 @@ ANNOTATION_DIR = ROOT / "annotations"
18
  DATASET_REPO_ID = os.environ.get("ANNOTATION_DATASET_REPO", "nickname-xingxing/filter_data")
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
  HF_API = HfApi(token=HF_TOKEN) if HF_TOKEN else None
 
21
 
22
 
23
  def resolve_image_fs_path(record: dict) -> str:
@@ -214,13 +215,14 @@ def build_app():
214
  with gr.Blocks(title="过滤数据人工审核", theme=gr.themes.Soft()) as demo:
215
  gr.Markdown("# 过滤数据人工审核")
216
  gr.Markdown(
217
- "请先填写标注人名称,然后判断图片是否符合给定 prompt耗时大约10分钟"
218
  )
219
 
220
  annotator = gr.Textbox(label="标注人名称", value="", placeholder="请先输入姓名/昵称")
221
  # 当前样本在 records 中的下标(全局),不是「在当前列表里的第几个」
222
  cursor_record_idx = gr.State(0)
223
  annotations_state = gr.State({})
 
224
 
225
  with gr.Row():
226
  dataset_filter = gr.Dropdown(dataset_choices, value="all", label="数据源")
@@ -229,6 +231,7 @@ def build_app():
229
  refresh_btn = gr.Button("应用筛选", variant="primary")
230
 
231
  progress_md = gr.Markdown()
 
232
 
233
  with gr.Row():
234
  with gr.Column(scale=1):
@@ -249,6 +252,9 @@ def build_app():
249
 
250
  save_status = gr.Textbox(label="保存状态", interactive=False)
251
 
 
 
 
252
  def validate_annotator_name(name: str) -> str:
253
  who = (name or "").strip()
254
  return who
@@ -256,18 +262,18 @@ def build_app():
256
  def bootstrap(annotator_name):
257
  annotator_name = validate_annotator_name(annotator_name)
258
  if not annotator_name:
259
- return {}, 0, None, "请先输入标注人名称", "unsure", "", "0 / 0", "### 标注进度\n\n- 请先输入标注人名称", "请先输入标注人名称后再开始标注"
260
  annotation_file = ANNOTATION_DIR / f"{annotator_name}.jsonl"
261
  annotations = load_existing_annotations(annotation_file)
262
  pool = resolve_filters(records, "all", "all", "all", annotations)
263
  if not pool:
264
- return annotations, 0, None, "", "unsure", "", "0 / 0", build_stats(records, annotations), ""
265
  cursor = pool[0]
266
  record = records[cursor]
267
  rendered = render_record(
268
  record, annotations.get(record["sample_id"]), f"1 / {len(pool)}"
269
  )
270
- return annotations, int(cursor), *rendered, build_stats(records, annotations), ""
271
 
272
  def refresh_pool(dataset_value, split_value, status_value, annotations, annotator_name):
273
  annotator_name = validate_annotator_name(annotator_name)
@@ -275,12 +281,14 @@ def build_app():
275
  return (
276
  {},
277
  0,
 
278
  None,
279
  "请先输入标注人名称",
280
  "unsure",
281
  "",
282
  "0 / 0",
283
  "### 标注进度\n\n- 请先输入标注人名称",
 
284
  "请先输入标注人名称后再开始标注",
285
  )
286
  ann_file = ANNOTATION_DIR / f"{annotator_name}.jsonl"
@@ -290,12 +298,14 @@ def build_app():
290
  return (
291
  annotations,
292
  0,
 
293
  None,
294
  "",
295
  "unsure",
296
  "",
297
  "0 / 0",
298
  build_stats(records, annotations),
 
299
  "没有可显示的样本",
300
  )
301
  cursor = pool[0]
@@ -306,8 +316,10 @@ def build_app():
306
  return (
307
  annotations,
308
  int(cursor),
 
309
  *rendered,
310
  build_stats(records, annotations),
 
311
  "筛选条件已更新",
312
  )
313
 
@@ -342,6 +354,7 @@ def build_app():
342
  decision,
343
  note_value,
344
  annotations,
 
345
  annotator_name,
346
  dataset_value,
347
  split_value,
@@ -352,12 +365,14 @@ def build_app():
352
  return (
353
  annotations or {},
354
  0,
 
355
  None,
356
  "请先输入标注人名称",
357
  "unsure",
358
  "",
359
  "0 / 0",
360
  build_stats(records, annotations or {}),
 
361
  "请先输入标注人名称后再保存",
362
  )
363
  annotation_file = ANNOTATION_DIR / f"{annotator_name}.jsonl"
@@ -367,12 +382,14 @@ def build_app():
367
  return (
368
  merged,
369
  0,
 
370
  None,
371
  "",
372
  "unsure",
373
  "",
374
  "0 / 0",
375
  build_stats(records, merged),
 
376
  "没有可保存的样本",
377
  )
378
  c = normalize_cursor(cursor, old_pool)
@@ -399,24 +416,39 @@ def build_app():
399
  return (
400
  merged,
401
  int(cur),
 
402
  *rendered,
403
  build_stats(records, merged),
 
404
  f"写入失败(请检查磁盘是否可写): {e}",
405
  )
406
  annotations = to_write
 
 
 
 
 
 
407
  new_pool = resolve_filters(records, dataset_value, split_value, status_value, annotations)
408
  stats = build_stats(records, annotations)
409
  if not new_pool:
 
 
 
 
 
410
  return (
411
  annotations,
412
  0,
 
413
  None,
414
  "",
415
  "unsure",
416
  "",
417
  "0 / 0",
418
  stats,
419
- f"已保存 {record['sample_id']}(仅本地缓存,点击“同步到HF”再上传)",
 
420
  )
421
  if record_idx in new_pool:
422
  new_cursor = record_idx
@@ -434,21 +466,26 @@ def build_app():
434
  return (
435
  annotations,
436
  int(new_cursor),
 
437
  *rendered,
438
  stats,
439
- f"已保存 {record['sample_id']}(仅本地缓存,点击“同步到HF”再上传)",
 
440
  )
441
 
442
- def sync_current_annotations(annotations, annotator_name):
443
  annotator_name = validate_annotator_name(annotator_name)
444
  if not annotator_name:
445
- return annotations or {}, build_stats(records, annotations or {}), "请先输入标注人名称后再同步"
446
  annotation_file = ANNOTATION_DIR / f"{annotator_name}.jsonl"
447
  merged = merge_annotations_from_disk(annotation_file, annotations)
448
  if not merged:
449
- return merged, build_stats(records, merged), "当前没有可同步的标注"
 
 
450
  sync_msg = sync_annotation_to_hf(annotation_file, annotator_name)
451
- return merged, build_stats(records, merged), sync_msg
 
452
 
453
  annotator.change(
454
  bootstrap,
@@ -456,12 +493,14 @@ def build_app():
456
  outputs=[
457
  annotations_state,
458
  cursor_record_idx,
 
459
  image,
460
  prompt_box,
461
  judgment,
462
  note,
463
  position_box,
464
  progress_md,
 
465
  save_status,
466
  ],
467
  )
@@ -472,12 +511,14 @@ def build_app():
472
  outputs=[
473
  annotations_state,
474
  cursor_record_idx,
 
475
  image,
476
  prompt_box,
477
  judgment,
478
  note,
479
  position_box,
480
  progress_md,
 
481
  save_status,
482
  ],
483
  )
@@ -515,6 +556,7 @@ def build_app():
515
  judgment,
516
  note,
517
  annotations_state,
 
518
  annotator,
519
  dataset_filter,
520
  split_filter,
@@ -523,20 +565,22 @@ def build_app():
523
  outputs=[
524
  annotations_state,
525
  cursor_record_idx,
 
526
  image,
527
  prompt_box,
528
  judgment,
529
  note,
530
  position_box,
531
  progress_md,
 
532
  save_status,
533
  ],
534
  )
535
 
536
  sync_btn.click(
537
  sync_current_annotations,
538
- inputs=[annotations_state, annotator],
539
- outputs=[annotations_state, progress_md, save_status],
540
  )
541
 
542
  demo.load(
@@ -544,12 +588,14 @@ def build_app():
544
  outputs=[
545
  annotations_state,
546
  cursor_record_idx,
 
547
  image,
548
  prompt_box,
549
  judgment,
550
  note,
551
  position_box,
552
  progress_md,
 
553
  save_status,
554
  ],
555
  )
 
18
  DATASET_REPO_ID = os.environ.get("ANNOTATION_DATASET_REPO", "nickname-xingxing/filter_data")
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
  HF_API = HfApi(token=HF_TOKEN) if HF_TOKEN else None
21
+ AUTO_SYNC_EVERY = 10
22
 
23
 
24
  def resolve_image_fs_path(record: dict) -> str:
 
215
  with gr.Blocks(title="过滤数据人工审核", theme=gr.themes.Soft()) as demo:
216
  gr.Markdown("# 过滤数据人工审核")
217
  gr.Markdown(
218
+ "请先填写标注人名称,然后判断图片是否符合给定 promp全部标注完成后请同步到HF上面"
219
  )
220
 
221
  annotator = gr.Textbox(label="标注人名称", value="", placeholder="请先输入姓名/昵称")
222
  # 当前样本在 records 中的下标(全局),不是「在当前列表里的第几个」
223
  cursor_record_idx = gr.State(0)
224
  annotations_state = gr.State({})
225
+ unsynced_count_state = gr.State(0)
226
 
227
  with gr.Row():
228
  dataset_filter = gr.Dropdown(dataset_choices, value="all", label="数据源")
 
231
  refresh_btn = gr.Button("应用筛选", variant="primary")
232
 
233
  progress_md = gr.Markdown()
234
+ sync_md = gr.Markdown("### 同步状态\n\n- 未同步修改: `0`\n- 建议标注一批后再点击一次“同步到HF”。")
235
 
236
  with gr.Row():
237
  with gr.Column(scale=1):
 
252
 
253
  save_status = gr.Textbox(label="保存状态", interactive=False)
254
 
255
+ def sync_status_text(n: int) -> str:
256
+ return f"### 同步状态\n\n- 未同步修改: `{n}`\n- 系统会在累计达到 `{AUTO_SYNC_EVERY}` 条后自动同步一次。"
257
+
258
  def validate_annotator_name(name: str) -> str:
259
  who = (name or "").strip()
260
  return who
 
262
  def bootstrap(annotator_name):
263
  annotator_name = validate_annotator_name(annotator_name)
264
  if not annotator_name:
265
+ return {}, 0, 0, None, "请先输入标注人名称", "unsure", "", "0 / 0", "### 标注进度\n\n- 请先输入标注人名称", sync_status_text(0), "请先输入标注人名称后再开始标注"
266
  annotation_file = ANNOTATION_DIR / f"{annotator_name}.jsonl"
267
  annotations = load_existing_annotations(annotation_file)
268
  pool = resolve_filters(records, "all", "all", "all", annotations)
269
  if not pool:
270
+ return annotations, 0, 0, None, "", "unsure", "", "0 / 0", build_stats(records, annotations), sync_status_text(0), ""
271
  cursor = pool[0]
272
  record = records[cursor]
273
  rendered = render_record(
274
  record, annotations.get(record["sample_id"]), f"1 / {len(pool)}"
275
  )
276
+ return annotations, int(cursor), 0, *rendered, build_stats(records, annotations), sync_status_text(0), ""
277
 
278
  def refresh_pool(dataset_value, split_value, status_value, annotations, annotator_name):
279
  annotator_name = validate_annotator_name(annotator_name)
 
281
  return (
282
  {},
283
  0,
284
+ 0,
285
  None,
286
  "请先输入标注人名称",
287
  "unsure",
288
  "",
289
  "0 / 0",
290
  "### 标注进度\n\n- 请先输入标注人名称",
291
+ sync_status_text(0),
292
  "请先输入标注人名称后再开始标注",
293
  )
294
  ann_file = ANNOTATION_DIR / f"{annotator_name}.jsonl"
 
298
  return (
299
  annotations,
300
  0,
301
+ 0,
302
  None,
303
  "",
304
  "unsure",
305
  "",
306
  "0 / 0",
307
  build_stats(records, annotations),
308
+ sync_status_text(0),
309
  "没有可显示的样本",
310
  )
311
  cursor = pool[0]
 
316
  return (
317
  annotations,
318
  int(cursor),
319
+ 0,
320
  *rendered,
321
  build_stats(records, annotations),
322
+ sync_status_text(0),
323
  "筛选条件已更新",
324
  )
325
 
 
354
  decision,
355
  note_value,
356
  annotations,
357
+ unsynced_count,
358
  annotator_name,
359
  dataset_value,
360
  split_value,
 
365
  return (
366
  annotations or {},
367
  0,
368
+ unsynced_count or 0,
369
  None,
370
  "请先输入标注人名称",
371
  "unsure",
372
  "",
373
  "0 / 0",
374
  build_stats(records, annotations or {}),
375
+ sync_status_text(unsynced_count or 0),
376
  "请先输入标注人名称后再保存",
377
  )
378
  annotation_file = ANNOTATION_DIR / f"{annotator_name}.jsonl"
 
382
  return (
383
  merged,
384
  0,
385
+ unsynced_count or 0,
386
  None,
387
  "",
388
  "unsure",
389
  "",
390
  "0 / 0",
391
  build_stats(records, merged),
392
+ sync_status_text(unsynced_count or 0),
393
  "没有可保存的样本",
394
  )
395
  c = normalize_cursor(cursor, old_pool)
 
416
  return (
417
  merged,
418
  int(cur),
419
+ unsynced_count or 0,
420
  *rendered,
421
  build_stats(records, merged),
422
+ sync_status_text(unsynced_count or 0),
423
  f"写入失败(请检查磁盘是否可写): {e}",
424
  )
425
  annotations = to_write
426
+ next_unsynced = int(unsynced_count or 0) + 1
427
+ sync_msg = ""
428
+ if next_unsynced >= AUTO_SYNC_EVERY:
429
+ sync_msg = sync_annotation_to_hf(annotation_file, annotator_name)
430
+ if sync_msg.startswith("已同步到 HF dataset"):
431
+ next_unsynced = 0
432
  new_pool = resolve_filters(records, dataset_value, split_value, status_value, annotations)
433
  stats = build_stats(records, annotations)
434
  if not new_pool:
435
+ if next_unsynced > 0:
436
+ final_sync_msg = sync_annotation_to_hf(annotation_file, annotator_name)
437
+ if final_sync_msg.startswith("已同步到 HF dataset"):
438
+ next_unsynced = 0
439
+ sync_msg = (sync_msg + ";" + final_sync_msg).strip(";")
440
  return (
441
  annotations,
442
  0,
443
+ next_unsynced,
444
  None,
445
  "",
446
  "unsure",
447
  "",
448
  "0 / 0",
449
  stats,
450
+ sync_status_text(next_unsynced),
451
+ f"已保存 {record['sample_id']};{sync_msg or '仅本地缓存,尚未同步到HF'}",
452
  )
453
  if record_idx in new_pool:
454
  new_cursor = record_idx
 
466
  return (
467
  annotations,
468
  int(new_cursor),
469
+ next_unsynced,
470
  *rendered,
471
  stats,
472
+ sync_status_text(next_unsynced),
473
+ f"已保存 {record['sample_id']};{sync_msg or '仅本地缓存,尚未同步到HF'}",
474
  )
475
 
476
+ def sync_current_annotations(annotations, unsynced_count, annotator_name):
477
  annotator_name = validate_annotator_name(annotator_name)
478
  if not annotator_name:
479
+ return annotations or {}, unsynced_count or 0, build_stats(records, annotations or {}), sync_status_text(unsynced_count or 0), "请先输入标注人名称后再同步"
480
  annotation_file = ANNOTATION_DIR / f"{annotator_name}.jsonl"
481
  merged = merge_annotations_from_disk(annotation_file, annotations)
482
  if not merged:
483
+ return merged, 0, build_stats(records, merged), sync_status_text(0), "当前没有可同步的标注"
484
+ if not int(unsynced_count or 0):
485
+ return merged, 0, build_stats(records, merged), sync_status_text(0), "当前没有未同步修改"
486
  sync_msg = sync_annotation_to_hf(annotation_file, annotator_name)
487
+ next_unsynced = 0 if sync_msg.startswith("已同步到 HF dataset") else int(unsynced_count or 0)
488
+ return merged, next_unsynced, build_stats(records, merged), sync_status_text(next_unsynced), sync_msg
489
 
490
  annotator.change(
491
  bootstrap,
 
493
  outputs=[
494
  annotations_state,
495
  cursor_record_idx,
496
+ unsynced_count_state,
497
  image,
498
  prompt_box,
499
  judgment,
500
  note,
501
  position_box,
502
  progress_md,
503
+ sync_md,
504
  save_status,
505
  ],
506
  )
 
511
  outputs=[
512
  annotations_state,
513
  cursor_record_idx,
514
+ unsynced_count_state,
515
  image,
516
  prompt_box,
517
  judgment,
518
  note,
519
  position_box,
520
  progress_md,
521
+ sync_md,
522
  save_status,
523
  ],
524
  )
 
556
  judgment,
557
  note,
558
  annotations_state,
559
+ unsynced_count_state,
560
  annotator,
561
  dataset_filter,
562
  split_filter,
 
565
  outputs=[
566
  annotations_state,
567
  cursor_record_idx,
568
+ unsynced_count_state,
569
  image,
570
  prompt_box,
571
  judgment,
572
  note,
573
  position_box,
574
  progress_md,
575
+ sync_md,
576
  save_status,
577
  ],
578
  )
579
 
580
  sync_btn.click(
581
  sync_current_annotations,
582
+ inputs=[annotations_state, unsynced_count_state, annotator],
583
+ outputs=[annotations_state, unsynced_count_state, progress_md, sync_md, save_status],
584
  )
585
 
586
  demo.load(
 
588
  outputs=[
589
  annotations_state,
590
  cursor_record_idx,
591
+ unsynced_count_state,
592
  image,
593
  prompt_box,
594
  judgment,
595
  note,
596
  position_box,
597
  progress_md,
598
+ sync_md,
599
  save_status,
600
  ],
601
  )