impresso-project
/

ner-stacked-bert-multilingual-v1.1.0

Token Classification

Model card Files Files and versions

ner-stacked-bert-multilingual-v1.1.0 / test_ner.py

emanuelaboros's picture

new model

c36a6f3 9 months ago

History Blame Contribute Delete

3.98 kB

	from transformers import pipeline, AutoTokenizer
	import bz2, json
	from pprint import pprint

	MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual-light"

	# Load the tokenizer and model using the pipeline
	ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	ner_pipeline = pipeline(
	"generic-ner",
	model=MODEL_NAME,
	tokenizer=ner_tokenizer,
	trust_remote_code=True,
	device="cpu",
	)

	def process_archive(lingproc_path):
	"""
	Processes paired NER and full-text archives to extract full text and sentence offsets.

	Args:
	ner_path (str): Path to the NER .jsonl.bz2 archive.
	fulltext_path (str): Path to the full-text .jsonl.bz2 archive.

	Returns:
	List of tuples: (doc_id, full_text, sentence_offsets)
	"""
	results = []

	with bz2.open(lingproc_path, mode='rt', encoding='utf-8') as f:
	for line in f:
	data = json.loads(line)
	doc_id = data.get("id")

	# Reconstruct the full text from all tokens using their offsets
	offset_token_map = {}
	for sent in data.get("sents", []):
	for token in sent.get("tok", []):
	offset = token["o"]
	text = token["t"]
	offset_token_map[offset] = text

	# Rebuild full text from sorted offsets
	full_text_parts = []
	sorted_offsets = sorted(offset_token_map.keys())
	last_end = 0
	for offset in sorted_offsets:
	token = offset_token_map[offset]
	if offset > last_end:
	full_text_parts.append(" " * (offset - last_end))
	full_text_parts.append(token)
	last_end = offset + len(token)
	full_text = "".join(full_text_parts).strip()

	# assert new_full_text == full_text, f"Full text mismatch for doc_id {doc_id}. Expected: {full_text}, Got: {new_full_text}"

	sentences = []
	for sent in data.get("sents", []):
	tokens = sent.get("tok", [])
	if not tokens:
	continue
	start = tokens[0]["o"]
	end = tokens[-1]["o"] + len(tokens[-1]["t"])
	newtokens = [{"t": token["t"], "o": token["o"], "l": len(token["t"])} for token in tokens]
	sentences.append({"start": start, "end": end, "tokens": newtokens})
	results.append((doc_id, full_text, sentences))

	return results

	processed_cis = process_archive("../../data/lematin-1885.jsonl.bz2")

	for ci in processed_cis:
	doc_id, full_text, offsets = ci
	print(f"Document ID: {doc_id}")
	# print(f"Full Text: {full_text}")
	# print("Sentences:")
	for sentence in offsets:
	start = sentence["start"]
	end = sentence["end"]
	tokens = sentence["tokens"]
	sentence_text = full_text[start:end]
	tokens_texts = [full_text[token["o"]:token["o"] + len(token["t"])] for token in tokens]
	# print(sentence_text)

	entities = ner_pipeline(sentence_text, tokens=tokens_texts)

	for entity in entities:
	abs_start = sentence["start"] + entity["lOffset"]
	abs_end = sentence["start"] + entity["rOffset"]
	entity_text = full_text[abs_start:abs_end]
	entity_surface = entity["surface"]
	assert entity_text == entity_surface, f"Entity text mismatch: {entity_text} != {entity_surface}"
	print(f"{doc_id}: {entity_text} -- surface: {entity_surface} -- {entity['type']} -- {abs_start} - {abs_end}")
	# pprint(entities)

	# print(f" Sentence: {sentence_text} (Start: {start}, End: {end})")
	# for token in tokens:
	# token_text = token["t"]
	# token_offset = token["o"]
	# token_label = token["l"]
	# print(f" Token: {token_text} (Offset: {token_offset}, Label: {token_label})")


	# entities = ner_pipeline(sentence)