Token Classification
Transformers
PyTorch
Safetensors
English
French
German
stacked_bert
v1.0.0
custom_code
Instructions to use impresso-project/ner-stacked-bert-multilingual-v1.1.0 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use impresso-project/ner-stacked-bert-multilingual-v1.1.0 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="impresso-project/ner-stacked-bert-multilingual-v1.1.0", trust_remote_code=True)# Load model directly from transformers import AutoModelForTokenClassification model = AutoModelForTokenClassification.from_pretrained("impresso-project/ner-stacked-bert-multilingual-v1.1.0", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| from transformers import pipeline, AutoTokenizer | |
| import bz2, json | |
| from pprint import pprint | |
| MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual-light" | |
| # Load the tokenizer and model using the pipeline | |
| ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| ner_pipeline = pipeline( | |
| "generic-ner", | |
| model=MODEL_NAME, | |
| tokenizer=ner_tokenizer, | |
| trust_remote_code=True, | |
| device="cpu", | |
| ) | |
| def process_archive(lingproc_path): | |
| """ | |
| Processes paired NER and full-text archives to extract full text and sentence offsets. | |
| Args: | |
| ner_path (str): Path to the NER .jsonl.bz2 archive. | |
| fulltext_path (str): Path to the full-text .jsonl.bz2 archive. | |
| Returns: | |
| List of tuples: (doc_id, full_text, sentence_offsets) | |
| """ | |
| results = [] | |
| with bz2.open(lingproc_path, mode='rt', encoding='utf-8') as f: | |
| for line in f: | |
| data = json.loads(line) | |
| doc_id = data.get("id") | |
| # Reconstruct the full text from all tokens using their offsets | |
| offset_token_map = {} | |
| for sent in data.get("sents", []): | |
| for token in sent.get("tok", []): | |
| offset = token["o"] | |
| text = token["t"] | |
| offset_token_map[offset] = text | |
| # Rebuild full text from sorted offsets | |
| full_text_parts = [] | |
| sorted_offsets = sorted(offset_token_map.keys()) | |
| last_end = 0 | |
| for offset in sorted_offsets: | |
| token = offset_token_map[offset] | |
| if offset > last_end: | |
| full_text_parts.append(" " * (offset - last_end)) | |
| full_text_parts.append(token) | |
| last_end = offset + len(token) | |
| full_text = "".join(full_text_parts).strip() | |
| # assert new_full_text == full_text, f"Full text mismatch for doc_id {doc_id}. Expected: {full_text}, Got: {new_full_text}" | |
| sentences = [] | |
| for sent in data.get("sents", []): | |
| tokens = sent.get("tok", []) | |
| if not tokens: | |
| continue | |
| start = tokens[0]["o"] | |
| end = tokens[-1]["o"] + len(tokens[-1]["t"]) | |
| newtokens = [{"t": token["t"], "o": token["o"], "l": len(token["t"])} for token in tokens] | |
| sentences.append({"start": start, "end": end, "tokens": newtokens}) | |
| results.append((doc_id, full_text, sentences)) | |
| return results | |
| processed_cis = process_archive("../../data/lematin-1885.jsonl.bz2") | |
| for ci in processed_cis: | |
| doc_id, full_text, offsets = ci | |
| print(f"Document ID: {doc_id}") | |
| # print(f"Full Text: {full_text}") | |
| # print("Sentences:") | |
| for sentence in offsets: | |
| start = sentence["start"] | |
| end = sentence["end"] | |
| tokens = sentence["tokens"] | |
| sentence_text = full_text[start:end] | |
| tokens_texts = [full_text[token["o"]:token["o"] + len(token["t"])] for token in tokens] | |
| # print(sentence_text) | |
| entities = ner_pipeline(sentence_text, tokens=tokens_texts) | |
| for entity in entities: | |
| abs_start = sentence["start"] + entity["lOffset"] | |
| abs_end = sentence["start"] + entity["rOffset"] | |
| entity_text = full_text[abs_start:abs_end] | |
| entity_surface = entity["surface"] | |
| assert entity_text == entity_surface, f"Entity text mismatch: {entity_text} != {entity_surface}" | |
| print(f"{doc_id}: {entity_text} -- surface: {entity_surface} -- {entity['type']} -- {abs_start} - {abs_end}") | |
| # pprint(entities) | |
| # print(f" Sentence: {sentence_text} (Start: {start}, End: {end})") | |
| # for token in tokens: | |
| # token_text = token["t"] | |
| # token_offset = token["o"] | |
| # token_label = token["l"] | |
| # print(f" Token: {token_text} (Offset: {token_offset}, Label: {token_label})") | |
| # entities = ner_pipeline(sentence) | |