--- license: mit --- f1: 88.43 | | precision | recall | f1-score | support | |-------|-----------|--------|----------|---------| | DAT | 0.96 | 0.97 | 0.96 | 182 | | DUR | 0.79 | 0.82 | 0.80 | 50 | | LOC | 0.70 | 0.79 | 0.74 | 206 | | MNY | 0.87 | 1.00 | 0.93 | 20 | | NOH | 0.91 | 0.93 | 0.92 | 1007 | | ORG | 0.86 | 0.89 | 0.88 | 795 | | PER | 0.92 | 0.95 | 0.94 | 853 | | PNT | 0.78 | 0.78 | 0.78 | 60 | | POH | 0.64 | 0.71 | 0.68 | 214 | | TIM | 0.76 | 1.00 | 0.86 | 19 | |-------|-----------|--------|----------|---------| |micro avg | 0.87 | 0.90 | 0.88 | 3406 | |macro avg | 0.82 | 0.89 | 0.85 | 3406 | |weighted avg | 0.87 | 0.90 | 0.89 | 3406 | ```python from transformers import TFBertModel, BertTokenizer import os import tensorflow as tf import numpy as np from tqdm import tqdm from konlpy.tag import Mecab mecab = Mecab() checkpoint_path = "./cp-{epoch:04d}.ckpt" checkpoint_dir = os.path.dirname(checkpoint_path) latest = tf.train.latest_checkpoint(checkpoint_dir) index_to_tag = {0: 'B-PER', 1: 'B-LOC', 2: 'I-ORG', 3: 'B-DAT', 4: 'O', 5: 'I-DUR', 6: 'I-TIM', 7: 'I-NOH', 8: 'B-MNY', 9: 'B-PNT', 10: 'I-PER', 11: 'I-PNT', 12: 'I-LOC', 13: 'I-DAT', 14: 'B-TIM', 15: 'B-POH', 16: 'B-NOH', 17: 'I-POH', 18: 'I-MNY', 19: 'B-ORG', 20: 'B-DUR'} tokenizer = BertTokenizer.from_pretrained("klue/bert-base") model = TFBertForTokenClassification("klue/bert-base", num_labels=21) model.load_weights(latest) class TFBertForTokenClassification(tf.keras.Model): def __init__(self, model_name, num_labels): super(TFBertForTokenClassification, self).__init__() self.bert = TFBertModel.from_pretrained(model_name, from_pt=True) self.classifier = tf.keras.layers.Dense(num_labels, kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02), name='classifier') def call(self, inputs): input_ids, attention_mask, token_type_ids = inputs outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) all_output = outputs[0] prediction = self.classifier(all_output) return prediction def convert_examples_to_features_for_prediction(examples, max_seq_len, tokenizer, pad_token_id_for_segment=0, pad_token_id_for_label=-100): cls_token = tokenizer.cls_token sep_token = tokenizer.sep_token pad_token_id = tokenizer.pad_token_id input_ids, attention_masks, token_type_ids, label_masks = [], [], [], [] for example in tqdm(examples): tokens = [] label_mask = [] for one_word in example: subword_tokens = tokenizer.tokenize(one_word) tokens.extend(subword_tokens) label_mask.extend([0]+ [pad_token_id_for_label] * (len(subword_tokens) - 1)) special_tokens_count = 2 if len(tokens) > max_seq_len - special_tokens_count: tokens = tokens[:(max_seq_len - special_tokens_count)] label_mask = label_mask[:(max_seq_len - special_tokens_count)] tokens += [sep_token] label_mask += [pad_token_id_for_label] tokens = [cls_token] + tokens label_mask = [pad_token_id_for_label] + label_mask input_id = tokenizer.convert_tokens_to_ids(tokens) attention_mask = [1] * len(input_id) padding_count = max_seq_len - len(input_id) input_id = input_id + ([pad_token_id] * padding_count) attention_mask = attention_mask + ([0] * padding_count) token_type_id = [pad_token_id_for_segment] * max_seq_len label_mask = label_mask + ([pad_token_id_for_label] * padding_count) assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len) assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len) assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len) assert len(label_mask) == max_seq_len, "Error with labels length {} vs {}".format(len(label_mask), max_seq_len) input_ids.append(input_id) attention_masks.append(attention_mask) token_type_ids.append(token_type_id) label_masks.append(label_mask) input_ids = np.array(input_ids, dtype=int) attention_masks = np.array(attention_masks, dtype=int) token_type_ids = np.array(token_type_ids, dtype=int) label_masks = np.asarray(label_masks, dtype=np.int32) return (input_ids, attention_masks, token_type_ids), label_masks def ner_prediction(examples, max_seq_len, tokenizer, lang='ko'): if lang == 'ko': examples = [mecab.morphs(sent) for sent in examples] else: examples = [sent.split() for sent in examples] X_pred, label_masks = convert_examples_to_features_for_prediction( examples, max_seq_len=128, tokenizer=tokenizer) y_predicted = model.predict(X_pred) y_predicted = np.argmax(y_predicted, axis=2) pred_list = [] result_list = [] for i in range(0, len(label_masks)): pred_tag = [] for label_index, pred_index in zip(label_masks[i], y_predicted[i]): if label_index != -100: pred_tag.append(index_to_tag[pred_index]) pred_list.append(pred_tag) for example, pred in zip(examples, pred_list): one_sample_result = [] for one_word, label_token in zip(example, pred): one_sample_result.append((one_word, label_token)) result_list.append(one_sample_result) return result_list sent1 = '울산에서 활동하고 있는 시각예술 분야 김유경 작가는 최근 지역 AI 기업 코어닷투데이와의 협업을 통한 특별한 전시를 열었다.' sent2 = '가치관이나 인식에 따라 세상을 불완전하게 보는 인간이 학습을 통해 인지한 부분만을 인식하는 AI와 비슷하다고 보고 전시를 기획했다.' sent3 = '부산 광안리 해변과 달맞이 고개 등 유동 인구와 차량 이동이 많은 지역 몇 곳을 골라 CCTV 데이터 속 정보를 어떻게 인식하는지, 공간에 대한 찰나를 표현한 작가의 작품을 어떻게 인식하는지 차이를 비교했다.' test_samples = [sent1, sent2, sent3] ner_prediction(test_samples, max_seq_len=128, tokenizer=tokenizer, lang='ko') ``` ``` [[('울산', 'B-LOC'), ('에서', 'O'), ('활동', 'O'), ('하', 'O'), ('고', 'O'), ('있', 'O'), ('는', 'O'), ('시각', 'O'), ('예술', 'O'), ('분야', 'O'), ('김유경', 'B-PER'), ('작가', 'O'), ('는', 'O'), ('최근', 'O'), ('지역', 'O'), ('AI', 'O'), ('기업', 'O'), ('코어', 'B-ORG'), ('닷', 'I-ORG'), ('투데이', 'I-ORG'), ('와', 'O'), ('의', 'O'), ('협업', 'O'), ('을', 'O'), ('통한', 'O'), ('특별', 'O'), ('한', 'O'), ('전시', 'O'), ('를', 'O'), ('열', 'O'), ('었', 'O'), ('다', 'O'), ('.', 'O')], [('가치관', 'O'), ('이나', 'O'), ('인식', 'O'), ('에', 'O'), ('따라', 'O'), ('세상', 'O'), ('을', 'O'), ('불', 'O'), ('완전', 'O'), ('하', 'O'), ('게', 'O'), ('보', 'O'), ('는', 'O'), ('인간', 'O'), ('이', 'O'), ('학습', 'O'), ('을', 'O'), ('통해', 'O'), ('인지', 'O'), ('한', 'O'), ('부분', 'O'), ('만', 'O'), ('을', 'O'), ('인식', 'O'), ('하', 'O'), ('는', 'O'), ('AI', 'O'), ('와', 'O'), ('비슷', 'O'), ('하', 'O'), ('다고', 'O'), ('보', 'O'), ('고', 'O'), ('전시', 'O'), ('를', 'O'), ('기획', 'O'), ('했', 'O'), ('다', 'O'), ('.', 'O')], [('부산', 'B-LOC'), ('광안리', 'I-LOC'), ('해변', 'I-LOC'), ('과', 'O'), ('달맞이', 'B-LOC'), ('고개', 'I-LOC'), ('등', 'O'), ('유동', 'O'), ('인구', 'O'), ('와', 'O'), ('차량', 'O'), ('이동', 'O'), ('이', 'O'), ('많', 'O'), ('은', 'O'), ('지역', 'O'), ('몇', 'O'), ('곳', 'O'), ('을', 'O'), ('골라', 'O'), ('CCTV', 'O'), ('데이터', 'O'), ('속', 'O'), ('정보', 'O'), ('를', 'O'), ('어떻게', 'O'), ('인식', 'O'), ('하', 'O'), ('는지', 'O'), (',', 'O'), ('공간', 'O'), ('에', 'O'), ('대한', 'O'), ('찰나', 'O'), ('를', 'O'), ('표현', 'O'), ('한', 'O'), ('작가', 'O'), ('의', 'O'), ('작품', 'O'), ('을', 'O'), ('어떻게', 'O'), ('인식', 'O'), ('하', 'O'), ('는지', 'O'), ('차이', 'O'), ('를', 'O'), ('비교', 'O'), ('했', 'O'), ('다', 'O'), ('.', 'O')]] ``` ``` tensorflow-estimator==2.5.0 tensorflow-gpu==2.5.3 transformers @ git+https://github.com/davidegazze/transformers@cf28c1db00410f0df3e654d9866e0ff1d3a45f29 numpy==1.24.3 konlpy==0.6.0 ```