Farzad-R
/

llava-v1.6-mistral-7b-cordv2

Model card Files Files and versions

llava-v1.6-mistral-7b-cordv2 / token-to-json.py

Farzad-R

add Readme and toke_to_json finction

fdabcf2 almost 2 years ago

History Blame Contribute Delete

2.25 kB

	import re

	# Function to convert tokens to JSON
	def token2json(tokens, is_inner_value=False, added_vocab=None):
	if added_vocab is None:
	added_vocab = processor.tokenizer.get_added_vocab()

	output = {}

	while tokens:
	start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
	if start_token is None:
	break
	key = start_token.group(1)
	key_escaped = re.escape(key)

	end_token = re.search(rf"</s_{key_escaped}>", tokens, re.IGNORECASE)
	start_token = start_token.group()
	if end_token is None:
	tokens = tokens.replace(start_token, "")
	else:
	end_token = end_token.group()
	start_token_escaped = re.escape(start_token)
	end_token_escaped = re.escape(end_token)
	content = re.search(
	f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE \| re.DOTALL
	)
	if content is not None:
	content = content.group(1).strip()
	if r"<s_" in content and r"</s_" in content: # non-leaf node
	value = token2json(content, is_inner_value=True, added_vocab=added_vocab)
	if value:
	if len(value) == 1:
	value = value[0]
	output[key] = value
	else: # leaf nodes
	output[key] = []
	for leaf in content.split(r"<sep/>"):
	leaf = leaf.strip()
	if leaf in added_vocab and leaf[0] == "<" and leaf[-2:] == "/>":
	leaf = leaf[1:-2] # for categorical special tokens
	output[key].append(leaf)
	if len(output[key]) == 1:
	output[key] = output[key][0]

	tokens = tokens[tokens.find(end_token) + len(end_token) :].strip()
	if tokens[:6] == r"<sep/>": # non-leaf nodes
	return [output] + token2json(tokens[6:], is_inner_value=True, added_vocab=added_vocab)

	if len(output):
	return [output] if is_inner_value else output
	else:
	return [] if is_inner_value else {"text_sequence": tokens}