Upload folder using huggingface_hub

7164129 verified over 1 year ago

4.73 kB

	import faiss
	import numpy as np
	import json
	from tqdm import tqdm
	import os
	from torch.nn import DataParallel
	from transformers import AutoTokenizer, AutoModel, T5EncoderModel
	import torch
	from sentence_transformers import SentenceTransformer
	from multiprocessing import Pool
	import time

	start_time = time.time()

	with open("merged_triple_processed_new_withID.json", "r") as fi:
	data = json.load(fi)

	sentences = [_['contents'] for _ in data]
	print("Chunks nums: ", len(sentences))

	# model_path = '/mnt/ceph_rbd/hf_models/gtr-t5-xl'
	# model_path = '/mnt/ceph_rbd/hf_models/bge-large-en-v1.5'
	model_path = 'facebook/contriever'

	### Using SentenceTransformer
	# def encode_sentences_on_gpu(params):
	# sentences_chunk, device_id = params
	# device = torch.device(f'cuda:{device_id}')
	# model = SentenceTransformer(model_path, device=device)
	# embeddings = model.encode(
	# sentences_chunk,
	# batch_size=1024,
	# show_progress_bar=False,
	# convert_to_numpy=True,
	# normalize_embeddings=True
	# )
	# return embeddings


	# num_gpus = torch.cuda.device_count()
	# print(f"Number of GPUs: {num_gpus}")

	# sentences_chunks = np.array_split(sentences, num_gpus)
	# params = [(sentences_chunks[i], i) for i in range(num_gpus)]

	# with Pool(processes=num_gpus) as pool:
	# embeddings_list = pool.map(encode_sentences_on_gpu, params)


	# sentence_embeddings = np.concatenate(embeddings_list, axis=0)


	### Using Transformers

	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model = AutoModel.from_pretrained(model_path)
	# model = T5EncoderModel.from_pretrained(model_path)
	model = DataParallel(model) # Wrap the model for multi-GPU support
	model.eval()


	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model.to(device)

	batch_size = 1024

	def mean_pooling(token_embeddings, mask):
	token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
	sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
	return sentence_embeddings

	def process_in_batches(sentences, batch_size):
	sentence_embeddings_list = []
	for i in tqdm(range(0, len(sentences), batch_size)):
	batch_sentences = sentences[i:i + batch_size]
	encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt').to(device)

	with torch.no_grad():
	model_output = model(**encoded_input)
	batch_sentence_embeddings = mean_pooling(model_output[0], encoded_input['attention_mask'])

	# CLS pooling for BGE
	# batch_sentence_embeddings = model_output[0][:, 0]
	# pooling for GTR
	# batch_sentence_embeddings = model_output.last_hidden_state.mean(dim=1)

	# batch_sentence_embeddings = torch.nn.functional.normalize(batch_sentence_embeddings, p=2, dim=1)
	sentence_embeddings_list.append(batch_sentence_embeddings.cpu()) # Move to CPU to save GPU memory


	sentence_embeddings = torch.cat(sentence_embeddings_list, dim=0)
	return sentence_embeddings


	sentence_embeddings = process_in_batches(sentences, batch_size)

	sentence_embeddings = sentence_embeddings.cpu().numpy()


	# Create a FAISS index
	dim = sentence_embeddings.shape[1]
	faiss_index = faiss.IndexFlatIP(dim) # Inner product for cosine similarity

	faiss_index.add(sentence_embeddings)

	faiss_index_file = 'faiss_index.bin'
	faiss.write_index(faiss_index, faiss_index_file)
	print(f"FAISS index saved to {faiss_index_file}")

	embeddings_file = 'document_embeddings.npy'
	np.save(embeddings_file, sentence_embeddings)
	print(f"Document embeddings saved to {embeddings_file}")

	end_time = time.time()
	execution_time_hours = (end_time - start_time) / 3600
	print(f"Total execution time: {execution_time_hours:.2f} hours")


	# instruction = "Represent this sentence for searching relevant passages: "
	# queries = ["Who is the president of U.S.A.?"]

	# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

	# # Compute token embeddings
	# with torch.no_grad():
	# model_output = model(**encoded_input)
	# # Perform pooling. In this case, cls pooling.
	# sentence_embeddings = model_output[0][:, 0]
	# # normalize embeddings
	# query_vector = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)


	# k = 5 # Number of nearest neighbors to retrieve
	# distances, indices = faiss_index.search(np.array([query_vector.cpu()], dtype=np.float32), k)

	# # Print the most similar documents
	# for i, index in enumerate(indices[0]):
	# distance = distances[0][i]
	# print(f"Nearest neighbor {i+1}: {documents[index]}, Distance {distance}")