--- license: apache-2.0 datasets: - Yehor/cv10-uk-testset-clean-punctuated language: - uk base_model: - openai/whisper-large-v3-turbo --- # Quantized Whisper Large V3 Turbo with calibration on Ukrainian Quantized it using https://pypi.org/project/llmcompressor/ Data used for calibration: https://huggingface.co/datasets/Yehor/cv10-uk-testset-clean-punctuated How to quantize: https://colab.research.google.com/drive/1TsCMxwq9kqsWV8jabihFN7J78RKgyvnD?usp=sharing ## Usage Install required packages: ``` pip install vllm polars ``` Run inference: ```python import io import wave import numpy as np import polars as pl from vllm import LLM, SamplingParams def bytes_to_numpy(_bytes): with wave.open(io.BytesIO(_bytes), "rb") as wr: if (nc := wr.getnchannels()) != 1: raise ValueError(f"num_channels must be 1, got {nc}") if (sw := wr.getsampwidth()) != 2: raise ValueError(f"sample_width must be 2, got {sw}") audio_data = wr.readframes(wr.getnframes()) return np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0 llm = LLM( model="Yehor/whisper-large-v3-turbo-quantized-uk", max_model_len=448, max_num_seqs=400, gpu_memory_utilization=0.8, limit_mm_per_prompt={"audio": 1}, ) df = pl.read_parquet("hf://datasets/Yehor/cv10-uk-testset-clean/data/train-*.parquet") for row in df.iter_rows(named=True): inputs = { "encoder_prompt": { "prompt": "", "multi_modal_data": { "audio": (bytes_to_numpy(row["audio"]["bytes"]), 16_000,), }, }, "decoder_prompt": "<|startoftranscript|><|uk|><|transcribe|><|notimestamps|>", } sampling_params = SamplingParams( temperature=1.0, top_p=1.0, max_tokens=200, ) outputs = llm.generate(inputs, sampling_params) print(f"PROMPT : {outputs[0].prompt}") print(f"TRANSCRIPTION: {row['transcription']}") print(f"PREDICTION: {outputs[0].outputs[0].text}") print("==========================================") ```