#!/usr/bin/env python3 """Generate all charts for the HuggingFace README.""" import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import matplotlib.ticker as mticker import numpy as np from pathlib import Path OUT = Path(__file__).parent COLORS = { 'dflash': '#00d4aa', 'autoreg': '#ff6b6b', 'gold': '#ffd700', 'blue': '#4dabf7', 'bg': '#0d1117', 'grid': '#ffffff', 'text': '#e6edf3', } plt.rcParams.update({ 'figure.facecolor': COLORS['bg'], 'axes.facecolor': COLORS['bg'], 'text.color': COLORS['text'], 'axes.labelcolor': COLORS['text'], 'xtick.color': COLORS['text'], 'ytick.color': COLORS['text'], 'font.family': 'sans-serif', }) def chart_throughput_scaling(): fig, ax = plt.subplots(figsize=(13, 6.5)) concurrency = [8, 12, 16, 20, 24, 32] dflash = [127, 193, 251, 323, 379, 508] autoreg = [90, 125] x = np.arange(len(concurrency)) w = 0.38 bars_d = ax.bar(x, dflash, width=w*2, color=COLORS['dflash'], zorder=3, edgecolor='white', linewidth=0.5, label='DFlash st=2 (this config)') bars_a = ax.bar(x[:2] - 0.01, autoreg, width=w*2, color=COLORS['autoreg'], alpha=0.6, zorder=2, edgecolor='white', linewidth=0.5, label='Autoregressive baseline') for bar, v in zip(bars_d, dflash): ax.text(bar.get_x() + bar.get_width()/2, v + 10, f'{v}', ha='center', va='bottom', fontweight='bold', fontsize=14, color=COLORS['dflash']) for bar, v in zip(bars_a, autoreg): ax.text(bar.get_x() + bar.get_width()/2, v - 15, f'{v}', ha='center', va='top', fontsize=12, color='white', fontweight='bold') ax.axhline(y=500, color=COLORS['gold'], linestyle='--', alpha=0.4, linewidth=1.5) ax.text(5.6, 508, '500 tok/s', ha='right', color=COLORS['gold'], fontsize=10, alpha=0.6) ax.plot(x, dflash, color=COLORS['dflash'], alpha=0.4, linewidth=2, zorder=1, linestyle='--') ax.set_xticks(x) ax.set_xticklabels([f'{c} users' for c in concurrency], fontsize=12) ax.set_ylabel('Output tokens / second', fontsize=14, labelpad=10) ax.set_title('Kimi K2.6 Throughput Scaling\n8x AMD Instinct MI300X (gfx942, 192 GB HBM3 each)', fontsize=17, fontweight='bold', pad=15) ax.legend(fontsize=13, loc='upper left', framealpha=0.3) ax.set_ylim(0, 590) ax.grid(axis='y', alpha=0.1, color=COLORS['grid']) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_color('#333') ax.spines['bottom'].set_color('#333') fig.tight_layout() fig.savefig(OUT / 'throughput-scaling.png', dpi=150, bbox_inches='tight') print('saved throughput-scaling.png') def chart_speedup(): fig, ax = plt.subplots(figsize=(10, 5.5)) configs = [ 'Autoreg\nseqs=8\n(old baseline)', 'DFlash st=8\nseqs=8\n(old DFlash)', 'DFlash st=2\nseqs=8', 'DFlash st=2\nseqs=16', 'DFlash st=2\nseqs=24', 'DFlash st=2\nseqs=32', ] tps = [90, 108, 127, 251, 379, 508] colors = [COLORS['autoreg'], COLORS['autoreg'], COLORS['blue'], COLORS['blue'], COLORS['dflash'], COLORS['dflash']] bars = ax.barh(range(len(configs)), tps, color=colors, edgecolor='white', linewidth=0.5, height=0.65, zorder=3) for bar, v in zip(bars, tps): label = f' {v} tok/s' if v == 508: label += ' (5.6x)' ax.text(v + 5, bar.get_y() + bar.get_height()/2, label, va='center', fontsize=13, fontweight='bold', color=COLORS['text']) ax.set_yticks(range(len(configs))) ax.set_yticklabels(configs, fontsize=11) ax.set_xlabel('Output tokens / second', fontsize=13, labelpad=10) ax.set_title('Optimization Journey: 90 → 508 tok/s', fontsize=16, fontweight='bold', pad=15) ax.set_xlim(0, 620) ax.invert_yaxis() ax.grid(axis='x', alpha=0.1, color=COLORS['grid']) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_color('#333') ax.spines['bottom'].set_color('#333') fig.tight_layout() fig.savefig(OUT / 'optimization-journey.png', dpi=150, bbox_inches='tight') print('saved optimization-journey.png') def chart_acceptance(): fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5)) positions_8 = ['Pos 0', 'Pos 1', 'Pos 2', 'Pos 3', 'Pos 4', 'Pos 5', 'Pos 6', 'Pos 7'] accept_8 = [64, 34, 18, 9, 4, 2, 1, 0.5] positions_2 = ['Pos 0', 'Pos 1'] accept_2 = [64, 34] bars8 = ax1.bar(positions_8, accept_8, color=[COLORS['dflash'] if v > 20 else COLORS['autoreg'] for v in accept_8], edgecolor='white', linewidth=0.5, zorder=3) for bar, v in zip(bars8, accept_8): ax1.text(bar.get_x() + bar.get_width()/2, v + 1.5, f'{v}%', ha='center', fontsize=10, color=COLORS['text']) ax1.axhline(y=20, color=COLORS['gold'], linestyle='--', alpha=0.4) ax1.text(7.5, 22, 'break-even', ha='right', fontsize=9, color=COLORS['gold'], alpha=0.6) ax1.set_title('st=8: 16% avg acceptance\nWastes compute on positions 3-7', fontsize=13, fontweight='bold', color=COLORS['autoreg']) ax1.set_ylabel('Acceptance rate (%)', fontsize=12) ax1.set_ylim(0, 80) ax1.grid(axis='y', alpha=0.1) ax1.spines['top'].set_visible(False) ax1.spines['right'].set_visible(False) ax1.spines['left'].set_color('#333') ax1.spines['bottom'].set_color('#333') bars2 = ax2.bar(positions_2, accept_2, color=COLORS['dflash'], edgecolor='white', linewidth=0.5, width=0.5, zorder=3) for bar, v in zip(bars2, accept_2): ax2.text(bar.get_x() + bar.get_width()/2, v + 1.5, f'{v}%', ha='center', fontsize=14, fontweight='bold', color=COLORS['dflash']) ax2.axhline(y=20, color=COLORS['gold'], linestyle='--', alpha=0.4) ax2.text(1.7, 22, 'break-even', ha='right', fontsize=9, color=COLORS['gold'], alpha=0.6) ax2.set_title('st=2: 49% avg acceptance\nEvery position contributes', fontsize=13, fontweight='bold', color=COLORS['dflash']) ax2.set_ylim(0, 80) ax2.grid(axis='y', alpha=0.1) ax2.spines['top'].set_visible(False) ax2.spines['right'].set_visible(False) ax2.spines['left'].set_color('#333') ax2.spines['bottom'].set_color('#333') fig.suptitle('Why 2 Speculative Tokens Beats 8 (K2.5 drafter on K2.6 target)', fontsize=15, fontweight='bold', y=1.02) fig.tight_layout() fig.savefig(OUT / 'acceptance-comparison.png', dpi=150, bbox_inches='tight') print('saved acceptance-comparison.png') def chart_latency(): fig, ax = plt.subplots(figsize=(10, 5)) concurrency = [8, 12, 16, 20, 24, 32] latency = [31.0, 30.7, 30.8, 30.2, 30.0, 30.7] per_user = [15.9, 16.1, 15.7, 16.2, 15.8, 15.9] ax2 = ax.twinx() line1 = ax.plot(concurrency, latency, 'o-', color=COLORS['blue'], linewidth=2.5, markersize=10, label='Mean latency (s)', zorder=3) ax.fill_between(concurrency, [l-0.5 for l in latency], [l+0.5 for l in latency], color=COLORS['blue'], alpha=0.1) line2 = ax2.plot(concurrency, per_user, 's--', color=COLORS['gold'], linewidth=2, markersize=8, label='Per-user tok/s', zorder=3) ax.set_xlabel('Concurrent Users', fontsize=13) ax.set_ylabel('Mean Latency (seconds)', fontsize=13, color=COLORS['blue']) ax2.set_ylabel('Per-User tok/s', fontsize=13, color=COLORS['gold']) ax.set_ylim(25, 36) ax2.set_ylim(12, 20) lines = line1 + line2 labels = [l.get_label() for l in lines] ax.legend(lines, labels, fontsize=12, loc='upper left', framealpha=0.3) ax.set_title('Latency Stays Flat as Concurrency Scales\n512-token completions, Kimi K2.6 on 8x MI300X', fontsize=15, fontweight='bold', pad=15) ax.grid(alpha=0.1) ax.spines['top'].set_visible(False) ax2.spines['top'].set_visible(False) ax.spines['left'].set_color('#333') ax.spines['right'].set_color('#333') ax.spines['bottom'].set_color('#333') fig.tight_layout() fig.savefig(OUT / 'latency-flat.png', dpi=150, bbox_inches='tight') print('saved latency-flat.png') def chart_hardware(): fig, ax = plt.subplots(figsize=(11, 3)) ax.axis('off') table_data = [ ['8x AMD Instinct MI300X', 'gfx942 (CDNA 3)', '192 GB HBM3 each', '1,536 GB total'], ['moonshotai/Kimi-K2.6', '1T MoE / 32B active', '256K context', '555 GB (64 shards)'], ['z-lab/Kimi-K2.5-DFlash', '5 decoder layers', 'Shared embed/lm_head', '6.5 GB'], ['vLLM v0.19.2 ROCm', 'AITER MoE kernels', 'TRITON_MLA attention', 'DFlash patched'], ] row_labels = ['GPU', 'Target', 'Drafter', 'Runtime'] col_labels = ['', '', '', ''] table = ax.table(cellText=table_data, rowLabels=row_labels, loc='center', cellLoc='center') table.auto_set_font_size(False) table.set_fontsize(11) table.scale(1, 1.8) for key, cell in table.get_celld().items(): cell.set_edgecolor('#333') if key[0] == 0: cell.set_facecolor('#1a3a2a') cell.set_text_props(color=COLORS['dflash'], fontweight='bold') elif key[1] == -1: cell.set_facecolor('#1a2a3a') cell.set_text_props(color=COLORS['blue'], fontweight='bold') else: cell.set_facecolor(COLORS['bg']) cell.set_text_props(color=COLORS['text']) ax.set_title('Hardware & Software Stack', fontsize=14, fontweight='bold', pad=10, color=COLORS['text']) fig.patch.set_facecolor(COLORS['bg']) fig.tight_layout() fig.savefig(OUT / 'hardware-stack.png', dpi=150, bbox_inches='tight') print('saved hardware-stack.png') if __name__ == '__main__': chart_throughput_scaling() chart_speedup() chart_acceptance() chart_latency() chart_hardware() print('all charts generated')