// ===== 9. LEADERBOARD ===== const BENCHMARKS=['AppWorld','BrowseComp+','SWE-bench','TauBench-Airline','TauBench-Retail','TauBench-Telecom']; const BENCH_SHORT={'AppWorld':'App','BrowseComp+':'Browse','SWE-bench':'SWE','TauBench-Airline':'Tau-Air','TauBench-Retail':'Tau-Ret','TauBench-Telecom':'Tau-Tel'}; const MODEL_DISPLAY={'claude-opus-4.5':'Claude Opus 4.5','gpt-5.2':'GPT 5.2','gemini-3-pro':'Gemini Pro 3','deepseek-v3.2':'DeepSeek V3.2','kimi-k2.5':'Kimi K2.5'}; const MODEL_URLS={'claude-opus-4.5':'https://www.anthropic.com/claude','gpt-5.2':'https://openai.com/','gemini-3-pro':'https://deepmind.google/technologies/gemini/','deepseek-v3.2':'https://www.deepseek.com/','kimi-k2.5':'https://www.moonshot.ai/'}; const AGENT_DISPLAY={'Claude_Code':'Claude Code','OpenAI_Solo':'OpenAI Solo','Smolagent':'Smolagent','React':'React','React_+_Shortlisting':'React + Shortlist'}; const AGENT_URLS={'Claude_Code':'https://github.com/anthropics/claude-code','OpenAI_Solo':'https://github.com/openai/openai-agents-python','Smolagent':'https://github.com/huggingface/smolagents','React':'https://github.com/BerriAI/litellm','React_+_Shortlisting':'https://github.com/BerriAI/litellm'}; let sortCol='avg',sortDir=-1; function parseCSV(text){ const lines=text.trim().split('\n');const headers=lines[0].split(','); return lines.slice(1).map(line=>{const vals=line.split(',');const obj={};headers.forEach((h,i)=>obj[h.trim()]=vals[i]?.trim()||'');return obj;}); } function processData(rows,modelFilter){ const groups={}; rows.forEach(r=>{ if(modelFilter!=='all'&&r.model_normalized!==modelFilter)return; const key=r.visible_agent_name+'|'+r.model_normalized; if(!groups[key])groups[key]={agent:r.visible_agent_name,model:r.model_normalized,version:r.agent_version||'',benchmarks:{},costs:{}}; groups[key].benchmarks[r.benchmark]=parseFloat(r.score)||0; groups[key].costs[r.benchmark]=parseFloat(r.avg_cost)||0; }); // Benchmark weights: TauBench sub-benchmarks each get 1/12 (so TauBench total = 1/4) // Other benchmarks get 1/4 each const BENCH_WEIGHT={}; BENCHMARKS.forEach(b=>{BENCH_WEIGHT[b]=b.startsWith('TauBench')?1/12:1/4}); return Object.values(groups).map(g=>{ // Every (model, agent, benchmark) cell is populated; zero is a real score // (TauBench protocol failures, AppWorld tool-limit failures). Include all six. const bs=BENCHMARKS.map(b=>g.benchmarks[b]||0); let wSum=0; BENCHMARKS.forEach((b,i)=>{wSum+=bs[i]*BENCH_WEIGHT[b]}); const avg=wSum; // weights sum to 1 const cs=Object.values(g.costs).filter(c=>c>0); const avgCost=cs.length?cs.reduce((a,b)=>a+b,0)/cs.length:0; return{...g,avg,avgCost,benchScores:bs}; }).filter(g=>g.avg>0).sort((a,b)=>b.avg-a.avg); } function scoreClass(s){if(s===0)return'score-zero';if(s>=.7)return'score-high';if(s>=.5)return'score-mid';return'score-low'} function renderTable(data){ const cols=[{key:'rank',label:'#',sortable:false},{key:'agent',label:'Agent',sortable:true},{key:'model',label:'Model',sortable:true},{key:'avg',label:'Avg Success',sortable:true},{key:'avgCost',label:'Avg Cost',sortable:true}]; BENCHMARKS.forEach(b=>{cols.push({key:'bench_'+b,label:BENCH_SHORT[b],sortable:true})}); const head=document.getElementById('lbHead'); head.innerHTML=cols.map(c=>{ const sorted=sortCol===c.key; const cls=c.sortable?`class="sortable${sorted?' sorted':''}" data-col="${c.key}"`:''; const arrow=c.sortable?`${sorted?(sortDir>0?'▲':'▼'):'▲'}`:''; return`${c.label}${arrow}`; }).join(''); const sorted=[...data]; if(sortCol==='agent')sorted.sort((a,b)=>sortDir*a.agent.localeCompare(b.agent)); else if(sortCol==='model')sorted.sort((a,b)=>sortDir*a.model.localeCompare(b.model)); else if(sortCol==='avg')sorted.sort((a,b)=>sortDir*(a.avg-b.avg)); else if(sortCol==='avgCost')sorted.sort((a,b)=>sortDir*(a.avgCost-b.avgCost)); else if(sortCol.startsWith('bench_')){const idx=BENCHMARKS.indexOf(sortCol.replace('bench_',''));sorted.sort((a,b)=>sortDir*((a.benchScores[idx]||0)-(b.benchScores[idx]||0)));} const body=document.getElementById('lbBody'); body.innerHTML=sorted.map((row,i)=>{ const name=AGENT_DISPLAY[row.agent]||row.agent.replace(/_/g,' '); const model=MODEL_DISPLAY[row.model]||row.model; let rankHtml; if(i===0)rankHtml='1'; else if(i===1)rankHtml='2'; else if(i===2)rankHtml='3'; else rankHtml=`${i+1}`; function fmtPct(v){const s=(v*100).toFixed(1);return s.endsWith('.0')?s.slice(0,-2)+'%':s+'%'} let html=``; html+=`${rankHtml}`; const ver=row.version?row.version.replace(/_/g,' ').replace(/\s*�\s*/g,' · '):''; const agentUrl=AGENT_URLS[row.agent]; const agentLink=agentUrl?`${name}`:name; html+=`${agentLink}${ver?`${ver}`:''}`; const modelUrl=MODEL_URLS[row.model]; const modelLink=modelUrl?`${model}`:model; html+=`${modelLink}`; html+=`
${fmtPct(row.avg)}`; html+=`$${row.avgCost.toFixed(2)}`; row.benchScores.forEach(s=>{ html+=`
${fmtPct(s)}`; }); html+=``;return html; }).join(''); // Stagger row reveal const rows=body.querySelectorAll('tr'); rows.forEach((r,i)=>{setTimeout(()=>r.classList.add('visible'),i*60)}); head.querySelectorAll('.sortable').forEach(th=>{ th.addEventListener('click',()=>{ const col=th.dataset.col; if(sortCol===col)sortDir*=-1;else{sortCol=col;sortDir=-1;} renderTable(data); }); }); // Remove any legacy "See all" button if it exists from a previous render const oldBtn=document.getElementById('lbSeeAll'); if(oldBtn)oldBtn.remove(); // Toggle bottom-fade hint based on whether the table can still scroll down const tableWrap=document.querySelector('.table-wrap'); const updateFade=()=>{ const atBottom=tableWrap.scrollTop+tableWrap.clientHeight>=tableWrap.scrollHeight-2; const overflows=tableWrap.scrollHeight>tableWrap.clientHeight+2; tableWrap.classList.toggle('lb-can-scroll',overflows&&!atBottom); }; if(!tableWrap.dataset.scrollBound){ tableWrap.addEventListener('scroll',updateFade,{passive:true}); window.addEventListener('resize',updateFade); tableWrap.dataset.scrollBound='1'; } // Defer to next frame so layout is settled requestAnimationFrame(updateFade); } // ===== CHART ===== function renderChart(data){ if(window.renderParetoChart) window.renderParetoChart(data); } // ===== INIT ===== const _lbScript=document.currentScript; const _basePath=(_lbScript&&_lbScript.getAttribute('data-base'))||'/'; fetch(_basePath+'results.csv').then(r=>r.text()).then(text=>{ const rows=parseCSV(text); const models=[...new Set(rows.map(r=>r.model_normalized))].filter(Boolean); const sel=document.getElementById('modelFilter'); models.forEach(m=>{const opt=document.createElement('option');opt.value=m;opt.textContent=MODEL_DISPLAY[m]||m;sel.appendChild(opt)}); const data=processData(rows,'all'); window._chartData=data; renderTable(data);renderChart(data); sel.addEventListener('change',()=>{const d=processData(rows,sel.value);renderTable(d)}); }).catch(err=>console.error('CSV load failed:',err)); const _lastUpdatedEl=document.getElementById('lastUpdated'); if(_lastUpdatedEl){fetch(_basePath+'results.csv.timestamp').then(r=>r.text()).then(t=>{ const d=new Date(t.trim()); const fmt=isNaN(d)?t.trim():d.toLocaleDateString('en-US',{month:'long',day:'numeric',year:'numeric'}); _lastUpdatedEl.textContent='Last updated: '+fmt; }).catch(()=>{});}