Files
lynchmark/index.html

193 lines
10 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Lynchmark LLM Benchmark</title>
<meta property="og:title" content="Lynchmark LLM Benchmark">
<meta property="og:site_name" content="Lynchmark">
<meta name="description" content="Lynchmark tests LLMs by requiring correct CDN imports and library-specific implementations to solve challenging browser-based JavaScript tasks.">
<meta property="og:description" content="Lynchmark tests LLMs by requiring correct CDN imports and library-specific implementations to solve challenging browser-based JavaScript tasks.">
<meta property="og:type" content="website">
<meta property="og:url" content="https://lynchmark.pages.dev/">
<link rel="canonical" href="https://lynchmark.pages.dev/">
<script type="application/ld+json">
{
"@context":"https://schema.org",
"@type":"WebSite",
"name":"Lynchmark",
"url":"https://lynchmark.pages.dev/",
"description":"Lynchmark an automated benchmark for LLM coding abilities in a real browser+CDN environment."
}
</script>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=DM+Serif+Display:ital@0;1&family=IBM+Plex+Mono:wght@400;500&display=swap" rel="stylesheet">
<script src="https://cdn.tailwindcss.com"></script>
<style>
body{font-family:Inter,system-ui,-apple-system,Segoe UI,Roboto,Ubuntu,Cantarell,Noto Sans,sans-serif}
.mono{font-family:"IBM Plex Mono",ui-monospace,SFMono-Regular,Menlo,monospace}
</style>
</head>
<body class="bg-gray-50 text-gray-800">
<main class="max-w-2xl mx-auto flex flex-col min-h-screen p-6 lg:p-8">
<header class="text-center mb-10">
<div class="relative inline-block">
<h1 class="text-4xl font-bold text-gray-900 mb-2">Lynchmark</h1>
<span class="mono pointer-events-none absolute -top-2 -right-3 inline-flex items-center rounded-full border border-green-200 bg-green-50 text-green-700 text-[10px] leading-none font-medium px-1.5 py-0.5 shadow-sm">
Last updated <time id="last-updated" class="ml-1"></time>
</span>
</div>
<p class="text-base text-gray-600 max-w-lg mx-auto">
This benchmark tests the model's knowledge by tasking it to import the right library from the right CDN URL path and having the pre-existing library specific knowledge to correctly implement a solution for each challenging problem for/in the browser environment using JavaScript.
</p>
</header>
<div id="results-container" class="flex flex-col gap-6 flex-grow">
</div>
<footer class="mt-10 flex justify-center">
<a
href="https://github.com/multipleof4/lynchmark"
class="inline-flex items-center gap-2 text-gray-600 hover:text-gray-900"
target="_blank"
rel="noopener noreferrer"
>
<svg
xmlns="http://www.w3.org/2000/svg"
viewBox="0 0 16 16"
aria-hidden="true"
class="w-5 h-5 fill-current"
>
<path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38
0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52
0-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95
0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0
1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15
0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2
0 .21.15.46.55.38A8.012 8.012 0 0 0 16 8c0-4.42-3.58-8-8-8z"/>
</svg>
<span class="mono text-xs font-medium">@multipleof4/lynchmark</span>
</a>
</footer>
</main>
<script type="module">
const get=id=>document.getElementById(id);
const esc=s=>String(s??'').replace(/[&<>"']/g,c=>({ '&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;' }[c]));
const container=get('results-container');
const updatedEl=get('last-updated');
const now=new Date();
updatedEl.textContent=now.toLocaleDateString('en-US',{month:'short',year:'numeric'});
updatedEl.dateTime=now.toISOString().split('T')[0];
const run=async()=>{
const readme=await fetch('./README').then(r=>r.text());
const genTimes=await fetch('./results.json').then(r=>r.json());
const models=readme.match(/<!-- MODELS_START -->\n([\s\S]+?)\n<!-- MODELS_END -->/)[1].trim().split('\n');
const tests=[...new Set(Object.values(genTimes).flatMap(Object.keys))].sort();
const stats=[];
for(const model of models){
const sModel=model.replace(/[\/:]/g,'_');
const card=document.createElement('section');
card.className='rounded-2xl border border-gray-200 bg-white shadow-sm overflow-hidden';
card.innerHTML=`
<div class="bg-gray-900 text-white px-5 py-4 flex flex-col gap-2">
<div class="flex items-center justify-between gap-4">
<p class="mono text-sm font-medium truncate">${model}</p>
<span class="mono text-xs font-semibold px-3 py-1 rounded-full bg-white/10 border border-white/20" id="summary-${sModel}">Scoring…</span>
</div>
<p class="text-xs text-white/70" id="badge-${sModel}">Benchmarking ${tests.length} tasks</p>
</div>
<div class="p-4 overflow-x-auto">
<table class="w-full text-sm text-left border-separate border-spacing-y-1">
<thead>
<tr class="text-gray-500 uppercase text-xs">
<th class="px-3 py-2 font-semibold">Test</th>
<th class="px-3 py-2 font-semibold">Result</th>
<th class="px-3 py-2 font-semibold text-right">Time</th>
</tr>
</thead>
<tbody id="body-${sModel}"></tbody>
</table>
</div>`;
container.appendChild(card);
const body=get(`body-${sModel}`);
const summaryEl=get(`summary-${sModel}`);
let passes=0,attempted=0;
const durations=[];
for(const test of tests){
const row=document.createElement('tr');
row.className='bg-gray-50 text-gray-800 rounded-xl shadow-sm';
row.innerHTML=`
<td class="px-3 py-2 font-medium flex items-center gap-2">
<svg class="animate-spin h-4 w-4 text-gray-400" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
<circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
<path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
</svg>
<span>${test}</span>
</td>
<td class="px-3 py-2 mono text-gray-500">Running…</td>
<td class="px-3 py-2 mono text-gray-500 text-right">…</td>`;
body.appendChild(row);
const time=genTimes[model]?.[test];
if(time==null){
row.innerHTML=`
<td class="px-3 py-2 font-medium flex items-center gap-2 text-gray-500">— <span>${test}</span></td>
<td class="px-3 py-2 mono text-gray-500">Not run</td>
<td class="px-3 py-2 mono text-gray-500 text-right">N/A</td>`;
continue;
}
attempted++;
let status='✅',error=null;
try{
const testP=(async()=>{
const tMod=await import(`./tests/${test}/test.js`);
const lMod=await import(`./tests/${test}/outputs/${sModel}.js`);
await tMod.default.runTest(lMod.default);
})();
await Promise.race([testP,new Promise((_,r)=>setTimeout(()=>r(new Error('Timeout')),12000))]);
passes++;
}catch(e){
console.error(`${model} - ${test}:`,e);
status='❌';
error=e.message||'Failed';
}
if(typeof time==='number')durations.push(time);
const timeStr=typeof time==='number'?`${time.toFixed(3)}s`:'N/A';
const detail=status==='✅'?'Passed':`Failed ${esc(error)}`;
row.innerHTML=`
<td class="px-3 py-2 font-semibold flex items-center gap-2 ${status==='✅'?'text-green-600':'text-red-600'}">
<span>${status}</span><span class="text-gray-900">${test}</span>
</td>
<td class="px-3 py-2 text-gray-700">${detail}</td>
<td class="px-3 py-2 mono text-right text-gray-900">${timeStr}</td>`;
}
const avg=durations.length?(durations.reduce((a,b)=>a+b,0)/durations.length):null;
const totalBase=attempted||tests.length;
const passRate=totalBase?Math.round((passes/totalBase)*100):0;
const score=Math.max(0,Math.round(passes*100-(avg??120)*10));
const summary=`${passes}/${totalBase} passed · Avg ${avg?avg.toFixed(2)+'s':'N/A'} · ${score} pts`;
summaryEl.textContent=summary;
get(`badge-${sModel}`).textContent=`${attempted} tests attempted · ${passRate}% pass rate`;
const overall=document.createElement('tr');
overall.className='bg-amber-50 text-amber-900 font-semibold';
overall.innerHTML=`
<td class="px-3 py-2 flex items-center gap-2">
<span>Σ</span><span>Overall</span>
</td>
<td class="px-3 py-2">Pass rate ${passRate}% (${passes}/${totalBase})</td>
<td class="px-3 py-2 mono text-right">${avg?avg.toFixed(2)+'s':'N/A'} · ${score} pts</td>`;
body.appendChild(overall);
stats.push({model,score,avg:avg??Number.POSITIVE_INFINITY,passes,summaryEl});
}
if(stats.length){
const best=[...stats].sort((a,b)=>b.score-a.score||a.avg-b.avg||b.passes-a.passes)[0];
best.summaryEl.innerHTML=`🏆 ${best.summaryEl.textContent}`;
best.summaryEl.classList.add('bg-amber-100','text-amber-700','border','border-amber-300');
}
};
run();
</script>
</body>
</html>