mirror of
https://github.com/multipleof4/lynchmark.git
synced 2026-01-13 16:17:54 +00:00
156 lines
8.1 KiB
HTML
156 lines
8.1 KiB
HTML
<!DOCTYPE html>
|
||
<html lang="en">
|
||
<head>
|
||
<meta charset="UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||
<title>Lynchmark – LLM Benchmark</title>
|
||
|
||
<meta property="og:title" content="Lynchmark – LLM Benchmark">
|
||
<meta property="og:site_name" content="Lynchmark">
|
||
<meta name="description" content="Lynchmark tests LLMs by requiring correct CDN imports and library-specific implementations to solve challenging browser-based JavaScript tasks.">
|
||
<meta property="og:description" content="Lynchmark tests LLMs by requiring correct CDN imports and library-specific implementations to solve challenging browser-based JavaScript tasks.">
|
||
<meta property="og:type" content="website">
|
||
<meta property="og:url" content="https://lynchmark.com/">
|
||
<link rel="canonical" href="https://lynchmark.com/">
|
||
<script type="application/ld+json">
|
||
{
|
||
"@context":"https://schema.org",
|
||
"@type":"WebSite",
|
||
"name":"Lynchmark",
|
||
"url":"https://lynchmark.com/",
|
||
"description":"Lynchmark – an automated benchmark for LLM coding abilities in a real browser+CDN environment."
|
||
}
|
||
</script>
|
||
|
||
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=DM+Serif+Display:ital@0;1&family=IBM+Plex+Mono:wght@400;500&display=swap" rel="stylesheet">
|
||
<script src="https://cdn.tailwindcss.com"></script>
|
||
<style>
|
||
@font-face{font-family:"Stain";src:url("https://cdn.jsdelivr.net/gh/multipleof4/stain.otf@master/dist/Stain.otf") format("opentype")}
|
||
body{font-family:Inter,system-ui,-apple-system,Segoe UI,Roboto,Ubuntu,Cantarell,Noto Sans,sans-serif}
|
||
.mono{font-family:"IBM Plex Mono",ui-monospace,SFMono-Regular,Menlo,monospace}
|
||
</style>
|
||
</head>
|
||
<body class="bg-gray-50 text-gray-800">
|
||
<main class="max-w-2xl mx-auto flex flex-col min-h-screen p-6 lg:p-8">
|
||
<header class="text-center mb-10">
|
||
<div class="relative inline-block">
|
||
<h1 class="text-4xl font-bold text-gray-900 mb-2">Lynchmark</h1>
|
||
<span class="mono pointer-events-none absolute -top-2 -right-3 inline-flex items-center rounded-full border border-green-200 bg-green-50 text-green-700 text-xs leading-none font-medium px-2 py-1 shadow-sm">
|
||
Last updated <time id="last-updated" class="ml-1"></time>
|
||
</span>
|
||
</div>
|
||
<p class="text-base text-gray-600 max-w-lg mx-auto" style="font-family:Stain,sans-serif">
|
||
This benchmark tests the model's knowledge by tasking it to import the right library from the right CDN URL path and having the pre-existing library specific knowledge to correctly implement a solution for each challenging problem for/in the browser environment using JavaScript.
|
||
</p>
|
||
</header>
|
||
|
||
<div id="results-container" class="flex flex-col gap-6 flex-grow">
|
||
</div>
|
||
|
||
<div class="mt-12 text-center space-y-2">
|
||
<a href="/blog/gemini-optimal-temperature.html" class="block text-sm text-blue-500 hover:text-blue-700 font-medium mono">blog/gemini-optimal-temperature</a>
|
||
<a href="/blog/lynchmark-newsletter-experiment.html" class="block text-sm text-blue-500 hover:text-blue-700 font-medium mono">blog/lynchmark-newsletter-experiment</a>
|
||
</div>
|
||
|
||
<footer class="mt-10 flex justify-center">
|
||
<a
|
||
href="https://github.com/multipleof4/lynchmark"
|
||
class="inline-flex items-center gap-2 text-gray-600 hover:text-gray-900"
|
||
target="_blank"
|
||
rel="noopener noreferrer"
|
||
>
|
||
<svg
|
||
xmlns="http://www.w3.org/2000/svg"
|
||
viewBox="0 0 16 16"
|
||
aria-hidden="true"
|
||
class="w-5 h-5 fill-current"
|
||
>
|
||
<path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38
|
||
0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52
|
||
0-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95
|
||
0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0
|
||
1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15
|
||
0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2
|
||
0 .21.15.46.55.38A8.012 8.012 0 0 0 16 8c0-4.42-3.58-8-8-8z"/>
|
||
</svg>
|
||
<span class="mono text-xs font-medium">@multipleof4/lynchmark</span>
|
||
</a>
|
||
</footer>
|
||
</main>
|
||
<script type="module">
|
||
const get=id=>document.getElementById(id);
|
||
const container=get('results-container');
|
||
const updatedEl=get('last-updated');
|
||
const now=new Date();
|
||
updatedEl.textContent=now.toLocaleDateString('en-US',{month:'short',year:'numeric'});
|
||
updatedEl.dateTime=now.toISOString().split('T')[0];
|
||
const grades=[[.97,'A+'],[.93,'A'],[.9,'A-'],[.87,'B+'],[.83,'B'],[.8,'B-'],[.77,'C+'],[.73,'C'],[.7,'C-'],[.6,'D'],[0,'F']];
|
||
const gradeOf=ratio=>grades.find(([floor])=>ratio>=floor)[1];
|
||
|
||
const run=async()=>{
|
||
const readme=await fetch('./README').then(r=>r.text());
|
||
const models=readme.match(/<!-- MODELS_START -->\n([\s\S]+?)\n<!-- MODELS_END -->/)[1].trim().split('\n');
|
||
|
||
const testsRes=await fetch('https://api.github.com/repos/multipleof4/lynchmark/contents/tests');
|
||
const testsData=await testsRes.json();
|
||
const tests=testsData.filter(d=>d.type==='dir').map(d=>d.name).sort();
|
||
|
||
for(const model of models){
|
||
const sModel=model.replace(/[\/:]/g,'_');
|
||
const card=document.createElement('section');
|
||
card.className='rounded-2xl border border-gray-200 bg-white shadow-sm overflow-hidden';
|
||
card.innerHTML=`
|
||
<div class="bg-gray-50 px-5 py-3 border-b border-gray-200">
|
||
<p class="mono text-sm text-gray-700 font-medium">${model}</p>
|
||
</div>
|
||
<ul class="p-4 space-y-2" id="list-${sModel}"></ul>`;
|
||
container.appendChild(card);
|
||
const list=get(`list-${sModel}`);
|
||
let passed=0;
|
||
let ran=0;
|
||
|
||
for(const test of tests){
|
||
const li=document.createElement('li');
|
||
li.className='flex items-center gap-3 text-sm';
|
||
list.appendChild(li);
|
||
|
||
const outUrl=`./tests/${test}/outputs/${sModel}.js`;
|
||
const srcP=fetch(outUrl).then(r=>{
|
||
if(!r.ok) throw new Error('404');
|
||
return r.text();
|
||
}).catch(()=>null);
|
||
|
||
li.innerHTML=`<svg class="animate-spin h-4 w-4 text-gray-400" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24"><circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle><path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path></svg><span class="font-medium text-gray-800">${test}</span><span class="mono text-gray-500 ml-auto">...</span>`;
|
||
|
||
const src=await srcP;
|
||
if(src===null){
|
||
li.innerHTML=`— <span class="font-medium text-gray-800">${test}</span><span class="mono text-gray-500 ml-auto">N/A</span>`;
|
||
continue;
|
||
}
|
||
ran++;
|
||
|
||
const resMatch=src.match(/\/\/ Result: (PASS|FAIL)/);
|
||
const status=resMatch?(resMatch[1]==='PASS'?'✅':'❌'):'❓';
|
||
if(status==='✅')passed++;
|
||
|
||
const fTime=src.match(/\/\/ Generation time: ([\d\.]+)s/)?.[1];
|
||
const timeStr=fTime?`${parseFloat(fTime).toFixed(3)}s`:'N/A';
|
||
li.innerHTML=`${status} <span class="font-medium text-gray-800">${test}</span><span class="mono text-gray-500 ml-auto">${timeStr}</span>`;
|
||
}
|
||
const ratio=ran?passed/ran:0;
|
||
const li=document.createElement('li');
|
||
li.className='mt-3 pt-3 border-t border-gray-200 flex items-center text-sm justify-between';
|
||
const grade=gradeOf(ratio);
|
||
li.innerHTML=`
|
||
<span class="text-gray-600">Score</span>
|
||
<span class="flex items-center gap-3">
|
||
<span class="mono text-gray-900 font-semibold">${passed}/${ran}</span>
|
||
<span class="inline-flex items-center rounded-full bg-gray-100 px-2 py-0.5 text-xs font-semibold text-gray-800">${grade}</span>
|
||
</span>`;
|
||
list.appendChild(li);
|
||
}
|
||
};
|
||
run();
|
||
</script>
|
||
</body>
|
||
</html>
|