From 7f8931af00b4c94ab565c4db6688b7504af6408f Mon Sep 17 00:00:00 2001 From: multipleof4 Date: Thu, 13 Nov 2025 12:48:54 -0800 Subject: [PATCH] Feat: Record execution output for each test run --- scripts/run-benchmark.js | 48 +++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/scripts/run-benchmark.js b/scripts/run-benchmark.js index d5f4ed2..db01e1e 100644 --- a/scripts/run-benchmark.js +++ b/scripts/run-benchmark.js @@ -37,7 +37,10 @@ const main = async () => { .filter(d => d.isDirectory()).map(d => d.name).sort(); await Promise.all( - allTestDirs.map(dir => fs.rm(path.join(TESTS_DIR, dir, 'outputs'), { recursive: true, force: true })) + allTestDirs.flatMap(dir => [ + fs.rm(path.join(TESTS_DIR, dir, 'outputs'), { recursive: true, force: true }), + fs.rm(path.join(TESTS_DIR, dir, 'results'), { recursive: true, force: true }) + ]) ); const testsToRun = allTestDirs.slice(0, Math.ceil(allTestDirs.length * (percentage / 100))); @@ -49,26 +52,41 @@ const main = async () => { genData[modelSpec] = {}; for (const dir of testsToRun) { - const { prompt, functionName } = (await import(pathToFileURL(path.join(TESTS_DIR, dir, 'test.js')))).default; - console.log(`Generating ${dir} for ${modelSpec}...`); - const result = await getLlmCode( - `${sharedPrompt}\n\n${prompt.trim()}`, - model, - functionName, - temperature - ); + const { prompt, functionName, runTest } = (await import(pathToFileURL(path.join(TESTS_DIR, dir, 'test.js')))).default; - genData[modelSpec][dir] = result?.duration ?? null; - if (!result) continue; - + console.log(`Generating ${dir} for ${modelSpec}...`); + const genResult = await getLlmCode(`${sharedPrompt}\n\n${prompt.trim()}`, model, functionName, temperature); + + const sModel = modelSpec.replace(/[\/:]/g, '_'); const outDir = path.join(TESTS_DIR, dir, 'outputs'); await fs.mkdir(outDir, { recursive: true }); - const fname = `${modelSpec.replace(/[\/:]/g, '_')}.js`; - await fs.writeFile(path.join(outDir, fname), result.code); + const fpath = path.join(outDir, `${sModel}.js`); + + let passed = false, error = 'Code generation failed', output = null; + + if (genResult) { + await fs.writeFile(fpath, genResult.code); + try { + console.log(`Testing ${dir} for ${modelSpec}...`); + output = await runTest((await import(pathToFileURL(fpath))).default); + passed = true; + error = null; + } catch (e) { + console.error(`Test failed for ${modelSpec} on ${dir}: ${e.message}`); + error = e.message || String(e); + } + } + + const resultsDir = path.join(TESTS_DIR, dir, 'results'); + await fs.mkdir(resultsDir, { recursive: true }); + const resultsFpath = path.join(resultsDir, `${sModel}.json`); + await fs.writeFile(resultsFpath, JSON.stringify({ output, error }, null, 2)); + + genData[modelSpec][dir] = { duration: genResult?.duration ?? null, passed, error }; } } await fs.writeFile(RESULTS_PATH, JSON.stringify(genData, null, 2)); - console.log('Code generation and results.json update complete.'); + console.log('Benchmark complete. Results saved to results.json.'); }; main().catch(console.error);