From e165d70b0592fdf27df1f9971392ea42c37bebd9 Mon Sep 17 00:00:00 2001 From: Kristiyan Kostadinov Date: Wed, 17 Dec 2025 14:25:54 +0200 Subject: [PATCH 1/2] fix: set correct model for custom analysis Currently we're using the same model for the custom categories as for the eval itself. This is incorrect because the model may not be available. These changes add the option to set the model and default to Gemini 2.5 Flash Lite. --- runner/configuration/constants.ts | 3 +++ runner/configuration/environment-config.ts | 1 + runner/configuration/environment.ts | 5 ++++- runner/orchestration/generate-summary.ts | 6 +++--- runner/reporting/report-ai-summary.ts | 3 ++- 5 files changed, 13 insertions(+), 5 deletions(-) diff --git a/runner/configuration/constants.ts b/runner/configuration/constants.ts index 19c555c..f74c5bf 100644 --- a/runner/configuration/constants.ts +++ b/runner/configuration/constants.ts @@ -17,6 +17,9 @@ export const DEFAULT_MODEL_NAME = 'gemini-2.5-pro'; // slower than `flash`, but */ export const DEFAULT_AUTORATER_MODEL_NAME = 'gemini-2.5-flash'; // use less expensive model +/** Model used for AI summarization by default. */ +export const DEFAULT_SUMMARY_MODEL = 'gemini-2.5-flash-lite'; + /** Name of the root folder where we store LLM-generated code for debugging */ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output'); diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts index ab7ef39..fefd21c 100644 --- a/runner/configuration/environment-config.ts +++ b/runner/configuration/environment-config.ts @@ -108,6 +108,7 @@ export const environmentConfigSchema = z.object({ z.object({ name: z.string(), path: z.string(), + model: z.string().optional(), reportsFilter: z .enum([ReportContextFilter.AllReports, ReportContextFilter.NonPerfectReports]) .optional(), diff --git a/runner/configuration/environment.ts b/runner/configuration/environment.ts index c6abbc3..fb46334 100644 --- a/runner/configuration/environment.ts +++ b/runner/configuration/environment.ts @@ -18,6 +18,7 @@ import {EnvironmentConfig} from './environment-config.js'; import {EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js'; import {renderPromptTemplate} from './prompt-templating.js'; import {getSha256Hash} from '../utils/hashing.js'; +import {DEFAULT_SUMMARY_MODEL} from './constants.js'; interface CategoryConfig { name: string; @@ -27,6 +28,7 @@ interface CategoryConfig { interface AnalysisPrompt { name: string; prompt: string; + model: string; reportsFilter: ReportContextFilter; ratingsFilter: RatingContextFilter; } @@ -463,12 +465,13 @@ export class Environment { private resolveAnalysisPrompts(config: EnvironmentConfig): AnalysisPrompt[] { const result: AnalysisPrompt[] = []; - config.analysisPrompts?.forEach(({name, path, reportsFilter, ratingsFilter}) => { + config.analysisPrompts?.forEach(({name, path, model, reportsFilter, ratingsFilter}) => { const prompt = this.renderEnvironmentPrompt(path).result; result.push({ name, prompt, + model: model || DEFAULT_SUMMARY_MODEL, reportsFilter: reportsFilter ?? ReportContextFilter.NonPerfectReports, ratingsFilter: ratingsFilter ?? RatingContextFilter.NonPerfectRatings, }); diff --git a/runner/orchestration/generate-summary.ts b/runner/orchestration/generate-summary.ts index 27ba1d0..38da6c5 100644 --- a/runner/orchestration/generate-summary.ts +++ b/runner/orchestration/generate-summary.ts @@ -12,7 +12,7 @@ import {AssessmentResult, CompletionStats, RunSummary} from '../shared-interface export async function prepareSummary( generateAiSummaryLlm: GenkitRunner | null, abortSignal: AbortSignal, - model: string, + evalRunModel: string, env: Environment, assessments: AssessmentResult[], completionStats: CompletionStats, @@ -75,7 +75,7 @@ export async function prepareSummary( abortSignal, assessments, [], - model, + config.model, { reportContextFilter: config.reportsFilter, ratingContextFilter: config.ratingsFilter, @@ -101,7 +101,7 @@ export async function prepareSummary( const executorInfo = await env.executor.getExecutorInfo?.(); return { - model, + model: evalRunModel, environmentId: env.id, displayName: env.displayName, framework: { diff --git a/runner/reporting/report-ai-summary.ts b/runner/reporting/report-ai-summary.ts index 872f79e..b6918fc 100644 --- a/runner/reporting/report-ai-summary.ts +++ b/runner/reporting/report-ai-summary.ts @@ -1,4 +1,5 @@ import {GenkitRunner} from '../codegen/genkit/genkit-runner.js'; +import {DEFAULT_SUMMARY_MODEL} from '../configuration/constants.js'; import {AssessmentResult, ReportContextFilter, RatingContextFilter} from '../shared-interfaces.js'; import {chatWithReportAI} from './report-ai-chat.js'; @@ -7,7 +8,7 @@ export async function summarizeReportWithAI( abortSignal: AbortSignal, assessments: AssessmentResult[], ) { - const model = 'gemini-2.5-flash-lite'; + const model = DEFAULT_SUMMARY_MODEL; if (!llm.getSupportedModels().includes(model)) { throw new Error(`Unable to generate AI summary due to unsupported model: ${model}`); From 401694d79994782db8f9e36f4f42d03bf144c2bf Mon Sep 17 00:00:00 2001 From: Kristiyan Kostadinov Date: Wed, 17 Dec 2025 14:26:51 +0200 Subject: [PATCH 2/2] build: bump version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index d5845f6..470d5cf 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "web-codegen-scorer", - "version": "0.0.54", + "version": "0.0.55", "scripts": { "build-runner": "tsc", "release-build": "tsx ./scripts/release-build.ts",