diff --git a/frontend/scene-generator/llm-client.js b/frontend/scene-generator/llm-client.js index 0750d93..8bfbc96 100644 --- a/frontend/scene-generator/llm-client.js +++ b/frontend/scene-generator/llm-client.js @@ -1,348 +1,803 @@ const http = require("http"); const https = require("https"); +const { validateSceneIdCandidate } = require("./generator-runner"); -const SYSTEM_PROMPT = `你是一个场景信息提取助手。根据场景目录的内容,提取 scene-id 和 scene-name。 +const SYSTEM_PROMPT = `You analyze a scene source directory and return compact JSON with sceneId and sceneName only. +Rules: +- sceneId: lowercase kebab-case, business meaningful +- never return a numeric-only, version-only, or placeholder sceneId such as 2-0, scene, report +- sceneName: concise display name +- return JSON only`; -scene-id 规则: -- 使用英文短横线连接,如 tq-lineloss-report -- 全小写,有业务含义 +const MAX_DEEP_PROMPT_CHARS = 60000; +const MAX_JSON_SECTION_CHARS = 8000; +const JSON_REPAIR_MAX_CHARS = 24000; +const DEFAULT_REQUEST_TIMEOUT_MS = 90000; +const DEEP_REQUEST_TIMEOUT_MS = 150000; +const JSON_REPAIR_TIMEOUT_MS = 120000; -scene-name 规则: -- 使用中文,简短描述性名称 -- 如 "台区线损报表"、"知乎热榜导出" - -请以 JSON 格式返回:{"sceneId": "...", "sceneName": "..."}`; - -const DEEP_SYSTEM_PROMPT = `你是一个场景代码分析专家。分析场景源码,提取关键业务信息。 - -## 分析目标 - -1. **多模式识别** (关键): - - 查找条件分支逻辑 (if/switch) 中基于 period_mode、reportType 等字段的分支 - - 识别不同分支对应的 API 端点、列定义、请求格式 - - 如果发现多模式,使用 modes 数组格式输出 - -2. **API 端点**: 识别所有 HTTP 请求地址 (URL, method, contentType, 用途) - - 从 \$.ajax/fetch 调用中提取 contentType - - 检测请求格式: application/json 或 application/x-www-form-urlencoded - -3. **请求模板**: 识别请求参数结构 - - 提取硬编码的分页参数 (rows, page, sidx, sord) - - 识别模板变量如 \${args.org_code} - -4. **数据归一化**: 识别数据处理规则 - - 查找数据渲染/表格填充逻辑 - - 检测数据验证条件 (哪些字段不能为空) - -5. **响应路径**: 识别数据在响应中的位置 - - 如 response.content 或 response.data - -## 输出格式 - -### 单模式场景 (无 modes 数组): +const DEEP_SYSTEM_PROMPT = `You are a workflow-semantics extractor for scene skill compilation. +Return JSON only. +Use the provided deterministic signals as hard facts. +Do not invent endpoints, domains, parameters, or workflow steps that are not supported by the input. +Prefer filling missing semantics, labels, descriptions, responsePath, requestTemplate shape, normalizeRules, workflowArchetype, confidence, uncertainties, evidence, and readiness notes. +Schema: { "sceneId": "string", "sceneName": "string", - "sceneKind": "report_collection | monitoring", - "expectedDomain": "string", - "targetUrl": "string", - "apiEndpoints": [{"name": "", "url": "", "method": "POST"}], - "staticParams": {"key": "value"}, - "columnDefs": [["fieldName", "中文列名"]] -} - -### 多模式场景 (有 modes 数组): -{ - "sceneId": "tq-lineloss-report", - "sceneName": "台区线损报表", - "sceneKind": "report_collection", - "modes": [ + "sceneKind": "report_collection|monitoring", + "workflowArchetype": "single_request_table|multi_mode_request|paginated_enrichment|page_state_eval", + "bootstrap": { + "expectedDomain": "string", + "targetUrl": "string", + "requiresTargetPage": true, + "pageTitleKeywords": ["string"], + "source": "llm" + }, + "params": [ { - "name": "month", - "label": "月度报表", - "condition": {"field": "period_mode", "operator": "equals", "value": "month"}, - "apiEndpoint": { - "name": "月度线损查询", - "url": "http://...", - "method": "POST", - "contentType": "application/x-www-form-urlencoded" - }, - "columnDefs": [["ORG_NAME", "供电单位"], ...], - "requestTemplate": {"orgno": "\${args.org_code}", "rows": 1000, "page": 1}, - "normalizeRules": {"type": "validate_all_columns", "filterNull": true}, - "responsePath": "content" - }, - { - "name": "week", - "label": "周报表", - "condition": {"field": "period_mode", "operator": "equals", "value": "week"}, - "apiEndpoint": {...}, - "columnDefs": [...], - ... + "name": "string", + "resolver": "dictionary_entity|month_week_period|fixed_enum|literal_passthrough", + "required": true, + "promptMissing": "string", + "promptAmbiguous": "string", + "resolverConfig": {} } ], - "defaultMode": "month", - "modeSwitchField": "period_mode" -} + "modes": [ + { + "name": "string", + "label": "string", + "condition": { "field": "string", "operator": "equals", "value": "string" }, + "apiEndpoint": { "name": "string", "url": "string", "method": "POST", "contentType": "string", "description": "string" }, + "columnDefs": [["field", "label"]], + "requestTemplate": {}, + "normalizeRules": { "type": "validate_required", "requiredFields": ["string"], "filterNull": true }, + "responsePath": "string" + } + ], + "defaultMode": "string|null", + "modeSwitchField": "string|null", + "workflowSteps": [ + { "type": "request|paginate|secondary_request|filter|transform|export|page_state", "entry": "string", "endpoint": "string", "expr": "string", "description": "string" } + ], + "workflowEvidence": { + "requestEntries": ["string"], + "paginationFields": ["string"], + "secondaryRequestEntries": ["string"], + "postProcessSteps": ["string"] + }, + "requestTemplate": {}, + "responsePath": "string", + "normalizeRules": { "type": "validate_required", "requiredFields": ["string"], "filterNull": true }, + "artifactContract": { "type": "report-artifact", "successStatus": ["ok"], "failureStatus": ["blocked", "error"] }, + "validationHints": { "requiresTargetPage": true, "runtimeCompatible": true, "manualCompletionRequired": false, "missingPieces": ["string"] }, + "evidence": [{ "kind": "llm_semantic", "summary": "string", "source": "llm", "confidence": 0.0 }], + "readiness": { "level": "A|B|C", "confidence": 0.0, "risks": ["string"], "missingPieces": ["string"], "notes": ["string"] }, + "apiEndpoints": [{ "name": "string", "url": "string", "method": "POST", "contentType": "string", "description": "string" }], + "staticParams": {}, + "columnDefs": [["field", "label"]], + "confidence": 0.0, + "uncertainties": ["string"] +}`; -**重要**: 如果发现代码中有基于 period_mode 的 if/switch 分支,必须使用多模式格式输出!`; +const JSON_REPAIR_SYSTEM_PROMPT = `You repair malformed JSON. +Rules: +- Return JSON only. +- Keep the original data as much as possible. +- Do not add explanations or markdown fences. +- Fix syntax only: commas, quotes, brackets, braces, trailing commas, duplicated fence text. +- If a field is unrecoverable, keep it empty instead of inventing content.`; function buildAnalyzePrompt(sourceDir, dirContents) { const parts = []; - - parts.push(`=== 目录结构 ===`); + parts.push(`Source: ${sourceDir}`); + parts.push("Directory tree:"); parts.push(dirContents.tree || "(empty)"); if (dirContents["scene.toml"]) { - parts.push(`\n=== scene.toml ===`); - parts.push(dirContents["scene.toml"]); + parts.push("\nscene.toml:"); + parts.push(truncate(dirContents["scene.toml"], 3000)); } if (dirContents["SKILL.toml"]) { - parts.push(`\n=== SKILL.toml ===`); - parts.push(dirContents["SKILL.toml"]); + parts.push("\nSKILL.toml:"); + parts.push(truncate(dirContents["SKILL.toml"], 2000)); } if (dirContents["SKILL.md"]) { - parts.push(`\n=== SKILL.md ===`); - parts.push(dirContents["SKILL.md"]); + parts.push("\nSKILL.md:"); + parts.push(truncate(dirContents["SKILL.md"], 2000)); } - if (dirContents.scripts && Object.keys(dirContents.scripts).length > 0) { - parts.push(`\n=== 脚本文件 ===`); - for (const [name, content] of Object.entries(dirContents.scripts)) { - parts.push(`\n--- ${name} ---`); - const contentStr = typeof content === 'string' ? content : String(content || ''); - parts.push(contentStr.substring(0, 2000)); + const keyScripts = Object.entries(dirContents.scripts || {}).slice(0, 6); + if (keyScripts.length > 0) { + parts.push("\nKey scripts:"); + for (const [name, content] of keyScripts) { + parts.push(`--- ${name} ---`); + parts.push(truncate(content, 1200)); } } - return `以下是场景目录 "${sourceDir}" 的内容:\n\n${parts.join("\n")}\n\n请以 JSON 格式返回:{"sceneId": "...", "sceneName": "..."}`; + return `${parts.join("\n")}\n\nReturn JSON only: {"sceneId":"...","sceneName":"..."}`; } -function buildDeepAnalyzePrompt(sourceDir, dirContents, indexHtmlContent) { +function buildDeepAnalyzePrompt(sourceDir, dirContents) { + const context = dirContents.analysisContext || {}; + const deterministic = dirContents.deterministic || {}; const parts = []; - parts.push(`=== 目录结构 ===`); - parts.push(dirContents.tree || "(empty)"); + parts.push(`Source: ${sourceDir}`); + parts.push("Directory summary:"); + parts.push(stringifyForPrompt(compactDirectorySummary(context.directorySummary || {}), 4000)); - if (dirContents["scene.toml"]) { - parts.push(`\n=== scene.toml ===`); - parts.push(dirContents["scene.toml"]); + parts.push("\nDeterministic candidate Scene IR:"); + parts.push(stringifyForPrompt(compactDeterministicSceneIr(stripForPrompt(deterministic)), MAX_JSON_SECTION_CHARS)); + + parts.push("\nBootstrap hints:"); + parts.push(stringifyForPrompt((context.bootstrapHints || []).slice(0, 6), 2000)); + + pushFragments(parts, "index.html chunks", context.indexHtmlChunks, 2); + pushFragments(parts, "URL-bearing fragments", context.urlFragments, 8); + pushFragments(parts, "Request-construction fragments", context.requestFragments, 8); + pushFragments(parts, "Branching fragments", context.branchingFragments, 6); + pushFragments(parts, "Response/normalization fragments", context.responseFragments, 6); + pushFragments(parts, "Export fragments", context.exportFragments, 4); + + parts.push(` +Instructions: +- Keep deterministic facts unless there is very strong contrary evidence in the snippets. +- Fill in semantic descriptions, labels, workflow steps, mode intent, readiness notes, and missing request/response structure. +- Never downgrade a Chinese business scene into a low-entropy sceneId such as 2-0, 1-0, scene, or report. +- If unsure, leave fields empty or add an uncertainty instead of guessing. +- Preserve the schema shape exactly. +- Return JSON only.`); + + return truncate(parts.join("\n"), MAX_DEEP_PROMPT_CHARS); +} + +function pushFragments(parts, title, fragments, limit) { + parts.push(`\n${title}:`); + const selected = Array.isArray(fragments) ? fragments.slice(0, limit) : []; + if (!selected.length) { + parts.push("[]"); + return; } - if (dirContents["SKILL.toml"]) { - parts.push(`\n=== SKILL.toml ===`); - parts.push(dirContents["SKILL.toml"]); + for (const fragment of selected) { + parts.push( + stringifyForPrompt( + { + path: fragment.path, + lineStart: fragment.lineStart, + lineEnd: fragment.lineEnd, + snippet: truncate(fragment.snippet || fragment.content, 1200), + }, + 1400 + ) + ); } +} - if (dirContents["SKILL.md"]) { - parts.push(`\n=== SKILL.md ===`); - parts.push(dirContents["SKILL.md"]); - } +function stringifyForPrompt(value, maxChars) { + return truncate(JSON.stringify(value, null, 2), maxChars); +} - // Include index.html content (key addition) - if (indexHtmlContent && typeof indexHtmlContent === 'string') { - parts.push(`\n=== index.html ===`); - // Limit to first 15000 chars to avoid token limits - parts.push(indexHtmlContent.substring(0, 15000)); - } +function compactDirectorySummary(summary) { + const files = Array.isArray(summary.files) ? summary.files.slice(0, 24) : []; + return { + sourceDir: summary.sourceDir || "", + tree: truncate(summary.tree || "", 2500), + files, + fileCount: Array.isArray(summary.files) ? summary.files.length : files.length, + }; +} - if (dirContents.scripts && Object.keys(dirContents.scripts).length > 0) { - parts.push(`\n=== 脚本文件 ===`); - for (const [name, content] of Object.entries(dirContents.scripts)) { - parts.push(`\n--- ${name} ---`); - const contentStr = typeof content === 'string' ? content : String(content || ''); - parts.push(contentStr.substring(0, 3000)); +function compactDeterministicSceneIr(sceneIr) { + const value = sceneIr && typeof sceneIr === "object" ? JSON.parse(JSON.stringify(sceneIr)) : {}; + value.evidence = Array.isArray(value.evidence) ? value.evidence.slice(0, 8) : []; + value.apiEndpoints = Array.isArray(value.apiEndpoints) ? value.apiEndpoints.slice(0, 8) : []; + value.params = Array.isArray(value.params) ? value.params.slice(0, 8) : []; + value.modes = Array.isArray(value.modes) ? value.modes.slice(0, 4) : []; + value.workflowSteps = Array.isArray(value.workflowSteps) ? value.workflowSteps.slice(0, 8) : []; + value.columnDefs = Array.isArray(value.columnDefs) ? value.columnDefs.slice(0, 12) : []; + value.uncertainties = Array.isArray(value.uncertainties) ? value.uncertainties.slice(0, 8) : []; + value.readiness = value.readiness && typeof value.readiness === "object" + ? { + level: value.readiness.level, + confidence: value.readiness.confidence, + risks: Array.isArray(value.readiness.risks) ? value.readiness.risks.slice(0, 8) : [], + missingPieces: Array.isArray(value.readiness.missingPieces) ? value.readiness.missingPieces.slice(0, 8) : [], + notes: Array.isArray(value.readiness.notes) ? value.readiness.notes.slice(0, 6) : [], + } + : value.readiness; + return value; +} + +function truncate(text, maxLength) { + const value = typeof text === "string" ? text : String(text || ""); + return value.length > maxLength ? value.slice(0, maxLength) : value; +} + +function stripForPrompt(sceneIr) { + if (!sceneIr || typeof sceneIr !== "object") return {}; + const clone = JSON.parse(JSON.stringify(sceneIr)); + delete clone.deterministicSignals; + return clone; +} + +function requestChatCompletion( + messages, + { apiKey, baseUrl, model, maxTokens = 1024, timeoutMs = DEFAULT_REQUEST_TIMEOUT_MS } +) { + const requestBody = JSON.stringify({ + model, + messages, + temperature: 0.1, + max_tokens: maxTokens, + }); + + return new Promise((resolve, reject) => { + const url = new URL(baseUrl.replace(/\/v1\/?$/, "") + "/v1/chat/completions"); + const options = { + hostname: url.hostname, + port: url.port || (url.protocol === "https:" ? 443 : 80), + path: url.pathname, + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${apiKey}`, + "Content-Length": Buffer.byteLength(requestBody), + }, + }; + + const transport = url.protocol === "https:" ? https : http; + const req = transport.request(options, (res) => { + let data = ""; + res.on("data", (chunk) => { + data += chunk; + }); + res.on("end", () => { + if (res.statusCode !== 200) { + reject(new Error(`LLM API error ${res.statusCode}: ${data}`)); + return; + } + + try { + const parsed = JSON.parse(data); + const content = parsed.choices?.[0]?.message?.content; + if (!content) { + reject(new Error("LLM returned empty response")); + return; + } + resolve(content); + } catch (error) { + reject(new Error(`Failed to parse LLM transport response: ${error.message}`)); + } + }); + }); + + req.on("error", reject); + req.setTimeout(timeoutMs, () => { + req.destroy(new Error("LLM API request timed out")); + }); + + req.write(requestBody); + req.end(); + }); +} + +async function requestChatCompletionWithRetry(messages, options) { + const maxAttempts = Number.isFinite(options?.retryAttempts) ? options.retryAttempts : 2; + let lastError = null; + + for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { + try { + return await requestChatCompletion(messages, options); + } catch (error) { + lastError = error; + if (!isRetryableLlmError(error) || attempt === maxAttempts) { + throw error; + } + await sleep(600 * attempt); } } - return `以下是场景目录 "${sourceDir}" 的内容:\n\n${parts.join("\n")}\n\n请分析以上代码,提取完整的场景信息。`; + throw lastError || new Error("LLM request failed"); +} + +function isRetryableLlmError(error) { + const message = String(error?.message || "").toLowerCase(); + return ( + message.includes("timed out") || + message.includes("timeout") || + message.includes("429") || + message.includes("502") || + message.includes("503") || + message.includes("504") || + message.includes("socket hang up") || + message.includes("econnreset") || + message.includes("etimedout") + ); +} + +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); } function extractJsonFromResponse(text) { - const codeBlockMatch = text.match(/```(?:json)?\s*\n([\s\S]*?)\n```/); - if (codeBlockMatch) return JSON.parse(codeBlockMatch[1]); + const candidates = extractJsonCandidates(text); + let lastError = null; - const jsonMatch = text.match( - /\{[\s\S]*"sceneId"[\s\S]*"sceneName"[\s\S]*\}/ - ); - if (jsonMatch) return JSON.parse(jsonMatch[0]); + for (const candidate of candidates) { + try { + return JSON.parse(candidate); + } catch (error) { + lastError = error; + } - return JSON.parse(text); + const repaired = repairCommonJsonIssues(candidate); + if (repaired && repaired !== candidate) { + try { + return JSON.parse(repaired); + } catch (error) { + lastError = error; + } + } + } + + throw lastError || new Error("Unable to parse JSON response"); } -function extractSceneInfo(text) { - // Try code block first - const codeBlockMatch = text.match(/```(?:json)?\s*\n([\s\S]*?)\n```/); +function extractJsonCandidates(text) { + const raw = typeof text === "string" ? text : String(text || ""); + const candidates = []; + const codeBlockMatch = raw.match(/```(?:json)?\s*\n([\s\S]*?)\n```/); if (codeBlockMatch) { - try { - return JSON.parse(codeBlockMatch[1]); - } catch (e) { - // fall through - } + candidates.push(codeBlockMatch[1]); } - // Try to find JSON object with sceneId - const jsonMatch = text.match(/\{[\s\S]*"sceneId"[\s\S]*\}/); - if (jsonMatch) { - try { - return JSON.parse(jsonMatch[0]); - } catch (e) { - // fall through - } + const objectStart = raw.indexOf("{"); + const objectEnd = raw.lastIndexOf("}"); + if (objectStart !== -1 && objectEnd !== -1 && objectEnd > objectStart) { + candidates.push(raw.slice(objectStart, objectEnd + 1)); } - // Last resort: parse entire text + const arrayStart = raw.indexOf("["); + const arrayEnd = raw.lastIndexOf("]"); + if (arrayStart !== -1 && arrayEnd !== -1 && arrayEnd > arrayStart) { + candidates.push(raw.slice(arrayStart, arrayEnd + 1)); + } + + candidates.push(raw); + return Array.from(new Set(candidates.map((value) => value.trim()).filter(Boolean))); +} + +function repairCommonJsonIssues(text) { + let value = typeof text === "string" ? text : String(text || ""); + if (!value) return value; + + value = value + .replace(/^\uFEFF/, "") + .replace(/[“”]/g, '"') + .replace(/[‘’]/g, "'") + .replace(/```(?:json)?/gi, "") + .replace(/```/g, "") + .replace(/\r\n/g, "\n"); + + value = stripJsonComments(value); + value = removeTrailingCommas(value); + value = insertMissingArrayCommas(value); + value = insertMissingObjectCommas(value); + return value.trim(); +} + +function stripJsonComments(text) { + return text + .replace(/^\s*\/\/.*$/gm, "") + .replace(/\/\*[\s\S]*?\*\//g, ""); +} + +function removeTrailingCommas(text) { + let output = text; + let previous = null; + while (output !== previous) { + previous = output; + output = output.replace(/,\s*([}\]])/g, "$1"); + } + return output; +} + +function insertMissingArrayCommas(text) { + return text + .replace(/}\s*{/g, "},{") + .replace(/]\s*{/g, "],{") + .replace(/}\s*\[/g, "},[") + .replace(/"\s*{/g, '",{') + .replace(/}\s*"/g, '},"') + .replace(/]\s*"/g, '],"') + .replace(/"\s*\[/g, '",['); +} + +function insertMissingObjectCommas(text) { + return text + .replace(/([0-9}\]"'])\s+("([A-Za-z0-9_]+)"\s*:)/g, "$1,$2") + .replace(/(true|false|null)\s+("([A-Za-z0-9_]+)"\s*:)/g, "$1,$2"); +} + +async function extractJsonFromResponseWithRepair(text, config) { try { - return JSON.parse(text); - } catch (e) { - throw new Error("Failed to extract valid SceneInfo JSON from LLM response"); + return extractJsonFromResponse(text); + } catch (error) { + const malformed = extractJsonCandidates(text)[0] || String(text || ""); + if (!config || !config.apiKey || !config.baseUrl || !config.model) { + throw error; + } + + const repairPrompt = [ + "Repair this malformed JSON and return valid JSON only.", + "", + `Original parse error: ${error.message}`, + "", + truncate(malformed, JSON_REPAIR_MAX_CHARS), + ].join("\n"); + + const repairedContent = await requestChatCompletionWithRetry( + [ + { role: "system", content: JSON_REPAIR_SYSTEM_PROMPT }, + { role: "user", content: repairPrompt }, + ], + { + ...config, + maxTokens: 2600, + timeoutMs: JSON_REPAIR_TIMEOUT_MS, + retryAttempts: 2, + } + ); + + try { + return extractJsonFromResponse(repairedContent); + } catch (repairError) { + throw new Error(`${error.message}; repair failed: ${repairError.message}`); + } } } -function analyzeScene(sourceDir, dirContents, { apiKey, baseUrl, model }) { - const userPrompt = buildAnalyzePrompt(sourceDir, dirContents); - - const requestBody = JSON.stringify({ - model, - messages: [ - { role: "system", content: SYSTEM_PROMPT }, - { role: "user", content: userPrompt }, - ], - temperature: 0.1, - max_tokens: 256, - }); - - return new Promise((resolve, reject) => { - const url = new URL(baseUrl.replace(/\/v1\/?$/, "") + "/v1/chat/completions"); - const options = { - hostname: url.hostname, - port: url.port || (url.protocol === "https:" ? 443 : 80), - path: url.pathname, - method: "POST", - headers: { - "Content-Type": "application/json", - Authorization: `Bearer ${apiKey}`, - "Content-Length": Buffer.byteLength(requestBody), - }, - }; - - const req = https.request(options, (res) => { - let data = ""; - res.on("data", (chunk) => (data += chunk)); - res.on("end", () => { - if (res.statusCode !== 200) { - return reject(new Error(`LLM API error ${res.statusCode}: ${data}`)); - } - - try { - const parsed = JSON.parse(data); - const content = parsed.choices?.[0]?.message?.content; - if (!content) return reject(new Error("LLM returned empty response")); - const result = extractJsonFromResponse(content); - if (!result.sceneId || !result.sceneName) { - return reject( - new Error(`LLM response missing sceneId/sceneName: ${content}`) - ); - } - resolve(result); - } catch (err) { - reject(new Error(`Failed to parse LLM response: ${err.message}`)); - } - }); - }); - - req.on("error", reject); - req.setTimeout(30000, () => { - req.destroy(new Error("LLM API request timed out")); - }); - - req.write(requestBody); - req.end(); - }); +function normalizeSceneIr(input) { + const sceneIr = input && typeof input === "object" ? input : {}; + return { + sceneId: safeString(sceneIr.sceneId), + sceneIdDiagnostics: normalizeSceneIdDiagnostics(sceneIr.sceneIdDiagnostics), + sceneName: safeString(sceneIr.sceneName), + sceneKind: safeString(sceneIr.sceneKind) || "report_collection", + workflowArchetype: safeString(sceneIr.workflowArchetype), + bootstrap: normalizeBootstrap(sceneIr.bootstrap), + params: Array.isArray(sceneIr.params) ? sceneIr.params.map(normalizeParam) : [], + modes: Array.isArray(sceneIr.modes) ? sceneIr.modes.map(normalizeMode) : [], + defaultMode: sceneIr.defaultMode || null, + modeSwitchField: sceneIr.modeSwitchField || null, + workflowSteps: Array.isArray(sceneIr.workflowSteps) ? sceneIr.workflowSteps.map(normalizeWorkflowStep) : [], + workflowEvidence: normalizeWorkflowEvidence(sceneIr.workflowEvidence), + requestTemplate: normalizeObject(sceneIr.requestTemplate), + responsePath: safeString(sceneIr.responsePath), + normalizeRules: normalizeNormalizeRules(sceneIr.normalizeRules), + artifactContract: normalizeArtifactContract(sceneIr.artifactContract), + validationHints: normalizeValidationHints(sceneIr.validationHints), + evidence: Array.isArray(sceneIr.evidence) ? sceneIr.evidence.map(normalizeEvidence) : [], + readiness: normalizeReadiness(sceneIr.readiness), + apiEndpoints: Array.isArray(sceneIr.apiEndpoints) ? sceneIr.apiEndpoints.map(normalizeApiEndpoint) : [], + staticParams: normalizeObject(sceneIr.staticParams), + columnDefs: Array.isArray(sceneIr.columnDefs) ? sceneIr.columnDefs : [], + confidence: normalizeConfidence(sceneIr.confidence), + uncertainties: Array.isArray(sceneIr.uncertainties) + ? sceneIr.uncertainties.map((item) => safeString(item)).filter(Boolean) + : [], + }; } -function analyzeSceneDeep(sourceDir, dirContents, indexHtmlContent, { apiKey, baseUrl, model }) { - const userPrompt = buildDeepAnalyzePrompt(sourceDir, dirContents, indexHtmlContent); +function normalizeWorkflowEvidence(value) { + const item = value && typeof value === "object" ? value : {}; + return { + requestEntries: Array.isArray(item.requestEntries) + ? item.requestEntries.map((entry) => safeString(entry)).filter(Boolean) + : [], + paginationFields: Array.isArray(item.paginationFields) + ? item.paginationFields.map((entry) => safeString(entry)).filter(Boolean) + : [], + secondaryRequestEntries: Array.isArray(item.secondaryRequestEntries) + ? item.secondaryRequestEntries.map((entry) => safeString(entry)).filter(Boolean) + : [], + postProcessSteps: Array.isArray(item.postProcessSteps) + ? item.postProcessSteps.map((entry) => safeString(entry)).filter(Boolean) + : [], + }; +} - const requestBody = JSON.stringify({ - model, - messages: [ - { role: "system", content: DEEP_SYSTEM_PROMPT }, - { role: "user", content: userPrompt }, +function normalizeSceneIdDiagnostics(value) { + const item = value && typeof value === "object" ? value : {}; + return { + candidateSource: safeString(item.candidateSource), + valid: item.valid !== false, + invalidReason: safeString(item.invalidReason) || null, + candidates: Array.isArray(item.candidates) + ? item.candidates + .map((candidate) => ({ + value: safeString(candidate?.value), + source: safeString(candidate?.source), + valid: candidate?.valid !== false, + reason: safeString(candidate?.reason) || null, + })) + .filter((candidate) => candidate.value) + : [], + }; +} + +function normalizeBootstrap(bootstrap) { + const value = bootstrap && typeof bootstrap === "object" ? bootstrap : {}; + return { + expectedDomain: safeString(value.expectedDomain), + targetUrl: safeString(value.targetUrl), + requiresTargetPage: value.requiresTargetPage !== false, + pageTitleKeywords: Array.isArray(value.pageTitleKeywords) + ? value.pageTitleKeywords.map((item) => safeString(item)).filter(Boolean) + : [], + source: safeString(value.source), + }; +} + +function normalizeParam(param) { + const value = param && typeof param === "object" ? param : {}; + return { + name: safeString(value.name), + resolver: safeString(value.resolver), + required: Boolean(value.required), + promptMissing: safeString(value.promptMissing), + promptAmbiguous: safeString(value.promptAmbiguous), + resolverConfig: normalizeObject(value.resolverConfig), + }; +} + +function normalizeMode(mode) { + const value = mode && typeof mode === "object" ? mode : {}; + return { + name: safeString(value.name), + label: safeString(value.label) || null, + condition: value.condition && typeof value.condition === "object" + ? { + field: safeString(value.condition.field), + operator: safeString(value.condition.operator) || "equals", + value: value.condition.value, + } + : null, + apiEndpoint: normalizeApiEndpoint(value.apiEndpoint), + columnDefs: Array.isArray(value.columnDefs) ? value.columnDefs : [], + requestTemplate: normalizeObject(value.requestTemplate), + normalizeRules: normalizeNormalizeRules(value.normalizeRules), + responsePath: safeString(value.responsePath), + }; +} + +function normalizeWorkflowStep(step) { + const value = step && typeof step === "object" ? step : {}; + return { + type: safeString(value.type), + entry: safeString(value.entry) || null, + source: safeString(value.source) || null, + expr: safeString(value.expr) || null, + description: safeString(value.description) || null, + endpoint: safeString(value.endpoint) || null, + }; +} + +function normalizeNormalizeRules(rules) { + const value = rules && typeof rules === "object" ? rules : {}; + const normalized = { + type: safeString(value.type), + requiredFields: Array.isArray(value.requiredFields) + ? value.requiredFields.map((item) => safeString(item)).filter(Boolean) + : [], + filterNull: value.filterNull !== false, + }; + for (const [key, item] of Object.entries(value)) { + if (!(key in normalized)) { + normalized[key] = item; + } + } + return normalized; +} + +function normalizeArtifactContract(contract) { + const value = contract && typeof contract === "object" ? contract : {}; + return { + type: safeString(value.type) || "report-artifact", + successStatus: Array.isArray(value.successStatus) + ? value.successStatus.map((item) => safeString(item)).filter(Boolean) + : ["ok", "partial", "empty"], + failureStatus: Array.isArray(value.failureStatus) + ? value.failureStatus.map((item) => safeString(item)).filter(Boolean) + : ["blocked", "error"], + }; +} + +function normalizeValidationHints(hints) { + const value = hints && typeof hints === "object" ? hints : {}; + return { + requiresTargetPage: value.requiresTargetPage !== false, + runtimeCompatible: value.runtimeCompatible !== false, + manualCompletionRequired: Boolean(value.manualCompletionRequired), + missingPieces: Array.isArray(value.missingPieces) + ? value.missingPieces.map((item) => safeString(item)).filter(Boolean) + : [], + }; +} + +function normalizeEvidence(item) { + const value = item && typeof item === "object" ? item : {}; + return { + kind: safeString(value.kind), + summary: safeString(value.summary), + source: safeString(value.source) || null, + confidence: normalizeConfidence(value.confidence), + }; +} + +function normalizeReadiness(readiness) { + const value = readiness && typeof readiness === "object" ? readiness : {}; + return { + level: safeString(value.level), + confidence: normalizeConfidence(value.confidence), + gates: Array.isArray(value.gates) + ? value.gates + .map((gate) => ({ + name: safeString(gate?.name), + passed: Boolean(gate?.passed), + reason: safeString(gate?.reason) || null, + })) + .filter((gate) => gate.name) + : [], + risks: Array.isArray(value.risks) ? value.risks.map((item) => safeString(item)).filter(Boolean) : [], + missingPieces: Array.isArray(value.missingPieces) + ? value.missingPieces.map((item) => safeString(item)).filter(Boolean) + : [], + notes: Array.isArray(value.notes) ? value.notes.map((item) => safeString(item)).filter(Boolean) : [], + }; +} + +function normalizeApiEndpoint(endpoint) { + const value = endpoint && typeof endpoint === "object" ? endpoint : {}; + const url = safeString(value.url); + if (!url) return null; + return { + name: safeString(value.name) || inferNameFromUrl(url), + url, + method: safeString(value.method).toUpperCase() || "GET", + contentType: safeString(value.contentType) || null, + description: safeString(value.description) || null, + }; +} + +function inferNameFromUrl(url) { + const parts = url.split(/[/?#]/).filter(Boolean); + return parts[parts.length - 1] || "endpoint"; +} + +function normalizeObject(value) { + return value && typeof value === "object" && !Array.isArray(value) ? value : {}; +} + +function normalizeConfidence(value) { + const number = typeof value === "number" ? value : Number(value); + if (!Number.isFinite(number)) return 0; + return Math.max(0, Math.min(1, Number(number.toFixed(2)))); +} + +function safeString(value) { + return typeof value === "string" ? value.trim() : ""; +} + +async function analyzeScene(sourceDir, dirContents, config) { + const content = await requestChatCompletionWithRetry( + [ + { role: "system", content: SYSTEM_PROMPT }, + { role: "user", content: buildAnalyzePrompt(sourceDir, dirContents) }, ], - temperature: 0.1, - max_tokens: 2048, // Increased for detailed response + { + ...config, + maxTokens: 256, + timeoutMs: DEFAULT_REQUEST_TIMEOUT_MS, + retryAttempts: 2, + } + ); + + const result = await extractJsonFromResponseWithRepair(content, config); + if (!result.sceneId || !result.sceneName) { + throw new Error(`LLM response missing sceneId/sceneName: ${content}`); + } + const validation = validateSceneIdCandidate(result.sceneId, { + sceneName: result.sceneName, + sourceDir, }); + if (!validation.valid) { + throw new Error(`LLM returned invalid sceneId (${validation.reason}): ${result.sceneId}`); + } + return { + sceneId: result.sceneId, + sceneName: result.sceneName, + }; +} - return new Promise((resolve, reject) => { - const url = new URL(baseUrl.replace(/\/v1\/?$/, "") + "/v1/chat/completions"); - const options = { - hostname: url.hostname, - port: url.port || (url.protocol === "https:" ? 443 : 80), - path: url.pathname, - method: "POST", - headers: { - "Content-Type": "application/json", - Authorization: `Bearer ${apiKey}`, - "Content-Length": Buffer.byteLength(requestBody), - }, - }; +async function analyzeSceneDeep(sourceDir, dirContents, config) { + const content = await requestChatCompletionWithRetry( + [ + { role: "system", content: DEEP_SYSTEM_PROMPT }, + { role: "user", content: buildDeepAnalyzePrompt(sourceDir, dirContents) }, + ], + { + ...config, + maxTokens: 2400, + timeoutMs: DEEP_REQUEST_TIMEOUT_MS, + retryAttempts: 2, + } + ); - const httpModule = url.protocol === "https:" ? https : http; - const req = httpModule.request(options, (res) => { - let data = ""; - res.on("data", (chunk) => (data += chunk)); - res.on("end", () => { - if (res.statusCode !== 200) { - return reject(new Error(`LLM API error ${res.statusCode}: ${data}`)); - } - - try { - const parsed = JSON.parse(data); - const content = parsed.choices?.[0]?.message?.content; - if (!content) return reject(new Error("LLM returned empty response")); - const result = extractSceneInfo(content); - - // Validate required fields - if (!result.sceneId || !result.sceneName) { - return reject(new Error(`LLM response missing sceneId/sceneName: ${content}`)); - } - - // Set defaults for optional fields - result.sceneKind = result.sceneKind || "report_collection"; - result.apiEndpoints = result.apiEndpoints || []; - result.staticParams = result.staticParams || {}; - result.columnDefs = result.columnDefs || []; - result.businessLogic = result.businessLogic || {}; - result.modes = result.modes || []; - result.defaultMode = result.defaultMode || (result.modes.length > 0 ? result.modes[0].name : null); - result.modeSwitchField = result.modeSwitchField || "period_mode"; - - resolve(result); - } catch (err) { - reject(new Error(`Failed to parse LLM response: ${err.message}`)); - } - }); - }); - - req.on("error", reject); - req.setTimeout(60000, () => { - req.destroy(new Error("LLM API request timed out")); - }); - - req.write(requestBody); - req.end(); + const normalized = normalizeSceneIr(await extractJsonFromResponseWithRepair(content, config)); + const validation = validateSceneIdCandidate(normalized.sceneId, { + sceneName: normalized.sceneName, + sourceDir, }); + normalized.sceneIdDiagnostics = { + candidateSource: "llm_semantic", + valid: validation.valid, + invalidReason: validation.valid ? null : validation.reason, + candidates: normalized.sceneId + ? [ + { + value: normalized.sceneId, + source: "llm_semantic", + valid: validation.valid, + reason: validation.valid ? null : validation.reason, + }, + ] + : [], + }; + if (!validation.valid && normalized.sceneId) { + normalized.uncertainties = Array.from( + new Set([...(normalized.uncertainties || []), `invalid_scene_id:${validation.reason}`]) + ); + } + + // AUTO-WRAP: single-mode scenes → modes array + if (normalized.modes.length === 0 && normalized.apiEndpoints.length > 0) { + normalized.modes.push({ + name: "default", + label: "default", + condition: { field: "period_mode", operator: "equals", value: "default" }, + apiEndpoint: normalized.apiEndpoints[0], + columnDefs: normalized.columnDefs || [], + requestTemplate: normalized.requestTemplate || {}, + normalizeRules: normalized.normalizeRules || { type: "validate_required", requiredFields: [], filterNull: true }, + responsePath: normalized.responsePath || "", + }); + normalized.defaultMode = "default"; + normalized.modeSwitchField = "period_mode"; + // Upgrade archetype if it was single_request_table + if (normalized.workflowArchetype === "single_request_table") { + normalized.workflowArchetype = "multi_mode_request"; + } + } + + return normalized; } module.exports = { - buildAnalyzePrompt, - extractJsonFromResponse, analyzeScene, - // New exports - buildDeepAnalyzePrompt, - extractSceneInfo, analyzeSceneDeep, + buildAnalyzePrompt, + buildDeepAnalyzePrompt, + extractJsonFromResponse, + extractJsonFromResponseWithRepair, + isRetryableLlmError, + normalizeSceneIr, + requestChatCompletionWithRetry, + repairCommonJsonIssues, };