From 5ff6e05911be6c0bb2ae2513bae4b76bb329f6e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=A8=E7=82=8E?= <635735027@qq.com> Date: Fri, 17 Apr 2026 12:51:09 +0800 Subject: [PATCH] docs: add enhanced LLM extraction schema design for multi-mode business logic --- ...7-enhanced-llm-extraction-schema-design.md | 573 ++++++++++++++++++ 1 file changed, 573 insertions(+) create mode 100644 docs/superpowers/specs/2026-04-17-enhanced-llm-extraction-schema-design.md diff --git a/docs/superpowers/specs/2026-04-17-enhanced-llm-extraction-schema-design.md b/docs/superpowers/specs/2026-04-17-enhanced-llm-extraction-schema-design.md new file mode 100644 index 0000000..0682bb0 --- /dev/null +++ b/docs/superpowers/specs/2026-04-17-enhanced-llm-extraction-schema-design.md @@ -0,0 +1,573 @@ +# 增强 LLM 提取 Schema - 多模式业务逻辑自动化 + +> **Status:** Draft +> **Date:** 2026-04-17 +> **Author:** Qoder + +## Problem Statement + +当前 `sg_scene_generate` 自动生成的 skill 脚本与 Claude 手写的 skill 存在显著差距: + +### 差距清单 + +| 差距类型 | 描述 | 严重程度 | +|----------|------|----------| +| 多模式支持 | 无法识别 month/week 等多模式场景,只生成单一逻辑 | 🔴 高 | +| 多 API 端点 | 定义多个 API 但只使用第一个 | 🔴 高 | +| 请求格式检测 | 默认 JSON body,未检测 form-urlencoded | 🟡 中 | +| 数据归一化 | 简单映射,无关键字段验证和空行过滤 | 🟡 中 | +| 参数标准化 | 无参数标准化处理,直接透传 | 🟡 中 | +| 分页参数 | 未提取和处理分页参数 (rows/page/sidx/sord) | 🟡 中 | + +### 对比分析 + +| 功能维度 | tq-lineloss-report (手写) | marketing-zero-consumer-report (自动) | +|----------|---------------------------|----------------------------------------| +| 模式切换 | month/week 两套完整逻辑 | 无 | +| API 端点 | queryMonthData + queryWeekData | 只用 API_ENDPOINTS[0] | +| 列定义 | MONTH/WEEK_COLUMN_DEFS 双套 | 单一 COLUMN_DEFS | +| 请求构建 | buildMonthRequest + buildWeekRequest | 单一 buildRequest | +| 请求格式 | form-urlencoded | application/json | +| 数据验证 | 关键字段验证 + filter(Boolean) | 无验证 | +| 参数标准化 | normalized 对象 + 类型转换 | 无 | + +## Goal + +增强 LLM 提取 Schema,使其能够从 index.html 中自动识别: + +1. **多模式业务逻辑** (month/week 等) +2. **模式切换条件** (如何判断使用哪个模式) +3. **每个模式的专属配置** (API、列定义、请求格式、验证规则) +4. **数据归一化规则** (关键字段、过滤条件) + +## Non-Goals + +- 不改变现有的两阶段架构 (LLM 提取 → Rust 渲染) +- 不增加新的 CLI 参数 +- 不支持超过 2 种模式的复杂场景 (Phase 1) +- 不处理认证/鉴权逻辑 (假设页面已登录) + +## Architecture + +### 增强后的 Schema 结构 + +```json +{ + "sceneId": "tq-lineloss-report", + "sceneName": "台区线损报表", + "sceneKind": "report_collection", + + "modes": [ + { + "name": "month", + "label": "月度报表", + "condition": { + "field": "period_mode", + "operator": "equals", + "value": "month" + }, + "apiEndpoint": { + "name": "月度线损查询", + "url": "http://20.76.57.61:18080/gsllys/fourVerEightHor/fourVerEightHorLinelossRateList", + "method": "POST", + "contentType": "application/x-www-form-urlencoded" + }, + "columnDefs": [ + ["ORG_NAME", "供电单位"], + ["YGDL", "累计供电量"], + ["YYDL", "累计售电量"], + ["YXSL", "线损完成率(%)"], + ["RAT_SCOPE", "线损率累计目标值"], + ["BLANK3", "目标完成率"], + ["BLANK2", "排行"] + ], + "requestTemplate": { + "orgno": "${args.org_code}", + "yn_flag": 0, + "_search": false, + "nd": "${Date.now()}", + "rows": 1000, + "page": 1, + "sidx": "TO_NUMBER(ORG_NO)", + "sord": "asc" + }, + "normalizeRules": { + "type": "validate_all_columns", + "filterNull": true + }, + "responsePath": "content" + }, + { + "name": "week", + "label": "周报表", + "condition": { + "field": "period_mode", + "operator": "equals", + "value": "week" + }, + "apiEndpoint": { + "name": "周线损查询", + "url": "http://20.76.57.61:18080/gsllys/tqLinelossStatis/getYearMonWeekLinelossAnalysisList", + "method": "POST", + "contentType": "application/x-www-form-urlencoded" + }, + "columnDefs": [ + ["ORG_NAME", "供电单位"], + ["LINE_LOSS_RATE", "综合线损率(%)"], + ["PPQ", "供电量(Kwh)"], + ["UPQ", "售电量(Kwh)"], + ["LOSS_PQ", "损失电量(Kwh)"] + ], + "requestTemplate": { + "orgno": "${args.org_code}", + "tjzq": "week", + "level": "00", + "_search": false, + "nd": "${Date.now()}", + "rows": 1000, + "page": 1, + "sidx": "TO_NUMBER(ORG_NO)", + "sord": "asc" + }, + "normalizeRules": { + "type": "validate_required", + "requiredFields": ["ORG_NAME", "LINE_LOSS_RATE"], + "filterNull": true + }, + "responsePath": "content" + } + ], + + "defaultMode": "month", + "modeSwitchField": "period_mode", + + "commonParams": {}, + "staticParams": {}, + + "expectedDomain": "20.76.57.61", + "targetUrl": "http://20.76.57.61:18080/gsllys" +} +``` + +### Schema 字段说明 + +#### 顶层字段 + +| 字段 | 类型 | 说明 | +|------|------|------| +| `modes` | Array | 模式配置数组 | +| `defaultMode` | string | 默认模式名称 | +| `modeSwitchField` | string | 用于切换模式的参数字段名 | + +#### Mode 配置 + +| 字段 | 类型 | 说明 | +|------|------|------| +| `name` | string | 模式标识符 | +| `label` | string | 模式显示名称 | +| `condition` | object | 模式激活条件 | +| `apiEndpoint` | object | 该模式的 API 配置 | +| `columnDefs` | Array | 该模式的列定义 | +| `requestTemplate` | object | 请求参数模板 | +| `normalizeRules` | object | 数据归一化规则 | +| `responsePath` | string | 响应数据路径 (如 `content`) | + +#### Condition 结构 + +```json +{ + "field": "period_mode", // 检查的字段 + "operator": "equals", // 操作符: equals, not_equals, in, contains + "value": "month" // 比较值 +} +``` + +#### NormalizeRules 结构 + +```json +{ + "type": "validate_required", // validate_all_columns | validate_required + "requiredFields": ["ORG_NAME", "LINE_LOSS_RATE"], // 仅 validate_required 需要 + "filterNull": true // 是否过滤空值行 +} +``` + +### 数据流 + +``` +用户请求 (args) + ↓ +validateArgs() → 参数验证 + ↓ +detectMode(args[modeSwitchField]) → 检测当前模式 + ↓ +selectModeConfig(mode) → 选择模式配置 + ↓ +buildRequest(args, modeConfig) → 构建该模式的请求 + ↓ +queryData(request, modeConfig.apiEndpoint) → HTTP 请求 + ↓ +extractResponse(response, modeConfig.responsePath) → 提取数据 + ↓ +normalizeRows(data, modeConfig.normalizeRules) → 数据归一化 + ↓ +buildArtifact({ mode, columnDefs, rows, ... }) → 构建输出 +``` + +## Implementation Details + +### 1. 增强 LLM 提取 Prompt + +修改 `frontend/scene-generator/llm-client.js` 中的 `DEEP_SYSTEM_PROMPT` 和 `buildDeepAnalyzePrompt`: + +**关键 Prompt 增强:** + +``` +分析 index.html 中的业务逻辑模式: + +1. **模式识别** + - 查找条件分支逻辑 (if/switch) 中基于 period_mode、reportType 等字段的分支 + - 识别不同分支对应的 API 端点、列定义、请求格式 + +2. **API 提取** + - 提取 $.ajax/fetch 调用中的 URL、method、contentType + - 识别请求参数构造方式 (JSON.stringify vs 对象展开) + - 检测分页参数 (rows/page/sidx/sord) + +3. **请求格式检测** + - contentType: application/json → JSON body + - contentType: application/x-www-form-urlencoded → 对象展开 + - 无显式 contentType → 检查 data 参数格式 + +4. **数据归一化** + - 查找数据渲染/表格填充逻辑 + - 识别字段映射关系 + - 检测数据验证条件 (哪些字段不能为空) + +5. **响应路径** + - 识别数据在响应中的位置 (response.content / response.data / response) +``` + +### 2. 增强 Rust Schema 结构 + +修改 `src/generated_scene/generator.rs`: + +```rust +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct ModeConditionJson { + pub field: String, + #[serde(default)] + pub operator: String, // equals, not_equals, in, contains + pub value: serde_json::Value, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct NormalizeRulesJson { + #[serde(rename = "type")] + pub rules_type: String, // validate_all_columns, validate_required + #[serde(default)] + pub required_fields: Vec, + #[serde(default)] + pub filter_null: bool, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct ApiEndpointEnhancedJson { + pub name: String, + pub url: String, + #[serde(default)] + pub method: String, + #[serde(default)] + pub content_type: Option, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct ModeConfigJson { + pub name: String, + pub label: Option, + pub condition: ModeConditionJson, + pub api_endpoint: ApiEndpointEnhancedJson, + pub column_defs: Vec<(String, String)>, + pub request_template: Option, + pub normalize_rules: Option, + pub response_path: Option, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct SceneInfoEnhancedJson { + #[serde(rename = "sceneId")] + pub scene_id: String, + #[serde(rename = "sceneName")] + pub scene_name: String, + + // 新增:多模式支持 + pub modes: Vec, + pub default_mode: Option, + pub mode_switch_field: Option, + + // 向后兼容:单模式场景 + #[serde(rename = "apiEndpoints", default)] + pub api_endpoints: Vec, + #[serde(rename = "columnDefs", default)] + pub column_defs: Vec<(String, String)>, + + // 其他字段保持不变 + // ... +} +``` + +### 3. 增强模板生成逻辑 + +```rust +fn browser_script_with_modes(scene_id: &str, scene_info: &SceneInfoEnhancedJson) -> String { + if scene_info.modes.is_empty() { + // 向后兼容:使用现有单模式模板 + return browser_script_with_business_logic(scene_id, scene_info); + } + + // 多模式模板 + generate_multi_mode_script(scene_id, scene_info) +} + +fn generate_multi_mode_script(scene_id: &str, scene_info: &SceneInfoEnhancedJson) -> String { + let modes_json = serde_json::to_string_pretty(&scene_info.modes).unwrap(); + let default_mode = scene_info.default_mode.as_deref().unwrap_or("month"); + + format!(r#" +const REPORT_NAME = '{scene_id}'; +const MODES = {modes_json}; +const DEFAULT_MODE = '{default_mode}'; +const MODE_SWITCH_FIELD = '{mode_switch_field}'; + +function detectMode(args) {{ + const modeValue = args[MODE_SWITCH_FIELD] || DEFAULT_MODE; + return MODES.find(m => m.condition.value === modeValue) || MODES[0]; +}} + +function buildModeRequest(args, mode) {{ + const endpoint = mode.api_endpoint; + const template = mode.request_template || {{}}; + + // 根据 contentType 构建请求 + const contentType = endpoint.content_type || 'application/json'; + const url = endpoint.url; + const method = endpoint.method || 'POST'; + + // 解析模板,替换变量 + let body; + if (contentType === 'application/x-www-form-urlencoded') {{ + body = {{ ...template }}; + // 替换模板变量 + for (const [key, value] of Object.entries(body)) {{ + if (typeof value === 'string' && value.startsWith('${{') && value.endsWith('}}')) {{ + const expr = value.slice(2, -1); + body[key] = eval(expr); + }} + }} + body.orgno = args.org_code; + }} else {{ + body = JSON.stringify({{ ...template, ...args }}); + }} + + return {{ url, method, headers: {{ 'Content-Type': contentType }}, body }}; +}} + +function normalizeModeRows(data, mode) {{ + const rules = mode.normalize_rules || {{ type: 'validate_all_columns' }}; + const columns = mode.column_defs.map(([key]) => key); + + if (!Array.isArray(data)) return []; + + return data.map(row => {{ + const result = {{}}; + for (const key of columns) {{ + result[key] = row[key] ?? ''; + }} + return result; + }}).filter(row => {{ + if (!rules.filter_null) return true; + + if (rules.type === 'validate_required') {{ + return rules.required_fields.every(f => row[f] !== ''); + }} + return columns.every(k => row[k] !== ''); + }}); +}} + +async function queryModeData(args, mode) {{ + const endpoint = mode.api_endpoint; + const request = buildModeRequest(args, mode); + + // jQuery 优先 + if (typeof $ !== 'undefined' && typeof $.ajax === 'function') {{ + const contentType = endpoint.content_type || 'application/json'; + return new Promise((resolve, reject) => {{ + $.ajax({{ + url: request.url, + type: request.method, + data: request.body, + contentType: contentType, + dataType: 'json', + success: resolve, + error: (xhr, status, err) => reject(new Error( + `API failed (${{xhr.status}}): ${{err}}` + )) + }}); + }}); + }} + + // fetch fallback + if (typeof fetch === 'function') {{ + const response = await fetch(request.url, {{ + method: request.method, + headers: request.headers, + body: request.method !== 'GET' ? request.body : undefined + }}); + if (!response.ok) {{ + const text = await response.text().catch(() => ''); + throw new Error(`HTTP ${{response.status}}: ${{text.substring(0, 200)}}`); + }} + return response.json(); + }} + + throw new Error('No HTTP client available'); +}} + +async function buildBrowserEntrypointResult(args, deps = defaultDeps) {{ + // 1. 参数验证 + const validation = validateArgs(args); + if (!validation.valid) {{ + return buildArtifact({{ + status: 'blocked', + blockedReason: 'validation_failed', + reasons: validation.errors, + rows: [], + args + }}); + }} + + // 2. 页面上下文验证 + const pageValidation = typeof deps.validatePageContext === 'function' + ? deps.validatePageContext(args) + : {{ ok: true }}; + if (!pageValidation?.ok) {{ + return buildArtifact({{ + status: 'blocked', + blockedReason: pageValidation?.reason || 'page_context_mismatch', + reasons: [pageValidation?.reason || 'page_context_mismatch'], + rows: [], + args + }}); + }} + + // 3. 检测模式 + const mode = detectMode(args); + + // 4. 数据获取 + const reasons = []; + let rawData = null; + try {{ + rawData = await queryModeData(args, mode); + }} catch (error) {{ + return buildArtifact({{ + status: 'error', + fatalError: error.message, + reasons: ['api_query_failed:' + error.message], + rows: [], + args + }}); + }} + + // 5. 提取响应数据 + const responsePath = mode.response_path || ''; + let data = rawData; + if (responsePath && rawData) {{ + data = rawData[responsePath] || rawData; + }} + + // 6. 数据归一化 + const rows = normalizeModeRows(data, mode); + if (rows.length === 0 && Array.isArray(data) && data.length > 0) {{ + reasons.push('row_normalization_partial'); + }} + + // 7. 构建 Artifact + return buildArtifact({{ + reasons, + rows, + args, + columnDefs: mode.column_defs, + columns: mode.column_defs.map(([key]) => key) + }}); +}} +"#, scene_id = scene_id, modes_json = modes_json, default_mode = default_mode, mode_switch_field = scene_info.mode_switch_field.as_deref().unwrap_or("period_mode")) +} +``` + +## Testing Strategy + +### 单元测试 + +1. **Schema 解析测试** + - 测试多模式 JSON 正确解析 + - 测试向后兼容(无 modes 字段时退化) + +2. **模式检测测试** + - 测试 `detectMode()` 根据参数正确选择模式 + +3. **请求构建测试** + - 测试 form-urlencoded 格式正确 + - 测试 JSON 格式正确 + - 测试模板变量替换 + +4. **数据归一化测试** + - 测试 validate_required 类型 + - 测试 validate_all_columns 类型 + - 测试空行过滤 + +### 集成测试 + +1. **端到端测试** + - 使用 tq-lineloss-report 源场景 + - 验证生成的脚本与手写版本功能一致 + +2. **回归测试** + - 验证单模式场景仍正常工作 + - 验证现有测试用例通过 + +## Migration Path + +### Phase 1: Schema 增强 + +1. 增强 Rust schema 结构 +2. 更新 LLM 提取 prompt +3. 验证 schema 解析正确 + +### Phase 2: 模板实现 + +1. 实现多模式模板 +2. 实现请求格式检测 +3. 实现数据归一化规则 + +### Phase 3: 测试验证 + +1. 使用 tq-lineloss-report 源场景测试 +2. 对比生成代码与手写代码 +3. 修复差异 + +## Risks and Mitigations + +| 风险 | 影响 | 缓解措施 | +|------|------|----------| +| LLM 提取不准确 | 生成代码不可用 | 提供 few-shot 示例,增加验证步骤 | +| 模式条件复杂 | 无法正确切换 | Phase 1 只支持 equals 操作符 | +| 请求模板变量 | 表达能力有限 | 支持常用表达式,复杂场景用 lessons 补充 | +| 向后兼容 | 现有场景受影响 | 无 modes 时使用旧模板 | + +## Success Criteria + +1. **多模式支持**:能够生成 month/week 双模式脚本 +2. **请求格式正确**:自动检测 form-urlencoded vs JSON +3. **数据验证**:支持关键字段验证和空行过滤 +4. **向后兼容**:单模式场景不受影响 +5. **代码质量**:生成的代码与手写 tq-lineloss-report 功能对等