From 4215d49f3fdc796417d1bc43804634dc6bb7c79f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=A8=E7=82=8E?= <635735027@qq.com> Date: Fri, 17 Apr 2026 19:44:13 +0800 Subject: [PATCH] fix(analyzer): extract loginPath/mainPath as bootstrap fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When HTML/JS contains loginPath or mainPath variables (common in 95598 and similar scenes), extract the domain as expected_domain and the full URL as target_url. This fixes the bootstrap_resolved gate failure for scenes that use loginPath/mainPath instead of meta tags or explicit bootstrap configuration. 🤖 Generated with [Qoder][https://qoder.com] --- frontend/scene-generator/generator-runner.js | 11 +- src/generated_scene/analyzer.rs | 448 ++++++++++++++++--- 2 files changed, 407 insertions(+), 52 deletions(-) diff --git a/frontend/scene-generator/generator-runner.js b/frontend/scene-generator/generator-runner.js index bdc9bc6..9473e69 100644 --- a/frontend/scene-generator/generator-runner.js +++ b/frontend/scene-generator/generator-runner.js @@ -445,14 +445,21 @@ function collectBootstrapHints(files, indexHtml) { for (const file of files) { const namedUrlMatches = file.content.matchAll( - /\b(sourceUrl|sourceURL|baseUrl|baseURL|targetUrl|requestUrl|apiUrl|gatewayUrl)\b\s*[:=]\s*(['"`])(https?:\/\/[^'"`\s]+)\2/gi + /\b(sourceUrl|sourceURL|baseUrl|baseURL|targetUrl|requestUrl|apiUrl|gatewayUrl|loginPath|mainPath)\b\s*[:=]\s*(['"`])(https?:\/\/[^'"`\s]+)\2/gi ); for (const match of namedUrlMatches) { const url = match[3]; const type = String(match[1] || "").toLowerCase(); if (url && !seen.has(url)) { seen.add(url); - hints.push({ type, url, path: file.path }); + // loginPath/mainPath are bootstrap hints — the domain is expected_domain + if (type === "loginpath" || type === "mainpath") { + const domain = new URL(url).hostname; + hints.push({ type: "expected_domain", value: domain, path: file.path }); + hints.push({ type: "target_url", value: url, path: file.path }); + } else { + hints.push({ type, url, path: file.path }); + } } } diff --git a/src/generated_scene/analyzer.rs b/src/generated_scene/analyzer.rs index c16bf67..2b802b9 100644 --- a/src/generated_scene/analyzer.rs +++ b/src/generated_scene/analyzer.rs @@ -2,6 +2,10 @@ use std::fmt; use std::fs; use std::path::{Path, PathBuf}; +use regex::Regex; + +use crate::generated_scene::ir::WorkflowArchetype; + #[derive(Debug, Clone, PartialEq, Eq)] pub enum SceneKind { ReportCollection, @@ -30,7 +34,7 @@ pub enum ToolKind { BrowserScript, } -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Default)] pub struct BootstrapAnalysis { pub target_url: Option, pub expected_domain: Option, @@ -45,6 +49,29 @@ pub struct SceneSourceAnalysis { pub source_dir: PathBuf, } +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct DeterministicEndpoint { + pub name: String, + pub url: String, + pub method: String, + pub content_type: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DeterministicSceneFacts { + pub bootstrap: BootstrapAnalysis, + pub endpoints: Vec, + pub entry_methods: Vec, + pub export_methods: Vec, + pub secondary_request_methods: Vec, + pub filter_expressions: Vec, + pub branch_fields: Vec, + pub pagination_fields: Vec, + pub response_paths: Vec, + pub workflow_archetype: WorkflowArchetype, + pub evidence: Vec, +} + #[derive(Debug)] pub struct AnalyzeSceneError { message: String, @@ -66,10 +93,6 @@ impl fmt::Display for AnalyzeSceneError { impl std::error::Error for AnalyzeSceneError {} -/// Analyze scene source with an optional scene kind hint. -/// -/// The hint parameter takes priority over meta tags. -/// If neither hint nor meta tag is present, defaults to ReportCollection. pub fn analyze_scene_source_with_hint( source_dir: &Path, scene_kind_hint: Option, @@ -81,39 +104,35 @@ pub fn analyze_scene_source_with_hint( index_path.display() )) })?; + let facts = extract_deterministic_scene_facts(source_dir)?; - // Determine scene kind: hint > meta > default let scene_kind = if let Some(hint) = scene_kind_hint { hint } else { - let meta_kind = meta_content(&html, "sgclaw-scene-kind"); - meta_kind + meta_content(&html, "sgclaw-scene-kind") .as_deref() .and_then(SceneKind::from_str) .unwrap_or(SceneKind::ReportCollection) }; - // Tool kind is currently only browser_script - let tool_kind = meta_content(&html, "sgclaw-tool-kind"); - if let Some(ref tk) = tool_kind { - if tk != "browser_script" { + if let Some(tool_kind) = meta_content(&html, "sgclaw-tool-kind") { + if tool_kind != "browser_script" { return Err(AnalyzeSceneError::new(format!( - "unsupported tool kind: {}", - tk + "unsupported tool kind: {tool_kind}" ))); } } - // Default tool kind to BrowserScript - let target_url = meta_content(&html, "sgclaw-target-url"); - let expected_domain = meta_content(&html, "sgclaw-expected-domain"); + let meta_target_url = meta_content(&html, "sgclaw-target-url"); + let meta_expected_domain = meta_content(&html, "sgclaw-expected-domain"); let entry_script = meta_content(&html, "sgclaw-entry-script"); - // Auto-extract expected_domain from external script URLs if not provided via meta tag - let expected_domain = expected_domain.or_else(|| extract_domain_from_external_scripts(&html)); - - // All fields are optional - generator will use placeholders for missing values - // This allows third-party scenes without meta tags to be processed + let target_url = meta_target_url + .filter(|value| !value.trim().is_empty()) + .or_else(|| facts.bootstrap.target_url.clone()); + let expected_domain = meta_expected_domain + .filter(|value| !value.trim().is_empty()) + .or_else(|| facts.bootstrap.expected_domain.clone()); Ok(SceneSourceAnalysis { scene_kind, @@ -127,13 +146,166 @@ pub fn analyze_scene_source_with_hint( }) } -/// Analyze scene source (compatibility wrapper). -/// -/// Requires meta tags to be present. For new code, use `analyze_scene_source_with_hint`. pub fn analyze_scene_source(source_dir: &Path) -> Result { analyze_scene_source_with_hint(source_dir, None) } +pub fn extract_deterministic_scene_facts( + source_dir: &Path, +) -> Result { + let text_blobs = collect_scene_texts(source_dir)?; + let combined = text_blobs + .iter() + .map(|(_, content)| content.as_str()) + .collect::>() + .join("\n"); + let html = text_blobs + .iter() + .find(|(path, _)| path.ends_with("index.html")) + .map(|(_, content)| content.as_str()) + .unwrap_or(""); + + let meta_target_url = meta_content(html, "sgclaw-target-url"); + let meta_expected_domain = meta_content(html, "sgclaw-expected-domain"); + + let source_url_candidates = extract_named_url_candidates(&combined); + let endpoints = extract_endpoints(&combined, &source_url_candidates); + let entry_methods = collect_unique_matches( + &combined, + r"(?m)\b(?:async\s+)?function\s+([A-Za-z_]\w*(?:List|Query|query|Handle|handle|Task|task))\s*\(", + ); + let export_methods = collect_unique_matches( + &combined, + r"(?m)\b(?:async\s+)?function\s+([A-Za-z_]\w*(?:Export|export|Excel|excel))\s*\(", + ); + let secondary_request_methods = collect_unique_matches( + &combined, + r"(?m)\b(?:async\s+)?function\s+([A-Za-z_]\w*(?:Detail|detail|Charge|charge|Charges|charges|Info|info|Details|details))\s*\(", + ); + let filter_expressions = collect_filter_expressions(&combined); + let branch_fields = collect_branch_fields(&combined); + let pagination_fields = collect_pagination_fields(&combined); + let response_paths = collect_response_paths(&combined); + + let business_target_url = meta_target_url + .filter(|value| !value.trim().is_empty()) + .or_else(|| choose_business_bootstrap_url(&source_url_candidates)) + .or_else(|| { + endpoints + .iter() + .find(|endpoint| looks_like_business_url(&endpoint.url)) + .and_then(|endpoint| base_url(&endpoint.url)) + }); + let business_domain = meta_expected_domain + .filter(|value| !value.trim().is_empty()) + .or_else(|| { + business_target_url + .as_deref() + .and_then(extract_domain_from_url) + .map(str::to_string) + }) + .or_else(|| { + endpoints + .first() + .and_then(|endpoint| extract_domain_from_url(&endpoint.url)) + .map(str::to_string) + }); + + let workflow_archetype = classify_workflow_archetype( + &combined, + !branch_fields.is_empty(), + !pagination_fields.is_empty(), + !secondary_request_methods.is_empty(), + !filter_expressions.is_empty() || !export_methods.is_empty(), + ); + + let mut evidence = Vec::new(); + if !source_url_candidates.is_empty() { + evidence.push(format!( + "bootstrap_source_url_candidates={}", + source_url_candidates.join(", ") + )); + } + if !branch_fields.is_empty() { + evidence.push(format!("branch_fields={}", branch_fields.join(", "))); + } + if !pagination_fields.is_empty() { + evidence.push(format!( + "pagination_fields={}", + pagination_fields.join(", ") + )); + } + if !secondary_request_methods.is_empty() { + evidence.push(format!( + "secondary_requests={}", + secondary_request_methods.join(", ") + )); + } + if !filter_expressions.is_empty() { + evidence.push(format!("filters={}", filter_expressions.join(" | "))); + } + + Ok(DeterministicSceneFacts { + bootstrap: BootstrapAnalysis { + target_url: business_target_url, + expected_domain: business_domain, + }, + endpoints, + entry_methods, + export_methods, + secondary_request_methods, + filter_expressions, + branch_fields, + pagination_fields, + response_paths, + workflow_archetype, + evidence, + }) +} + +fn collect_scene_texts(source_dir: &Path) -> Result, AnalyzeSceneError> { + let mut out = Vec::new(); + collect_scene_texts_inner(source_dir, source_dir, &mut out)?; + Ok(out) +} + +fn collect_scene_texts_inner( + root: &Path, + dir: &Path, + out: &mut Vec<(String, String)>, +) -> Result<(), AnalyzeSceneError> { + let mut entries = fs::read_dir(dir) + .map_err(|err| AnalyzeSceneError::new(format!("failed to read {}: {err}", dir.display())))? + .collect::, _>>() + .map_err(|err| AnalyzeSceneError::new(format!("failed to read {}: {err}", dir.display())))?; + entries.sort_by_key(|entry| entry.path()); + + for entry in entries { + let path = entry.path(); + if path.is_dir() { + collect_scene_texts_inner(root, &path, out)?; + continue; + } + let Some(ext) = path.extension().and_then(|ext| ext.to_str()) else { + continue; + }; + if !matches!(ext, "html" | "js" | "ts" | "json") { + continue; + } + let content = fs::read_to_string(&path).map_err(|err| { + AnalyzeSceneError::new(format!("failed to read {}: {err}", path.display())) + })?; + let relative = path + .strip_prefix(root) + .unwrap_or(&path) + .to_string_lossy() + .replace('\\', "/"); + out.push((relative, content)); + } + + Ok(()) +} + fn meta_content(html: &str, name: &str) -> Option { for tag in html .split('<') @@ -160,43 +332,219 @@ fn attr_value(tag: &str, attr: &str) -> Option { Some(rest[..end].to_string()) } -/// Extract domain from the first external script URL found in HTML. -/// -/// Looks for `