fix(analyzer): extract loginPath/mainPath as bootstrap fallback

When HTML/JS contains loginPath or mainPath variables (common in 95598 and similar scenes), extract the domain as expected_domain and the full URL as target_url. This fixes the bootstrap_resolved gate failure for scenes that use loginPath/mainPath instead of meta tags or explicit bootstrap configuration. 🤖 Generated with [Qoder][https://qoder.com]
2026-04-17 19:44:13 +08:00
parent 475e460eb1
commit 4215d49f3f
2 changed files with 407 additions and 52 deletions
--- a/src/generated_scene/analyzer.rs
+++ b/src/generated_scene/analyzer.rs
@@ -2,6 +2,10 @@ use std::fmt;
 use std::fs;
 use std::path::{Path, PathBuf};

+use regex::Regex;
+
+use crate::generated_scene::ir::WorkflowArchetype;
+
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum SceneKind {
    ReportCollection,
@@ -30,7 +34,7 @@ pub enum ToolKind {
    BrowserScript,
 }

-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq, Default)]
 pub struct BootstrapAnalysis {
    pub target_url: Option<String>,
    pub expected_domain: Option<String>,
@@ -45,6 +49,29 @@ pub struct SceneSourceAnalysis {
    pub source_dir: PathBuf,
 }

+#[derive(Debug, Clone, PartialEq, Eq, Default)]
+pub struct DeterministicEndpoint {
+    pub name: String,
+    pub url: String,
+    pub method: String,
+    pub content_type: Option<String>,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct DeterministicSceneFacts {
+    pub bootstrap: BootstrapAnalysis,
+    pub endpoints: Vec<DeterministicEndpoint>,
+    pub entry_methods: Vec<String>,
+    pub export_methods: Vec<String>,
+    pub secondary_request_methods: Vec<String>,
+    pub filter_expressions: Vec<String>,
+    pub branch_fields: Vec<String>,
+    pub pagination_fields: Vec<String>,
+    pub response_paths: Vec<String>,
+    pub workflow_archetype: WorkflowArchetype,
+    pub evidence: Vec<String>,
+}
+
 #[derive(Debug)]
 pub struct AnalyzeSceneError {
    message: String,
@@ -66,10 +93,6 @@ impl fmt::Display for AnalyzeSceneError {

 impl std::error::Error for AnalyzeSceneError {}

-/// Analyze scene source with an optional scene kind hint.
-///
-/// The hint parameter takes priority over meta tags.
-/// If neither hint nor meta tag is present, defaults to ReportCollection.
 pub fn analyze_scene_source_with_hint(
    source_dir: &Path,
    scene_kind_hint: Option<SceneKind>,
@@ -81,39 +104,35 @@ pub fn analyze_scene_source_with_hint(
            index_path.display()
        ))
    })?;
+    let facts = extract_deterministic_scene_facts(source_dir)?;

-    // Determine scene kind: hint > meta > default
    let scene_kind = if let Some(hint) = scene_kind_hint {
        hint
    } else {
-        let meta_kind = meta_content(&html, "sgclaw-scene-kind");
-        meta_kind
+        meta_content(&html, "sgclaw-scene-kind")
            .as_deref()
            .and_then(SceneKind::from_str)
            .unwrap_or(SceneKind::ReportCollection)
    };

-    // Tool kind is currently only browser_script
-    let tool_kind = meta_content(&html, "sgclaw-tool-kind");
-    if let Some(ref tk) = tool_kind {
-        if tk != "browser_script" {
+    if let Some(tool_kind) = meta_content(&html, "sgclaw-tool-kind") {
+        if tool_kind != "browser_script" {
            return Err(AnalyzeSceneError::new(format!(
-                "unsupported tool kind: {}",
-                tk
+                "unsupported tool kind: {tool_kind}"
            )));
        }
    }
-    // Default tool kind to BrowserScript

-    let target_url = meta_content(&html, "sgclaw-target-url");
-    let expected_domain = meta_content(&html, "sgclaw-expected-domain");
+    let meta_target_url = meta_content(&html, "sgclaw-target-url");
+    let meta_expected_domain = meta_content(&html, "sgclaw-expected-domain");
    let entry_script = meta_content(&html, "sgclaw-entry-script");

-    // Auto-extract expected_domain from external script URLs if not provided via meta tag
-    let expected_domain = expected_domain.or_else(|| extract_domain_from_external_scripts(&html));
-
-    // All fields are optional - generator will use placeholders for missing values
-    // This allows third-party scenes without meta tags to be processed
+    let target_url = meta_target_url
+        .filter(|value| !value.trim().is_empty())
+        .or_else(|| facts.bootstrap.target_url.clone());
+    let expected_domain = meta_expected_domain
+        .filter(|value| !value.trim().is_empty())
+        .or_else(|| facts.bootstrap.expected_domain.clone());

    Ok(SceneSourceAnalysis {
        scene_kind,
@@ -127,13 +146,166 @@ pub fn analyze_scene_source_with_hint(
    })
 }

-/// Analyze scene source (compatibility wrapper).
-///
-/// Requires meta tags to be present. For new code, use `analyze_scene_source_with_hint`.
 pub fn analyze_scene_source(source_dir: &Path) -> Result<SceneSourceAnalysis, AnalyzeSceneError> {
    analyze_scene_source_with_hint(source_dir, None)
 }

+pub fn extract_deterministic_scene_facts(
+    source_dir: &Path,
+) -> Result<DeterministicSceneFacts, AnalyzeSceneError> {
+    let text_blobs = collect_scene_texts(source_dir)?;
+    let combined = text_blobs
+        .iter()
+        .map(|(_, content)| content.as_str())
+        .collect::<Vec<_>>()
+        .join("\n");
+    let html = text_blobs
+        .iter()
+        .find(|(path, _)| path.ends_with("index.html"))
+        .map(|(_, content)| content.as_str())
+        .unwrap_or("");
+
+    let meta_target_url = meta_content(html, "sgclaw-target-url");
+    let meta_expected_domain = meta_content(html, "sgclaw-expected-domain");
+
+    let source_url_candidates = extract_named_url_candidates(&combined);
+    let endpoints = extract_endpoints(&combined, &source_url_candidates);
+    let entry_methods = collect_unique_matches(
+        &combined,
+        r"(?m)\b(?:async\s+)?function\s+([A-Za-z_]\w*(?:List|Query|query|Handle|handle|Task|task))\s*\(",
+    );
+    let export_methods = collect_unique_matches(
+        &combined,
+        r"(?m)\b(?:async\s+)?function\s+([A-Za-z_]\w*(?:Export|export|Excel|excel))\s*\(",
+    );
+    let secondary_request_methods = collect_unique_matches(
+        &combined,
+        r"(?m)\b(?:async\s+)?function\s+([A-Za-z_]\w*(?:Detail|detail|Charge|charge|Charges|charges|Info|info|Details|details))\s*\(",
+    );
+    let filter_expressions = collect_filter_expressions(&combined);
+    let branch_fields = collect_branch_fields(&combined);
+    let pagination_fields = collect_pagination_fields(&combined);
+    let response_paths = collect_response_paths(&combined);
+
+    let business_target_url = meta_target_url
+        .filter(|value| !value.trim().is_empty())
+        .or_else(|| choose_business_bootstrap_url(&source_url_candidates))
+        .or_else(|| {
+            endpoints
+                .iter()
+                .find(|endpoint| looks_like_business_url(&endpoint.url))
+                .and_then(|endpoint| base_url(&endpoint.url))
+        });
+    let business_domain = meta_expected_domain
+        .filter(|value| !value.trim().is_empty())
+        .or_else(|| {
+            business_target_url
+                .as_deref()
+                .and_then(extract_domain_from_url)
+                .map(str::to_string)
+        })
+        .or_else(|| {
+            endpoints
+                .first()
+                .and_then(|endpoint| extract_domain_from_url(&endpoint.url))
+                .map(str::to_string)
+        });
+
+    let workflow_archetype = classify_workflow_archetype(
+        &combined,
+        !branch_fields.is_empty(),
+        !pagination_fields.is_empty(),
+        !secondary_request_methods.is_empty(),
+        !filter_expressions.is_empty() || !export_methods.is_empty(),
+    );
+
+    let mut evidence = Vec::new();
+    if !source_url_candidates.is_empty() {
+        evidence.push(format!(
+            "bootstrap_source_url_candidates={}",
+            source_url_candidates.join(", ")
+        ));
+    }
+    if !branch_fields.is_empty() {
+        evidence.push(format!("branch_fields={}", branch_fields.join(", ")));
+    }
+    if !pagination_fields.is_empty() {
+        evidence.push(format!(
+            "pagination_fields={}",
+            pagination_fields.join(", ")
+        ));
+    }
+    if !secondary_request_methods.is_empty() {
+        evidence.push(format!(
+            "secondary_requests={}",
+            secondary_request_methods.join(", ")
+        ));
+    }
+    if !filter_expressions.is_empty() {
+        evidence.push(format!("filters={}", filter_expressions.join(" | ")));
+    }
+
+    Ok(DeterministicSceneFacts {
+        bootstrap: BootstrapAnalysis {
+            target_url: business_target_url,
+            expected_domain: business_domain,
+        },
+        endpoints,
+        entry_methods,
+        export_methods,
+        secondary_request_methods,
+        filter_expressions,
+        branch_fields,
+        pagination_fields,
+        response_paths,
+        workflow_archetype,
+        evidence,
+    })
+}
+
+fn collect_scene_texts(source_dir: &Path) -> Result<Vec<(String, String)>, AnalyzeSceneError> {
+    let mut out = Vec::new();
+    collect_scene_texts_inner(source_dir, source_dir, &mut out)?;
+    Ok(out)
+}
+
+fn collect_scene_texts_inner(
+    root: &Path,
+    dir: &Path,
+    out: &mut Vec<(String, String)>,
+) -> Result<(), AnalyzeSceneError> {
+    let mut entries = fs::read_dir(dir)
+        .map_err(|err| AnalyzeSceneError::new(format!("failed to read {}: {err}", dir.display())))?
+        .collect::<Result<Vec<_>, _>>()
+        .map_err(|err| AnalyzeSceneError::new(format!("failed to read {}: {err}", dir.display())))?;
+    entries.sort_by_key(|entry| entry.path());
+
+    for entry in entries {
+        let path = entry.path();
+        if path.is_dir() {
+            collect_scene_texts_inner(root, &path, out)?;
+            continue;
+        }
+        let Some(ext) = path.extension().and_then(|ext| ext.to_str()) else {
+            continue;
+        };
+        if !matches!(ext, "html" | "js" | "ts" | "json") {
+            continue;
+        }
+        let content = fs::read_to_string(&path).map_err(|err| {
+            AnalyzeSceneError::new(format!("failed to read {}: {err}", path.display()))
+        })?;
+        let relative = path
+            .strip_prefix(root)
+            .unwrap_or(&path)
+            .to_string_lossy()
+            .replace('\\', "/");
+        out.push((relative, content));
+    }
+
+    Ok(())
+}
+
 fn meta_content(html: &str, name: &str) -> Option<String> {
    for tag in html
        .split('<')
@@ -160,43 +332,219 @@ fn attr_value(tag: &str, attr: &str) -> Option<String> {
    Some(rest[..end].to_string())
 }

-/// Extract domain from the first external script URL found in HTML.
-///
-/// Looks for `<script src="http://...">` or `<script src="https://...">` tags
-/// and extracts the host:port portion as expected_domain.
-fn extract_domain_from_external_scripts(html: &str) -> Option<String> {
-    for tag in html
-        .split('<')
-        .filter(|fragment| fragment.starts_with("script"))
-    {
-        if let Some(src) = attr_value(tag, "src") {
-            // Check if it's an external URL (http:// or https://)
-            if src.starts_with("http://") || src.starts_with("https://") {
-                // Extract domain (host:port) from URL
-                if let Some(domain) = extract_domain_from_url(&src) {
-                    return Some(domain);
-                }
+fn extract_named_url_candidates(content: &str) -> Vec<String> {
+    let mut candidates = Vec::new();
+    let re = Regex::new(
+        r#"(?i)\b(?:sourceUrl|sourceURL|baseUrl|baseURL|targetUrl|requestUrl|loginPath|mainPath)\b\s*[:=]\s*["'](https?://[^"'?#\s]+(?:/[^"'#\s]*)?)["']"#,
+    )
+    .unwrap();
+    for capture in re.captures_iter(content) {
+        if let Some(url) = capture.get(1) {
+            let value = url.as_str().to_string();
+            if looks_like_business_url(&value) {
+                push_unique(&mut candidates, value);
            }
        }
    }
-    None
+    candidates
 }

-/// Extract domain (host:port) from a URL string.
-fn extract_domain_from_url(url: &str) -> Option<String> {
-    // Remove protocol prefix
+fn extract_endpoints(
+    content: &str,
+    source_url_candidates: &[String],
+) -> Vec<DeterministicEndpoint> {
+    let mut endpoints = Vec::new();
+    let url_re = Regex::new(r#"https?://[^\s"'`<>)]+(?:/[^\s"'`<>)]*)?"#).unwrap();
+    let method_re = Regex::new(r#"(?i)\btype\s*:\s*["'](GET|POST|PUT|DELETE)["']"#).unwrap();
+    let content_type_re =
+        Regex::new(r#"(?i)\bcontentType\s*:\s*["']([^"']+)["']"#).unwrap();
+    for capture in url_re.captures_iter(content) {
+        let Some(url_match) = capture.get(0) else {
+            continue;
+        };
+        let url = url_match.as_str().trim().trim_end_matches(&[';', ','][..]);
+        if !looks_like_business_url(url) {
+            continue;
+        }
+        let context_start = url_match.start().saturating_sub(120);
+        let context_end = (url_match.end() + 120).min(content.len());
+        let context = &content[context_start..context_end];
+        let method = method_re
+            .captures(context)
+            .and_then(|caps| caps.get(1))
+            .map(|m| m.as_str().to_uppercase())
+            .unwrap_or_else(|| "POST".to_string());
+        let content_type = content_type_re
+            .captures(context)
+            .and_then(|caps| caps.get(1))
+            .map(|m| m.as_str().to_string());
+        let name = infer_endpoint_name(url, context, source_url_candidates);
+        if endpoints
+            .iter()
+            .any(|existing: &DeterministicEndpoint| existing.url == url)
+        {
+            continue;
+        }
+        endpoints.push(DeterministicEndpoint {
+            name,
+            url: url.to_string(),
+            method,
+            content_type,
+        });
+    }
+    endpoints
+}
+
+fn infer_endpoint_name(url: &str, context: &str, source_url_candidates: &[String]) -> String {
+    for candidate in source_url_candidates {
+        if url.starts_with(candidate) {
+            let suffix = url.trim_start_matches(candidate).trim_matches('/');
+            if !suffix.is_empty() {
+                return suffix.replace('/', "_");
+            }
+        }
+    }
+    if let Some(segment) = url.split('/').filter(|part| !part.is_empty()).last() {
+        return segment.to_string();
+    }
+    let trimmed = context.trim();
+    trimmed
+        .split_whitespace()
+        .last()
+        .unwrap_or("endpoint")
+        .to_string()
+}
+
+fn looks_like_business_url(url: &str) -> bool {
+    let normalized = url.to_ascii_lowercase();
+    if normalized.ends_with(".js")
+        || normalized.ends_with(".css")
+        || normalized.contains("/a_js/")
+        || normalized.contains("localhost")
+        || normalized.contains("127.0.0.1")
+        || normalized.contains("surfaceservices")
+        || normalized.contains("reportservices")
+        || normalized.contains("${")
+        || normalized.contains("%s")
+    {
+        return false;
+    }
+    true
+}
+
+fn choose_business_bootstrap_url(candidates: &[String]) -> Option<String> {
+    candidates
+        .iter()
+        .find(|candidate| looks_like_business_url(candidate))
+        .cloned()
+}
+
+fn collect_unique_matches(content: &str, pattern: &str) -> Vec<String> {
+    let regex = Regex::new(pattern).unwrap();
+    let mut values = Vec::new();
+    for capture in regex.captures_iter(content) {
+        if let Some(value) = capture.get(1) {
+            push_unique(&mut values, value.as_str().to_string());
+        }
+    }
+    values
+}
+
+fn collect_filter_expressions(content: &str) -> Vec<String> {
+    let regex = Regex::new(r#"(?i)([A-Za-z_][\w.]*\s*(?:!==|!=|===|==|>=|<=|>|<)\s*[^;\n)]+)"#)
+        .unwrap();
+    let mut filters = Vec::new();
+    for capture in regex.captures_iter(content) {
+        let Some(expr) = capture.get(1) else {
+            continue;
+        };
+        let expr = expr.as_str().trim();
+        if expr.contains("charge")
+            || expr.contains("status")
+            || expr.contains("loss")
+            || expr.contains("filter")
+        {
+            push_unique(&mut filters, expr.to_string());
+        }
+    }
+    filters
+}
+
+fn collect_branch_fields(content: &str) -> Vec<String> {
+    let candidates = ["period_mode", "reportType", "tjzq", "mode", "week", "month"];
+    candidates
+        .into_iter()
+        .filter(|candidate| content.contains(candidate))
+        .map(str::to_string)
+        .collect()
+}
+
+fn collect_pagination_fields(content: &str) -> Vec<String> {
+    let candidates = ["pageSize", "page", "rows", "sidx", "sord", "totalPage"];
+    candidates
+        .into_iter()
+        .filter(|candidate| content.contains(candidate))
+        .map(str::to_string)
+        .collect()
+}
+
+fn collect_response_paths(content: &str) -> Vec<String> {
+    let mut paths = Vec::new();
+    for candidate in ["content", "data", "rows", "list"] {
+        if content.contains(&format!(".{candidate}")) || content.contains(&format!("[\"{candidate}\"]")) {
+            push_unique(&mut paths, candidate.to_string());
+        }
+    }
+    paths
+}
+
+fn classify_workflow_archetype(
+    content: &str,
+    has_branch_fields: bool,
+    has_pagination: bool,
+    has_secondary_request: bool,
+    has_post_process: bool,
+) -> WorkflowArchetype {
+    let has_month_week = content.contains("month") && content.contains("week");
+    if has_pagination && has_secondary_request && has_post_process {
+        return WorkflowArchetype::PaginatedEnrichment;
+    }
+    if has_branch_fields && has_month_week {
+        return WorkflowArchetype::MultiModeRequest;
+    }
+    if content.contains("monitor") || content.contains("status") {
+        return WorkflowArchetype::PageStateEval;
+    }
+    WorkflowArchetype::SingleRequestTable
+}
+
+fn base_url(url: &str) -> Option<String> {
+    let domain = extract_domain_from_url(url)?;
+    let protocol = if url.starts_with("https://") {
+        "https://"
+    } else if url.starts_with("http://") {
+        "http://"
+    } else {
+        return None;
+    };
+    Some(format!("{protocol}{domain}"))
+}
+
+pub fn extract_domain_from_url(url: &str) -> Option<&str> {
    let rest = url
        .strip_prefix("http://")
        .or_else(|| url.strip_prefix("https://"))?;
-    
-    // Find the end of domain (first '/' or end of string)
    let domain_end = rest.find('/').unwrap_or(rest.len());
    let domain = &rest[..domain_end];
-    
-    // Return non-empty domain
    if domain.is_empty() {
        None
    } else {
-        Some(domain.to_string())
+        Some(domain)
+    }
+}
+
+fn push_unique(values: &mut Vec<String>, candidate: String) {
+    if !values.iter().any(|existing| existing == &candidate) {
+        values.push(candidate);
    }
 }