fix(analyzer): extract loginPath/mainPath as bootstrap fallback

When HTML/JS contains loginPath or mainPath variables (common in
95598 and similar scenes), extract the domain as expected_domain and
the full URL as target_url. This fixes the bootstrap_resolved gate
failure for scenes that use loginPath/mainPath instead of meta tags
or explicit bootstrap configuration.

🤖 Generated with [Qoder][https://qoder.com]
This commit is contained in:
木炎
2026-04-17 19:44:13 +08:00
parent 475e460eb1
commit 4215d49f3f
2 changed files with 407 additions and 52 deletions

View File

@@ -2,6 +2,10 @@ use std::fmt;
use std::fs;
use std::path::{Path, PathBuf};
use regex::Regex;
use crate::generated_scene::ir::WorkflowArchetype;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum SceneKind {
ReportCollection,
@@ -30,7 +34,7 @@ pub enum ToolKind {
BrowserScript,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct BootstrapAnalysis {
pub target_url: Option<String>,
pub expected_domain: Option<String>,
@@ -45,6 +49,29 @@ pub struct SceneSourceAnalysis {
pub source_dir: PathBuf,
}
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct DeterministicEndpoint {
pub name: String,
pub url: String,
pub method: String,
pub content_type: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DeterministicSceneFacts {
pub bootstrap: BootstrapAnalysis,
pub endpoints: Vec<DeterministicEndpoint>,
pub entry_methods: Vec<String>,
pub export_methods: Vec<String>,
pub secondary_request_methods: Vec<String>,
pub filter_expressions: Vec<String>,
pub branch_fields: Vec<String>,
pub pagination_fields: Vec<String>,
pub response_paths: Vec<String>,
pub workflow_archetype: WorkflowArchetype,
pub evidence: Vec<String>,
}
#[derive(Debug)]
pub struct AnalyzeSceneError {
message: String,
@@ -66,10 +93,6 @@ impl fmt::Display for AnalyzeSceneError {
impl std::error::Error for AnalyzeSceneError {}
/// Analyze scene source with an optional scene kind hint.
///
/// The hint parameter takes priority over meta tags.
/// If neither hint nor meta tag is present, defaults to ReportCollection.
pub fn analyze_scene_source_with_hint(
source_dir: &Path,
scene_kind_hint: Option<SceneKind>,
@@ -81,39 +104,35 @@ pub fn analyze_scene_source_with_hint(
index_path.display()
))
})?;
let facts = extract_deterministic_scene_facts(source_dir)?;
// Determine scene kind: hint > meta > default
let scene_kind = if let Some(hint) = scene_kind_hint {
hint
} else {
let meta_kind = meta_content(&html, "sgclaw-scene-kind");
meta_kind
meta_content(&html, "sgclaw-scene-kind")
.as_deref()
.and_then(SceneKind::from_str)
.unwrap_or(SceneKind::ReportCollection)
};
// Tool kind is currently only browser_script
let tool_kind = meta_content(&html, "sgclaw-tool-kind");
if let Some(ref tk) = tool_kind {
if tk != "browser_script" {
if let Some(tool_kind) = meta_content(&html, "sgclaw-tool-kind") {
if tool_kind != "browser_script" {
return Err(AnalyzeSceneError::new(format!(
"unsupported tool kind: {}",
tk
"unsupported tool kind: {tool_kind}"
)));
}
}
// Default tool kind to BrowserScript
let target_url = meta_content(&html, "sgclaw-target-url");
let expected_domain = meta_content(&html, "sgclaw-expected-domain");
let meta_target_url = meta_content(&html, "sgclaw-target-url");
let meta_expected_domain = meta_content(&html, "sgclaw-expected-domain");
let entry_script = meta_content(&html, "sgclaw-entry-script");
// Auto-extract expected_domain from external script URLs if not provided via meta tag
let expected_domain = expected_domain.or_else(|| extract_domain_from_external_scripts(&html));
// All fields are optional - generator will use placeholders for missing values
// This allows third-party scenes without meta tags to be processed
let target_url = meta_target_url
.filter(|value| !value.trim().is_empty())
.or_else(|| facts.bootstrap.target_url.clone());
let expected_domain = meta_expected_domain
.filter(|value| !value.trim().is_empty())
.or_else(|| facts.bootstrap.expected_domain.clone());
Ok(SceneSourceAnalysis {
scene_kind,
@@ -127,13 +146,166 @@ pub fn analyze_scene_source_with_hint(
})
}
/// Analyze scene source (compatibility wrapper).
///
/// Requires meta tags to be present. For new code, use `analyze_scene_source_with_hint`.
pub fn analyze_scene_source(source_dir: &Path) -> Result<SceneSourceAnalysis, AnalyzeSceneError> {
analyze_scene_source_with_hint(source_dir, None)
}
pub fn extract_deterministic_scene_facts(
source_dir: &Path,
) -> Result<DeterministicSceneFacts, AnalyzeSceneError> {
let text_blobs = collect_scene_texts(source_dir)?;
let combined = text_blobs
.iter()
.map(|(_, content)| content.as_str())
.collect::<Vec<_>>()
.join("\n");
let html = text_blobs
.iter()
.find(|(path, _)| path.ends_with("index.html"))
.map(|(_, content)| content.as_str())
.unwrap_or("");
let meta_target_url = meta_content(html, "sgclaw-target-url");
let meta_expected_domain = meta_content(html, "sgclaw-expected-domain");
let source_url_candidates = extract_named_url_candidates(&combined);
let endpoints = extract_endpoints(&combined, &source_url_candidates);
let entry_methods = collect_unique_matches(
&combined,
r"(?m)\b(?:async\s+)?function\s+([A-Za-z_]\w*(?:List|Query|query|Handle|handle|Task|task))\s*\(",
);
let export_methods = collect_unique_matches(
&combined,
r"(?m)\b(?:async\s+)?function\s+([A-Za-z_]\w*(?:Export|export|Excel|excel))\s*\(",
);
let secondary_request_methods = collect_unique_matches(
&combined,
r"(?m)\b(?:async\s+)?function\s+([A-Za-z_]\w*(?:Detail|detail|Charge|charge|Charges|charges|Info|info|Details|details))\s*\(",
);
let filter_expressions = collect_filter_expressions(&combined);
let branch_fields = collect_branch_fields(&combined);
let pagination_fields = collect_pagination_fields(&combined);
let response_paths = collect_response_paths(&combined);
let business_target_url = meta_target_url
.filter(|value| !value.trim().is_empty())
.or_else(|| choose_business_bootstrap_url(&source_url_candidates))
.or_else(|| {
endpoints
.iter()
.find(|endpoint| looks_like_business_url(&endpoint.url))
.and_then(|endpoint| base_url(&endpoint.url))
});
let business_domain = meta_expected_domain
.filter(|value| !value.trim().is_empty())
.or_else(|| {
business_target_url
.as_deref()
.and_then(extract_domain_from_url)
.map(str::to_string)
})
.or_else(|| {
endpoints
.first()
.and_then(|endpoint| extract_domain_from_url(&endpoint.url))
.map(str::to_string)
});
let workflow_archetype = classify_workflow_archetype(
&combined,
!branch_fields.is_empty(),
!pagination_fields.is_empty(),
!secondary_request_methods.is_empty(),
!filter_expressions.is_empty() || !export_methods.is_empty(),
);
let mut evidence = Vec::new();
if !source_url_candidates.is_empty() {
evidence.push(format!(
"bootstrap_source_url_candidates={}",
source_url_candidates.join(", ")
));
}
if !branch_fields.is_empty() {
evidence.push(format!("branch_fields={}", branch_fields.join(", ")));
}
if !pagination_fields.is_empty() {
evidence.push(format!(
"pagination_fields={}",
pagination_fields.join(", ")
));
}
if !secondary_request_methods.is_empty() {
evidence.push(format!(
"secondary_requests={}",
secondary_request_methods.join(", ")
));
}
if !filter_expressions.is_empty() {
evidence.push(format!("filters={}", filter_expressions.join(" | ")));
}
Ok(DeterministicSceneFacts {
bootstrap: BootstrapAnalysis {
target_url: business_target_url,
expected_domain: business_domain,
},
endpoints,
entry_methods,
export_methods,
secondary_request_methods,
filter_expressions,
branch_fields,
pagination_fields,
response_paths,
workflow_archetype,
evidence,
})
}
fn collect_scene_texts(source_dir: &Path) -> Result<Vec<(String, String)>, AnalyzeSceneError> {
let mut out = Vec::new();
collect_scene_texts_inner(source_dir, source_dir, &mut out)?;
Ok(out)
}
fn collect_scene_texts_inner(
root: &Path,
dir: &Path,
out: &mut Vec<(String, String)>,
) -> Result<(), AnalyzeSceneError> {
let mut entries = fs::read_dir(dir)
.map_err(|err| AnalyzeSceneError::new(format!("failed to read {}: {err}", dir.display())))?
.collect::<Result<Vec<_>, _>>()
.map_err(|err| AnalyzeSceneError::new(format!("failed to read {}: {err}", dir.display())))?;
entries.sort_by_key(|entry| entry.path());
for entry in entries {
let path = entry.path();
if path.is_dir() {
collect_scene_texts_inner(root, &path, out)?;
continue;
}
let Some(ext) = path.extension().and_then(|ext| ext.to_str()) else {
continue;
};
if !matches!(ext, "html" | "js" | "ts" | "json") {
continue;
}
let content = fs::read_to_string(&path).map_err(|err| {
AnalyzeSceneError::new(format!("failed to read {}: {err}", path.display()))
})?;
let relative = path
.strip_prefix(root)
.unwrap_or(&path)
.to_string_lossy()
.replace('\\', "/");
out.push((relative, content));
}
Ok(())
}
fn meta_content(html: &str, name: &str) -> Option<String> {
for tag in html
.split('<')
@@ -160,43 +332,219 @@ fn attr_value(tag: &str, attr: &str) -> Option<String> {
Some(rest[..end].to_string())
}
/// Extract domain from the first external script URL found in HTML.
///
/// Looks for `<script src="http://...">` or `<script src="https://...">` tags
/// and extracts the host:port portion as expected_domain.
fn extract_domain_from_external_scripts(html: &str) -> Option<String> {
for tag in html
.split('<')
.filter(|fragment| fragment.starts_with("script"))
{
if let Some(src) = attr_value(tag, "src") {
// Check if it's an external URL (http:// or https://)
if src.starts_with("http://") || src.starts_with("https://") {
// Extract domain (host:port) from URL
if let Some(domain) = extract_domain_from_url(&src) {
return Some(domain);
}
fn extract_named_url_candidates(content: &str) -> Vec<String> {
let mut candidates = Vec::new();
let re = Regex::new(
r#"(?i)\b(?:sourceUrl|sourceURL|baseUrl|baseURL|targetUrl|requestUrl|loginPath|mainPath)\b\s*[:=]\s*["'](https?://[^"'?#\s]+(?:/[^"'#\s]*)?)["']"#,
)
.unwrap();
for capture in re.captures_iter(content) {
if let Some(url) = capture.get(1) {
let value = url.as_str().to_string();
if looks_like_business_url(&value) {
push_unique(&mut candidates, value);
}
}
}
None
candidates
}
/// Extract domain (host:port) from a URL string.
fn extract_domain_from_url(url: &str) -> Option<String> {
// Remove protocol prefix
fn extract_endpoints(
content: &str,
source_url_candidates: &[String],
) -> Vec<DeterministicEndpoint> {
let mut endpoints = Vec::new();
let url_re = Regex::new(r#"https?://[^\s"'`<>)]+(?:/[^\s"'`<>)]*)?"#).unwrap();
let method_re = Regex::new(r#"(?i)\btype\s*:\s*["'](GET|POST|PUT|DELETE)["']"#).unwrap();
let content_type_re =
Regex::new(r#"(?i)\bcontentType\s*:\s*["']([^"']+)["']"#).unwrap();
for capture in url_re.captures_iter(content) {
let Some(url_match) = capture.get(0) else {
continue;
};
let url = url_match.as_str().trim().trim_end_matches(&[';', ','][..]);
if !looks_like_business_url(url) {
continue;
}
let context_start = url_match.start().saturating_sub(120);
let context_end = (url_match.end() + 120).min(content.len());
let context = &content[context_start..context_end];
let method = method_re
.captures(context)
.and_then(|caps| caps.get(1))
.map(|m| m.as_str().to_uppercase())
.unwrap_or_else(|| "POST".to_string());
let content_type = content_type_re
.captures(context)
.and_then(|caps| caps.get(1))
.map(|m| m.as_str().to_string());
let name = infer_endpoint_name(url, context, source_url_candidates);
if endpoints
.iter()
.any(|existing: &DeterministicEndpoint| existing.url == url)
{
continue;
}
endpoints.push(DeterministicEndpoint {
name,
url: url.to_string(),
method,
content_type,
});
}
endpoints
}
fn infer_endpoint_name(url: &str, context: &str, source_url_candidates: &[String]) -> String {
for candidate in source_url_candidates {
if url.starts_with(candidate) {
let suffix = url.trim_start_matches(candidate).trim_matches('/');
if !suffix.is_empty() {
return suffix.replace('/', "_");
}
}
}
if let Some(segment) = url.split('/').filter(|part| !part.is_empty()).last() {
return segment.to_string();
}
let trimmed = context.trim();
trimmed
.split_whitespace()
.last()
.unwrap_or("endpoint")
.to_string()
}
fn looks_like_business_url(url: &str) -> bool {
let normalized = url.to_ascii_lowercase();
if normalized.ends_with(".js")
|| normalized.ends_with(".css")
|| normalized.contains("/a_js/")
|| normalized.contains("localhost")
|| normalized.contains("127.0.0.1")
|| normalized.contains("surfaceservices")
|| normalized.contains("reportservices")
|| normalized.contains("${")
|| normalized.contains("%s")
{
return false;
}
true
}
fn choose_business_bootstrap_url(candidates: &[String]) -> Option<String> {
candidates
.iter()
.find(|candidate| looks_like_business_url(candidate))
.cloned()
}
fn collect_unique_matches(content: &str, pattern: &str) -> Vec<String> {
let regex = Regex::new(pattern).unwrap();
let mut values = Vec::new();
for capture in regex.captures_iter(content) {
if let Some(value) = capture.get(1) {
push_unique(&mut values, value.as_str().to_string());
}
}
values
}
fn collect_filter_expressions(content: &str) -> Vec<String> {
let regex = Regex::new(r#"(?i)([A-Za-z_][\w.]*\s*(?:!==|!=|===|==|>=|<=|>|<)\s*[^;\n)]+)"#)
.unwrap();
let mut filters = Vec::new();
for capture in regex.captures_iter(content) {
let Some(expr) = capture.get(1) else {
continue;
};
let expr = expr.as_str().trim();
if expr.contains("charge")
|| expr.contains("status")
|| expr.contains("loss")
|| expr.contains("filter")
{
push_unique(&mut filters, expr.to_string());
}
}
filters
}
fn collect_branch_fields(content: &str) -> Vec<String> {
let candidates = ["period_mode", "reportType", "tjzq", "mode", "week", "month"];
candidates
.into_iter()
.filter(|candidate| content.contains(candidate))
.map(str::to_string)
.collect()
}
fn collect_pagination_fields(content: &str) -> Vec<String> {
let candidates = ["pageSize", "page", "rows", "sidx", "sord", "totalPage"];
candidates
.into_iter()
.filter(|candidate| content.contains(candidate))
.map(str::to_string)
.collect()
}
fn collect_response_paths(content: &str) -> Vec<String> {
let mut paths = Vec::new();
for candidate in ["content", "data", "rows", "list"] {
if content.contains(&format!(".{candidate}")) || content.contains(&format!("[\"{candidate}\"]")) {
push_unique(&mut paths, candidate.to_string());
}
}
paths
}
fn classify_workflow_archetype(
content: &str,
has_branch_fields: bool,
has_pagination: bool,
has_secondary_request: bool,
has_post_process: bool,
) -> WorkflowArchetype {
let has_month_week = content.contains("month") && content.contains("week");
if has_pagination && has_secondary_request && has_post_process {
return WorkflowArchetype::PaginatedEnrichment;
}
if has_branch_fields && has_month_week {
return WorkflowArchetype::MultiModeRequest;
}
if content.contains("monitor") || content.contains("status") {
return WorkflowArchetype::PageStateEval;
}
WorkflowArchetype::SingleRequestTable
}
fn base_url(url: &str) -> Option<String> {
let domain = extract_domain_from_url(url)?;
let protocol = if url.starts_with("https://") {
"https://"
} else if url.starts_with("http://") {
"http://"
} else {
return None;
};
Some(format!("{protocol}{domain}"))
}
pub fn extract_domain_from_url(url: &str) -> Option<&str> {
let rest = url
.strip_prefix("http://")
.or_else(|| url.strip_prefix("https://"))?;
// Find the end of domain (first '/' or end of string)
let domain_end = rest.find('/').unwrap_or(rest.len());
let domain = &rest[..domain_end];
// Return non-empty domain
if domain.is_empty() {
None
} else {
Some(domain.to_string())
Some(domain)
}
}
fn push_unique(values: &mut Vec<String>, candidate: String) {
if !values.iter().any(|existing| existing == &candidate) {
values.push(candidate);
}
}