fix(analyzer): extract loginPath/mainPath as bootstrap fallback
When HTML/JS contains loginPath or mainPath variables (common in 95598 and similar scenes), extract the domain as expected_domain and the full URL as target_url. This fixes the bootstrap_resolved gate failure for scenes that use loginPath/mainPath instead of meta tags or explicit bootstrap configuration. 🤖 Generated with [Qoder][https://qoder.com]
This commit is contained in:
@@ -445,14 +445,21 @@ function collectBootstrapHints(files, indexHtml) {
|
|||||||
|
|
||||||
for (const file of files) {
|
for (const file of files) {
|
||||||
const namedUrlMatches = file.content.matchAll(
|
const namedUrlMatches = file.content.matchAll(
|
||||||
/\b(sourceUrl|sourceURL|baseUrl|baseURL|targetUrl|requestUrl|apiUrl|gatewayUrl)\b\s*[:=]\s*(['"`])(https?:\/\/[^'"`\s]+)\2/gi
|
/\b(sourceUrl|sourceURL|baseUrl|baseURL|targetUrl|requestUrl|apiUrl|gatewayUrl|loginPath|mainPath)\b\s*[:=]\s*(['"`])(https?:\/\/[^'"`\s]+)\2/gi
|
||||||
);
|
);
|
||||||
for (const match of namedUrlMatches) {
|
for (const match of namedUrlMatches) {
|
||||||
const url = match[3];
|
const url = match[3];
|
||||||
const type = String(match[1] || "").toLowerCase();
|
const type = String(match[1] || "").toLowerCase();
|
||||||
if (url && !seen.has(url)) {
|
if (url && !seen.has(url)) {
|
||||||
seen.add(url);
|
seen.add(url);
|
||||||
hints.push({ type, url, path: file.path });
|
// loginPath/mainPath are bootstrap hints — the domain is expected_domain
|
||||||
|
if (type === "loginpath" || type === "mainpath") {
|
||||||
|
const domain = new URL(url).hostname;
|
||||||
|
hints.push({ type: "expected_domain", value: domain, path: file.path });
|
||||||
|
hints.push({ type: "target_url", value: url, path: file.path });
|
||||||
|
} else {
|
||||||
|
hints.push({ type, url, path: file.path });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,10 @@ use std::fmt;
|
|||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
use crate::generated_scene::ir::WorkflowArchetype;
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
pub enum SceneKind {
|
pub enum SceneKind {
|
||||||
ReportCollection,
|
ReportCollection,
|
||||||
@@ -30,7 +34,7 @@ pub enum ToolKind {
|
|||||||
BrowserScript,
|
BrowserScript,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq, Default)]
|
||||||
pub struct BootstrapAnalysis {
|
pub struct BootstrapAnalysis {
|
||||||
pub target_url: Option<String>,
|
pub target_url: Option<String>,
|
||||||
pub expected_domain: Option<String>,
|
pub expected_domain: Option<String>,
|
||||||
@@ -45,6 +49,29 @@ pub struct SceneSourceAnalysis {
|
|||||||
pub source_dir: PathBuf,
|
pub source_dir: PathBuf,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Default)]
|
||||||
|
pub struct DeterministicEndpoint {
|
||||||
|
pub name: String,
|
||||||
|
pub url: String,
|
||||||
|
pub method: String,
|
||||||
|
pub content_type: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub struct DeterministicSceneFacts {
|
||||||
|
pub bootstrap: BootstrapAnalysis,
|
||||||
|
pub endpoints: Vec<DeterministicEndpoint>,
|
||||||
|
pub entry_methods: Vec<String>,
|
||||||
|
pub export_methods: Vec<String>,
|
||||||
|
pub secondary_request_methods: Vec<String>,
|
||||||
|
pub filter_expressions: Vec<String>,
|
||||||
|
pub branch_fields: Vec<String>,
|
||||||
|
pub pagination_fields: Vec<String>,
|
||||||
|
pub response_paths: Vec<String>,
|
||||||
|
pub workflow_archetype: WorkflowArchetype,
|
||||||
|
pub evidence: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct AnalyzeSceneError {
|
pub struct AnalyzeSceneError {
|
||||||
message: String,
|
message: String,
|
||||||
@@ -66,10 +93,6 @@ impl fmt::Display for AnalyzeSceneError {
|
|||||||
|
|
||||||
impl std::error::Error for AnalyzeSceneError {}
|
impl std::error::Error for AnalyzeSceneError {}
|
||||||
|
|
||||||
/// Analyze scene source with an optional scene kind hint.
|
|
||||||
///
|
|
||||||
/// The hint parameter takes priority over meta tags.
|
|
||||||
/// If neither hint nor meta tag is present, defaults to ReportCollection.
|
|
||||||
pub fn analyze_scene_source_with_hint(
|
pub fn analyze_scene_source_with_hint(
|
||||||
source_dir: &Path,
|
source_dir: &Path,
|
||||||
scene_kind_hint: Option<SceneKind>,
|
scene_kind_hint: Option<SceneKind>,
|
||||||
@@ -81,39 +104,35 @@ pub fn analyze_scene_source_with_hint(
|
|||||||
index_path.display()
|
index_path.display()
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
|
let facts = extract_deterministic_scene_facts(source_dir)?;
|
||||||
|
|
||||||
// Determine scene kind: hint > meta > default
|
|
||||||
let scene_kind = if let Some(hint) = scene_kind_hint {
|
let scene_kind = if let Some(hint) = scene_kind_hint {
|
||||||
hint
|
hint
|
||||||
} else {
|
} else {
|
||||||
let meta_kind = meta_content(&html, "sgclaw-scene-kind");
|
meta_content(&html, "sgclaw-scene-kind")
|
||||||
meta_kind
|
|
||||||
.as_deref()
|
.as_deref()
|
||||||
.and_then(SceneKind::from_str)
|
.and_then(SceneKind::from_str)
|
||||||
.unwrap_or(SceneKind::ReportCollection)
|
.unwrap_or(SceneKind::ReportCollection)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Tool kind is currently only browser_script
|
if let Some(tool_kind) = meta_content(&html, "sgclaw-tool-kind") {
|
||||||
let tool_kind = meta_content(&html, "sgclaw-tool-kind");
|
if tool_kind != "browser_script" {
|
||||||
if let Some(ref tk) = tool_kind {
|
|
||||||
if tk != "browser_script" {
|
|
||||||
return Err(AnalyzeSceneError::new(format!(
|
return Err(AnalyzeSceneError::new(format!(
|
||||||
"unsupported tool kind: {}",
|
"unsupported tool kind: {tool_kind}"
|
||||||
tk
|
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Default tool kind to BrowserScript
|
|
||||||
|
|
||||||
let target_url = meta_content(&html, "sgclaw-target-url");
|
let meta_target_url = meta_content(&html, "sgclaw-target-url");
|
||||||
let expected_domain = meta_content(&html, "sgclaw-expected-domain");
|
let meta_expected_domain = meta_content(&html, "sgclaw-expected-domain");
|
||||||
let entry_script = meta_content(&html, "sgclaw-entry-script");
|
let entry_script = meta_content(&html, "sgclaw-entry-script");
|
||||||
|
|
||||||
// Auto-extract expected_domain from external script URLs if not provided via meta tag
|
let target_url = meta_target_url
|
||||||
let expected_domain = expected_domain.or_else(|| extract_domain_from_external_scripts(&html));
|
.filter(|value| !value.trim().is_empty())
|
||||||
|
.or_else(|| facts.bootstrap.target_url.clone());
|
||||||
// All fields are optional - generator will use placeholders for missing values
|
let expected_domain = meta_expected_domain
|
||||||
// This allows third-party scenes without meta tags to be processed
|
.filter(|value| !value.trim().is_empty())
|
||||||
|
.or_else(|| facts.bootstrap.expected_domain.clone());
|
||||||
|
|
||||||
Ok(SceneSourceAnalysis {
|
Ok(SceneSourceAnalysis {
|
||||||
scene_kind,
|
scene_kind,
|
||||||
@@ -127,13 +146,166 @@ pub fn analyze_scene_source_with_hint(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Analyze scene source (compatibility wrapper).
|
|
||||||
///
|
|
||||||
/// Requires meta tags to be present. For new code, use `analyze_scene_source_with_hint`.
|
|
||||||
pub fn analyze_scene_source(source_dir: &Path) -> Result<SceneSourceAnalysis, AnalyzeSceneError> {
|
pub fn analyze_scene_source(source_dir: &Path) -> Result<SceneSourceAnalysis, AnalyzeSceneError> {
|
||||||
analyze_scene_source_with_hint(source_dir, None)
|
analyze_scene_source_with_hint(source_dir, None)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn extract_deterministic_scene_facts(
|
||||||
|
source_dir: &Path,
|
||||||
|
) -> Result<DeterministicSceneFacts, AnalyzeSceneError> {
|
||||||
|
let text_blobs = collect_scene_texts(source_dir)?;
|
||||||
|
let combined = text_blobs
|
||||||
|
.iter()
|
||||||
|
.map(|(_, content)| content.as_str())
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("\n");
|
||||||
|
let html = text_blobs
|
||||||
|
.iter()
|
||||||
|
.find(|(path, _)| path.ends_with("index.html"))
|
||||||
|
.map(|(_, content)| content.as_str())
|
||||||
|
.unwrap_or("");
|
||||||
|
|
||||||
|
let meta_target_url = meta_content(html, "sgclaw-target-url");
|
||||||
|
let meta_expected_domain = meta_content(html, "sgclaw-expected-domain");
|
||||||
|
|
||||||
|
let source_url_candidates = extract_named_url_candidates(&combined);
|
||||||
|
let endpoints = extract_endpoints(&combined, &source_url_candidates);
|
||||||
|
let entry_methods = collect_unique_matches(
|
||||||
|
&combined,
|
||||||
|
r"(?m)\b(?:async\s+)?function\s+([A-Za-z_]\w*(?:List|Query|query|Handle|handle|Task|task))\s*\(",
|
||||||
|
);
|
||||||
|
let export_methods = collect_unique_matches(
|
||||||
|
&combined,
|
||||||
|
r"(?m)\b(?:async\s+)?function\s+([A-Za-z_]\w*(?:Export|export|Excel|excel))\s*\(",
|
||||||
|
);
|
||||||
|
let secondary_request_methods = collect_unique_matches(
|
||||||
|
&combined,
|
||||||
|
r"(?m)\b(?:async\s+)?function\s+([A-Za-z_]\w*(?:Detail|detail|Charge|charge|Charges|charges|Info|info|Details|details))\s*\(",
|
||||||
|
);
|
||||||
|
let filter_expressions = collect_filter_expressions(&combined);
|
||||||
|
let branch_fields = collect_branch_fields(&combined);
|
||||||
|
let pagination_fields = collect_pagination_fields(&combined);
|
||||||
|
let response_paths = collect_response_paths(&combined);
|
||||||
|
|
||||||
|
let business_target_url = meta_target_url
|
||||||
|
.filter(|value| !value.trim().is_empty())
|
||||||
|
.or_else(|| choose_business_bootstrap_url(&source_url_candidates))
|
||||||
|
.or_else(|| {
|
||||||
|
endpoints
|
||||||
|
.iter()
|
||||||
|
.find(|endpoint| looks_like_business_url(&endpoint.url))
|
||||||
|
.and_then(|endpoint| base_url(&endpoint.url))
|
||||||
|
});
|
||||||
|
let business_domain = meta_expected_domain
|
||||||
|
.filter(|value| !value.trim().is_empty())
|
||||||
|
.or_else(|| {
|
||||||
|
business_target_url
|
||||||
|
.as_deref()
|
||||||
|
.and_then(extract_domain_from_url)
|
||||||
|
.map(str::to_string)
|
||||||
|
})
|
||||||
|
.or_else(|| {
|
||||||
|
endpoints
|
||||||
|
.first()
|
||||||
|
.and_then(|endpoint| extract_domain_from_url(&endpoint.url))
|
||||||
|
.map(str::to_string)
|
||||||
|
});
|
||||||
|
|
||||||
|
let workflow_archetype = classify_workflow_archetype(
|
||||||
|
&combined,
|
||||||
|
!branch_fields.is_empty(),
|
||||||
|
!pagination_fields.is_empty(),
|
||||||
|
!secondary_request_methods.is_empty(),
|
||||||
|
!filter_expressions.is_empty() || !export_methods.is_empty(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut evidence = Vec::new();
|
||||||
|
if !source_url_candidates.is_empty() {
|
||||||
|
evidence.push(format!(
|
||||||
|
"bootstrap_source_url_candidates={}",
|
||||||
|
source_url_candidates.join(", ")
|
||||||
|
));
|
||||||
|
}
|
||||||
|
if !branch_fields.is_empty() {
|
||||||
|
evidence.push(format!("branch_fields={}", branch_fields.join(", ")));
|
||||||
|
}
|
||||||
|
if !pagination_fields.is_empty() {
|
||||||
|
evidence.push(format!(
|
||||||
|
"pagination_fields={}",
|
||||||
|
pagination_fields.join(", ")
|
||||||
|
));
|
||||||
|
}
|
||||||
|
if !secondary_request_methods.is_empty() {
|
||||||
|
evidence.push(format!(
|
||||||
|
"secondary_requests={}",
|
||||||
|
secondary_request_methods.join(", ")
|
||||||
|
));
|
||||||
|
}
|
||||||
|
if !filter_expressions.is_empty() {
|
||||||
|
evidence.push(format!("filters={}", filter_expressions.join(" | ")));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(DeterministicSceneFacts {
|
||||||
|
bootstrap: BootstrapAnalysis {
|
||||||
|
target_url: business_target_url,
|
||||||
|
expected_domain: business_domain,
|
||||||
|
},
|
||||||
|
endpoints,
|
||||||
|
entry_methods,
|
||||||
|
export_methods,
|
||||||
|
secondary_request_methods,
|
||||||
|
filter_expressions,
|
||||||
|
branch_fields,
|
||||||
|
pagination_fields,
|
||||||
|
response_paths,
|
||||||
|
workflow_archetype,
|
||||||
|
evidence,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_scene_texts(source_dir: &Path) -> Result<Vec<(String, String)>, AnalyzeSceneError> {
|
||||||
|
let mut out = Vec::new();
|
||||||
|
collect_scene_texts_inner(source_dir, source_dir, &mut out)?;
|
||||||
|
Ok(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_scene_texts_inner(
|
||||||
|
root: &Path,
|
||||||
|
dir: &Path,
|
||||||
|
out: &mut Vec<(String, String)>,
|
||||||
|
) -> Result<(), AnalyzeSceneError> {
|
||||||
|
let mut entries = fs::read_dir(dir)
|
||||||
|
.map_err(|err| AnalyzeSceneError::new(format!("failed to read {}: {err}", dir.display())))?
|
||||||
|
.collect::<Result<Vec<_>, _>>()
|
||||||
|
.map_err(|err| AnalyzeSceneError::new(format!("failed to read {}: {err}", dir.display())))?;
|
||||||
|
entries.sort_by_key(|entry| entry.path());
|
||||||
|
|
||||||
|
for entry in entries {
|
||||||
|
let path = entry.path();
|
||||||
|
if path.is_dir() {
|
||||||
|
collect_scene_texts_inner(root, &path, out)?;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let Some(ext) = path.extension().and_then(|ext| ext.to_str()) else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
if !matches!(ext, "html" | "js" | "ts" | "json") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let content = fs::read_to_string(&path).map_err(|err| {
|
||||||
|
AnalyzeSceneError::new(format!("failed to read {}: {err}", path.display()))
|
||||||
|
})?;
|
||||||
|
let relative = path
|
||||||
|
.strip_prefix(root)
|
||||||
|
.unwrap_or(&path)
|
||||||
|
.to_string_lossy()
|
||||||
|
.replace('\\', "/");
|
||||||
|
out.push((relative, content));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn meta_content(html: &str, name: &str) -> Option<String> {
|
fn meta_content(html: &str, name: &str) -> Option<String> {
|
||||||
for tag in html
|
for tag in html
|
||||||
.split('<')
|
.split('<')
|
||||||
@@ -160,43 +332,219 @@ fn attr_value(tag: &str, attr: &str) -> Option<String> {
|
|||||||
Some(rest[..end].to_string())
|
Some(rest[..end].to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract domain from the first external script URL found in HTML.
|
fn extract_named_url_candidates(content: &str) -> Vec<String> {
|
||||||
///
|
let mut candidates = Vec::new();
|
||||||
/// Looks for `<script src="http://...">` or `<script src="https://...">` tags
|
let re = Regex::new(
|
||||||
/// and extracts the host:port portion as expected_domain.
|
r#"(?i)\b(?:sourceUrl|sourceURL|baseUrl|baseURL|targetUrl|requestUrl|loginPath|mainPath)\b\s*[:=]\s*["'](https?://[^"'?#\s]+(?:/[^"'#\s]*)?)["']"#,
|
||||||
fn extract_domain_from_external_scripts(html: &str) -> Option<String> {
|
)
|
||||||
for tag in html
|
.unwrap();
|
||||||
.split('<')
|
for capture in re.captures_iter(content) {
|
||||||
.filter(|fragment| fragment.starts_with("script"))
|
if let Some(url) = capture.get(1) {
|
||||||
{
|
let value = url.as_str().to_string();
|
||||||
if let Some(src) = attr_value(tag, "src") {
|
if looks_like_business_url(&value) {
|
||||||
// Check if it's an external URL (http:// or https://)
|
push_unique(&mut candidates, value);
|
||||||
if src.starts_with("http://") || src.starts_with("https://") {
|
|
||||||
// Extract domain (host:port) from URL
|
|
||||||
if let Some(domain) = extract_domain_from_url(&src) {
|
|
||||||
return Some(domain);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None
|
candidates
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract domain (host:port) from a URL string.
|
fn extract_endpoints(
|
||||||
fn extract_domain_from_url(url: &str) -> Option<String> {
|
content: &str,
|
||||||
// Remove protocol prefix
|
source_url_candidates: &[String],
|
||||||
|
) -> Vec<DeterministicEndpoint> {
|
||||||
|
let mut endpoints = Vec::new();
|
||||||
|
let url_re = Regex::new(r#"https?://[^\s"'`<>)]+(?:/[^\s"'`<>)]*)?"#).unwrap();
|
||||||
|
let method_re = Regex::new(r#"(?i)\btype\s*:\s*["'](GET|POST|PUT|DELETE)["']"#).unwrap();
|
||||||
|
let content_type_re =
|
||||||
|
Regex::new(r#"(?i)\bcontentType\s*:\s*["']([^"']+)["']"#).unwrap();
|
||||||
|
for capture in url_re.captures_iter(content) {
|
||||||
|
let Some(url_match) = capture.get(0) else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let url = url_match.as_str().trim().trim_end_matches(&[';', ','][..]);
|
||||||
|
if !looks_like_business_url(url) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let context_start = url_match.start().saturating_sub(120);
|
||||||
|
let context_end = (url_match.end() + 120).min(content.len());
|
||||||
|
let context = &content[context_start..context_end];
|
||||||
|
let method = method_re
|
||||||
|
.captures(context)
|
||||||
|
.and_then(|caps| caps.get(1))
|
||||||
|
.map(|m| m.as_str().to_uppercase())
|
||||||
|
.unwrap_or_else(|| "POST".to_string());
|
||||||
|
let content_type = content_type_re
|
||||||
|
.captures(context)
|
||||||
|
.and_then(|caps| caps.get(1))
|
||||||
|
.map(|m| m.as_str().to_string());
|
||||||
|
let name = infer_endpoint_name(url, context, source_url_candidates);
|
||||||
|
if endpoints
|
||||||
|
.iter()
|
||||||
|
.any(|existing: &DeterministicEndpoint| existing.url == url)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
endpoints.push(DeterministicEndpoint {
|
||||||
|
name,
|
||||||
|
url: url.to_string(),
|
||||||
|
method,
|
||||||
|
content_type,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
endpoints
|
||||||
|
}
|
||||||
|
|
||||||
|
fn infer_endpoint_name(url: &str, context: &str, source_url_candidates: &[String]) -> String {
|
||||||
|
for candidate in source_url_candidates {
|
||||||
|
if url.starts_with(candidate) {
|
||||||
|
let suffix = url.trim_start_matches(candidate).trim_matches('/');
|
||||||
|
if !suffix.is_empty() {
|
||||||
|
return suffix.replace('/', "_");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(segment) = url.split('/').filter(|part| !part.is_empty()).last() {
|
||||||
|
return segment.to_string();
|
||||||
|
}
|
||||||
|
let trimmed = context.trim();
|
||||||
|
trimmed
|
||||||
|
.split_whitespace()
|
||||||
|
.last()
|
||||||
|
.unwrap_or("endpoint")
|
||||||
|
.to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn looks_like_business_url(url: &str) -> bool {
|
||||||
|
let normalized = url.to_ascii_lowercase();
|
||||||
|
if normalized.ends_with(".js")
|
||||||
|
|| normalized.ends_with(".css")
|
||||||
|
|| normalized.contains("/a_js/")
|
||||||
|
|| normalized.contains("localhost")
|
||||||
|
|| normalized.contains("127.0.0.1")
|
||||||
|
|| normalized.contains("surfaceservices")
|
||||||
|
|| normalized.contains("reportservices")
|
||||||
|
|| normalized.contains("${")
|
||||||
|
|| normalized.contains("%s")
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn choose_business_bootstrap_url(candidates: &[String]) -> Option<String> {
|
||||||
|
candidates
|
||||||
|
.iter()
|
||||||
|
.find(|candidate| looks_like_business_url(candidate))
|
||||||
|
.cloned()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_unique_matches(content: &str, pattern: &str) -> Vec<String> {
|
||||||
|
let regex = Regex::new(pattern).unwrap();
|
||||||
|
let mut values = Vec::new();
|
||||||
|
for capture in regex.captures_iter(content) {
|
||||||
|
if let Some(value) = capture.get(1) {
|
||||||
|
push_unique(&mut values, value.as_str().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
values
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_filter_expressions(content: &str) -> Vec<String> {
|
||||||
|
let regex = Regex::new(r#"(?i)([A-Za-z_][\w.]*\s*(?:!==|!=|===|==|>=|<=|>|<)\s*[^;\n)]+)"#)
|
||||||
|
.unwrap();
|
||||||
|
let mut filters = Vec::new();
|
||||||
|
for capture in regex.captures_iter(content) {
|
||||||
|
let Some(expr) = capture.get(1) else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let expr = expr.as_str().trim();
|
||||||
|
if expr.contains("charge")
|
||||||
|
|| expr.contains("status")
|
||||||
|
|| expr.contains("loss")
|
||||||
|
|| expr.contains("filter")
|
||||||
|
{
|
||||||
|
push_unique(&mut filters, expr.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
filters
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_branch_fields(content: &str) -> Vec<String> {
|
||||||
|
let candidates = ["period_mode", "reportType", "tjzq", "mode", "week", "month"];
|
||||||
|
candidates
|
||||||
|
.into_iter()
|
||||||
|
.filter(|candidate| content.contains(candidate))
|
||||||
|
.map(str::to_string)
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_pagination_fields(content: &str) -> Vec<String> {
|
||||||
|
let candidates = ["pageSize", "page", "rows", "sidx", "sord", "totalPage"];
|
||||||
|
candidates
|
||||||
|
.into_iter()
|
||||||
|
.filter(|candidate| content.contains(candidate))
|
||||||
|
.map(str::to_string)
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_response_paths(content: &str) -> Vec<String> {
|
||||||
|
let mut paths = Vec::new();
|
||||||
|
for candidate in ["content", "data", "rows", "list"] {
|
||||||
|
if content.contains(&format!(".{candidate}")) || content.contains(&format!("[\"{candidate}\"]")) {
|
||||||
|
push_unique(&mut paths, candidate.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
paths
|
||||||
|
}
|
||||||
|
|
||||||
|
fn classify_workflow_archetype(
|
||||||
|
content: &str,
|
||||||
|
has_branch_fields: bool,
|
||||||
|
has_pagination: bool,
|
||||||
|
has_secondary_request: bool,
|
||||||
|
has_post_process: bool,
|
||||||
|
) -> WorkflowArchetype {
|
||||||
|
let has_month_week = content.contains("month") && content.contains("week");
|
||||||
|
if has_pagination && has_secondary_request && has_post_process {
|
||||||
|
return WorkflowArchetype::PaginatedEnrichment;
|
||||||
|
}
|
||||||
|
if has_branch_fields && has_month_week {
|
||||||
|
return WorkflowArchetype::MultiModeRequest;
|
||||||
|
}
|
||||||
|
if content.contains("monitor") || content.contains("status") {
|
||||||
|
return WorkflowArchetype::PageStateEval;
|
||||||
|
}
|
||||||
|
WorkflowArchetype::SingleRequestTable
|
||||||
|
}
|
||||||
|
|
||||||
|
fn base_url(url: &str) -> Option<String> {
|
||||||
|
let domain = extract_domain_from_url(url)?;
|
||||||
|
let protocol = if url.starts_with("https://") {
|
||||||
|
"https://"
|
||||||
|
} else if url.starts_with("http://") {
|
||||||
|
"http://"
|
||||||
|
} else {
|
||||||
|
return None;
|
||||||
|
};
|
||||||
|
Some(format!("{protocol}{domain}"))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn extract_domain_from_url(url: &str) -> Option<&str> {
|
||||||
let rest = url
|
let rest = url
|
||||||
.strip_prefix("http://")
|
.strip_prefix("http://")
|
||||||
.or_else(|| url.strip_prefix("https://"))?;
|
.or_else(|| url.strip_prefix("https://"))?;
|
||||||
|
|
||||||
// Find the end of domain (first '/' or end of string)
|
|
||||||
let domain_end = rest.find('/').unwrap_or(rest.len());
|
let domain_end = rest.find('/').unwrap_or(rest.len());
|
||||||
let domain = &rest[..domain_end];
|
let domain = &rest[..domain_end];
|
||||||
|
|
||||||
// Return non-empty domain
|
|
||||||
if domain.is_empty() {
|
if domain.is_empty() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
Some(domain.to_string())
|
Some(domain)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn push_unique(values: &mut Vec<String>, candidate: String) {
|
||||||
|
if !values.iter().any(|existing| existing == &candidate) {
|
||||||
|
values.push(candidate);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user