feat: auto-extract expected_domain from external script URLs
When HTML has no sgclaw-expected-domain meta tag, analyzer now scans for external script URLs (http:// or https://) and extracts the domain (host:port) as expected_domain. Example: <script src="http://25.215.213.128:18080/a_js/YPTAPI.js"></script> → expected_domain = "25.215.213.128:18080" This reduces manual editing required for third-party scenes. 🤖 Generated with [Qoder][https://qoder.com]
This commit is contained in:
@@ -109,6 +109,9 @@ pub fn analyze_scene_source_with_hint(
|
||||
let expected_domain = meta_content(&html, "sgclaw-expected-domain");
|
||||
let entry_script = meta_content(&html, "sgclaw-entry-script");
|
||||
|
||||
// Auto-extract expected_domain from external script URLs if not provided via meta tag
|
||||
let expected_domain = expected_domain.or_else(|| extract_domain_from_external_scripts(&html));
|
||||
|
||||
// All fields are optional - generator will use placeholders for missing values
|
||||
// This allows third-party scenes without meta tags to be processed
|
||||
|
||||
@@ -156,3 +159,44 @@ fn attr_value(tag: &str, attr: &str) -> Option<String> {
|
||||
let end = rest.find(quote)?;
|
||||
Some(rest[..end].to_string())
|
||||
}
|
||||
|
||||
/// Extract domain from the first external script URL found in HTML.
|
||||
///
|
||||
/// Looks for `<script src="http://...">` or `<script src="https://...">` tags
|
||||
/// and extracts the host:port portion as expected_domain.
|
||||
fn extract_domain_from_external_scripts(html: &str) -> Option<String> {
|
||||
for tag in html
|
||||
.split('<')
|
||||
.filter(|fragment| fragment.starts_with("script"))
|
||||
{
|
||||
if let Some(src) = attr_value(tag, "src") {
|
||||
// Check if it's an external URL (http:// or https://)
|
||||
if src.starts_with("http://") || src.starts_with("https://") {
|
||||
// Extract domain (host:port) from URL
|
||||
if let Some(domain) = extract_domain_from_url(&src) {
|
||||
return Some(domain);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Extract domain (host:port) from a URL string.
|
||||
fn extract_domain_from_url(url: &str) -> Option<String> {
|
||||
// Remove protocol prefix
|
||||
let rest = url
|
||||
.strip_prefix("http://")
|
||||
.or_else(|| url.strip_prefix("https://"))?;
|
||||
|
||||
// Find the end of domain (first '/' or end of string)
|
||||
let domain_end = rest.find('/').unwrap_or(rest.len());
|
||||
let domain = &rest[..domain_end];
|
||||
|
||||
// Return non-empty domain
|
||||
if domain.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(domain.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
11
tests/fixtures/generated_scene/external_script/index.html
vendored
Normal file
11
tests/fixtures/generated_scene/external_script/index.html
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<title>测试外部脚本提取</title>
|
||||
<script src="http://25.215.213.128:18080/a_js/YPTAPI.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<div id="app">测试页面</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -166,3 +166,20 @@ fn generator_emits_monitoring_template() {
|
||||
// 监测类不应该有 org/period resolver
|
||||
assert!(!generated_manifest.contains("resolver = \"dictionary_entity\""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn analyzer_extracts_domain_from_external_script() {
|
||||
// external_script fixture has no expected_domain meta tag,
|
||||
// but has an external script URL that should be auto-extracted
|
||||
let analysis = analyze_scene_source(Path::new(
|
||||
"tests/fixtures/generated_scene/external_script",
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(analysis.scene_kind, SceneKind::ReportCollection);
|
||||
// Should auto-extract "25.215.213.128:18080" from script src
|
||||
assert_eq!(
|
||||
analysis.bootstrap.expected_domain.as_deref(),
|
||||
Some("25.215.213.128:18080")
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user