feat: auto-extract expected_domain from external script URLs

When HTML has no sgclaw-expected-domain meta tag, analyzer now scans
for external script URLs (http:// or https://) and extracts the
domain (host:port) as expected_domain.

Example:
  <script src="http://25.215.213.128:18080/a_js/YPTAPI.js"></script>
  → expected_domain = "25.215.213.128:18080"

This reduces manual editing required for third-party scenes.

🤖 Generated with [Qoder][https://qoder.com]
This commit is contained in:
木炎
2026-04-17 00:14:05 +08:00
parent 464f18c672
commit ce072c2ebe
3 changed files with 72 additions and 0 deletions

View File

@@ -109,6 +109,9 @@ pub fn analyze_scene_source_with_hint(
let expected_domain = meta_content(&html, "sgclaw-expected-domain"); let expected_domain = meta_content(&html, "sgclaw-expected-domain");
let entry_script = meta_content(&html, "sgclaw-entry-script"); let entry_script = meta_content(&html, "sgclaw-entry-script");
// Auto-extract expected_domain from external script URLs if not provided via meta tag
let expected_domain = expected_domain.or_else(|| extract_domain_from_external_scripts(&html));
// All fields are optional - generator will use placeholders for missing values // All fields are optional - generator will use placeholders for missing values
// This allows third-party scenes without meta tags to be processed // This allows third-party scenes without meta tags to be processed
@@ -156,3 +159,44 @@ fn attr_value(tag: &str, attr: &str) -> Option<String> {
let end = rest.find(quote)?; let end = rest.find(quote)?;
Some(rest[..end].to_string()) Some(rest[..end].to_string())
} }
/// Extract domain from the first external script URL found in HTML.
///
/// Looks for `<script src="http://...">` or `<script src="https://...">` tags
/// and extracts the host:port portion as expected_domain.
fn extract_domain_from_external_scripts(html: &str) -> Option<String> {
for tag in html
.split('<')
.filter(|fragment| fragment.starts_with("script"))
{
if let Some(src) = attr_value(tag, "src") {
// Check if it's an external URL (http:// or https://)
if src.starts_with("http://") || src.starts_with("https://") {
// Extract domain (host:port) from URL
if let Some(domain) = extract_domain_from_url(&src) {
return Some(domain);
}
}
}
}
None
}
/// Extract domain (host:port) from a URL string.
fn extract_domain_from_url(url: &str) -> Option<String> {
// Remove protocol prefix
let rest = url
.strip_prefix("http://")
.or_else(|| url.strip_prefix("https://"))?;
// Find the end of domain (first '/' or end of string)
let domain_end = rest.find('/').unwrap_or(rest.len());
let domain = &rest[..domain_end];
// Return non-empty domain
if domain.is_empty() {
None
} else {
Some(domain.to_string())
}
}

View File

@@ -0,0 +1,11 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8" />
<title>测试外部脚本提取</title>
<script src="http://25.215.213.128:18080/a_js/YPTAPI.js"></script>
</head>
<body>
<div id="app">测试页面</div>
</body>
</html>

View File

@@ -166,3 +166,20 @@ fn generator_emits_monitoring_template() {
// 监测类不应该有 org/period resolver // 监测类不应该有 org/period resolver
assert!(!generated_manifest.contains("resolver = \"dictionary_entity\"")); assert!(!generated_manifest.contains("resolver = \"dictionary_entity\""));
} }
#[test]
fn analyzer_extracts_domain_from_external_script() {
// external_script fixture has no expected_domain meta tag,
// but has an external script URL that should be auto-extracted
let analysis = analyze_scene_source(Path::new(
"tests/fixtures/generated_scene/external_script",
))
.unwrap();
assert_eq!(analysis.scene_kind, SceneKind::ReportCollection);
// Should auto-extract "25.215.213.128:18080" from script src
assert_eq!(
analysis.bootstrap.expected_domain.as_deref(),
Some("25.215.213.128:18080")
);
}