fix(analyzer): two fixes for scene analysis

1. Add loginPath/mainPath to extract_named_url_candidates regex so
   their domain is extracted as expected_domain for bootstrap.

2. Add safe_char_boundary helper to prevent UTF-8 char boundary panic
   in extract_endpoints when scanning HTML with Chinese characters.

🤖 Generated with [Qoder][https://qoder.com]
This commit is contained in:
木炎
2026-04-17 19:56:20 +08:00
parent 4215d49f3f
commit 118fc77935

View File

@@ -349,6 +349,18 @@ fn extract_named_url_candidates(content: &str) -> Vec<String> {
candidates
}
/// Round a byte index down to the nearest char boundary.
fn safe_char_boundary(s: &str, byte_idx: usize) -> usize {
if byte_idx >= s.len() {
return s.len();
}
s.char_indices()
.rev()
.find(|(idx, _)| *idx <= byte_idx)
.map(|(idx, _)| idx)
.unwrap_or(0)
}
fn extract_endpoints(
content: &str,
source_url_candidates: &[String],
@@ -366,8 +378,8 @@ fn extract_endpoints(
if !looks_like_business_url(url) {
continue;
}
let context_start = url_match.start().saturating_sub(120);
let context_end = (url_match.end() + 120).min(content.len());
let context_start = safe_char_boundary(content, url_match.start().saturating_sub(120));
let context_end = safe_char_boundary(content, (url_match.end() + 120).min(content.len()));
let context = &content[context_start..context_end];
let method = method_re
.captures(context)