fix(analyzer): two fixes for scene analysis

1. Add loginPath/mainPath to extract_named_url_candidates regex so their domain is extracted as expected_domain for bootstrap. 2. Add safe_char_boundary helper to prevent UTF-8 char boundary panic in extract_endpoints when scanning HTML with Chinese characters. 🤖 Generated with [Qoder][https://qoder.com]
2026-04-17 19:56:20 +08:00
parent 4215d49f3f
commit 118fc77935
1 changed files with 14 additions and 2 deletions
--- a/src/generated_scene/analyzer.rs
+++ b/src/generated_scene/analyzer.rs
@@ -349,6 +349,18 @@ fn extract_named_url_candidates(content: &str) -> Vec<String> {
    candidates
 }

+/// Round a byte index down to the nearest char boundary.
+fn safe_char_boundary(s: &str, byte_idx: usize) -> usize {
+    if byte_idx >= s.len() {
+        return s.len();
+    }
+    s.char_indices()
+        .rev()
+        .find(|(idx, _)| *idx <= byte_idx)
+        .map(|(idx, _)| idx)
+        .unwrap_or(0)
+}
+
 fn extract_endpoints(
    content: &str,
    source_url_candidates: &[String],
@@ -366,8 +378,8 @@ fn extract_endpoints(
        if !looks_like_business_url(url) {
            continue;
        }
-        let context_start = url_match.start().saturating_sub(120);
-        let context_end = (url_match.end() + 120).min(content.len());
+        let context_start = safe_char_boundary(content, url_match.start().saturating_sub(120));
+        let context_end = safe_char_boundary(content, (url_match.end() + 120).min(content.len()));
        let context = &content[context_start..context_end];
        let method = method_re
            .captures(context)