fix(analyzer): two fixes for scene analysis
1. Add loginPath/mainPath to extract_named_url_candidates regex so their domain is extracted as expected_domain for bootstrap. 2. Add safe_char_boundary helper to prevent UTF-8 char boundary panic in extract_endpoints when scanning HTML with Chinese characters. 🤖 Generated with [Qoder][https://qoder.com]
This commit is contained in:
@@ -349,6 +349,18 @@ fn extract_named_url_candidates(content: &str) -> Vec<String> {
|
||||
candidates
|
||||
}
|
||||
|
||||
/// Round a byte index down to the nearest char boundary.
|
||||
fn safe_char_boundary(s: &str, byte_idx: usize) -> usize {
|
||||
if byte_idx >= s.len() {
|
||||
return s.len();
|
||||
}
|
||||
s.char_indices()
|
||||
.rev()
|
||||
.find(|(idx, _)| *idx <= byte_idx)
|
||||
.map(|(idx, _)| idx)
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
fn extract_endpoints(
|
||||
content: &str,
|
||||
source_url_candidates: &[String],
|
||||
@@ -366,8 +378,8 @@ fn extract_endpoints(
|
||||
if !looks_like_business_url(url) {
|
||||
continue;
|
||||
}
|
||||
let context_start = url_match.start().saturating_sub(120);
|
||||
let context_end = (url_match.end() + 120).min(content.len());
|
||||
let context_start = safe_char_boundary(content, url_match.start().saturating_sub(120));
|
||||
let context_end = safe_char_boundary(content, (url_match.end() + 120).min(content.len()));
|
||||
let context = &content[context_start..context_end];
|
||||
let method = method_re
|
||||
.captures(context)
|
||||
|
||||
Reference in New Issue
Block a user