fix(analyzer): two fixes for scene analysis
1. Add loginPath/mainPath to extract_named_url_candidates regex so their domain is extracted as expected_domain for bootstrap. 2. Add safe_char_boundary helper to prevent UTF-8 char boundary panic in extract_endpoints when scanning HTML with Chinese characters. 🤖 Generated with [Qoder][https://qoder.com]
This commit is contained in:
@@ -349,6 +349,18 @@ fn extract_named_url_candidates(content: &str) -> Vec<String> {
|
|||||||
candidates
|
candidates
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Round a byte index down to the nearest char boundary.
|
||||||
|
fn safe_char_boundary(s: &str, byte_idx: usize) -> usize {
|
||||||
|
if byte_idx >= s.len() {
|
||||||
|
return s.len();
|
||||||
|
}
|
||||||
|
s.char_indices()
|
||||||
|
.rev()
|
||||||
|
.find(|(idx, _)| *idx <= byte_idx)
|
||||||
|
.map(|(idx, _)| idx)
|
||||||
|
.unwrap_or(0)
|
||||||
|
}
|
||||||
|
|
||||||
fn extract_endpoints(
|
fn extract_endpoints(
|
||||||
content: &str,
|
content: &str,
|
||||||
source_url_candidates: &[String],
|
source_url_candidates: &[String],
|
||||||
@@ -366,8 +378,8 @@ fn extract_endpoints(
|
|||||||
if !looks_like_business_url(url) {
|
if !looks_like_business_url(url) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let context_start = url_match.start().saturating_sub(120);
|
let context_start = safe_char_boundary(content, url_match.start().saturating_sub(120));
|
||||||
let context_end = (url_match.end() + 120).min(content.len());
|
let context_end = safe_char_boundary(content, (url_match.end() + 120).min(content.len()));
|
||||||
let context = &content[context_start..context_end];
|
let context = &content[context_start..context_end];
|
||||||
let method = method_re
|
let method = method_re
|
||||||
.captures(context)
|
.captures(context)
|
||||||
|
|||||||
Reference in New Issue
Block a user