feat: add browser script skill execution

This commit is contained in:
zyl
2026-03-30 02:15:07 +08:00
parent f7e2ff256e
commit d2c9902966
22 changed files with 1775 additions and 249 deletions

View File

@@ -1,4 +1,4 @@
use std::collections::BTreeSet;
use std::fs;
use std::path::Path;
use regex::Regex;
@@ -11,8 +11,6 @@ use crate::pipe::{Action, AgentMessage, BrowserPipeTool, PipeError, Transport};
const ZHIHU_DOMAIN: &str = "www.zhihu.com";
const ZHIHU_HOT_URL: &str = "https://www.zhihu.com/hot";
const HOTLIST_ROOT_SELECTORS: [&str; 3] = ["main", "body", "html"];
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum WorkflowRoute {
ZhihuHotlistExportXlsx,
@@ -87,28 +85,28 @@ fn collect_hotlist_items<T: Transport + 'static>(
top_n: usize,
) -> Result<Vec<HotlistItem>, PipeError> {
navigate_hotlist_with_retry(transport, browser_tool)?;
for selector in HOTLIST_ROOT_SELECTORS {
transport.send(&AgentMessage::LogEntry {
level: "info".to_string(),
message: format!("getText {selector}"),
})?;
let response = browser_tool.invoke(
Action::GetText,
json!({ "selector": selector }),
ZHIHU_DOMAIN,
)?;
if !response.success {
continue;
}
let text = response.data["text"].as_str().unwrap_or_default();
let items = parse_hotlist_items(text, top_n);
if !items.is_empty() {
return Ok(items);
}
transport.send(&AgentMessage::LogEntry {
level: "info".to_string(),
message: "call zhihu-hotlist.extract_hotlist".to_string(),
})?;
let response = browser_tool.invoke(
Action::Eval,
json!({ "script": load_hotlist_extractor_script(top_n)? }),
ZHIHU_DOMAIN,
)?;
if !response.success {
return Err(PipeError::Protocol(format!(
"知乎热榜采集失败:{}",
response
.data
.get("error")
.and_then(|value| value.get("message"))
.and_then(Value::as_str)
.unwrap_or("browser script execution failed")
)));
}
Ok(Vec::new())
parse_hotlist_items_payload(response.data.get("text").unwrap_or(&response.data))
}
fn navigate_hotlist_with_retry<T: Transport + 'static>(
@@ -212,130 +210,71 @@ fn export_screen<T: Transport>(
Ok(format!("已生成知乎热榜大屏 {output_path}"))
}
fn parse_hotlist_items(text: &str, top_n: usize) -> Vec<HotlistItem> {
let mut items = parse_single_line_items(text, top_n);
if !items.is_empty() {
return items;
}
let lines = normalize_lines(text);
let mut seen_ranks = BTreeSet::new();
let mut idx = 0usize;
while idx < lines.len() && items.len() < top_n {
let Some(rank) = parse_rank(&lines[idx]) else {
idx += 1;
continue;
};
if !seen_ranks.insert(rank) {
idx += 1;
continue;
}
let mut title = None;
let mut heat = None;
for candidate in lines.iter().skip(idx + 1).take(6) {
if parse_rank(candidate).is_some() {
break;
}
if heat.is_none() && looks_like_heat(candidate) {
heat = Some(normalize_heat(candidate));
continue;
}
if title.is_none() && !is_noise_line(candidate) {
title = Some(candidate.clone());
}
}
if let (Some(title), Some(heat)) = (title, heat) {
items.push(HotlistItem { rank, title, heat });
}
idx += 1;
}
items.sort_by_key(|item| item.rank);
items.truncate(top_n);
items
fn load_hotlist_extractor_script(top_n: usize) -> Result<String, PipeError> {
let script_path = Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap_or_else(|| Path::new(env!("CARGO_MANIFEST_DIR")))
.join("skill_lib")
.join("skills")
.join("zhihu-hotlist")
.join("scripts")
.join("extract_hotlist.js");
let script = fs::read_to_string(&script_path).map_err(|err| {
PipeError::Protocol(format!(
"failed to read zhihu hotlist extractor script {}: {err}",
script_path.display()
))
})?;
Ok(format!(
"(function() {{\nconst args = {};\n{}\n}})()",
json!({ "top_n": top_n.to_string() }),
script
))
}
fn parse_single_line_items(text: &str, top_n: usize) -> Vec<HotlistItem> {
let re = Regex::new(
r"(?m)^\s*(\d{1,2})[\.、\s]+(.+?)\s+(\d+(?:\.\d+)?\s*[万亿kKmM]?)\s*(?:热度)?\s*$",
)
.expect("valid hotlist single-line regex");
let mut items = Vec::new();
let mut seen_ranks = BTreeSet::new();
fn parse_hotlist_items_payload(payload: &Value) -> Result<Vec<HotlistItem>, PipeError> {
let normalized_payload = if let Some(text) = payload.as_str() {
serde_json::from_str::<Value>(text).unwrap_or_else(|_| Value::String(text.to_string()))
} else {
payload.clone()
};
for capture in re.captures_iter(text) {
let rank = capture
.get(1)
.and_then(|value| value.as_str().parse::<u64>().ok())
.unwrap_or_default();
if rank == 0 || !seen_ranks.insert(rank) {
let rows = normalized_payload
.get("rows")
.and_then(Value::as_array)
.ok_or_else(|| {
PipeError::Protocol("知乎热榜采集失败:浏览器脚本未返回 rows".to_string())
})?;
let mut items = Vec::new();
for row in rows {
let Some(cells) = row.as_array() else {
continue;
};
if cells.len() != 3 {
continue;
}
let title = capture.get(2).map(|value| value.as_str().trim()).unwrap_or("");
let heat = capture.get(3).map(|value| value.as_str().trim()).unwrap_or("");
let rank = cells[0]
.as_u64()
.or_else(|| cells[0].as_str().and_then(|value| value.parse::<u64>().ok()))
.unwrap_or((items.len() + 1) as u64);
let title = cells[1].as_str().unwrap_or_default().trim().to_string();
let heat = cells[2].as_str().unwrap_or_default().trim().to_string();
if title.is_empty() || heat.is_empty() {
continue;
}
items.push(HotlistItem {
rank,
title: title.to_string(),
heat: normalize_heat(heat),
});
if items.len() >= top_n {
break;
}
items.push(HotlistItem { rank, title, heat });
}
items
}
fn normalize_lines(text: &str) -> Vec<String> {
text.lines()
.map(str::trim)
.filter(|line| !line.is_empty())
.map(|line| line.split_whitespace().collect::<Vec<_>>().join(" "))
.collect()
}
fn parse_rank(line: &str) -> Option<u64> {
let trimmed = line.trim();
if trimmed.is_empty() {
return None;
}
if trimmed.chars().all(|ch| ch.is_ascii_digit()) {
return trimmed.parse::<u64>().ok().filter(|value| *value > 0);
if items.is_empty() {
return Err(PipeError::Protocol(
"知乎热榜采集失败:浏览器脚本未返回有效热榜条目".to_string(),
));
}
let rank_re = Regex::new(r"^(\d{1,2})[\.、\s]").expect("valid rank regex");
rank_re
.captures(trimmed)
.and_then(|capture| capture.get(1))
.and_then(|value| value.as_str().parse::<u64>().ok())
.filter(|value| *value > 0)
Ok(items)
}
fn looks_like_heat(line: &str) -> bool {
let compact = line.replace(' ', "");
let heat_re = Regex::new(r"^\d+(?:\.\d+)?(?:万|亿|k|K|m|M)?(?:热度)?$").expect("valid heat regex");
heat_re.is_match(compact.as_str())
}
fn normalize_heat(line: &str) -> String {
line.replace(' ', "")
.trim_end_matches("热度")
.to_string()
}
fn is_noise_line(line: &str) -> bool {
matches!(
line,
"知乎" | "知乎热榜" | "热榜" | "首页" | "发现" | "等你来答" | "更多内容"
)
}
fn extract_top_n(instruction: &str) -> usize {
let re = Regex::new(r"(?:前|top\s*)(\d{1,2})").expect("valid top-n regex");
re.captures(&instruction.to_ascii_lowercase())