use std::collections::BTreeSet; use std::path::Path; use regex::Regex; use serde_json::{json, Value}; use zeroclaw::tools::Tool; use crate::compat::openxml_office_tool::OpenXmlOfficeTool; use crate::compat::screen_html_export_tool::ScreenHtmlExportTool; use crate::pipe::{Action, AgentMessage, BrowserPipeTool, PipeError, Transport}; const ZHIHU_DOMAIN: &str = "www.zhihu.com"; const ZHIHU_HOT_URL: &str = "https://www.zhihu.com/hot"; const HOTLIST_ROOT_SELECTORS: [&str; 3] = ["main", "body", "html"]; #[derive(Debug, Clone, PartialEq, Eq)] pub enum WorkflowRoute { ZhihuHotlistExportXlsx, ZhihuHotlistScreen, } #[derive(Debug, Clone, PartialEq, Eq)] struct HotlistItem { rank: u64, title: String, heat: String, } pub fn detect_route( instruction: &str, page_url: Option<&str>, page_title: Option<&str>, ) -> Option { if !crate::runtime::is_zhihu_hotlist_task(instruction, page_url, page_title) { return None; } let normalized = instruction.to_ascii_lowercase(); if normalized.contains("dashboard") || instruction.contains("大屏") || instruction.contains("新标签页") { return Some(WorkflowRoute::ZhihuHotlistScreen); } if normalized.contains("excel") || normalized.contains("xlsx") || instruction.contains("导出") { return Some(WorkflowRoute::ZhihuHotlistExportXlsx); } None } pub fn should_fallback_after_summary(summary: &str, route: &WorkflowRoute) -> bool { let normalized = summary.to_ascii_lowercase(); if normalized.contains(".xlsx") || normalized.contains(".html") { return false; } let looks_like_denial = summary.contains("拒绝") || normalized.contains("denied") || normalized.contains("failed") || summary.contains("失败") || summary.contains("无法"); looks_like_denial || matches!(route, WorkflowRoute::ZhihuHotlistExportXlsx | WorkflowRoute::ZhihuHotlistScreen) } pub fn execute_route( transport: &T, browser_tool: &BrowserPipeTool, workspace_root: &Path, instruction: &str, route: WorkflowRoute, ) -> Result { let top_n = extract_top_n(instruction); let items = collect_hotlist_items(transport, browser_tool, top_n)?; if items.is_empty() { return Err(PipeError::Protocol( "知乎热榜采集失败:未能从页面文本中解析到热榜条目".to_string(), )); } match route { WorkflowRoute::ZhihuHotlistExportXlsx => export_xlsx(transport, workspace_root, &items), WorkflowRoute::ZhihuHotlistScreen => export_screen(transport, workspace_root, &items), } } fn collect_hotlist_items( transport: &T, browser_tool: &BrowserPipeTool, top_n: usize, ) -> Result, PipeError> { navigate_hotlist_with_retry(transport, browser_tool)?; for selector in HOTLIST_ROOT_SELECTORS { transport.send(&AgentMessage::LogEntry { level: "info".to_string(), message: format!("getText {selector}"), })?; let response = browser_tool.invoke( Action::GetText, json!({ "selector": selector }), ZHIHU_DOMAIN, )?; if !response.success { continue; } let text = response.data["text"].as_str().unwrap_or_default(); let items = parse_hotlist_items(text, top_n); if !items.is_empty() { return Ok(items); } } Ok(Vec::new()) } fn navigate_hotlist_with_retry( transport: &T, browser_tool: &BrowserPipeTool, ) -> Result<(), PipeError> { let mut last_error = None; for _ in 0..2 { transport.send(&AgentMessage::LogEntry { level: "info".to_string(), message: format!("navigate {ZHIHU_HOT_URL}"), })?; match browser_tool.invoke( Action::Navigate, json!({ "url": ZHIHU_HOT_URL }), ZHIHU_DOMAIN, ) { Ok(response) if response.success => return Ok(()), Ok(response) => { last_error = Some(PipeError::Protocol(format!( "navigate failed: {}", response.data ))); } Err(err) => last_error = Some(err), } } Err(last_error.unwrap_or_else(|| { PipeError::Protocol("navigate failed without detailed error".to_string()) })) } fn export_xlsx( transport: &T, workspace_root: &Path, items: &[HotlistItem], ) -> Result { transport.send(&AgentMessage::LogEntry { level: "info".to_string(), message: "call openxml_office".to_string(), })?; let tool = OpenXmlOfficeTool::new(workspace_root.to_path_buf()); let rows = items .iter() .map(|item| json!([item.rank, item.title, item.heat])) .collect::>(); let runtime = tokio::runtime::Runtime::new() .map_err(|err| PipeError::Protocol(format!("failed to create tokio runtime: {err}")))?; let result = runtime .block_on(tool.execute(json!({ "sheet_name": "知乎热榜", "columns": ["rank", "title", "heat"], "rows": rows, }))) .map_err(|err| PipeError::Protocol(err.to_string()))?; if !result.success { return Err(PipeError::Protocol( result.error.unwrap_or_else(|| "openxml_office failed".to_string()), )); } let payload: Value = serde_json::from_str(&result.output) .map_err(|err| PipeError::Protocol(format!("invalid openxml_office output: {err}")))?; let output_path = payload["output_path"] .as_str() .ok_or_else(|| PipeError::Protocol("openxml_office did not return output_path".to_string()))?; Ok(format!("已导出知乎热榜 Excel {output_path}")) } fn export_screen( transport: &T, workspace_root: &Path, items: &[HotlistItem], ) -> Result { transport.send(&AgentMessage::LogEntry { level: "info".to_string(), message: "call screen_html_export".to_string(), })?; let tool = ScreenHtmlExportTool::new(workspace_root.to_path_buf()); let rows = items .iter() .map(|item| json!([item.rank, item.title, item.heat])) .collect::>(); let runtime = tokio::runtime::Runtime::new() .map_err(|err| PipeError::Protocol(format!("failed to create tokio runtime: {err}")))?; let result = runtime .block_on(tool.execute(json!({ "rows": rows }))) .map_err(|err| PipeError::Protocol(err.to_string()))?; if !result.success { return Err(PipeError::Protocol( result.error.unwrap_or_else(|| "screen_html_export failed".to_string()), )); } let payload: Value = serde_json::from_str(&result.output) .map_err(|err| PipeError::Protocol(format!("invalid screen_html_export output: {err}")))?; let output_path = payload["output_path"] .as_str() .ok_or_else(|| PipeError::Protocol("screen_html_export did not return output_path".to_string()))?; Ok(format!("已生成知乎热榜大屏 {output_path}")) } fn parse_hotlist_items(text: &str, top_n: usize) -> Vec { let mut items = parse_single_line_items(text, top_n); if !items.is_empty() { return items; } let lines = normalize_lines(text); let mut seen_ranks = BTreeSet::new(); let mut idx = 0usize; while idx < lines.len() && items.len() < top_n { let Some(rank) = parse_rank(&lines[idx]) else { idx += 1; continue; }; if !seen_ranks.insert(rank) { idx += 1; continue; } let mut title = None; let mut heat = None; for candidate in lines.iter().skip(idx + 1).take(6) { if parse_rank(candidate).is_some() { break; } if heat.is_none() && looks_like_heat(candidate) { heat = Some(normalize_heat(candidate)); continue; } if title.is_none() && !is_noise_line(candidate) { title = Some(candidate.clone()); } } if let (Some(title), Some(heat)) = (title, heat) { items.push(HotlistItem { rank, title, heat }); } idx += 1; } items.sort_by_key(|item| item.rank); items.truncate(top_n); items } fn parse_single_line_items(text: &str, top_n: usize) -> Vec { let re = Regex::new( r"(?m)^\s*(\d{1,2})[\.、\s]+(.+?)\s+(\d+(?:\.\d+)?\s*[万亿kKmM]?)\s*(?:热度)?\s*$", ) .expect("valid hotlist single-line regex"); let mut items = Vec::new(); let mut seen_ranks = BTreeSet::new(); for capture in re.captures_iter(text) { let rank = capture .get(1) .and_then(|value| value.as_str().parse::().ok()) .unwrap_or_default(); if rank == 0 || !seen_ranks.insert(rank) { continue; } let title = capture.get(2).map(|value| value.as_str().trim()).unwrap_or(""); let heat = capture.get(3).map(|value| value.as_str().trim()).unwrap_or(""); if title.is_empty() || heat.is_empty() { continue; } items.push(HotlistItem { rank, title: title.to_string(), heat: normalize_heat(heat), }); if items.len() >= top_n { break; } } items } fn normalize_lines(text: &str) -> Vec { text.lines() .map(str::trim) .filter(|line| !line.is_empty()) .map(|line| line.split_whitespace().collect::>().join(" ")) .collect() } fn parse_rank(line: &str) -> Option { let trimmed = line.trim(); if trimmed.is_empty() { return None; } if trimmed.chars().all(|ch| ch.is_ascii_digit()) { return trimmed.parse::().ok().filter(|value| *value > 0); } let rank_re = Regex::new(r"^(\d{1,2})[\.、\s]").expect("valid rank regex"); rank_re .captures(trimmed) .and_then(|capture| capture.get(1)) .and_then(|value| value.as_str().parse::().ok()) .filter(|value| *value > 0) } fn looks_like_heat(line: &str) -> bool { let compact = line.replace(' ', ""); let heat_re = Regex::new(r"^\d+(?:\.\d+)?(?:万|亿|k|K|m|M)?(?:热度)?$").expect("valid heat regex"); heat_re.is_match(compact.as_str()) } fn normalize_heat(line: &str) -> String { line.replace(' ', "") .trim_end_matches("热度") .to_string() } fn is_noise_line(line: &str) -> bool { matches!( line, "知乎" | "知乎热榜" | "热榜" | "首页" | "发现" | "等你来答" | "更多内容" ) } fn extract_top_n(instruction: &str) -> usize { let re = Regex::new(r"(?:前|top\s*)(\d{1,2})").expect("valid top-n regex"); re.captures(&instruction.to_ascii_lowercase()) .and_then(|capture| capture.get(1)) .and_then(|value| value.as_str().parse::().ok()) .filter(|value| *value > 0) .unwrap_or(10) }