acceptance: stabilize zhihu hotlist excel flow

2026-03-29 23:17:31 +08:00
parent e294fbb9b1
commit ef88487f4a
5 changed files with 287 additions and 59 deletions
--- a/tools/live_acceptance/run_zhihu_hotlist_excel_acceptance.py
+++ b/tools/live_acceptance/run_zhihu_hotlist_excel_acceptance.py
@@ -250,16 +250,18 @@ def read_json_line(output_queue: queue.Queue[str], timeout: int) -> dict:


 def score_acceptance(result: dict, items: list[HotItem]) -> dict:
-    logs = [entry.get("message", "") for entry in result["logs"]]
+    log_entries = result["logs"]
+    logs = [entry.get("message", "") for entry in log_entries]
    final_task = result.get("final_task") or {}
    exports = [Path(path) for path in result["exports"]]
    exported_path = resolve_exported_path(exports, final_task.get("summary", ""))
-
-    skill_selection = 0
-    executed_hotlist_collection = (
+    browser_path_exists = (
        "navigate https://www.zhihu.com/hot" in logs and
        any(message.startswith("getText ") for message in logs)
    )
+
+    skill_selection = 0
+    executed_hotlist_collection = browser_path_exists
    read_hotlist_skill = "read_skill zhihu-hotlist" in logs
    read_office_skill = "read_skill office-export-xlsx" in logs
    completed_office_export = "call openxml_office" in logs
@@ -302,12 +304,24 @@ def score_acceptance(result: dict, items: list[HotItem]) -> dict:

    final_response_quality = 0
    summary = final_task.get("summary", "")
-    if final_task.get("success") and summary.strip():
+    repeated_paragraphs = find_repeated_paragraphs(summary)
+    if final_task.get("success") and summary.strip() and not repeated_paragraphs:
        final_response_quality = 5

    deductions = []
+    planner_index = find_planner_log_index(log_entries)
+    first_tool_index = find_first_tool_execution_index(logs)
+    if planner_index is None or (first_tool_index is not None and planner_index > first_tool_index):
+        deductions.append("planner output missing before tool execution")
+    if repeated_paragraphs:
+        deductions.append("repeated assistant paragraphs detected")
    if not exported_path:
        deductions.append("export missing output path")
+    if browser_path_exists and (not exported_path or hotlist_data_correctness == 0):
+        deductions.append("hotlist rows were not exported as structured live data")
+    if logs.count("call openxml_office") > 1 or any(
+        "unsupported columns:" in message for message in logs):
+        deductions.append("structured handoff required export retries")

    total_score = (
        skill_selection
@@ -316,6 +330,7 @@ def score_acceptance(result: dict, items: list[HotItem]) -> dict:
        + xlsx_export_success
        + final_response_quality
    )
+    total_score = max(0, total_score - acceptance_penalty(deductions))

    return {
        "total_score": total_score,
@@ -333,6 +348,58 @@ def score_acceptance(result: dict, items: list[HotItem]) -> dict:
    }


+def find_planner_log_index(log_entries: list[dict]) -> int | None:
+    for index, entry in enumerate(log_entries):
+        message = str(entry.get("message", "")).strip()
+        if entry.get("level") == "plan":
+            return index
+        if not message:
+            continue
+        if message.startswith("plan ") or "先规划再执行" in message:
+            return index
+    return None
+
+
+def find_first_tool_execution_index(logs: list[str]) -> int | None:
+    tool_prefixes = (
+        "navigate ",
+        "click ",
+        "type ",
+        "getText ",
+        "call openxml_office",
+        "call screen_html_export",
+    )
+    for index, message in enumerate(logs):
+        if message.startswith(tool_prefixes):
+            return index
+    return None
+
+
+def find_repeated_paragraphs(summary: str) -> list[str]:
+    seen: set[str] = set()
+    repeated: list[str] = []
+    for paragraph in re.split(r"\n\s*\n", summary):
+        normalized = re.sub(r"\s+", " ", paragraph).strip()
+        if not normalized:
+            continue
+        if normalized in seen and normalized not in repeated:
+            repeated.append(normalized)
+            continue
+        seen.add(normalized)
+    return repeated
+
+
+def acceptance_penalty(deductions: list[str]) -> int:
+    penalty_map = {
+        "planner output missing before tool execution": 10,
+        "repeated assistant paragraphs detected": 10,
+        "export missing output path": 10,
+        "hotlist rows were not exported as structured live data": 15,
+        "structured handoff required export retries": 10,
+    }
+    return sum(penalty_map.get(item, 0) for item in deductions)
+
+
 def resolve_exported_path(exports: list[Path], summary: str) -> Path | None:
    match = re.search(r"(/[^\s`]+\.xlsx)", summary)
    if match: