acceptance: stabilize zhihu hotlist excel flow
This commit is contained in:
@@ -250,16 +250,18 @@ def read_json_line(output_queue: queue.Queue[str], timeout: int) -> dict:
|
||||
|
||||
|
||||
def score_acceptance(result: dict, items: list[HotItem]) -> dict:
|
||||
logs = [entry.get("message", "") for entry in result["logs"]]
|
||||
log_entries = result["logs"]
|
||||
logs = [entry.get("message", "") for entry in log_entries]
|
||||
final_task = result.get("final_task") or {}
|
||||
exports = [Path(path) for path in result["exports"]]
|
||||
exported_path = resolve_exported_path(exports, final_task.get("summary", ""))
|
||||
|
||||
skill_selection = 0
|
||||
executed_hotlist_collection = (
|
||||
browser_path_exists = (
|
||||
"navigate https://www.zhihu.com/hot" in logs and
|
||||
any(message.startswith("getText ") for message in logs)
|
||||
)
|
||||
|
||||
skill_selection = 0
|
||||
executed_hotlist_collection = browser_path_exists
|
||||
read_hotlist_skill = "read_skill zhihu-hotlist" in logs
|
||||
read_office_skill = "read_skill office-export-xlsx" in logs
|
||||
completed_office_export = "call openxml_office" in logs
|
||||
@@ -302,12 +304,24 @@ def score_acceptance(result: dict, items: list[HotItem]) -> dict:
|
||||
|
||||
final_response_quality = 0
|
||||
summary = final_task.get("summary", "")
|
||||
if final_task.get("success") and summary.strip():
|
||||
repeated_paragraphs = find_repeated_paragraphs(summary)
|
||||
if final_task.get("success") and summary.strip() and not repeated_paragraphs:
|
||||
final_response_quality = 5
|
||||
|
||||
deductions = []
|
||||
planner_index = find_planner_log_index(log_entries)
|
||||
first_tool_index = find_first_tool_execution_index(logs)
|
||||
if planner_index is None or (first_tool_index is not None and planner_index > first_tool_index):
|
||||
deductions.append("planner output missing before tool execution")
|
||||
if repeated_paragraphs:
|
||||
deductions.append("repeated assistant paragraphs detected")
|
||||
if not exported_path:
|
||||
deductions.append("export missing output path")
|
||||
if browser_path_exists and (not exported_path or hotlist_data_correctness == 0):
|
||||
deductions.append("hotlist rows were not exported as structured live data")
|
||||
if logs.count("call openxml_office") > 1 or any(
|
||||
"unsupported columns:" in message for message in logs):
|
||||
deductions.append("structured handoff required export retries")
|
||||
|
||||
total_score = (
|
||||
skill_selection
|
||||
@@ -316,6 +330,7 @@ def score_acceptance(result: dict, items: list[HotItem]) -> dict:
|
||||
+ xlsx_export_success
|
||||
+ final_response_quality
|
||||
)
|
||||
total_score = max(0, total_score - acceptance_penalty(deductions))
|
||||
|
||||
return {
|
||||
"total_score": total_score,
|
||||
@@ -333,6 +348,58 @@ def score_acceptance(result: dict, items: list[HotItem]) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def find_planner_log_index(log_entries: list[dict]) -> int | None:
|
||||
for index, entry in enumerate(log_entries):
|
||||
message = str(entry.get("message", "")).strip()
|
||||
if entry.get("level") == "plan":
|
||||
return index
|
||||
if not message:
|
||||
continue
|
||||
if message.startswith("plan ") or "先规划再执行" in message:
|
||||
return index
|
||||
return None
|
||||
|
||||
|
||||
def find_first_tool_execution_index(logs: list[str]) -> int | None:
|
||||
tool_prefixes = (
|
||||
"navigate ",
|
||||
"click ",
|
||||
"type ",
|
||||
"getText ",
|
||||
"call openxml_office",
|
||||
"call screen_html_export",
|
||||
)
|
||||
for index, message in enumerate(logs):
|
||||
if message.startswith(tool_prefixes):
|
||||
return index
|
||||
return None
|
||||
|
||||
|
||||
def find_repeated_paragraphs(summary: str) -> list[str]:
|
||||
seen: set[str] = set()
|
||||
repeated: list[str] = []
|
||||
for paragraph in re.split(r"\n\s*\n", summary):
|
||||
normalized = re.sub(r"\s+", " ", paragraph).strip()
|
||||
if not normalized:
|
||||
continue
|
||||
if normalized in seen and normalized not in repeated:
|
||||
repeated.append(normalized)
|
||||
continue
|
||||
seen.add(normalized)
|
||||
return repeated
|
||||
|
||||
|
||||
def acceptance_penalty(deductions: list[str]) -> int:
|
||||
penalty_map = {
|
||||
"planner output missing before tool execution": 10,
|
||||
"repeated assistant paragraphs detected": 10,
|
||||
"export missing output path": 10,
|
||||
"hotlist rows were not exported as structured live data": 15,
|
||||
"structured handoff required export retries": 10,
|
||||
}
|
||||
return sum(penalty_map.get(item, 0) for item in deductions)
|
||||
|
||||
|
||||
def resolve_exported_path(exports: list[Path], summary: str) -> Path | None:
|
||||
match = re.search(r"(/[^\s`]+\.xlsx)", summary)
|
||||
if match:
|
||||
|
||||
Reference in New Issue
Block a user