Files
skill-lib/skills/zhihu-hotlist/scripts/extract_hotlist.js
木炎 51913555ad feat: add initial skill authoring workspace
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-02 18:34:56 +08:00

263 lines
6.2 KiB
JavaScript

const limit = Math.max(1, Number(args.top_n || 10));
function cleanText(value) {
return String(value || '')
.replace(/\s+/g, ' ')
.replace(/\u200b/g, '')
.trim();
}
function pickText(root, selectors) {
for (const selector of selectors) {
const node = root.querySelector(selector);
const text = cleanText(node && node.textContent);
if (text) {
return text;
}
}
return '';
}
function inferHeat(text) {
const compact = cleanText(text);
const match = compact.match(/(\d+(?:\.\d+)?)\s*(万|亿|k|K|m|M)(?:热度)?/);
if (match) {
return `${match[1]}${match[2]}`.replace('K', 'k').replace('M', 'm');
}
const plain = compact.match(/(\d+(?:\.\d+)?)(?:热度)?/);
return plain ? plain[1] : '';
}
function extractHeatToken(text) {
const compact = cleanText(text);
const match = compact.match(/(\d+(?:\.\d+)?)\s*(万|亿|k|K|m|M)(?:热度)?$/);
if (match) {
return `${match[1]}${match[2]}`.replace('K', 'k').replace('M', 'm');
}
return '';
}
function inferRank(item, index) {
const direct = pickText(item, [
'.HotList-item-index',
'.HotItem-index',
'[data-rank]',
'.RankingIndex',
]);
const directNumber = Number.parseInt(direct, 10);
if (Number.isFinite(directNumber) && directNumber > 0) {
return directNumber;
}
const text = cleanText(item.textContent);
const leading = text.match(/^(\d{1,2})\b/);
if (leading) {
return Number.parseInt(leading[1], 10);
}
return index + 1;
}
function collectRows() {
const candidates = collectDomCandidates();
const seenTitles = new Set();
const rows = [];
for (const item of candidates) {
const title = pickText(item, [
'.HotList-item-title',
'.HotList-item-title a',
'.HotItem-content a',
'h2 a',
'h2',
'a[href*="/question/"]',
]);
if (!title || seenTitles.has(title)) {
continue;
}
let heat = pickText(item, [
'.HotList-item-metrics',
'.HotList-item-heat',
'.HotItem-metrics',
'.HotItem-hot',
'[data-heat]',
]);
if (!heat) {
heat = inferHeat(item.textContent);
}
if (!heat) {
continue;
}
seenTitles.add(title);
rows.push([
inferRank(item, rows.length),
title,
heat,
]);
if (rows.length >= limit) {
break;
}
}
return rows;
}
function collectDomCandidates() {
const selectors = [
'.HotList-item',
'.HotItem',
'.HotList-list > *',
'[data-hot-item]',
'section ol li',
'main li',
'main article',
'main [class*="Hot"]',
];
const seen = new Set();
const candidates = [];
selectors.forEach((selector) => {
const nodes = Array.from(document.querySelectorAll(selector));
nodes.forEach((node) => {
if (seen.has(node)) {
return;
}
seen.add(node);
candidates.push(node);
});
});
return candidates;
}
function collectTextSources() {
const selectors = ['.HotList-list', '.HotList', '#root', 'main', 'body'];
const sources = [];
const seen = new Set();
selectors.forEach((selector) => {
const node = document.querySelector(selector);
const rawText = String(node && (node.innerText || node.textContent || '') || '');
const dedupeKey = cleanText(rawText);
if (!dedupeKey || seen.has(dedupeKey)) {
return;
}
seen.add(dedupeKey);
sources.push(rawText);
});
return sources.sort((left, right) => right.length - left.length);
}
function looksLikeBlockedPage(text) {
return /安全验证|异常访问|请完成验证|登录后继续|登录即可查看|验证码|访问受限/.test(text);
}
function shouldIgnoreTextLine(line) {
if (!line) {
return true;
}
if (line === '知乎热榜' || line === '首页 - 知乎' || line === '首页-知乎') {
return true;
}
if (line.startsWith('/ ') || line.startsWith('当前页面 ·') ||
line.startsWith('继续输入任务')) {
return true;
}
return false;
}
function collectRowsFromText() {
const sources = collectTextSources();
for (const source of sources) {
if (!source) {
continue;
}
if (looksLikeBlockedPage(source)) {
throw new Error('知乎页面当前需要登录或完成安全验证,无法读取热榜条目');
}
const rows = parseRowsFromText(source);
if (rows.length) {
return rows.slice(0, limit);
}
}
return [];
}
function parseRowsFromText(text) {
const lines = String(text || '')
.split(/\n+/)
.map(cleanText)
.filter((line) => !!line && !shouldIgnoreTextLine(line));
const seenTitles = new Set();
const rows = [];
let pendingRank = null;
let titleParts = [];
function pushRow(title, heat) {
const normalizedTitle = cleanText(title);
if (!normalizedTitle || !heat || seenTitles.has(normalizedTitle)) {
return;
}
seenTitles.add(normalizedTitle);
rows.push([
pendingRank || rows.length + 1,
normalizedTitle,
heat,
]);
pendingRank = null;
titleParts = [];
}
for (const rawLine of lines) {
let line = rawLine;
const rankOnly = line.match(/^(\d{1,2})$/);
if (rankOnly && !titleParts.length) {
pendingRank = Number(rankOnly[1]);
continue;
}
const rankedLine = line.match(/^(\d{1,2})[.、\s]+(.+)$/);
if (rankedLine) {
pendingRank = Number(rankedLine[1]);
line = cleanText(rankedLine[2]);
}
const inlineMatch = line.match(/^(.*?)(\d+(?:\.\d+)?)\s*(万|亿|k|K|m|M)(?:热度)?$/);
if (inlineMatch && cleanText(inlineMatch[1])) {
pushRow(cleanText(inlineMatch[1]), `${inlineMatch[2]}${inlineMatch[3]}`.replace('K', 'k').replace('M', 'm'));
if (rows.length >= limit) {
break;
}
continue;
}
const heatOnly = extractHeatToken(line);
if (heatOnly && titleParts.length) {
pushRow(titleParts.join(' '), heatOnly);
if (rows.length >= limit) {
break;
}
continue;
}
titleParts.push(line);
}
return rows;
}
const domRows = collectRows();
const rows = domRows.length ? domRows : collectRowsFromText();
if (!rows.length) {
throw new Error('未能从页面 DOM 中提取到知乎热榜条目');
}
return {
source: `${location.origin}${location.pathname}`,
sheet_name: '知乎热榜',
columns: ['rank', 'title', 'heat'],
rows,
};