chore: initial commit
This commit is contained in:
262
tmp/extract_hotlist/scripts/extract_hotlist.js
Normal file
262
tmp/extract_hotlist/scripts/extract_hotlist.js
Normal file
@@ -0,0 +1,262 @@
|
||||
const limit = Math.max(1, Number(args.top_n || 10));
|
||||
|
||||
function cleanText(value) {
|
||||
return String(value || '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.replace(/\u200b/g, '')
|
||||
.trim();
|
||||
}
|
||||
|
||||
function pickText(root, selectors) {
|
||||
for (const selector of selectors) {
|
||||
const node = root.querySelector(selector);
|
||||
const text = cleanText(node && node.textContent);
|
||||
if (text) {
|
||||
return text;
|
||||
}
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
function inferHeat(text) {
|
||||
const compact = cleanText(text);
|
||||
const match = compact.match(/(\d+(?:\.\d+)?)\s*(万|亿|k|K|m|M)(?:热度)?/);
|
||||
if (match) {
|
||||
return `${match[1]}${match[2]}`.replace('K', 'k').replace('M', 'm');
|
||||
}
|
||||
const plain = compact.match(/(\d+(?:\.\d+)?)(?:热度)?/);
|
||||
return plain ? plain[1] : '';
|
||||
}
|
||||
|
||||
function extractHeatToken(text) {
|
||||
const compact = cleanText(text);
|
||||
const match = compact.match(/(\d+(?:\.\d+)?)\s*(万|亿|k|K|m|M)(?:热度)?$/);
|
||||
if (match) {
|
||||
return `${match[1]}${match[2]}`.replace('K', 'k').replace('M', 'm');
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
function inferRank(item, index) {
|
||||
const direct = pickText(item, [
|
||||
'.HotList-item-index',
|
||||
'.HotItem-index',
|
||||
'[data-rank]',
|
||||
'.RankingIndex',
|
||||
]);
|
||||
const directNumber = Number.parseInt(direct, 10);
|
||||
if (Number.isFinite(directNumber) && directNumber > 0) {
|
||||
return directNumber;
|
||||
}
|
||||
|
||||
const text = cleanText(item.textContent);
|
||||
const leading = text.match(/^(\d{1,2})\b/);
|
||||
if (leading) {
|
||||
return Number.parseInt(leading[1], 10);
|
||||
}
|
||||
|
||||
return index + 1;
|
||||
}
|
||||
|
||||
function collectRows() {
|
||||
const candidates = collectDomCandidates();
|
||||
const seenTitles = new Set();
|
||||
const rows = [];
|
||||
|
||||
for (const item of candidates) {
|
||||
const title = pickText(item, [
|
||||
'.HotList-item-title',
|
||||
'.HotList-item-title a',
|
||||
'.HotItem-content a',
|
||||
'h2 a',
|
||||
'h2',
|
||||
'a[href*="/question/"]',
|
||||
]);
|
||||
if (!title || seenTitles.has(title)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let heat = pickText(item, [
|
||||
'.HotList-item-metrics',
|
||||
'.HotList-item-heat',
|
||||
'.HotItem-metrics',
|
||||
'.HotItem-hot',
|
||||
'[data-heat]',
|
||||
]);
|
||||
if (!heat) {
|
||||
heat = inferHeat(item.textContent);
|
||||
}
|
||||
if (!heat) {
|
||||
continue;
|
||||
}
|
||||
|
||||
seenTitles.add(title);
|
||||
rows.push([
|
||||
inferRank(item, rows.length),
|
||||
title,
|
||||
heat,
|
||||
]);
|
||||
|
||||
if (rows.length >= limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return rows;
|
||||
}
|
||||
|
||||
function collectDomCandidates() {
|
||||
const selectors = [
|
||||
'.HotList-item',
|
||||
'.HotItem',
|
||||
'.HotList-list > *',
|
||||
'[data-hot-item]',
|
||||
'section ol li',
|
||||
'main li',
|
||||
'main article',
|
||||
'main [class*="Hot"]',
|
||||
];
|
||||
const seen = new Set();
|
||||
const candidates = [];
|
||||
selectors.forEach((selector) => {
|
||||
const nodes = Array.from(document.querySelectorAll(selector));
|
||||
nodes.forEach((node) => {
|
||||
if (seen.has(node)) {
|
||||
return;
|
||||
}
|
||||
seen.add(node);
|
||||
candidates.push(node);
|
||||
});
|
||||
});
|
||||
return candidates;
|
||||
}
|
||||
|
||||
function collectTextSources() {
|
||||
const selectors = ['.HotList-list', '.HotList', '#root', 'main', 'body'];
|
||||
const sources = [];
|
||||
const seen = new Set();
|
||||
selectors.forEach((selector) => {
|
||||
const node = document.querySelector(selector);
|
||||
const rawText = String(node && (node.innerText || node.textContent || '') || '');
|
||||
const dedupeKey = cleanText(rawText);
|
||||
if (!dedupeKey || seen.has(dedupeKey)) {
|
||||
return;
|
||||
}
|
||||
seen.add(dedupeKey);
|
||||
sources.push(rawText);
|
||||
});
|
||||
return sources.sort((left, right) => right.length - left.length);
|
||||
}
|
||||
|
||||
function looksLikeBlockedPage(text) {
|
||||
return /安全验证|异常访问|请完成验证|登录后继续|登录即可查看|验证码|访问受限/.test(text);
|
||||
}
|
||||
|
||||
function shouldIgnoreTextLine(line) {
|
||||
if (!line) {
|
||||
return true;
|
||||
}
|
||||
if (line === '知乎热榜' || line === '首页 - 知乎' || line === '首页-知乎') {
|
||||
return true;
|
||||
}
|
||||
if (line.startsWith('/ ') || line.startsWith('当前页面 ·') ||
|
||||
line.startsWith('继续输入任务')) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function collectRowsFromText() {
|
||||
const sources = collectTextSources();
|
||||
for (const source of sources) {
|
||||
if (!source) {
|
||||
continue;
|
||||
}
|
||||
if (looksLikeBlockedPage(source)) {
|
||||
throw new Error('知乎页面当前需要登录或完成安全验证,无法读取热榜条目');
|
||||
}
|
||||
|
||||
const rows = parseRowsFromText(source);
|
||||
if (rows.length) {
|
||||
return rows.slice(0, limit);
|
||||
}
|
||||
}
|
||||
return [];
|
||||
}
|
||||
|
||||
function parseRowsFromText(text) {
|
||||
const lines = String(text || '')
|
||||
.split(/\n+/)
|
||||
.map(cleanText)
|
||||
.filter((line) => !!line && !shouldIgnoreTextLine(line));
|
||||
const seenTitles = new Set();
|
||||
const rows = [];
|
||||
let pendingRank = null;
|
||||
let titleParts = [];
|
||||
|
||||
function pushRow(title, heat) {
|
||||
const normalizedTitle = cleanText(title);
|
||||
if (!normalizedTitle || !heat || seenTitles.has(normalizedTitle)) {
|
||||
return;
|
||||
}
|
||||
seenTitles.add(normalizedTitle);
|
||||
rows.push([
|
||||
pendingRank || rows.length + 1,
|
||||
normalizedTitle,
|
||||
heat,
|
||||
]);
|
||||
pendingRank = null;
|
||||
titleParts = [];
|
||||
}
|
||||
|
||||
for (const rawLine of lines) {
|
||||
let line = rawLine;
|
||||
|
||||
const rankOnly = line.match(/^(\d{1,2})$/);
|
||||
if (rankOnly && !titleParts.length) {
|
||||
pendingRank = Number(rankOnly[1]);
|
||||
continue;
|
||||
}
|
||||
|
||||
const rankedLine = line.match(/^(\d{1,2})[.、\s]+(.+)$/);
|
||||
if (rankedLine) {
|
||||
pendingRank = Number(rankedLine[1]);
|
||||
line = cleanText(rankedLine[2]);
|
||||
}
|
||||
|
||||
const inlineMatch = line.match(/^(.*?)(\d+(?:\.\d+)?)\s*(万|亿|k|K|m|M)(?:热度)?$/);
|
||||
if (inlineMatch && cleanText(inlineMatch[1])) {
|
||||
pushRow(cleanText(inlineMatch[1]), `${inlineMatch[2]}${inlineMatch[3]}`.replace('K', 'k').replace('M', 'm'));
|
||||
if (rows.length >= limit) {
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
const heatOnly = extractHeatToken(line);
|
||||
if (heatOnly && titleParts.length) {
|
||||
pushRow(titleParts.join(' '), heatOnly);
|
||||
if (rows.length >= limit) {
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
titleParts.push(line);
|
||||
}
|
||||
|
||||
return rows;
|
||||
}
|
||||
|
||||
const domRows = collectRows();
|
||||
const rows = domRows.length ? domRows : collectRowsFromText();
|
||||
if (!rows.length) {
|
||||
throw new Error('未能从页面 DOM 中提取到知乎热榜条目');
|
||||
}
|
||||
|
||||
return {
|
||||
source: `${location.origin}${location.pathname}`,
|
||||
sheet_name: '知乎热榜',
|
||||
columns: ['rank', 'title', 'heat'],
|
||||
rows,
|
||||
};
|
||||
Reference in New Issue
Block a user