♻️ refactor: 重构并优化文档抓取插件能力

This commit is contained in:
arvinxx
2023-07-30 00:44:28 +08:00
parent 425070fe2f
commit ff56348358
2 changed files with 21 additions and 55 deletions

View File

@@ -1,47 +1,40 @@
import { PluginRunner } from '@/plugins/type';
import { DataResults, Result } from './type';
import { Result } from './type';
const BASE_URL =
'https://api.apify.com/v2/acts/apify~website-content-crawler/run-sync-get-dataset-items';
const token = process.env.APIFY_API_KEY;
const BASE_URL = process.env.BROWSERLESS_URL ?? 'https://chrome.browserless.io';
const BROWSERLESS_TOKEN = process.env.BROWSERLESS_TOKEN;
// service from: https://github.com/lobehub/html-parser/tree/master
const HTML_PARSER_URL = process.env.HTML_PARSER_URL;
const runner: PluginRunner<{ url: string }, Result> = async ({ url }) => {
// Prepare Actor input
const input = {
aggressivePrune: false,
clickElementsCssSelector: '[aria-expanded="false"]',
debugMode: false,
dynamicContentWaitSecs: 3,
proxyConfiguration: {
useApifyProxy: true,
},
removeCookieWarnings: true,
removeElementsCssSelector:
'nav, footer, script, style, noscript, svg,\n[role="alert"],\n[role="banner"],\n[role="dialog"],\n[role="alertdialog"],\n[role="region"][aria-label*="skip" i],\n[aria-modal="true"]',
saveFiles: false,
saveHtml: false,
saveMarkdown: true,
saveScreenshots: false,
startUrls: [{ url }],
gotoOptions: { waitUntil: 'networkidle2' },
url,
};
try {
const data = await fetch(`${BASE_URL}?token=${token}`, {
const res = await fetch(`${BASE_URL}/content?token=${BROWSERLESS_TOKEN}`, {
body: JSON.stringify(input),
headers: {
'Content-Type': 'application/json',
},
method: 'POST',
});
const result = (await data.json()) as DataResults;
const html = await res.text();
const item = result[0];
return {
content: item.markdown,
title: item.metadata.title,
url: item.url,
};
const parserBody = { html, url };
const parseRes = await fetch(`${HTML_PARSER_URL}`, {
body: JSON.stringify(parserBody),
headers: {
'Content-Type': 'application/json',
},
method: 'POST',
});
return await parseRes.json();
} catch (error) {
console.error(error);
return { content: '抓取失败', errorMessage: (error as any).message };

View File

@@ -1,32 +1,5 @@
export type DataResults = DataItem[];
export type Result = {
content?: string;
title?: string;
url?: string;
};
export interface DataItem {
crawl: Crawl;
markdown: string;
metadata: Metadata;
screenshotUrl: any;
text: string;
url: string;
}
export interface Crawl {
depth: number;
httpStatusCode: number;
loadedTime: string;
loadedUrl: string;
referrerUrl: string;
}
export interface Metadata {
author: any;
canonicalUrl: string;
description: string;
keywords: string;
languageCode: string;
title: string;
}