feat: 支持网页抓取 (#14)

*  feat: 添加网页内容提取插件

*  feat: 添加网页内容提取插件

* 💄 style: 修正样式

* 🐛 fix: 替换 plugins 为 nodejs 运行时,以解决调用插件会超时的问题

* 💬 style: 优化插件名称

* Revert "🐛 fix: 替换 plugins 为 nodejs 运行时,以解决调用插件会超时的问题"

This reverts commit 0f99edf216.
This commit is contained in:
Arvin Xu
2023-07-24 10:02:35 +08:00
committed by GitHub
parent 1f2b4a535a
commit 9e933b0009
5 changed files with 100 additions and 2 deletions

View File

@@ -37,6 +37,8 @@ export default {
'ok': '确定',
'plugin-realtimeWeather': '实时天气预报',
'plugin-searchEngine': '搜索引擎',
'plugin-undefined': '插件检测中...',
'plugin-websiteCrawler': '网页内容提取',
'pluginList': '插件列表',
'pluginLoading': '插件运行中...',
'profile': '助手身份',

View File

@@ -69,7 +69,7 @@ const MessageExtra = ({ role, extra, function_call }: ChatMessage): ReactNode =>
if (!(hasModelTag || hasFuncTag)) return;
return (
<Flexbox className={styles.container}>
<Flexbox className={styles.container} horizontal>
{hasFuncTag && funcTag}
{hasModelTag && modelTag}
</Flexbox>

View File

@@ -1,6 +1,7 @@
import searchEngine from './searchEngine';
import getWeather from './weather';
import webCrawler from './webCrawler';
const pluginList = [getWeather, searchEngine];
const pluginList = [getWeather, searchEngine, webCrawler];
export default pluginList;

View File

@@ -0,0 +1,20 @@
import runner from './runner';
const schema = {
description: '提取网页内容并总结',
name: 'websiteCrawler',
parameters: {
properties: {
url: {
description: '网页内容',
type: 'string',
},
},
required: ['url'],
type: 'object',
},
};
const getWeather = { avatar: '🕸', name: 'websiteCrawler', runner, schema };
export default getWeather;

View File

@@ -0,0 +1,75 @@
export type DataResults = DataItem[];
export interface DataItem {
crawl: Crawl;
markdown: string;
metadata: Metadata;
screenshotUrl: any;
text: string;
url: string;
}
export interface Crawl {
depth: number;
httpStatusCode: number;
loadedTime: string;
loadedUrl: string;
referrerUrl: string;
}
export interface Metadata {
author: any;
canonicalUrl: string;
description: string;
keywords: string;
languageCode: string;
title: string;
}
const BASE_URL =
'https://api.apify.com/v2/acts/apify~website-content-crawler/run-sync-get-dataset-items';
const token = process.env.APIFY_API_KEY;
const runner = async ({ url }: { url: string }) => {
// Prepare Actor input
const input = {
aggressivePrune: false,
clickElementsCssSelector: '[aria-expanded="false"]',
debugMode: false,
dynamicContentWaitSecs: 3,
proxyConfiguration: {
useApifyProxy: true,
},
removeCookieWarnings: true,
removeElementsCssSelector:
'nav, footer, script, style, noscript, svg,\n[role="alert"],\n[role="banner"],\n[role="dialog"],\n[role="alertdialog"],\n[role="region"][aria-label*="skip" i],\n[aria-modal="true"]',
saveFiles: false,
saveHtml: false,
saveMarkdown: true,
saveScreenshots: false,
startUrls: [{ url }],
};
try {
const data = await fetch(`${BASE_URL}?token=${token}`, {
body: JSON.stringify(input),
headers: {
'Content-Type': 'application/json',
},
method: 'POST',
});
const result = (await data.json()) as DataResults;
const item = result[0];
return {
content: item.markdown,
title: item.metadata.title,
url: item.url,
};
} catch (error) {
console.error(error);
return { content: '抓取失败', errorMessage: (error as any).message };
}
};
export default runner;