✨ feat: 支持网页抓取 (#14)

* ✨ feat: 添加网页内容提取插件 * ✨ feat: 添加网页内容提取插件 * 💄 style: 修正样式 * 🐛 fix: 替换 plugins 为 nodejs 运行时，以解决调用插件会超时的问题 * 💬 style: 优化插件名称 * Revert "🐛 fix: 替换 plugins 为 nodejs 运行时，以解决调用插件会超时的问题" This reverts commit 0f99edf216.
2026-03-26 13:19:34 +07:00 · 2023-07-24 10:02:35 +08:00
parent 1f2b4a535a
commit 9e933b0009
5 changed files with 100 additions and 2 deletions
--- a/src/locales/default/common.ts
+++ b/src/locales/default/common.ts
@@ -37,6 +37,8 @@ export default {
  'ok': '确定',
  'plugin-realtimeWeather': '实时天气预报',
  'plugin-searchEngine': '搜索引擎',
+  'plugin-undefined': '插件检测中...',
+  'plugin-websiteCrawler': '网页内容提取',
  'pluginList': '插件列表',
  'pluginLoading': '插件运行中...',
  'profile': '助手身份',
--- a/src/pages/chat/[id]/Conversation/MessageExtra.tsx
+++ b/src/pages/chat/[id]/Conversation/MessageExtra.tsx
@@ -69,7 +69,7 @@ const MessageExtra = ({ role, extra, function_call }: ChatMessage): ReactNode =>
      if (!(hasModelTag || hasFuncTag)) return;

      return (
-        <Flexbox className={styles.container}>
+        <Flexbox className={styles.container} horizontal>
          {hasFuncTag && funcTag}
          {hasModelTag && modelTag}
        </Flexbox>
--- a/src/plugins/index.ts
+++ b/src/plugins/index.ts
@@ -1,6 +1,7 @@
 import searchEngine from './searchEngine';
 import getWeather from './weather';
+import webCrawler from './webCrawler';

-const pluginList = [getWeather, searchEngine];
+const pluginList = [getWeather, searchEngine, webCrawler];

 export default pluginList;
--- a/src/plugins/webCrawler/index.ts
+++ b/src/plugins/webCrawler/index.ts
@@ -0,0 +1,20 @@
+import runner from './runner';
+
+const schema = {
+  description: '提取网页内容并总结',
+  name: 'websiteCrawler',
+  parameters: {
+    properties: {
+      url: {
+        description: '网页内容',
+        type: 'string',
+      },
+    },
+    required: ['url'],
+    type: 'object',
+  },
+};
+
+const getWeather = { avatar: '🕸', name: 'websiteCrawler', runner, schema };
+
+export default getWeather;
--- a/src/plugins/webCrawler/runner.ts
+++ b/src/plugins/webCrawler/runner.ts
@@ -0,0 +1,75 @@
+export type DataResults = DataItem[];
+
+export interface DataItem {
+  crawl: Crawl;
+  markdown: string;
+  metadata: Metadata;
+  screenshotUrl: any;
+  text: string;
+  url: string;
+}
+
+export interface Crawl {
+  depth: number;
+  httpStatusCode: number;
+  loadedTime: string;
+  loadedUrl: string;
+  referrerUrl: string;
+}
+
+export interface Metadata {
+  author: any;
+  canonicalUrl: string;
+  description: string;
+  keywords: string;
+  languageCode: string;
+  title: string;
+}
+
+const BASE_URL =
+  'https://api.apify.com/v2/acts/apify~website-content-crawler/run-sync-get-dataset-items';
+const token = process.env.APIFY_API_KEY;
+
+const runner = async ({ url }: { url: string }) => {
+  // Prepare Actor input
+  const input = {
+    aggressivePrune: false,
+    clickElementsCssSelector: '[aria-expanded="false"]',
+    debugMode: false,
+    dynamicContentWaitSecs: 3,
+    proxyConfiguration: {
+      useApifyProxy: true,
+    },
+    removeCookieWarnings: true,
+    removeElementsCssSelector:
+      'nav, footer, script, style, noscript, svg,\n[role="alert"],\n[role="banner"],\n[role="dialog"],\n[role="alertdialog"],\n[role="region"][aria-label*="skip" i],\n[aria-modal="true"]',
+    saveFiles: false,
+    saveHtml: false,
+    saveMarkdown: true,
+    saveScreenshots: false,
+    startUrls: [{ url }],
+  };
+
+  try {
+    const data = await fetch(`${BASE_URL}?token=${token}`, {
+      body: JSON.stringify(input),
+      headers: {
+        'Content-Type': 'application/json',
+      },
+      method: 'POST',
+    });
+    const result = (await data.json()) as DataResults;
+
+    const item = result[0];
+    return {
+      content: item.markdown,
+      title: item.metadata.title,
+      url: item.url,
+    };
+  } catch (error) {
+    console.error(error);
+    return { content: '抓取失败', errorMessage: (error as any).message };
+  }
+};
+
+export default runner;