mirror of
https://github.com/lobehub/lobehub.git
synced 2026-03-26 13:19:34 +07:00
✨ feat: 支持网页抓取 (#14)
* ✨ feat: 添加网页内容提取插件
* ✨ feat: 添加网页内容提取插件
* 💄 style: 修正样式
* 🐛 fix: 替换 plugins 为 nodejs 运行时,以解决调用插件会超时的问题
* 💬 style: 优化插件名称
* Revert "🐛 fix: 替换 plugins 为 nodejs 运行时,以解决调用插件会超时的问题"
This reverts commit 0f99edf216.
This commit is contained in:
@@ -37,6 +37,8 @@ export default {
|
||||
'ok': '确定',
|
||||
'plugin-realtimeWeather': '实时天气预报',
|
||||
'plugin-searchEngine': '搜索引擎',
|
||||
'plugin-undefined': '插件检测中...',
|
||||
'plugin-websiteCrawler': '网页内容提取',
|
||||
'pluginList': '插件列表',
|
||||
'pluginLoading': '插件运行中...',
|
||||
'profile': '助手身份',
|
||||
|
||||
@@ -69,7 +69,7 @@ const MessageExtra = ({ role, extra, function_call }: ChatMessage): ReactNode =>
|
||||
if (!(hasModelTag || hasFuncTag)) return;
|
||||
|
||||
return (
|
||||
<Flexbox className={styles.container}>
|
||||
<Flexbox className={styles.container} horizontal>
|
||||
{hasFuncTag && funcTag}
|
||||
{hasModelTag && modelTag}
|
||||
</Flexbox>
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import searchEngine from './searchEngine';
|
||||
import getWeather from './weather';
|
||||
import webCrawler from './webCrawler';
|
||||
|
||||
const pluginList = [getWeather, searchEngine];
|
||||
const pluginList = [getWeather, searchEngine, webCrawler];
|
||||
|
||||
export default pluginList;
|
||||
|
||||
20
src/plugins/webCrawler/index.ts
Normal file
20
src/plugins/webCrawler/index.ts
Normal file
@@ -0,0 +1,20 @@
|
||||
import runner from './runner';
|
||||
|
||||
const schema = {
|
||||
description: '提取网页内容并总结',
|
||||
name: 'websiteCrawler',
|
||||
parameters: {
|
||||
properties: {
|
||||
url: {
|
||||
description: '网页内容',
|
||||
type: 'string',
|
||||
},
|
||||
},
|
||||
required: ['url'],
|
||||
type: 'object',
|
||||
},
|
||||
};
|
||||
|
||||
const getWeather = { avatar: '🕸', name: 'websiteCrawler', runner, schema };
|
||||
|
||||
export default getWeather;
|
||||
75
src/plugins/webCrawler/runner.ts
Normal file
75
src/plugins/webCrawler/runner.ts
Normal file
@@ -0,0 +1,75 @@
|
||||
export type DataResults = DataItem[];
|
||||
|
||||
export interface DataItem {
|
||||
crawl: Crawl;
|
||||
markdown: string;
|
||||
metadata: Metadata;
|
||||
screenshotUrl: any;
|
||||
text: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
export interface Crawl {
|
||||
depth: number;
|
||||
httpStatusCode: number;
|
||||
loadedTime: string;
|
||||
loadedUrl: string;
|
||||
referrerUrl: string;
|
||||
}
|
||||
|
||||
export interface Metadata {
|
||||
author: any;
|
||||
canonicalUrl: string;
|
||||
description: string;
|
||||
keywords: string;
|
||||
languageCode: string;
|
||||
title: string;
|
||||
}
|
||||
|
||||
const BASE_URL =
|
||||
'https://api.apify.com/v2/acts/apify~website-content-crawler/run-sync-get-dataset-items';
|
||||
const token = process.env.APIFY_API_KEY;
|
||||
|
||||
const runner = async ({ url }: { url: string }) => {
|
||||
// Prepare Actor input
|
||||
const input = {
|
||||
aggressivePrune: false,
|
||||
clickElementsCssSelector: '[aria-expanded="false"]',
|
||||
debugMode: false,
|
||||
dynamicContentWaitSecs: 3,
|
||||
proxyConfiguration: {
|
||||
useApifyProxy: true,
|
||||
},
|
||||
removeCookieWarnings: true,
|
||||
removeElementsCssSelector:
|
||||
'nav, footer, script, style, noscript, svg,\n[role="alert"],\n[role="banner"],\n[role="dialog"],\n[role="alertdialog"],\n[role="region"][aria-label*="skip" i],\n[aria-modal="true"]',
|
||||
saveFiles: false,
|
||||
saveHtml: false,
|
||||
saveMarkdown: true,
|
||||
saveScreenshots: false,
|
||||
startUrls: [{ url }],
|
||||
};
|
||||
|
||||
try {
|
||||
const data = await fetch(`${BASE_URL}?token=${token}`, {
|
||||
body: JSON.stringify(input),
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
method: 'POST',
|
||||
});
|
||||
const result = (await data.json()) as DataResults;
|
||||
|
||||
const item = result[0];
|
||||
return {
|
||||
content: item.markdown,
|
||||
title: item.metadata.title,
|
||||
url: item.url,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
return { content: '抓取失败', errorMessage: (error as any).message };
|
||||
}
|
||||
};
|
||||
|
||||
export default runner;
|
||||
Reference in New Issue
Block a user