From 306c50704e5fa13d4e56d8423a0602378c2927e7 Mon Sep 17 00:00:00 2001 From: YuTengjing Date: Thu, 26 Feb 2026 22:59:10 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20fix:=20improve=20crawler=20error?= =?UTF-8?q?=20handling=20and=20timeout=20cancellation=20(#12487)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .agents/skills/pr/SKILL.md | 55 ++++ .agents/skills/upstash-workflow/SKILL.md | 5 + AGENTS.md | 3 +- CLAUDE.md | 2 + GEMINI.md | 2 + docs/self-hosting/advanced/online-search.mdx | 18 ++ .../advanced/online-search.zh-CN.mdx | 18 ++ .../web-crawler/src/__tests__/crawler.test.ts | 21 +- .../crawImpl/__tests__/browserless.test.ts | 50 ++-- .../src/crawImpl/__tests__/exa.test.ts | 163 ++++++------ .../src/crawImpl/__tests__/firecrawl.test.ts | 207 +++++++-------- .../src/crawImpl/__tests__/jina.test.ts | 133 +++++----- .../src/crawImpl/__tests__/naive.test.ts | 53 ++-- .../src/crawImpl/__tests__/search1api.test.ts | 78 +++--- .../src/crawImpl/__tests__/tavily.test.ts | 244 ++++++++---------- .../web-crawler/src/crawImpl/browserless.ts | 91 ++++--- packages/web-crawler/src/crawImpl/exa.ts | 84 +++--- .../web-crawler/src/crawImpl/firecrawl.ts | 91 +++---- packages/web-crawler/src/crawImpl/jina.ts | 78 ++++-- packages/web-crawler/src/crawImpl/naive.ts | 34 ++- .../web-crawler/src/crawImpl/search1api.ts | 70 +++-- packages/web-crawler/src/crawImpl/tavily.ts | 84 +++--- packages/web-crawler/src/crawler.ts | 11 +- packages/web-crawler/src/test-utils.ts | 25 ++ .../utils/{ => __tests__}/appUrlRules.test.ts | 2 +- .../src/utils/__tests__/errorType.test.ts | 45 +++- .../src/utils/__tests__/response.test.ts | 102 ++++++++ .../src/utils/__tests__/withTimeout.test.ts | 80 ++++-- packages/web-crawler/src/utils/errorType.ts | 31 +++ .../src/utils/htmlToMarkdown.test.ts | 27 +- .../web-crawler/src/utils/htmlToMarkdown.ts | 6 +- packages/web-crawler/src/utils/response.ts | 49 ++++ packages/web-crawler/src/utils/withTimeout.ts | 21 +- src/envs/tools.ts | 10 + src/server/routers/tools/search.test.ts | 21 ++ src/server/routers/tools/search.ts | 5 +- src/server/services/search/index.test.ts | 153 ++++++++++- src/server/services/search/index.ts | 74 +++++- 38 files changed, 1462 insertions(+), 784 deletions(-) create mode 100644 .agents/skills/pr/SKILL.md create mode 100644 packages/web-crawler/src/test-utils.ts rename packages/web-crawler/src/utils/{ => __tests__}/appUrlRules.test.ts (98%) create mode 100644 packages/web-crawler/src/utils/__tests__/response.test.ts create mode 100644 packages/web-crawler/src/utils/response.ts diff --git a/.agents/skills/pr/SKILL.md b/.agents/skills/pr/SKILL.md new file mode 100644 index 0000000000..b751d5a0eb --- /dev/null +++ b/.agents/skills/pr/SKILL.md @@ -0,0 +1,55 @@ +--- +name: pr +description: "Create a PR for the current branch. Use when the user asks to create a pull request, submit PR, or says 'pr'." +user_invocable: true +--- + +# Create Pull Request + +## Branch Strategy + +- **Target branch**: `canary` (development branch, cloud production) +- `main` is the release branch — never PR directly to main + +## Steps + +1. **Gather context** (run in parallel): + - `git branch --show-current` — current branch name + - `git rev-parse --abbrev-ref @{u} 2>/dev/null` — remote tracking status + - `git log --oneline origin/canary..HEAD` — unpushed commits + - `gh pr list --head "$(git branch --show-current)" --json number,title,state,url` — existing PR + - `git log --oneline origin/canary..HEAD` — commit history for PR title + - `git diff --stat --stat-count=20 origin/canary..HEAD` — change summary + +2. **Push if needed**: + - No upstream: `git push -u origin $(git branch --show-current)` + - Has upstream: `git push origin $(git branch --show-current)` + +3. **Search related GitHub issues**: + - `gh issue list --search "" --state all --limit 10` + - Only link issues with matching scope (avoid large umbrella issues) + - Skip if no matching issue found + +4. **Create PR** with `gh pr create --base canary`: + - Title: ` (): ` + - Body: based on PR template (`.github/PULL_REQUEST_TEMPLATE.md`), fill checkboxes + - Link related GitHub issues using magic keywords (`Fixes #123`, `Closes #123`) + - Link Linear issues if applicable (`Fixes LOBE-xxx`) + - Use HEREDOC for body to preserve formatting + +5. **Open in browser**: `gh pr view --web` + +## PR Template + +Use `.github/PULL_REQUEST_TEMPLATE.md` as the body structure. Key sections: + +- **Change Type**: Check the appropriate gitmoji type +- **Related Issue**: Link GitHub/Linear issues with magic keywords +- **Description of Change**: Summarize what and why +- **How to Test**: Describe test approach, check relevant boxes + +## Notes + +- **Release impact**: PR titles with `✨ feat/` or `🐛 fix` trigger releases — use carefully +- **Language**: All PR content must be in English +- If a PR already exists for the branch, inform the user instead of creating a duplicate diff --git a/.agents/skills/upstash-workflow/SKILL.md b/.agents/skills/upstash-workflow/SKILL.md index 1d2178302f..1f410280ce 100644 --- a/.agents/skills/upstash-workflow/SKILL.md +++ b/.agents/skills/upstash-workflow/SKILL.md @@ -1,3 +1,8 @@ +--- +name: upstash-workflow +description: 'Upstash Workflow implementation guide. Use when creating async workflows with QStash, implementing fan-out patterns, or building 3-layer workflow architecture (process → paginate → execute).' +--- + # Upstash Workflow Implementation Guide This guide covers the standard patterns for implementing Upstash Workflow + QStash async workflows in the LobeHub codebase. diff --git a/AGENTS.md b/AGENTS.md index 9df160c082..722c995404 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -38,7 +38,8 @@ lobe-chat/ ### Git Workflow -- The current release branch is `next` until v2.0.0 is officially released +- **Branch strategy**: `canary` is the development branch (cloud production); `main` is the release branch (periodically cherry-picks from canary) +- New branches should be created from `canary`; PRs should target `canary` - Use rebase for git pull - Git commit messages should prefix with gitmoji - Git branch name format: `username/feat/feature-name` diff --git a/CLAUDE.md b/CLAUDE.md index dc4a72b7d8..83f683670c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -33,6 +33,8 @@ lobe-chat/ ### Git Workflow +- **Branch strategy**: `canary` is the development branch (cloud production); `main` is the release branch (periodically cherry-picks from canary) +- New branches should be created from `canary`; PRs should target `canary` - Use rebase for `git pull` - Commit messages: prefix with gitmoji - Branch format: `/` diff --git a/GEMINI.md b/GEMINI.md index b4cd3a15b7..c1bdab8da2 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -33,6 +33,8 @@ lobe-chat/ ### Git Workflow +- **Branch strategy**: `canary` is the development branch (cloud production); `main` is the release branch (periodically cherry-picks from canary) +- New branches should be created from `canary`; PRs should target `canary` - Use rebase for `git pull` - Commit messages: prefix with gitmoji - Branch format: `/` diff --git a/docs/self-hosting/advanced/online-search.mdx b/docs/self-hosting/advanced/online-search.mdx index 6e7862fdc5..23b9e7fa85 100644 --- a/docs/self-hosting/advanced/online-search.mdx +++ b/docs/self-hosting/advanced/online-search.mdx @@ -51,6 +51,24 @@ Supported crawler types are listed below: --- +## `CRAWL_CONCURRENCY` + +Controls crawler concurrency per crawl task. The default is `3`. On low-resource servers, use `1` to reduce CPU spikes. + +```env +CRAWL_CONCURRENCY=3 +``` + +## `CRAWLER_RETRY` + +Controls retry attempts per URL on crawl failures. The default is `1` (up to 2 attempts total). + +```env +CRAWLER_RETRY=1 +``` + +--- + ## `SEARCH_PROVIDERS` Configure which search engine providers to use for web search. diff --git a/docs/self-hosting/advanced/online-search.zh-CN.mdx b/docs/self-hosting/advanced/online-search.zh-CN.mdx index a9af246822..3b599c34be 100644 --- a/docs/self-hosting/advanced/online-search.zh-CN.mdx +++ b/docs/self-hosting/advanced/online-search.zh-CN.mdx @@ -46,6 +46,24 @@ CRAWLER_IMPLS="naive,search1api" --- +## `CRAWL_CONCURRENCY` + +控制单次网页抓取任务的并发数量,默认值为 `3`。在低配置服务器上建议设置为 `1` 以降低 CPU 峰值。 + +```env +CRAWL_CONCURRENCY=3 +``` + +## `CRAWLER_RETRY` + +控制单个 URL 的抓取失败重试次数,默认值为 `1`(即最多尝试 2 次)。 + +```env +CRAWLER_RETRY=1 +``` + +--- + ## `SEARCH_PROVIDERS` 配置联网搜索使用的搜索引擎提供商。 diff --git a/packages/web-crawler/src/__tests__/crawler.test.ts b/packages/web-crawler/src/__tests__/crawler.test.ts index a7868f7762..ba971ca7e7 100644 --- a/packages/web-crawler/src/__tests__/crawler.test.ts +++ b/packages/web-crawler/src/__tests__/crawler.test.ts @@ -1,4 +1,4 @@ -import { describe, expect, it, vi } from 'vitest'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; import { Crawler } from '../crawler'; @@ -19,6 +19,16 @@ vi.mock('../utils/appUrlRules', () => ({ })); describe('Crawler', () => { + beforeEach(async () => { + vi.clearAllMocks(); + // Reset applyUrlRules to default (no impls override) + const { applyUrlRules } = await import('../utils/appUrlRules'); + vi.mocked(applyUrlRules).mockReturnValue({ + transformedUrl: 'https://example.com', + filterOptions: {}, + }); + }); + const crawler = new Crawler(); it('should crawl successfully with default impls', async () => { @@ -194,11 +204,12 @@ describe('Crawler', () => { }); expect(result).toEqual({ - crawler: undefined, + crawler: 'browserless', data: { - content: 'Fail to crawl the page. Error type: UnknownError, error message: undefined', - errorMessage: undefined, - errorType: 'UnknownError', + content: + 'Fail to crawl the page. Error type: EmptyCrawlResultError, error message: browserless returned empty or short content', + errorMessage: 'browserless returned empty or short content', + errorType: 'EmptyCrawlResultError', }, originalUrl: 'https://example.com', transformedUrl: undefined, diff --git a/packages/web-crawler/src/crawImpl/__tests__/browserless.test.ts b/packages/web-crawler/src/crawImpl/__tests__/browserless.test.ts index 563c7e29f8..63006744bb 100644 --- a/packages/web-crawler/src/crawImpl/__tests__/browserless.test.ts +++ b/packages/web-crawler/src/crawImpl/__tests__/browserless.test.ts @@ -1,7 +1,13 @@ import { describe, expect, it, vi } from 'vitest'; +import * as withTimeoutModule from '../../utils/withTimeout'; import { browserless } from '../browserless'; +// Mock withTimeout to just call the factory function directly (bypassing real timeout) +vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((fn) => + fn(new AbortController().signal), +); + describe('browserless', () => { it('should throw BrowserlessInitError when env vars not set', async () => { const originalEnv = { ...process.env }; @@ -16,17 +22,22 @@ describe('browserless', () => { process.env = originalEnv; }); - it('should return undefined on fetch error', async () => { + it('should throw NetworkConnectionError on fetch failed', async () => { process.env.BROWSERLESS_TOKEN = 'test-token'; - global.fetch = vi.fn().mockRejectedValue(new Error('Fetch error')); + global.fetch = vi.fn().mockRejectedValue(new TypeError('fetch failed')); - const result = await browserless('https://example.com', { filterOptions: {} }); - expect(result).toBeUndefined(); + const { NetworkConnectionError } = await import('../../utils/errorType'); + await expect(browserless('https://example.com', { filterOptions: {} })).rejects.toThrow( + NetworkConnectionError, + ); }); it('should return undefined when content is empty', async () => { process.env.BROWSERLESS_TOKEN = 'test-token'; global.fetch = vi.fn().mockResolvedValue({ + ok: true, + status: 200, + statusText: 'OK', text: vi.fn().mockResolvedValue(''), } as any); @@ -37,6 +48,9 @@ describe('browserless', () => { it('should return undefined when title is "Just a moment..."', async () => { process.env.BROWSERLESS_TOKEN = 'test-token'; global.fetch = vi.fn().mockResolvedValue({ + ok: true, + status: 200, + statusText: 'OK', text: vi.fn().mockResolvedValue('Just a moment...'), } as any); @@ -46,7 +60,12 @@ describe('browserless', () => { it('should return crawl result on successful fetch', async () => { process.env.BROWSERLESS_TOKEN = 'test-token'; + const longContent = + 'This is a test paragraph with enough content to pass the length check. '.repeat(3); global.fetch = vi.fn().mockResolvedValue({ + ok: true, + status: 200, + statusText: 'OK', text: vi.fn().mockResolvedValue(` @@ -54,7 +73,7 @@ describe('browserless', () => { -

Test Content

+

${longContent}

`), @@ -76,6 +95,9 @@ describe('browserless', () => { it('should include rejectRequestPattern in request payload', async () => { process.env.BROWSERLESS_TOKEN = 'test-token'; const fetchMock = vi.fn().mockResolvedValue({ + ok: true, + status: 200, + statusText: 'OK', text: vi.fn().mockResolvedValue('Test'), }); global.fetch = fetchMock; @@ -90,9 +112,7 @@ describe('browserless', () => { it('should allow requests to permitted file types', async () => { const allowedExtensions = ['html', 'css', 'js', 'json', 'xml', 'webmanifest', 'txt', 'md']; - const pattern = new RegExp( - '.*\\.(?!(html|css|js|json|xml|webmanifest|txt|md)(\\?|#|$))[\\w-]+(?:[?#].*)?$', - ); + const pattern = /.*\.(?!(html|css|js|json|xml|webmanifest|txt|md)(\?|#|$))[\w-]+(?:[?#].*)?$/; allowedExtensions.forEach((ext) => { expect(`file.${ext}`).not.toMatch(pattern); @@ -103,9 +123,7 @@ describe('browserless', () => { it('should reject requests to non-permitted file types', async () => { const rejectedExtensions = ['jpg', 'png', 'gif', 'pdf', 'doc', 'mp4', 'wav']; - const pattern = new RegExp( - '.*\\.(?!(html|css|js|json|xml|webmanifest|txt|md)(\\?|#|$))[\\w-]+(?:[?#].*)?$', - ); + const pattern = /.*\.(?!(html|css|js|json|xml|webmanifest|txt|md)(\?|#|$))[\w-]+(?:[?#].*)?$/; rejectedExtensions.forEach((ext) => { expect(`file.${ext}`).toMatch(pattern); @@ -114,14 +132,16 @@ describe('browserless', () => { }); }); - it('should use correct URL when BROWSERLESS_URL is provided', async () => { - const customUrl = 'https://custom.browserless.io'; + it('should call fetch with the base URL and content path', async () => { const originalEnv = { ...process.env }; process.env.BROWSERLESS_TOKEN = 'test-token'; - process.env.BROWSERLESS_URL = customUrl; global.fetch = vi.fn().mockImplementation((url) => { - expect(url).toContain(customUrl); + // BASE_URL is captured at module load time, so we verify fetch is called with /content path + expect(url).toContain('/content'); return Promise.resolve({ + ok: true, + status: 200, + statusText: 'OK', text: () => Promise.resolve('Test'), }); }); diff --git a/packages/web-crawler/src/crawImpl/__tests__/exa.test.ts b/packages/web-crawler/src/crawImpl/__tests__/exa.test.ts index 23dddb03e5..05872a823e 100644 --- a/packages/web-crawler/src/crawImpl/__tests__/exa.test.ts +++ b/packages/web-crawler/src/crawImpl/__tests__/exa.test.ts @@ -1,5 +1,6 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { createMockResponse } from '../../test-utils'; import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../../utils/errorType'; import { exa } from '../exa'; @@ -18,23 +19,20 @@ describe('exa crawler', () => { it('should successfully crawl content with API key', async () => { process.env.EXA_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - requestId: 'test-request-id', - results: [ - { - id: 'test-id', - title: 'Test Article', - url: 'https://example.com', - text: 'This is a test article with enough content to pass the length check. '.repeat(3), - author: 'Test Author', - publishedDate: '2023-01-01', - summary: 'Test summary', - }, - ], - }), - }; + const mockResponse = createMockResponse({ + requestId: 'test-request-id', + results: [ + { + id: 'test-id', + title: 'Test Article', + url: 'https://example.com', + text: 'This is a test article with enough content to pass the length check. '.repeat(3), + author: 'Test Author', + publishedDate: '2023-01-01', + summary: 'Test summary', + }, + ], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -51,23 +49,20 @@ describe('exa crawler', () => { url: 'https://example.com', }); - expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000); + expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000); }); it('should handle missing API key', async () => { // API key is undefined - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - results: [ - { - title: 'Test Article', - url: 'https://example.com', - text: 'Test content with sufficient length. '.repeat(5), - }, - ], - }), - }; + const mockResponse = createMockResponse({ + results: [ + { + title: 'Test Article', + url: 'https://example.com', + text: 'Test content with sufficient length. '.repeat(5), + }, + ], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -75,19 +70,16 @@ describe('exa crawler', () => { await exa('https://example.com', { filterOptions: {} }); // Check that fetch was called with empty API key header - expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000); + expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000); }); it('should return undefined when no results are returned', async () => { process.env.EXA_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - requestId: 'test-request-id', - results: [], - }), - }; + const mockResponse = createMockResponse({ + requestId: 'test-request-id', + results: [], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -108,18 +100,15 @@ describe('exa crawler', () => { it('should return undefined for short content', async () => { process.env.EXA_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - results: [ - { - title: 'Test Article', - url: 'https://example.com', - text: 'Short', // Content too short - }, - ], - }), - }; + const mockResponse = createMockResponse({ + results: [ + { + title: 'Test Article', + url: 'https://example.com', + text: 'Short', // Content too short + }, + ], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -132,11 +121,11 @@ describe('exa crawler', () => { it('should throw PageNotFoundError for 404 status', async () => { process.env.EXA_API_KEY = 'test-api-key'; - const mockResponse = { + const mockResponse = createMockResponse('Not Found', { ok: false, status: 404, statusText: 'Not Found', - }; + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -149,11 +138,11 @@ describe('exa crawler', () => { it('should throw error for other HTTP errors', async () => { process.env.EXA_API_KEY = 'test-api-key'; - const mockResponse = { + const mockResponse = createMockResponse('', { ok: false, status: 500, statusText: 'Internal Server Error', - }; + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -167,7 +156,7 @@ describe('exa crawler', () => { process.env.EXA_API_KEY = 'test-api-key'; const { withTimeout } = await import('../../utils/withTimeout'); - vi.mocked(withTimeout).mockRejectedValue(new Error('fetch failed')); + vi.mocked(withTimeout).mockRejectedValue(new TypeError('fetch failed')); await expect(exa('https://example.com', { filterOptions: {} })).rejects.toThrow( NetworkConnectionError, @@ -198,42 +187,37 @@ describe('exa crawler', () => { ); }); - it('should return undefined when JSON parsing fails', async () => { + it('should throw ResponseBodyParseError when JSON parsing fails', async () => { process.env.EXA_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, + const mockResponse = createMockResponse('not json', { ok: true }); + mockResponse.json = vi.fn().mockRejectedValue(new Error('Invalid JSON')); + mockResponse.clone.mockReturnValue({ + ...mockResponse, json: vi.fn().mockRejectedValue(new Error('Invalid JSON')), - }; + text: vi.fn().mockResolvedValue('not json'), + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); - const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {}); - - const result = await exa('https://example.com', { filterOptions: {} }); - - expect(result).toBeUndefined(); - expect(consoleSpy).toHaveBeenCalled(); - - consoleSpy.mockRestore(); + await expect(exa('https://example.com', { filterOptions: {} })).rejects.toThrow( + 'Exa returned non-JSON response: not json', + ); }); it('should use result URL when available', async () => { process.env.EXA_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - results: [ - { - title: 'Test Article', - url: 'https://redirected.example.com', - text: 'Test content with sufficient length. '.repeat(5), - }, - ], - }), - }; + const mockResponse = createMockResponse({ + results: [ + { + title: 'Test Article', + url: 'https://redirected.example.com', + text: 'Test content with sufficient length. '.repeat(5), + }, + ], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -246,18 +230,15 @@ describe('exa crawler', () => { it('should fallback to original URL when result URL is missing', async () => { process.env.EXA_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - results: [ - { - title: 'Test Article', - text: 'Test content with sufficient length. '.repeat(5), - // url is missing - }, - ], - }), - }; + const mockResponse = createMockResponse({ + results: [ + { + title: 'Test Article', + text: 'Test content with sufficient length. '.repeat(5), + // url is missing + }, + ], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); diff --git a/packages/web-crawler/src/crawImpl/__tests__/firecrawl.test.ts b/packages/web-crawler/src/crawImpl/__tests__/firecrawl.test.ts index 8cd3d27427..5b8216363e 100644 --- a/packages/web-crawler/src/crawImpl/__tests__/firecrawl.test.ts +++ b/packages/web-crawler/src/crawImpl/__tests__/firecrawl.test.ts @@ -1,5 +1,6 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { createMockResponse } from '../../test-utils'; import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../../utils/errorType'; import { firecrawl } from '../firecrawl'; @@ -19,25 +20,23 @@ describe('firecrawl crawler', () => { it('should successfully crawl content with API key', async () => { process.env.FIRECRAWL_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - success: true, - data: { - markdown: - 'This is a test markdown content with enough length to pass validation. '.repeat(3), - metadata: { - title: 'Test Article', - description: 'Test description', - sourceURL: 'https://example.com', - statusCode: 200, - language: 'en', - keywords: 'test', - robots: 'index', - }, + const mockResponse = createMockResponse({ + success: true, + data: { + markdown: 'This is a test markdown content with enough length to pass validation. '.repeat( + 3, + ), + metadata: { + title: 'Test Article', + description: 'Test description', + sourceURL: 'https://example.com', + statusCode: 200, + language: 'en', + keywords: 'test', + robots: 'index', }, - }), - }; + }, + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -55,58 +54,52 @@ describe('firecrawl crawler', () => { url: 'https://example.com', }); - expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000); + expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000); }); it('should handle missing API key', async () => { - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - success: true, - data: { - markdown: 'Test content with sufficient length. '.repeat(5), - metadata: { - title: 'Test', - description: 'Test', - sourceURL: 'https://example.com', - statusCode: 200, - language: 'en', - keywords: 'test', - robots: 'index', - }, + const mockResponse = createMockResponse({ + success: true, + data: { + markdown: 'Test content with sufficient length. '.repeat(5), + metadata: { + title: 'Test', + description: 'Test', + sourceURL: 'https://example.com', + statusCode: 200, + language: 'en', + keywords: 'test', + robots: 'index', }, - }), - }; + }, + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); await firecrawl('https://example.com', { filterOptions: {} }); - expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000); + expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000); }); it('should return undefined for short content', async () => { process.env.FIRECRAWL_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - success: true, - data: { - markdown: 'Short', // Content too short - metadata: { - title: 'Test', - description: 'Test', - sourceURL: 'https://example.com', - statusCode: 200, - language: 'en', - keywords: 'test', - robots: 'index', - }, + const mockResponse = createMockResponse({ + success: true, + data: { + markdown: 'Short', // Content too short + metadata: { + title: 'Test', + description: 'Test', + sourceURL: 'https://example.com', + statusCode: 200, + language: 'en', + keywords: 'test', + robots: 'index', }, - }), - }; + }, + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -119,24 +112,21 @@ describe('firecrawl crawler', () => { it('should return undefined when markdown is missing', async () => { process.env.FIRECRAWL_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - success: true, - data: { - // markdown is missing - metadata: { - title: 'Test', - description: 'Test', - sourceURL: 'https://example.com', - statusCode: 200, - language: 'en', - keywords: 'test', - robots: 'index', - }, + const mockResponse = createMockResponse({ + success: true, + data: { + // markdown is missing + metadata: { + title: 'Test', + description: 'Test', + sourceURL: 'https://example.com', + statusCode: 200, + language: 'en', + keywords: 'test', + robots: 'index', }, - }), - }; + }, + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -149,11 +139,11 @@ describe('firecrawl crawler', () => { it('should throw PageNotFoundError for 404 status', async () => { process.env.FIRECRAWL_API_KEY = 'test-api-key'; - const mockResponse = { + const mockResponse = createMockResponse('Not Found', { ok: false, status: 404, statusText: 'Not Found', - }; + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -166,11 +156,11 @@ describe('firecrawl crawler', () => { it('should throw error for other HTTP errors', async () => { process.env.FIRECRAWL_API_KEY = 'test-api-key'; - const mockResponse = { + const mockResponse = createMockResponse('', { ok: false, status: 500, statusText: 'Internal Server Error', - }; + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -184,7 +174,7 @@ describe('firecrawl crawler', () => { process.env.FIRECRAWL_API_KEY = 'test-api-key'; const { withTimeout } = await import('../../utils/withTimeout'); - vi.mocked(withTimeout).mockRejectedValue(new Error('fetch failed')); + vi.mocked(withTimeout).mockRejectedValue(new TypeError('fetch failed')); await expect(firecrawl('https://example.com', { filterOptions: {} })).rejects.toThrow( NetworkConnectionError, @@ -217,54 +207,49 @@ describe('firecrawl crawler', () => { ); }); - it('should return undefined when JSON parsing fails', async () => { + it('should throw ResponseBodyParseError when JSON parsing fails', async () => { process.env.FIRECRAWL_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, + const mockResponse = createMockResponse('not json', { ok: true }); + mockResponse.json = vi.fn().mockRejectedValue(new Error('Invalid JSON')); + mockResponse.clone.mockReturnValue({ + ...mockResponse, json: vi.fn().mockRejectedValue(new Error('Invalid JSON')), - }; + text: vi.fn().mockResolvedValue('not json'), + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); - const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {}); - - const result = await firecrawl('https://example.com', { filterOptions: {} }); - - expect(result).toBeUndefined(); - expect(consoleSpy).toHaveBeenCalled(); - - consoleSpy.mockRestore(); + await expect(firecrawl('https://example.com', { filterOptions: {} })).rejects.toThrow( + 'Firecrawl returned non-JSON response: not json', + ); }); it('should handle metadata with all optional fields', async () => { process.env.FIRECRAWL_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - success: true, - data: { - markdown: 'Complete test content with all metadata fields provided. '.repeat(3), - metadata: { - title: 'Complete Test Article', - description: 'Complete test description', - keywords: 'test,complete,article', - language: 'en', - ogDescription: 'OG description', - ogImage: 'https://example.com/image.jpg', - ogLocaleAlternate: ['en-US', 'fr-FR'], - ogSiteName: 'Example Site', - ogTitle: 'OG Title', - ogUrl: 'https://example.com/og', - robots: 'index,follow', - statusCode: 200, - sourceURL: 'https://example.com', - }, + const mockResponse = createMockResponse({ + success: true, + data: { + markdown: 'Complete test content with all metadata fields provided. '.repeat(3), + metadata: { + title: 'Complete Test Article', + description: 'Complete test description', + keywords: 'test,complete,article', + language: 'en', + ogDescription: 'OG description', + ogImage: 'https://example.com/image.jpg', + ogLocaleAlternate: ['en-US', 'fr-FR'], + ogSiteName: 'Example Site', + ogTitle: 'OG Title', + ogUrl: 'https://example.com/og', + robots: 'index,follow', + statusCode: 200, + sourceURL: 'https://example.com', }, - }), - }; + }, + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); diff --git a/packages/web-crawler/src/crawImpl/__tests__/jina.test.ts b/packages/web-crawler/src/crawImpl/__tests__/jina.test.ts index 6798c2896c..fdf8efe4e4 100644 --- a/packages/web-crawler/src/crawImpl/__tests__/jina.test.ts +++ b/packages/web-crawler/src/crawImpl/__tests__/jina.test.ts @@ -1,29 +1,44 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { createMockResponse } from '../../test-utils'; +import * as withTimeoutModule from '../../utils/withTimeout'; import { jina } from '../jina'; +// Mock withTimeout to just call the factory function directly (bypassing real timeout) +vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((fn) => + fn(new AbortController().signal), +); + describe('jina crawler', () => { const mockFetch = vi.fn(); global.fetch = mockFetch; beforeEach(() => { vi.resetAllMocks(); + // Re-apply the withTimeout spy after resetAllMocks + vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((fn) => + fn(new AbortController().signal), + ); }); it('should crawl url successfully', async () => { - const mockResponse = { - ok: true, - json: () => - Promise.resolve({ - code: 200, - data: { - content: 'test content', - description: 'test description', - siteName: 'test site', - title: 'test title', - }, - }), - }; + const testContent = + 'This is a test content that is long enough to pass the minimum length validation check. '.repeat( + 2, + ); + + const mockResponse = createMockResponse( + { + code: 200, + data: { + content: testContent, + description: 'test description', + siteName: 'test site', + title: 'test title', + }, + }, + { ok: true }, + ); mockFetch.mockResolvedValue(mockResponse); @@ -38,13 +53,14 @@ describe('jina crawler', () => { 'Authorization': 'Bearer test-key', 'x-send-from': 'LobeChat Community', }, + signal: expect.any(AbortSignal), }); expect(result).toEqual({ - content: 'test content', + content: testContent, contentType: 'text', description: 'test description', - length: 12, + length: testContent.length, siteName: 'test site', title: 'test title', url: 'https://example.com', @@ -54,16 +70,15 @@ describe('jina crawler', () => { it('should use JINA_READER_API_KEY from env if apiKey not provided', async () => { process.env.JINA_READER_API_KEY = 'env-reader-key'; - const mockResponse = { - ok: true, - json: () => - Promise.resolve({ - code: 200, - data: { - content: 'test content', - }, - }), - }; + const mockResponse = createMockResponse( + { + code: 200, + data: { + content: 'test content', + }, + }, + { ok: true }, + ); mockFetch.mockResolvedValue(mockResponse); @@ -75,6 +90,7 @@ describe('jina crawler', () => { 'Authorization': 'Bearer env-reader-key', 'x-send-from': 'LobeChat Community', }, + signal: expect.any(AbortSignal), }); delete process.env.JINA_READER_API_KEY; @@ -83,16 +99,15 @@ describe('jina crawler', () => { it('should use JINA_API_KEY from env if apiKey and JINA_READER_API_KEY not provided', async () => { process.env.JINA_API_KEY = 'env-key'; - const mockResponse = { - ok: true, - json: () => - Promise.resolve({ - code: 200, - data: { - content: 'test content', - }, - }), - }; + const mockResponse = createMockResponse( + { + code: 200, + data: { + content: 'test content', + }, + }, + { ok: true }, + ); mockFetch.mockResolvedValue(mockResponse); @@ -104,22 +119,22 @@ describe('jina crawler', () => { 'Authorization': 'Bearer env-key', 'x-send-from': 'LobeChat Community', }, + signal: expect.any(AbortSignal), }); delete process.env.JINA_API_KEY; }); it('should send empty Authorization header if no api key provided', async () => { - const mockResponse = { - ok: true, - json: () => - Promise.resolve({ - code: 200, - data: { - content: 'test content', - }, - }), - }; + const mockResponse = createMockResponse( + { + code: 200, + data: { + content: 'test content', + }, + }, + { ok: true }, + ); mockFetch.mockResolvedValue(mockResponse); @@ -131,11 +146,14 @@ describe('jina crawler', () => { 'Authorization': '', 'x-send-from': 'LobeChat Community', }, + signal: expect.any(AbortSignal), }); }); it('should return undefined if response is not ok', async () => { - mockFetch.mockResolvedValue({ ok: false }); + mockFetch.mockResolvedValue( + createMockResponse(null, { ok: false, status: 500, statusText: 'Internal Server Error' }), + ); const result = await jina('https://example.com', { filterOptions: {} }); @@ -143,14 +161,13 @@ describe('jina crawler', () => { }); it('should return undefined if response code is not 200', async () => { - const mockResponse = { - ok: true, - json: () => - Promise.resolve({ - code: 400, - message: 'Bad Request', - }), - }; + const mockResponse = createMockResponse( + { + code: 400, + message: 'Bad Request', + }, + { ok: true }, + ); mockFetch.mockResolvedValue(mockResponse); @@ -159,11 +176,11 @@ describe('jina crawler', () => { expect(result).toBeUndefined(); }); - it('should return undefined if fetch throws error', async () => { + it('should throw error if fetch throws non-fetch-failed error', async () => { mockFetch.mockRejectedValue(new Error('Network error')); - const result = await jina('https://example.com', { filterOptions: {} }); - - expect(result).toBeUndefined(); + await expect(jina('https://example.com', { filterOptions: {} })).rejects.toThrow( + 'Network error', + ); }); }); diff --git a/packages/web-crawler/src/crawImpl/__tests__/naive.test.ts b/packages/web-crawler/src/crawImpl/__tests__/naive.test.ts index 1848c95b9d..464a5653de 100644 --- a/packages/web-crawler/src/crawImpl/__tests__/naive.test.ts +++ b/packages/web-crawler/src/crawImpl/__tests__/naive.test.ts @@ -22,9 +22,10 @@ describe('naive crawler', () => { vi.clearAllMocks(); }); - it('should return undefined for normal pages (due to cloudflare logic)', async () => { + it('should return content for normal pages', async () => { const mockResponse = { status: 200, + ok: true, headers: new Map([['content-type', 'text/html']]), text: vi.fn().mockResolvedValue('Test content'), }; @@ -34,8 +35,8 @@ describe('naive crawler', () => { const { htmlToMarkdown } = await import('../../utils/htmlToMarkdown'); vi.mocked(htmlToMarkdown).mockReturnValue({ - content: 'Test content'.padEnd(101, ' '), // Ensure length > 100 - title: 'Normal Page Title', // Not "Just a moment..." so it returns undefined + content: 'Test content'.padEnd(101, ' '), + title: 'Normal Page Title', description: 'Test description', siteName: 'Test Site', length: 101, @@ -43,13 +44,22 @@ describe('naive crawler', () => { const result = await naive('https://example.com', { filterOptions: {} }); - expect(result).toBeUndefined(); + expect(result).toEqual({ + content: 'Test content'.padEnd(101, ' '), + contentType: 'text', + description: 'Test description', + length: 101, + siteName: 'Test Site', + title: 'Normal Page Title', + url: 'https://example.com', + }); }); it('should successfully crawl JSON content', async () => { const mockJsonData = { message: 'Hello world', data: [1, 2, 3] }; const mockResponse = { status: 200, + ok: true, headers: new Map([['content-type', 'application/json']]), clone: () => ({ json: vi.fn().mockResolvedValue(mockJsonData), @@ -74,6 +84,7 @@ describe('naive crawler', () => { const mockText = '{"invalid": json}'; const mockResponse = { status: 200, + ok: true, headers: new Map([['content-type', 'application/json']]), clone: () => ({ json: vi.fn().mockRejectedValue(new Error('Invalid JSON')), @@ -97,6 +108,7 @@ describe('naive crawler', () => { it('should return undefined for short content', async () => { const mockResponse = { status: 200, + ok: true, headers: new Map([['content-type', 'text/html']]), text: vi.fn().mockResolvedValue('Short'), }; @@ -116,9 +128,10 @@ describe('naive crawler', () => { expect(result).toBeUndefined(); }); - it('should return content when NOT blocked by Cloudflare', async () => { + it('should return undefined when blocked by Cloudflare', async () => { const mockResponse = { status: 200, + ok: true, headers: new Map([['content-type', 'text/html']]), text: vi.fn().mockResolvedValue('Normal content'), }; @@ -129,7 +142,7 @@ describe('naive crawler', () => { const { htmlToMarkdown } = await import('../../utils/htmlToMarkdown'); vi.mocked(htmlToMarkdown).mockReturnValue({ content: 'Test content'.padEnd(101, ' '), - title: 'Just a moment...', // Cloudflare blocking page - this will cause return + title: 'Just a moment...', // Cloudflare blocking page description: 'Test description', siteName: 'Test Site', length: 101, @@ -137,15 +150,21 @@ describe('naive crawler', () => { const result = await naive('https://example.com', { filterOptions: {} }); - expect(result).toEqual({ - content: 'Test content'.padEnd(101, ' '), - contentType: 'text', - description: 'Test description', - length: 101, - siteName: 'Test Site', - title: 'Just a moment...', - url: 'https://example.com', - }); + expect(result).toBeUndefined(); + }); + + it('should throw error for non-ok status codes', async () => { + const mockResponse = { + status: 500, + ok: false, + statusText: 'Internal Server Error', + text: vi.fn().mockResolvedValue('Server Error'), + }; + + const { withTimeout } = await import('../../utils/withTimeout'); + vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); + + await expect(naive('https://example.com', { filterOptions: {} })).rejects.toThrow(/500/); }); it('should throw PageNotFoundError for 404 status', async () => { @@ -164,7 +183,7 @@ describe('naive crawler', () => { it('should throw NetworkConnectionError for fetch failures', async () => { const { withTimeout } = await import('../../utils/withTimeout'); - vi.mocked(withTimeout).mockRejectedValue(new Error('fetch failed')); + vi.mocked(withTimeout).mockRejectedValue(new TypeError('fetch failed')); await expect(naive('https://example.com', { filterOptions: {} })).rejects.toThrow( NetworkConnectionError, @@ -194,6 +213,7 @@ describe('naive crawler', () => { it('should return undefined when HTML processing fails', async () => { const mockResponse = { status: 200, + ok: true, headers: new Map([['content-type', 'text/html']]), text: vi.fn().mockRejectedValue(new Error('Failed to read text')), }; @@ -209,6 +229,7 @@ describe('naive crawler', () => { it('should pass filter options to htmlToMarkdown', async () => { const mockResponse = { status: 200, + ok: true, headers: new Map([['content-type', 'text/html']]), text: vi.fn().mockResolvedValue('Test content'), }; diff --git a/packages/web-crawler/src/crawImpl/__tests__/search1api.test.ts b/packages/web-crawler/src/crawImpl/__tests__/search1api.test.ts index 7c22728154..2e2981f71c 100644 --- a/packages/web-crawler/src/crawImpl/__tests__/search1api.test.ts +++ b/packages/web-crawler/src/crawImpl/__tests__/search1api.test.ts @@ -1,5 +1,6 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { createMockResponse } from '../../test-utils'; import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../../utils/errorType'; import * as withTimeoutModule from '../../utils/withTimeout'; import { search1api } from '../search1api'; @@ -17,8 +18,10 @@ describe('search1api crawler', () => { originalEnv = { ...process.env }; process.env.SEARCH1API_API_KEY = 'test-api-key'; - // Mock withTimeout to directly return the promise - vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((promise) => promise); + // Mock withTimeout to call the factory function directly (bypassing real timeout) + vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((fn) => + fn(new AbortController().signal), + ); }); afterEach(() => { @@ -26,7 +29,7 @@ describe('search1api crawler', () => { }); it('should throw NetworkConnectionError when fetch fails', async () => { - mockFetch.mockRejectedValue(new Error('fetch failed')); + mockFetch.mockRejectedValue(new TypeError('fetch failed')); await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow( NetworkConnectionError, @@ -48,11 +51,13 @@ describe('search1api crawler', () => { }); it('should throw PageNotFoundError when status is 404', async () => { - mockFetch.mockResolvedValue({ - ok: false, - status: 404, - statusText: 'Not Found', - }); + mockFetch.mockResolvedValue( + createMockResponse('Not Found', { + ok: false, + status: 404, + statusText: 'Not Found', + }), + ); await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow( PageNotFoundError, @@ -60,11 +65,13 @@ describe('search1api crawler', () => { }); it('should throw error for other failed responses', async () => { - mockFetch.mockResolvedValue({ - ok: false, - status: 500, - statusText: 'Internal Server Error', - }); + mockFetch.mockResolvedValue( + createMockResponse('', { + ok: false, + status: 500, + statusText: 'Internal Server Error', + }), + ); await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow( 'Search1API request failed with status 500: Internal Server Error', @@ -72,18 +79,19 @@ describe('search1api crawler', () => { }); it('should return undefined when content is too short', async () => { - mockFetch.mockResolvedValue({ - ok: true, - json: () => - Promise.resolve({ + mockFetch.mockResolvedValue( + createMockResponse( + { crawlParameters: { url: 'https://example.com' }, results: { title: 'Test Title', link: 'https://example.com', content: 'Short', // Less than 100 characters }, - }), - }); + }, + { ok: true }, + ), + ); const result = await search1api('https://example.com', { filterOptions: {} }); expect(result).toBeUndefined(); @@ -92,18 +100,19 @@ describe('search1api crawler', () => { it('should return crawl result on successful fetch', async () => { const mockContent = 'This is a test content that is longer than 100 characters. '.repeat(3); - mockFetch.mockResolvedValue({ - ok: true, - json: () => - Promise.resolve({ + mockFetch.mockResolvedValue( + createMockResponse( + { crawlParameters: { url: 'https://example.com' }, results: { title: 'Test Title', link: 'https://example.com', content: mockContent, }, - }), - }); + }, + { ok: true }, + ), + ); const result = await search1api('https://example.com', { filterOptions: {} }); @@ -116,6 +125,7 @@ describe('search1api crawler', () => { body: JSON.stringify({ url: 'https://example.com', }), + signal: expect.any(AbortSignal), }); expect(result).toEqual({ @@ -130,12 +140,18 @@ describe('search1api crawler', () => { }); it('should handle JSON parse errors', async () => { - mockFetch.mockResolvedValue({ - ok: true, - json: () => Promise.reject(new Error('Invalid JSON')), - }); + mockFetch.mockResolvedValue(createMockResponse('invalid json', { ok: true })); + // Override json to reject for this specific test + const response = createMockResponse('invalid json', { ok: true }); + response.json = () => Promise.reject(new Error('Invalid JSON')); + // clone should also return a response whose text() works for error reporting + response.clone = () => { + const cloned = createMockResponse('invalid json', { ok: true }); + cloned.json = () => Promise.reject(new Error('Invalid JSON')); + return cloned; + }; + mockFetch.mockResolvedValue(response); - const result = await search1api('https://example.com', { filterOptions: {} }); - expect(result).toBeUndefined(); + await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(); }); }); diff --git a/packages/web-crawler/src/crawImpl/__tests__/tavily.test.ts b/packages/web-crawler/src/crawImpl/__tests__/tavily.test.ts index 3be81eab47..d8fee80e80 100644 --- a/packages/web-crawler/src/crawImpl/__tests__/tavily.test.ts +++ b/packages/web-crawler/src/crawImpl/__tests__/tavily.test.ts @@ -1,5 +1,6 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { createMockResponse } from '../../test-utils'; import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../../utils/errorType'; import { tavily } from '../tavily'; @@ -19,21 +20,18 @@ describe('tavily crawler', () => { it('should successfully crawl content with API key', async () => { process.env.TAVILY_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - base_url: 'https://api.tavily.com', - response_time: 1.5, - results: [ - { - url: 'https://example.com', - raw_content: - 'This is a test raw content with sufficient length to pass validation. '.repeat(3), - images: ['https://example.com/image1.jpg', 'https://example.com/image2.jpg'], - }, - ], - }), - }; + const mockResponse = createMockResponse({ + base_url: 'https://api.tavily.com', + response_time: 1.5, + results: [ + { + url: 'https://example.com', + raw_content: + 'This is a test raw content with sufficient length to pass validation. '.repeat(3), + images: ['https://example.com/image1.jpg', 'https://example.com/image2.jpg'], + }, + ], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -50,69 +48,60 @@ describe('tavily crawler', () => { url: 'https://example.com', }); - expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000); + expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000); }); it('should use custom extract depth when provided', async () => { process.env.TAVILY_API_KEY = 'test-api-key'; process.env.TAVILY_EXTRACT_DEPTH = 'advanced'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - base_url: 'https://api.tavily.com', - response_time: 2.1, - results: [ - { - url: 'https://example.com', - raw_content: 'Advanced extraction content with more details. '.repeat(5), - }, - ], - }), - }; + const mockResponse = createMockResponse({ + base_url: 'https://api.tavily.com', + response_time: 2.1, + results: [ + { + url: 'https://example.com', + raw_content: 'Advanced extraction content with more details. '.repeat(5), + }, + ], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); await tavily('https://example.com', { filterOptions: {} }); - expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000); + expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000); }); it('should handle missing API key', async () => { - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - base_url: 'https://api.tavily.com', - response_time: 1.2, - results: [ - { - url: 'https://example.com', - raw_content: 'Test content with sufficient length. '.repeat(5), - }, - ], - }), - }; + const mockResponse = createMockResponse({ + base_url: 'https://api.tavily.com', + response_time: 1.2, + results: [ + { + url: 'https://example.com', + raw_content: 'Test content with sufficient length. '.repeat(5), + }, + ], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); await tavily('https://example.com', { filterOptions: {} }); - expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000); + expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000); }); it('should return undefined when no results are returned', async () => { process.env.TAVILY_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - base_url: 'https://api.tavily.com', - response_time: 0.8, - results: [], - }), - }; + const mockResponse = createMockResponse({ + base_url: 'https://api.tavily.com', + response_time: 0.8, + results: [], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -133,19 +122,16 @@ describe('tavily crawler', () => { it('should return undefined for short content', async () => { process.env.TAVILY_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - base_url: 'https://api.tavily.com', - response_time: 1.1, - results: [ - { - url: 'https://example.com', - raw_content: 'Short', // Content too short - }, - ], - }), - }; + const mockResponse = createMockResponse({ + base_url: 'https://api.tavily.com', + response_time: 1.1, + results: [ + { + url: 'https://example.com', + raw_content: 'Short', // Content too short + }, + ], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -158,20 +144,17 @@ describe('tavily crawler', () => { it('should return undefined when raw_content is missing', async () => { process.env.TAVILY_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - base_url: 'https://api.tavily.com', - response_time: 1, - results: [ - { - url: 'https://example.com', - // raw_content is missing - images: ['https://example.com/image.jpg'], - }, - ], - }), - }; + const mockResponse = createMockResponse({ + base_url: 'https://api.tavily.com', + response_time: 1, + results: [ + { + url: 'https://example.com', + // raw_content is missing + images: ['https://example.com/image.jpg'], + }, + ], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -184,11 +167,11 @@ describe('tavily crawler', () => { it('should throw PageNotFoundError for 404 status', async () => { process.env.TAVILY_API_KEY = 'test-api-key'; - const mockResponse = { + const mockResponse = createMockResponse('Not Found', { ok: false, status: 404, statusText: 'Not Found', - }; + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -201,11 +184,11 @@ describe('tavily crawler', () => { it('should throw error for other HTTP errors', async () => { process.env.TAVILY_API_KEY = 'test-api-key'; - const mockResponse = { + const mockResponse = createMockResponse('', { ok: false, status: 500, statusText: 'Internal Server Error', - }; + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -219,7 +202,7 @@ describe('tavily crawler', () => { process.env.TAVILY_API_KEY = 'test-api-key'; const { withTimeout } = await import('../../utils/withTimeout'); - vi.mocked(withTimeout).mockRejectedValue(new Error('fetch failed')); + vi.mocked(withTimeout).mockRejectedValue(new TypeError('fetch failed')); await expect(tavily('https://example.com', { filterOptions: {} })).rejects.toThrow( NetworkConnectionError, @@ -252,43 +235,38 @@ describe('tavily crawler', () => { ); }); - it('should return undefined when JSON parsing fails', async () => { + it('should throw ResponseBodyParseError when JSON parsing fails', async () => { process.env.TAVILY_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, + const mockResponse = createMockResponse('not json', { ok: true }); + mockResponse.json = vi.fn().mockRejectedValue(new Error('Invalid JSON')); + mockResponse.clone.mockReturnValue({ + ...mockResponse, json: vi.fn().mockRejectedValue(new Error('Invalid JSON')), - }; + text: vi.fn().mockResolvedValue('not json'), + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); - const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {}); - - const result = await tavily('https://example.com', { filterOptions: {} }); - - expect(result).toBeUndefined(); - expect(consoleSpy).toHaveBeenCalled(); - - consoleSpy.mockRestore(); + await expect(tavily('https://example.com', { filterOptions: {} })).rejects.toThrow( + 'Tavily returned non-JSON response: not json', + ); }); it('should use result URL when available', async () => { process.env.TAVILY_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - base_url: 'https://api.tavily.com', - response_time: 1.3, - results: [ - { - url: 'https://redirected.example.com', - raw_content: 'Test content with sufficient length. '.repeat(5), - }, - ], - }), - }; + const mockResponse = createMockResponse({ + base_url: 'https://api.tavily.com', + response_time: 1.3, + results: [ + { + url: 'https://redirected.example.com', + raw_content: 'Test content with sufficient length. '.repeat(5), + }, + ], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -301,19 +279,16 @@ describe('tavily crawler', () => { it('should fallback to original URL when result URL is missing', async () => { process.env.TAVILY_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - base_url: 'https://api.tavily.com', - response_time: 1.4, - results: [ - { - raw_content: 'Test content with sufficient length. '.repeat(5), - // url is missing - }, - ], - }), - }; + const mockResponse = createMockResponse({ + base_url: 'https://api.tavily.com', + response_time: 1.4, + results: [ + { + raw_content: 'Test content with sufficient length. '.repeat(5), + // url is missing + }, + ], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); @@ -326,20 +301,17 @@ describe('tavily crawler', () => { it('should handle failed results in response', async () => { process.env.TAVILY_API_KEY = 'test-api-key'; - const mockResponse = { - ok: true, - json: vi.fn().mockResolvedValue({ - base_url: 'https://api.tavily.com', - response_time: 1.6, - results: [], - failed_results: [ - { - url: 'https://example.com', - error: 'Page not accessible', - }, - ], - }), - }; + const mockResponse = createMockResponse({ + base_url: 'https://api.tavily.com', + response_time: 1.6, + results: [], + failed_results: [ + { + url: 'https://example.com', + error: 'Page not accessible', + }, + ], + }); const { withTimeout } = await import('../../utils/withTimeout'); vi.mocked(withTimeout).mockResolvedValue(mockResponse as any); diff --git a/packages/web-crawler/src/crawImpl/browserless.ts b/packages/web-crawler/src/crawImpl/browserless.ts index b07f11c6d1..44f4464983 100644 --- a/packages/web-crawler/src/crawImpl/browserless.ts +++ b/packages/web-crawler/src/crawImpl/browserless.ts @@ -2,7 +2,10 @@ import qs from 'query-string'; import urlJoin from 'url-join'; import type { CrawlImpl, CrawlSuccessResult } from '../type'; +import { PageNotFoundError, toFetchError } from '../utils/errorType'; import { htmlToMarkdown } from '../utils/htmlToMarkdown'; +import { createHTTPStatusError } from '../utils/response'; +import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout'; const BASE_URL = process.env.BROWSERLESS_URL ?? 'https://chrome.browserless.io'; // Allowed file types: html, css, js, json, xml, webmanifest, txt, md @@ -31,46 +34,62 @@ export const browserless: CrawlImpl = async (url, { filterOptions }) => { url, }; + let res: Response; + try { - const res = await fetch( - qs.stringifyUrl({ - query: { - blockAds: BROWSERLESS_BLOCK_ADS, - launch: JSON.stringify({ stealth: BROWSERLESS_STEALTH_MODE }), - token: BROWSERLESS_TOKEN, - }, - url: urlJoin(BASE_URL, '/content'), - }), - { - body: JSON.stringify(input), - headers: { - 'Content-Type': 'application/json', - }, - method: 'POST', - }, + res = await withTimeout( + (signal) => + fetch( + qs.stringifyUrl({ + query: { + blockAds: BROWSERLESS_BLOCK_ADS, + launch: JSON.stringify({ stealth: BROWSERLESS_STEALTH_MODE }), + token: BROWSERLESS_TOKEN, + }, + url: urlJoin(BASE_URL, '/content'), + }), + { + body: JSON.stringify(input), + headers: { + 'Content-Type': 'application/json', + }, + method: 'POST', + signal, + }, + ), + DEFAULT_TIMEOUT, ); - const html = await res.text(); + } catch (e) { + throw toFetchError(e); + } - const result = htmlToMarkdown(html, { filterOptions, url }); - - if ( - !!result.content && - result.title && - // "Just a moment..." indicates being blocked by CloudFlare - result.title.trim() !== 'Just a moment...' - ) { - return { - content: result.content, - contentType: 'text', - description: result?.description, - length: result.length, - siteName: result?.siteName, - title: result?.title, - url, - } satisfies CrawlSuccessResult; + if (!res.ok) { + if (res.status === 404) { + throw new PageNotFoundError(res.statusText); } - } catch (error) { - console.error(error); + + throw await createHTTPStatusError(res, 'Browserless'); + } + + const html = await res.text(); + const result = htmlToMarkdown(html, { filterOptions, url }); + + if ( + !!result.content && + result.content.length > 100 && + result.title && + // "Just a moment..." indicates being blocked by CloudFlare + result.title.trim() !== 'Just a moment...' + ) { + return { + content: result.content, + contentType: 'text', + description: result?.description, + length: result.length, + siteName: result?.siteName, + title: result?.title, + url, + } satisfies CrawlSuccessResult; } return; diff --git a/packages/web-crawler/src/crawImpl/exa.ts b/packages/web-crawler/src/crawImpl/exa.ts index dd4e031b85..f1e15a770b 100644 --- a/packages/web-crawler/src/crawImpl/exa.ts +++ b/packages/web-crawler/src/crawImpl/exa.ts @@ -1,5 +1,6 @@ import type { CrawlImpl, CrawlSuccessResult } from '../type'; -import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType'; +import { PageNotFoundError, toFetchError } from '../utils/errorType'; +import { createHTTPStatusError, parseJSONResponse } from '../utils/response'; import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout'; interface ExaResults { @@ -27,31 +28,24 @@ export const exa: CrawlImpl = async (url) => { try { res = await withTimeout( - fetch('https://api.exa.ai/contents', { - body: JSON.stringify({ - livecrawl: 'fallback', // always, fallback - text: true, - urls: [url], + (signal) => + fetch('https://api.exa.ai/contents', { + body: JSON.stringify({ + livecrawl: 'fallback', // always, fallback + text: true, + urls: [url], + }), + headers: { + 'Content-Type': 'application/json', + 'x-api-key': !apiKey ? '' : apiKey, + }, + method: 'POST', + signal, }), - headers: { - 'Content-Type': 'application/json', - 'x-api-key': !apiKey ? '' : apiKey, - }, - method: 'POST', - }), DEFAULT_TIMEOUT, ); } catch (e) { - const error = e as Error; - if (error.message === 'fetch failed') { - throw new NetworkConnectionError(); - } - - if (error instanceof TimeoutError) { - throw error; - } - - throw e; + throw toFetchError(e); } if (!res.ok) { @@ -59,35 +53,29 @@ export const exa: CrawlImpl = async (url) => { throw new PageNotFoundError(res.statusText); } - throw new Error(`Exa request failed with status ${res.status}: ${res.statusText}`); + throw await createHTTPStatusError(res, 'Exa'); } - try { - const data = (await res.json()) as ExaResponse; + const data = await parseJSONResponse(res, 'Exa'); - if (!data.results || data.results.length === 0) { - console.warn('Exa API returned no results for URL:', url); - return; - } - - const firstResult = data.results[0]; - - // Check if content is empty or too short - if (!firstResult.text || firstResult.text.length < 100) { - return; - } - - return { - content: firstResult.text, - contentType: 'text', - length: firstResult.text.length, - siteName: new URL(url).hostname, - title: firstResult.title, - url: firstResult.url || url, - } satisfies CrawlSuccessResult; - } catch (error) { - console.error(error); + if (!data.results || data.results.length === 0) { + console.warn('Exa API returned no results for URL:', url); + return; } - return; + const firstResult = data.results[0]; + + // Check if content is empty or too short + if (!firstResult.text || firstResult.text.length < 100) { + return; + } + + return { + content: firstResult.text, + contentType: 'text', + length: firstResult.text.length, + siteName: new URL(url).hostname, + title: firstResult.title, + url: firstResult.url || url, + } satisfies CrawlSuccessResult; }; diff --git a/packages/web-crawler/src/crawImpl/firecrawl.ts b/packages/web-crawler/src/crawImpl/firecrawl.ts index e63e1b7338..74d9902be1 100644 --- a/packages/web-crawler/src/crawImpl/firecrawl.ts +++ b/packages/web-crawler/src/crawImpl/firecrawl.ts @@ -1,5 +1,6 @@ import type { CrawlImpl, CrawlSuccessResult } from '../type'; -import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType'; +import { PageNotFoundError, toFetchError } from '../utils/errorType'; +import { createHTTPStatusError, parseJSONResponse } from '../utils/response'; import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout'; interface FirecrawlMetadata { @@ -57,30 +58,23 @@ export const firecrawl: CrawlImpl = async (url) => { try { res = await withTimeout( - fetch(`${baseUrl}/scrape`, { - body: JSON.stringify({ - formats: ['markdown'], // ["markdown", "html"] - url, + (signal) => + fetch(`${baseUrl}/scrape`, { + body: JSON.stringify({ + formats: ['markdown'], // ["markdown", "html"] + url, + }), + headers: { + 'Authorization': !apiKey ? '' : `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + method: 'POST', + signal, }), - headers: { - 'Authorization': !apiKey ? '' : `Bearer ${apiKey}`, - 'Content-Type': 'application/json', - }, - method: 'POST', - }), DEFAULT_TIMEOUT, ); } catch (e) { - const error = e as Error; - if (error.message === 'fetch failed') { - throw new NetworkConnectionError(); - } - - if (error instanceof TimeoutError) { - throw error; - } - - throw e; + throw toFetchError(e); } if (!res.ok) { @@ -88,37 +82,34 @@ export const firecrawl: CrawlImpl = async (url) => { throw new PageNotFoundError(res.statusText); } - throw new Error(`Firecrawl request failed with status ${res.status}: ${res.statusText}`); + throw await createHTTPStatusError(res, 'Firecrawl'); } - try { - const data = (await res.json()) as FirecrawlResponse; - - if (data.data.warning) { - console.warn('[Firecrawl] Warning:', data.data.warning); - } - - if (data.data.metadata.error) { - console.error('[Firecrawl] Metadata error:', data.data.metadata.error); - } - - // Check if content is empty or too short - if (!data.data.markdown || data.data.markdown.length < 100) { - return; - } - - return { - content: data.data.markdown, - contentType: 'text', - description: data.data.metadata.description || '', - length: data.data.markdown.length, - siteName: new URL(url).hostname, - title: data.data.metadata.title || '', - url: url, - } satisfies CrawlSuccessResult; - } catch (error) { - console.error('[Firecrawl] Parse error:', error); + const data = await parseJSONResponse(res, 'Firecrawl'); + if (!data.data) { + throw new Error('Firecrawl response missing data field'); } - return; + if (data.data.warning) { + console.warn('[Firecrawl] Warning:', data.data.warning); + } + + if (data.data.metadata.error) { + console.error('[Firecrawl] Metadata error:', data.data.metadata.error); + } + + // Check if content is empty or too short + if (!data.data.markdown || data.data.markdown.length < 100) { + return; + } + + return { + content: data.data.markdown, + contentType: 'text', + description: data.data.metadata.description || '', + length: data.data.markdown.length, + siteName: new URL(url).hostname, + title: data.data.metadata.title || '', + url, + } satisfies CrawlSuccessResult; }; diff --git a/packages/web-crawler/src/crawImpl/jina.ts b/packages/web-crawler/src/crawImpl/jina.ts index d4fe9ba7b8..ccdb9998ef 100644 --- a/packages/web-crawler/src/crawImpl/jina.ts +++ b/packages/web-crawler/src/crawImpl/jina.ts @@ -1,37 +1,59 @@ import type { CrawlImpl } from '../type'; +import { toFetchError } from '../utils/errorType'; +import { parseJSONResponse } from '../utils/response'; +import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout'; export const jina: CrawlImpl<{ apiKey?: string }> = async (url, params) => { const token = params.apiKey ?? process.env.JINA_READER_API_KEY ?? process.env.JINA_API_KEY; + let res: Response; try { - const res = await fetch(`https://r.jina.ai/${url}`, { - headers: { - 'Accept': 'application/json', - 'Authorization': token ? `Bearer ${token}` : '', - 'x-send-from': 'LobeChat Community', - }, - }); - - if (res.ok) { - const json = await res.json(); - if (json.code === 200) { - const result = json.data; - return { - content: result.content, - contentType: 'text', - description: result?.description, - length: result.content.length, - siteName: result?.siteName, - title: result?.title, - url: url, - }; - } - - throw json; - } - } catch (error) { - console.error(error); + res = await withTimeout( + (signal) => + fetch(`https://r.jina.ai/${url}`, { + headers: { + 'Accept': 'application/json', + 'Authorization': token ? `Bearer ${token}` : '', + 'x-send-from': 'LobeChat Community', + }, + signal, + }), + DEFAULT_TIMEOUT, + ); + } catch (e) { + throw toFetchError(e); } - return; + if (!res.ok) { + return; + } + + const json = await parseJSONResponse<{ + code: number; + data: { + content: string; + description?: string; + siteName?: string; + title?: string; + }; + }>(res, 'Jina'); + + if (json.code !== 200) { + return; + } + + const result = json.data; + if (!result?.content || result.content.length < 100) { + return; + } + + return { + content: result.content, + contentType: 'text', + description: result?.description, + length: result.content.length, + siteName: result?.siteName, + title: result?.title, + url, + }; }; diff --git a/packages/web-crawler/src/crawImpl/naive.ts b/packages/web-crawler/src/crawImpl/naive.ts index f0af72f5ba..05f780b06e 100755 --- a/packages/web-crawler/src/crawImpl/naive.ts +++ b/packages/web-crawler/src/crawImpl/naive.ts @@ -1,8 +1,9 @@ import { ssrfSafeFetch } from '@lobechat/ssrf-safe-fetch'; import type { CrawlImpl, CrawlSuccessResult } from '../type'; -import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType'; +import { PageNotFoundError, toFetchError } from '../utils/errorType'; import { htmlToMarkdown } from '../utils/htmlToMarkdown'; +import { createHTTPStatusError } from '../utils/response'; import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout'; const mixinHeaders = { @@ -39,28 +40,25 @@ export const naive: CrawlImpl = async (url, { filterOptions }) => { try { res = await withTimeout( - ssrfSafeFetch(url, { - headers: mixinHeaders, - signal: new AbortController().signal, - }), + (signal) => + ssrfSafeFetch(url, { + headers: mixinHeaders, + signal, + }), DEFAULT_TIMEOUT, ); } catch (e) { - const error = e as Error; - if (error.message === 'fetch failed') { - throw new NetworkConnectionError(); - } - - if (error instanceof TimeoutError) { - throw error; - } - - throw e; + throw toFetchError(e); } if (res.status === 404) { throw new PageNotFoundError(res.statusText); } + + if (!res.ok) { + throw await createHTTPStatusError(res, 'Naive'); + } + const type = res.headers.get('content-type'); if (type?.includes('application/json')) { @@ -74,7 +72,7 @@ export const naive: CrawlImpl = async (url, { filterOptions }) => { } return { - content: content, + content, contentType: 'json', length: content.length, url, @@ -91,8 +89,8 @@ export const naive: CrawlImpl = async (url, { filterOptions }) => { return; } - // it's blocked by cloudflare - if (result.title !== 'Just a moment...') { + // It's blocked by Cloudflare. + if (result.title === 'Just a moment...') { return; } diff --git a/packages/web-crawler/src/crawImpl/search1api.ts b/packages/web-crawler/src/crawImpl/search1api.ts index 11e826a5a1..edc6f690f1 100644 --- a/packages/web-crawler/src/crawImpl/search1api.ts +++ b/packages/web-crawler/src/crawImpl/search1api.ts @@ -1,5 +1,6 @@ import type { CrawlImpl, CrawlSuccessResult } from '../type'; -import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType'; +import { PageNotFoundError, toFetchError } from '../utils/errorType'; +import { createHTTPStatusError, parseJSONResponse } from '../utils/response'; import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout'; interface Search1ApiResponse { @@ -21,29 +22,22 @@ export const search1api: CrawlImpl = async (url) => { try { res = await withTimeout( - fetch('https://api.search1api.com/crawl', { - body: JSON.stringify({ - url, + (signal) => + fetch('https://api.search1api.com/crawl', { + body: JSON.stringify({ + url, + }), + headers: { + 'Authorization': !apiKey ? '' : `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + method: 'POST', + signal, }), - headers: { - 'Authorization': !apiKey ? '' : `Bearer ${apiKey}`, - 'Content-Type': 'application/json', - }, - method: 'POST', - }), DEFAULT_TIMEOUT, ); } catch (e) { - const error = e as Error; - if (error.message === 'fetch failed') { - throw new NetworkConnectionError(); - } - - if (error instanceof TimeoutError) { - throw error; - } - - throw e; + throw toFetchError(e); } if (!res.ok) { @@ -51,30 +45,24 @@ export const search1api: CrawlImpl = async (url) => { throw new PageNotFoundError(res.statusText); } - throw new Error(`Search1API request failed with status ${res.status}: ${res.statusText}`); + throw await createHTTPStatusError(res, 'Search1API'); } - try { - const data = (await res.json()) as Search1ApiResponse; + const data = await parseJSONResponse(res, 'Search1API'); - // Check if content is empty or too short - if (!data.results.content || data.results.content.length < 100) { - return; - } - - return { - content: data.results.content, - contentType: 'text', - description: data.results.title, - // Using title as description since API doesn't provide a separate description - length: data.results.content.length, - siteName: new URL(url).hostname, - title: data.results.title, - url: data.results.link || url, - } satisfies CrawlSuccessResult; - } catch (error) { - console.error(error); + // Check if content is empty or too short + if (!data.results?.content || data.results.content.length < 100) { + return; } - return; + return { + content: data.results.content, + contentType: 'text', + description: data.results?.title, + // Using title as description since API doesn't provide a separate description + length: data.results.content.length, + siteName: new URL(url).hostname, + title: data.results?.title, + url: data.results?.link || url, + } satisfies CrawlSuccessResult; }; diff --git a/packages/web-crawler/src/crawImpl/tavily.ts b/packages/web-crawler/src/crawImpl/tavily.ts index 4adbb7eebc..95e6d56b2b 100644 --- a/packages/web-crawler/src/crawImpl/tavily.ts +++ b/packages/web-crawler/src/crawImpl/tavily.ts @@ -1,5 +1,6 @@ import type { CrawlImpl, CrawlSuccessResult } from '../type'; -import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType'; +import { PageNotFoundError, toFetchError } from '../utils/errorType'; +import { createHTTPStatusError, parseJSONResponse } from '../utils/response'; import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout'; interface TavilyResults { @@ -28,31 +29,24 @@ export const tavily: CrawlImpl = async (url) => { try { res = await withTimeout( - fetch('https://api.tavily.com/extract', { - body: JSON.stringify({ - extract_depth: process.env.TAVILY_EXTRACT_DEPTH || 'basic', // basic or advanced - include_images: false, - urls: url, + (signal) => + fetch('https://api.tavily.com/extract', { + body: JSON.stringify({ + extract_depth: process.env.TAVILY_EXTRACT_DEPTH || 'basic', // basic or advanced + include_images: false, + urls: url, + }), + headers: { + 'Authorization': !apiKey ? '' : `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + method: 'POST', + signal, }), - headers: { - 'Authorization': !apiKey ? '' : `Bearer ${apiKey}`, - 'Content-Type': 'application/json', - }, - method: 'POST', - }), DEFAULT_TIMEOUT, ); } catch (e) { - const error = e as Error; - if (error.message === 'fetch failed') { - throw new NetworkConnectionError(); - } - - if (error instanceof TimeoutError) { - throw error; - } - - throw e; + throw toFetchError(e); } if (!res.ok) { @@ -60,35 +54,29 @@ export const tavily: CrawlImpl = async (url) => { throw new PageNotFoundError(res.statusText); } - throw new Error(`Tavily request failed with status ${res.status}: ${res.statusText}`); + throw await createHTTPStatusError(res, 'Tavily'); } - try { - const data = (await res.json()) as TavilyResponse; + const data = await parseJSONResponse(res, 'Tavily'); - if (!data.results || data.results.length === 0) { - console.warn('Tavily API returned no results for URL:', url); - return; - } - - const firstResult = data.results[0]; - - // Check if content is empty or too short - if (!firstResult.raw_content || firstResult.raw_content.length < 100) { - return; - } - - return { - content: firstResult.raw_content, - contentType: 'text', - length: firstResult.raw_content.length, - siteName: new URL(url).hostname, - title: new URL(url).hostname, - url: firstResult.url || url, - } satisfies CrawlSuccessResult; - } catch (error) { - console.error(error); + if (!data.results || data.results.length === 0) { + console.warn('Tavily API returned no results for URL:', url); + return; } - return; + const firstResult = data.results[0]; + + // Check if content is empty or too short + if (!firstResult.raw_content || firstResult.raw_content.length < 100) { + return; + } + + return { + content: firstResult.raw_content, + contentType: 'text', + length: firstResult.raw_content.length, + siteName: new URL(url).hostname, + title: new URL(url).hostname, + url: firstResult.url || url, + } satisfies CrawlSuccessResult; }; diff --git a/packages/web-crawler/src/crawler.ts b/packages/web-crawler/src/crawler.ts index d14669c3f6..14dcf1362f 100644 --- a/packages/web-crawler/src/crawler.ts +++ b/packages/web-crawler/src/crawler.ts @@ -59,13 +59,18 @@ export class Crawler { try { const res = await crawlImpls[impl](transformedUrl, { filterOptions: mergedFilterOptions }); - if (res && res.content && res.content?.length > 100) + if (res && res.content && res.content.length > 100) { return { crawler: impl, data: res, originalUrl: url, transformedUrl: transformedUrl !== url ? transformedUrl : undefined, }; + } + + finalError = new Error(`${impl} returned empty or short content`); + finalError.name = 'EmptyCrawlResultError'; + finalCrawler = impl; } catch (error) { console.error(error); finalError = error as Error; @@ -77,10 +82,10 @@ export class Crawler { const errorMessage = finalError?.message; return { - crawler: finalCrawler!, + crawler: finalCrawler || finalImpls.at(-1) || 'unknown', data: { content: `Fail to crawl the page. Error type: ${errorType}, error message: ${errorMessage}`, - errorMessage: errorMessage, + errorMessage, errorType, }, originalUrl: url, diff --git a/packages/web-crawler/src/test-utils.ts b/packages/web-crawler/src/test-utils.ts new file mode 100644 index 0000000000..0fc80b4b64 --- /dev/null +++ b/packages/web-crawler/src/test-utils.ts @@ -0,0 +1,25 @@ +import { vi } from 'vitest'; + +/** + * Create a mock Response object for crawler tests. + * Uses `vi.fn()` for `json`, `text`, and `clone` so individual tests can override them. + */ +export const createMockResponse = ( + body: any, + opts: { ok: boolean; status?: number; statusText?: string } = { ok: true }, +) => { + const self: any = { + ok: opts.ok, + status: opts.status ?? (opts.ok ? 200 : 500), + statusText: opts.statusText ?? (opts.ok ? 'OK' : 'Internal Server Error'), + json: vi.fn().mockResolvedValue(body), + text: vi.fn().mockResolvedValue(typeof body === 'string' ? body : JSON.stringify(body)), + clone: vi.fn(), + }; + self.clone.mockReturnValue({ + ...self, + json: vi.fn().mockResolvedValue(body), + text: vi.fn().mockResolvedValue(typeof body === 'string' ? body : JSON.stringify(body)), + }); + return self; +}; diff --git a/packages/web-crawler/src/utils/appUrlRules.test.ts b/packages/web-crawler/src/utils/__tests__/appUrlRules.test.ts similarity index 98% rename from packages/web-crawler/src/utils/appUrlRules.test.ts rename to packages/web-crawler/src/utils/__tests__/appUrlRules.test.ts index abc1ff971a..e3ecb33a20 100644 --- a/packages/web-crawler/src/utils/appUrlRules.test.ts +++ b/packages/web-crawler/src/utils/__tests__/appUrlRules.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; -import { applyUrlRules } from './appUrlRules'; +import { applyUrlRules } from '../appUrlRules'; describe('applyUrlRules', () => { // @gru-agent github file rules 不要改 diff --git a/packages/web-crawler/src/utils/__tests__/errorType.test.ts b/packages/web-crawler/src/utils/__tests__/errorType.test.ts index ba06111b25..d2ad87c793 100644 --- a/packages/web-crawler/src/utils/__tests__/errorType.test.ts +++ b/packages/web-crawler/src/utils/__tests__/errorType.test.ts @@ -1,6 +1,12 @@ import { describe, expect, it } from 'vitest'; -import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../errorType'; +import { + isFetchNetworkError, + NetworkConnectionError, + PageNotFoundError, + TimeoutError, + toFetchError, +} from '../errorType'; describe('errorType', () => { describe('PageNotFoundError', () => { @@ -170,6 +176,43 @@ describe('errorType', () => { }); }); + describe('isFetchNetworkError', () => { + it('should return true for TypeError with "fetch failed" message', () => { + expect(isFetchNetworkError(new TypeError('fetch failed'))).toBe(true); + }); + + it('should return false for plain Error with "fetch failed" message', () => { + expect(isFetchNetworkError(new Error('fetch failed'))).toBe(false); + }); + + it('should return false for TypeError with different message', () => { + expect(isFetchNetworkError(new TypeError('something else'))).toBe(false); + }); + + it('should return false for non-error values', () => { + expect(isFetchNetworkError('fetch failed')).toBe(false); + expect(isFetchNetworkError(null)).toBe(false); + expect(isFetchNetworkError(undefined)).toBe(false); + }); + }); + + describe('toFetchError', () => { + it('should return NetworkConnectionError for fetch network errors', () => { + const result = toFetchError(new TypeError('fetch failed')); + expect(result).toBeInstanceOf(NetworkConnectionError); + }); + + it('should return TimeoutError as-is', () => { + const timeout = new TimeoutError('Request timeout after 10000ms'); + expect(toFetchError(timeout)).toBe(timeout); + }); + + it('should return unknown errors unchanged', () => { + const unknown = new Error('something unexpected'); + expect(toFetchError(unknown)).toBe(unknown); + }); + }); + describe('error catching scenarios', () => { it('should allow catching specific error types', () => { const testErrors = [ diff --git a/packages/web-crawler/src/utils/__tests__/response.test.ts b/packages/web-crawler/src/utils/__tests__/response.test.ts new file mode 100644 index 0000000000..cbd434e732 --- /dev/null +++ b/packages/web-crawler/src/utils/__tests__/response.test.ts @@ -0,0 +1,102 @@ +import { describe, expect, it } from 'vitest'; + +import { createHTTPStatusError, parseJSONResponse, ResponseBodyParseError } from '../response'; + +const createMockResponse = ( + body: string, + options: { ok?: boolean; status?: number; statusText?: string } = {}, +) => { + const { ok = true, status = 200, statusText = 'OK' } = options; + return new Response(body, { + status, + statusText, + headers: { 'Content-Type': ok ? 'application/json' : 'text/html' }, + }); +}; + +describe('ResponseBodyParseError', () => { + it('should create error with provider and body snippet', () => { + const error = new ResponseBodyParseError('Jina', 'error'); + expect(error.message).toBe('Jina returned non-JSON response: error'); + expect(error.name).toBe('ResponseBodyParseError'); + }); + + it('should create error without body snippet', () => { + const error = new ResponseBodyParseError('Firecrawl'); + expect(error.message).toBe('Firecrawl returned non-JSON response'); + }); +}); + +describe('parseJSONResponse', () => { + it('should parse valid JSON response', async () => { + const data = { code: 200, results: ['a', 'b'] }; + const response = createMockResponse(JSON.stringify(data)); + + const result = await parseJSONResponse(response, 'TestProvider'); + + expect(result).toEqual(data); + }); + + it('should throw ResponseBodyParseError for non-JSON response', async () => { + const response = createMockResponse('Error'); + + await expect(parseJSONResponse(response, 'Jina')).rejects.toThrow(ResponseBodyParseError); + await expect( + parseJSONResponse(createMockResponse('Error'), 'Jina'), + ).rejects.toThrow('Jina returned non-JSON response'); + }); + + it('should include body snippet in error for non-JSON response', async () => { + const htmlBody = 'Internal Server Error'; + const response = createMockResponse(htmlBody); + + await expect(parseJSONResponse(response, 'Firecrawl')).rejects.toThrow( + /Firecrawl returned non-JSON response: .*Internal Server Error/, + ); + }); + + it('should handle empty response body', async () => { + const response = createMockResponse(''); + + await expect(parseJSONResponse(response, 'TestProvider')).rejects.toThrow( + 'TestProvider returned non-JSON response', + ); + }); +}); + +describe('createHTTPStatusError', () => { + it('should create error with status and body snippet', async () => { + const response = createMockResponse('Not Found', { + ok: false, + status: 404, + statusText: 'Not Found', + }); + + const error = await createHTTPStatusError(response, 'Exa'); + + expect(error.message).toContain('Exa request failed with status 404: Not Found'); + expect(error.message).toContain('Not Found'); + }); + + it('should create error without body when response text fails', async () => { + const response = createMockResponse('', { + ok: false, + status: 500, + statusText: 'Internal Server Error', + }); + + const error = await createHTTPStatusError(response, 'Tavily'); + + expect(error.message).toBe('Tavily request failed with status 500: Internal Server Error'); + }); + + it('should truncate long body snippets', async () => { + const longBody = 'x'.repeat(500); + const response = createMockResponse(longBody, { ok: false, status: 500, statusText: 'Error' }); + + const error = await createHTTPStatusError(response, 'Test'); + + // Body snippet should be truncated to 200 chars + expect(error.message.length).toBeLessThan(500 + 100); + }); +}); diff --git a/packages/web-crawler/src/utils/__tests__/withTimeout.test.ts b/packages/web-crawler/src/utils/__tests__/withTimeout.test.ts index 3b52aee4cf..2445b75522 100644 --- a/packages/web-crawler/src/utils/__tests__/withTimeout.test.ts +++ b/packages/web-crawler/src/utils/__tests__/withTimeout.test.ts @@ -12,18 +12,18 @@ describe('withTimeout', () => { vi.useRealTimers(); }); - it('should resolve when promise resolves before timeout', async () => { - const promise = Promise.resolve('success'); - const result = await withTimeout(promise, 1000); + it('should resolve when factory function resolves before timeout', async () => { + const result = await withTimeout(() => Promise.resolve('success'), 1000); expect(result).toBe('success'); }); - it('should reject with TimeoutError when promise takes too long', async () => { - const slowPromise = new Promise((resolve) => { - setTimeout(() => resolve('too late'), 200); - }); + it('should reject with TimeoutError when factory takes too long', async () => { + const fn = () => + new Promise((resolve) => { + setTimeout(() => resolve('too late'), 200); + }); - const timeoutPromise = withTimeout(slowPromise, 100); + const timeoutPromise = withTimeout(fn, 100); vi.advanceTimersByTime(100); await expect(timeoutPromise).rejects.toThrow(TimeoutError); @@ -31,32 +31,70 @@ describe('withTimeout', () => { }); it('should use DEFAULT_TIMEOUT when no timeout specified', async () => { - const slowPromise = new Promise((resolve) => { - setTimeout(() => resolve('success'), DEFAULT_TIMEOUT + 100); - }); + const fn = () => + new Promise((resolve) => { + setTimeout(() => resolve('success'), DEFAULT_TIMEOUT + 100); + }); - const timeoutPromise = withTimeout(slowPromise); + const timeoutPromise = withTimeout(fn); vi.advanceTimersByTime(DEFAULT_TIMEOUT); await expect(timeoutPromise).rejects.toThrow(TimeoutError); await expect(timeoutPromise).rejects.toThrow(`Request timeout after ${DEFAULT_TIMEOUT}ms`); }); - it('should reject with original error if promise rejects before timeout', async () => { + it('should reject with original error if factory rejects before timeout', async () => { const error = new Error('Original error'); - const failingPromise = Promise.reject(error); + const fn = () => Promise.reject(error); - await expect(withTimeout(failingPromise, 1000)).rejects.toThrow('Original error'); + await expect(withTimeout(fn, 1000)).rejects.toThrow('Original error'); }); - it('should abort controller when timeout occurs', async () => { - const slowPromise = new Promise((resolve) => { - setTimeout(() => resolve('too late'), 2000); - }); + it('should pass AbortSignal to the factory function', async () => { + const factoryFn = vi.fn().mockResolvedValue('result'); + await withTimeout(factoryFn, 1000); - const timeoutPromise = withTimeout(slowPromise, 1000); - vi.advanceTimersByTime(1000); + expect(factoryFn).toHaveBeenCalledTimes(1); + const signal = factoryFn.mock.calls[0][0]; + expect(signal).toBeInstanceOf(AbortSignal); + expect(signal.aborted).toBe(false); + }); + it('should abort the signal when timeout occurs', async () => { + let capturedSignal: AbortSignal | undefined; + const fn = (signal: AbortSignal) => { + capturedSignal = signal; + return new Promise((resolve) => { + setTimeout(() => resolve('too late'), 2000); + }); + }; + + const timeoutPromise = withTimeout(fn, 100); + expect(capturedSignal!.aborted).toBe(false); + + vi.advanceTimersByTime(100); await expect(timeoutPromise).rejects.toThrow(TimeoutError); + + expect(capturedSignal!.aborted).toBe(true); + }); + + it('should clear timeout timer when promise resolves successfully', async () => { + const clearTimeoutSpy = vi.spyOn(globalThis, 'clearTimeout'); + + await withTimeout(() => Promise.resolve('success'), 5000); + + expect(clearTimeoutSpy).toHaveBeenCalled(); + clearTimeoutSpy.mockRestore(); + }); + + it('should clear timeout timer when promise rejects', async () => { + const clearTimeoutSpy = vi.spyOn(globalThis, 'clearTimeout'); + + await expect(withTimeout(() => Promise.reject(new Error('fail')), 5000)).rejects.toThrow( + 'fail', + ); + + expect(clearTimeoutSpy).toHaveBeenCalled(); + clearTimeoutSpy.mockRestore(); }); }); diff --git a/packages/web-crawler/src/utils/errorType.ts b/packages/web-crawler/src/utils/errorType.ts index c3ade9c806..fc4e4fbb3d 100644 --- a/packages/web-crawler/src/utils/errorType.ts +++ b/packages/web-crawler/src/utils/errorType.ts @@ -17,3 +17,34 @@ export class TimeoutError extends Error { this.name = 'TimeoutError'; } } + +/** + * Check if an error is a Node.js fetch network failure. + * Node.js undici throws TypeError with message "fetch failed" on network errors. + */ +export const isFetchNetworkError = (error: unknown): boolean => + error instanceof TypeError && (error as Error).message === 'fetch failed'; + +/** + * Normalize a fetch error into a typed error for consistent handling. + * Converts network failures to `NetworkConnectionError`, passes through `TimeoutError`, + * and returns any other error unchanged. Callers should `throw` the returned value. + * + * @example + * ```ts + * } catch (e) { + * throw toFetchError(e); + * } + * ``` + */ +export const toFetchError = (error: unknown): Error => { + if (isFetchNetworkError(error)) { + return new NetworkConnectionError(); + } + + if (error instanceof TimeoutError) { + return error; + } + + return error as Error; +}; diff --git a/packages/web-crawler/src/utils/htmlToMarkdown.test.ts b/packages/web-crawler/src/utils/htmlToMarkdown.test.ts index 7452541829..611d44743b 100644 --- a/packages/web-crawler/src/utils/htmlToMarkdown.test.ts +++ b/packages/web-crawler/src/utils/htmlToMarkdown.test.ts @@ -1,5 +1,5 @@ import { readFileSync } from 'node:fs'; -import * as path from 'node:path'; +import path from 'node:path'; import { describe, expect, it } from 'vitest'; @@ -33,4 +33,29 @@ describe('htmlToMarkdown', () => { expect(data).toMatchSnapshot(); }, 20000); }); + + it('should truncate HTML exceeding 1 MB', () => { + // Create HTML slightly over 1 MB + const maxSize = 1024 * 1024; + const largeContent = 'x'.repeat(maxSize + 1000); + const html = `

${largeContent}

`; + + // Should not throw - the function handles large HTML by truncating + const result = htmlToMarkdown(html, { url: 'https://example.com', filterOptions: {} }); + + // Verify content was produced (truncated HTML is still parseable) + expect(result).toBeDefined(); + expect(result.content).toBeDefined(); + // The output content should be smaller than the input due to truncation + expect(result.content.length).toBeLessThan(html.length); + }, 20000); + + it('should not truncate HTML under 1 MB', () => { + const html = '

Small content

'; + + const result = htmlToMarkdown(html, { url: 'https://example.com', filterOptions: {} }); + + expect(result).toBeDefined(); + expect(result.content).toContain('Small content'); + }); }); diff --git a/packages/web-crawler/src/utils/htmlToMarkdown.ts b/packages/web-crawler/src/utils/htmlToMarkdown.ts index 66a899f835..c964b352d4 100644 --- a/packages/web-crawler/src/utils/htmlToMarkdown.ts +++ b/packages/web-crawler/src/utils/htmlToMarkdown.ts @@ -5,6 +5,9 @@ import { NodeHtmlMarkdown } from 'node-html-markdown'; import type { FilterOptions } from '../type'; +/** Truncate HTML to 1 MB before DOM parsing to prevent CPU spikes on large pages */ +const MAX_HTML_SIZE = 1024 * 1024; + const cleanObj = ( obj: T, ): { @@ -24,9 +27,10 @@ interface HtmlToMarkdownOutput { } export const htmlToMarkdown = ( - html: string, + rawHtml: string, { url, filterOptions }: { filterOptions: FilterOptions; url: string }, ): HtmlToMarkdownOutput => { + const html = rawHtml.length > MAX_HTML_SIZE ? rawHtml.slice(0, MAX_HTML_SIZE) : rawHtml; const window = new Window({ url }); const document = window.document; diff --git a/packages/web-crawler/src/utils/response.ts b/packages/web-crawler/src/utils/response.ts new file mode 100644 index 0000000000..901ffffdcb --- /dev/null +++ b/packages/web-crawler/src/utils/response.ts @@ -0,0 +1,49 @@ +const ERROR_BODY_SNIPPET_LIMIT = 200; + +const normalizeBodySnippet = (body: string) => body.replaceAll(/\s+/g, ' ').trim(); + +export class ResponseBodyParseError extends Error { + constructor(provider: string, bodySnippet?: string) { + super( + bodySnippet + ? `${provider} returned non-JSON response: ${bodySnippet}` + : `${provider} returned non-JSON response`, + ); + this.name = 'ResponseBodyParseError'; + } +} + +const getBodySnippet = async (response: Response): Promise => { + try { + const body = await response.text(); + const snippet = normalizeBodySnippet(body).slice(0, ERROR_BODY_SNIPPET_LIMIT); + + return snippet.length > 0 ? snippet : undefined; + } catch { + return undefined; + } +}; + +export const parseJSONResponse = async (response: Response, provider: string): Promise => { + const clonedResponse = response.clone(); + + try { + return (await response.json()) as T; + } catch { + const bodySnippet = await getBodySnippet(clonedResponse); + throw new ResponseBodyParseError(provider, bodySnippet); + } +}; + +export const createHTTPStatusError = async ( + response: Response, + provider: string, +): Promise => { + const bodySnippet = await getBodySnippet(response); + + return new Error( + bodySnippet + ? `${provider} request failed with status ${response.status}: ${response.statusText}. Response: ${bodySnippet}` + : `${provider} request failed with status ${response.status}: ${response.statusText}`, + ); +}; diff --git a/packages/web-crawler/src/utils/withTimeout.ts b/packages/web-crawler/src/utils/withTimeout.ts index 9f295fa43b..bcc8d62829 100644 --- a/packages/web-crawler/src/utils/withTimeout.ts +++ b/packages/web-crawler/src/utils/withTimeout.ts @@ -3,19 +3,28 @@ import { TimeoutError } from './errorType'; export const DEFAULT_TIMEOUT = 10_000; /** - * Wraps a promise with a timeout - * @param promise Promise to wrap + * Wraps a factory function with a timeout and abort support. + * The factory receives an AbortSignal that is aborted on timeout, + * allowing the underlying request (e.g. fetch) to be properly cancelled. + * @param fn Factory function that receives an AbortSignal and returns a Promise * @param ms Timeout in milliseconds - * @returns Promise that will be rejected if it takes longer than ms to resolve */ -export const withTimeout = (promise: Promise, ms: number = DEFAULT_TIMEOUT): Promise => { +export const withTimeout = ( + fn: (signal: AbortSignal) => Promise, + ms: number = DEFAULT_TIMEOUT, +): Promise => { const controller = new AbortController(); + let timeoutId: ReturnType; + const timeoutPromise = new Promise((_, reject) => { - setTimeout(() => { + timeoutId = setTimeout(() => { controller.abort(); reject(new TimeoutError(`Request timeout after ${ms}ms`)); }, ms); }); - return Promise.race([promise, timeoutPromise]); + return Promise.race([ + fn(controller.signal).finally(() => clearTimeout(timeoutId)), + timeoutPromise, + ]); }; diff --git a/src/envs/tools.ts b/src/envs/tools.ts index 3fc2054815..753a236622 100644 --- a/src/envs/tools.ts +++ b/src/envs/tools.ts @@ -1,15 +1,25 @@ import { createEnv } from '@t3-oss/env-nextjs'; import { z } from 'zod'; +const optionalNumberEnv = (min: number, max: number) => + z.preprocess( + (value) => (value === '' || value === null ? undefined : value), + z.coerce.number().int().max(max).min(min).optional(), + ); + export const getToolsConfig = () => { return createEnv({ runtimeEnv: { + CRAWL_CONCURRENCY: process.env.CRAWL_CONCURRENCY, + CRAWLER_RETRY: process.env.CRAWLER_RETRY, CRAWLER_IMPLS: process.env.CRAWLER_IMPLS, SEARCH_PROVIDERS: process.env.SEARCH_PROVIDERS, SEARXNG_URL: process.env.SEARXNG_URL, }, server: { + CRAWL_CONCURRENCY: optionalNumberEnv(1, 10), + CRAWLER_RETRY: optionalNumberEnv(0, 3), CRAWLER_IMPLS: z.string().optional(), SEARCH_PROVIDERS: z.string().optional(), SEARXNG_URL: z.string().url().optional(), diff --git a/src/server/routers/tools/search.test.ts b/src/server/routers/tools/search.test.ts index 42bef5b943..ba79581727 100644 --- a/src/server/routers/tools/search.test.ts +++ b/src/server/routers/tools/search.test.ts @@ -46,6 +46,27 @@ describe('searchRouter', () => { expect(result.results[1]).toEqual({ content: 'test content' }); }); + it('should accept all supported crawler implementations', async () => { + const caller = searchRouter.createCaller(mockContext as any); + + const allImpls = [ + 'browserless', + 'exa', + 'firecrawl', + 'jina', + 'naive', + 'search1api', + 'tavily', + ] as const; + for (const impl of allImpls) { + const result = await caller.crawlPages({ + urls: ['http://test.com'], + impls: [impl], + }); + expect(result.results).toHaveLength(1); + } + }); + it('should work without specifying impls', async () => { const caller = searchRouter.createCaller(mockContext as any); diff --git a/src/server/routers/tools/search.ts b/src/server/routers/tools/search.ts index 67c0f72315..682d4de58f 100644 --- a/src/server/routers/tools/search.ts +++ b/src/server/routers/tools/search.ts @@ -9,7 +9,10 @@ export const searchRouter = router({ crawlPages: searchProcedure .input( z.object({ - impls: z.enum(['jina', 'naive', 'browserless']).array().optional(), + impls: z + .enum(['browserless', 'exa', 'firecrawl', 'jina', 'naive', 'search1api', 'tavily']) + .array() + .optional(), urls: z.string().array(), }), ) diff --git a/src/server/services/search/index.test.ts b/src/server/services/search/index.test.ts index 50730a16c1..a7092c9252 100644 --- a/src/server/services/search/index.test.ts +++ b/src/server/services/search/index.test.ts @@ -3,7 +3,7 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'; import { toolsEnv } from '@/envs/tools'; -import { createSearchServiceImpl,SearchImplType } from './impls'; +import { createSearchServiceImpl, SearchImplType } from './impls'; import { SearchService } from './index'; // Mock dependencies @@ -11,7 +11,9 @@ vi.mock('@lobechat/web-crawler'); vi.mock('./impls'); vi.mock('@/envs/tools', () => ({ toolsEnv: { + CRAWL_CONCURRENCY: undefined, CRAWLER_IMPLS: '', + CRAWLER_RETRY: undefined, SEARCH_PROVIDERS: '', }, })); @@ -279,10 +281,9 @@ describe('SearchService', () => { describe('crawlPages', () => { it('should crawl multiple pages concurrently', async () => { const mockCrawlResult = { - content: 'Page content', - description: 'Page description', - title: 'Page title', - url: 'https://example.com', + crawler: 'naive', + data: { content: 'Page content', contentType: 'text' }, + originalUrl: 'https://example.com', }; const mockCrawler = { @@ -304,8 +305,13 @@ describe('SearchService', () => { it('should use crawler implementations from env', async () => { vi.mocked(toolsEnv).CRAWLER_IMPLS = 'jina,reader'; + const mockSuccessResult = { + crawler: 'jina', + data: { content: 'ok', contentType: 'text' }, + originalUrl: 'https://example.com', + }; const mockCrawler = { - crawl: vi.fn().mockResolvedValue({}), + crawl: vi.fn().mockResolvedValue(mockSuccessResult), }; vi.mocked(Crawler).mockImplementation(() => mockCrawler as any); @@ -317,8 +323,13 @@ describe('SearchService', () => { }); it('should pass impls parameter to crawler.crawl', async () => { + const mockSuccessResult = { + crawler: 'jina', + data: { content: 'ok', contentType: 'text' }, + originalUrl: 'https://example.com', + }; const mockCrawler = { - crawl: vi.fn().mockResolvedValue({}), + crawl: vi.fn().mockResolvedValue(mockSuccessResult), }; vi.mocked(Crawler).mockImplementation(() => mockCrawler as any); @@ -334,5 +345,133 @@ describe('SearchService', () => { url: 'https://example.com', }); }); + + it('should use CRAWL_CONCURRENCY from env', async () => { + vi.mocked(toolsEnv).CRAWL_CONCURRENCY = 1; + + const mockCrawler = { + crawl: vi.fn().mockResolvedValue({ + crawler: 'naive', + data: { content: 'ok', contentType: 'text' }, + originalUrl: 'https://example.com', + }), + }; + vi.mocked(Crawler).mockImplementation(() => mockCrawler as any); + + searchService = new SearchService(); + const urls = ['https://a.com', 'https://b.com']; + await searchService.crawlPages({ urls }); + + // All URLs should still be crawled + expect(mockCrawler.crawl).toHaveBeenCalledTimes(2); + }); + + it('should retry on failed crawl results', async () => { + vi.mocked(toolsEnv).CRAWLER_RETRY = 1; + + const failedResult = { + crawler: 'naive', + data: { content: 'Fail', errorType: 'NetworkError', errorMessage: 'timeout' }, + originalUrl: 'https://example.com', + }; + const successResult = { + crawler: 'naive', + data: { content: 'Page content', contentType: 'text' }, + originalUrl: 'https://example.com', + }; + + const mockCrawler = { + crawl: vi.fn().mockResolvedValueOnce(failedResult).mockResolvedValueOnce(successResult), + }; + vi.mocked(Crawler).mockImplementation(() => mockCrawler as any); + + searchService = new SearchService(); + const result = await searchService.crawlPages({ urls: ['https://example.com'] }); + + expect(mockCrawler.crawl).toHaveBeenCalledTimes(2); + expect(result.results[0]).toBe(successResult); + }); + + it('should return last failed result after all retries exhausted', async () => { + vi.mocked(toolsEnv).CRAWLER_RETRY = 1; + + const failedResult = { + crawler: 'naive', + data: { content: 'Fail', errorType: 'NetworkError', errorMessage: 'timeout' }, + originalUrl: 'https://example.com', + }; + + const mockCrawler = { + crawl: vi.fn().mockResolvedValue(failedResult), + }; + vi.mocked(Crawler).mockImplementation(() => mockCrawler as any); + + searchService = new SearchService(); + const result = await searchService.crawlPages({ urls: ['https://example.com'] }); + + expect(mockCrawler.crawl).toHaveBeenCalledTimes(2); // 1 + 1 retry + expect(result.results[0]).toBe(failedResult); + }); + + it('should not retry when CRAWLER_RETRY is 0', async () => { + vi.mocked(toolsEnv).CRAWLER_RETRY = 0; + + const failedResult = { + crawler: 'naive', + data: { content: 'Fail', errorType: 'Error', errorMessage: 'fail' }, + originalUrl: 'https://example.com', + }; + + const mockCrawler = { + crawl: vi.fn().mockResolvedValue(failedResult), + }; + vi.mocked(Crawler).mockImplementation(() => mockCrawler as any); + + searchService = new SearchService(); + const result = await searchService.crawlPages({ urls: ['https://example.com'] }); + + expect(mockCrawler.crawl).toHaveBeenCalledTimes(1); + expect(result.results[0]).toBe(failedResult); + }); + + it('should handle crawl exceptions during retry', async () => { + vi.mocked(toolsEnv).CRAWLER_RETRY = 1; + + const mockCrawler = { + crawl: vi.fn().mockRejectedValue(new Error('Network error')), + }; + vi.mocked(Crawler).mockImplementation(() => mockCrawler as any); + + searchService = new SearchService(); + const result = await searchService.crawlPages({ urls: ['https://example.com'] }); + + expect(mockCrawler.crawl).toHaveBeenCalledTimes(2); + expect(result.results[0].data).toMatchObject({ + errorType: 'Error', + errorMessage: 'Network error', + }); + }); + + it('should detect successful results by contentType presence', async () => { + vi.mocked(toolsEnv).CRAWLER_RETRY = 1; + + const successResult = { + crawler: 'naive', + data: { content: 'Page content', contentType: 'text' }, + originalUrl: 'https://example.com', + }; + + const mockCrawler = { + crawl: vi.fn().mockResolvedValue(successResult), + }; + vi.mocked(Crawler).mockImplementation(() => mockCrawler as any); + + searchService = new SearchService(); + const result = await searchService.crawlPages({ urls: ['https://example.com'] }); + + // Should not retry since result has contentType (successful) + expect(mockCrawler.crawl).toHaveBeenCalledTimes(1); + expect(result.results[0]).toBe(successResult); + }); }); }); diff --git a/src/server/services/search/index.ts b/src/server/services/search/index.ts index 2da4066656..fa18db53af 100644 --- a/src/server/services/search/index.ts +++ b/src/server/services/search/index.ts @@ -1,5 +1,5 @@ -import { type SearchParams, type SearchQuery } from '@lobechat/types'; -import { type CrawlImplType } from '@lobechat/web-crawler'; +import type { SearchParams, SearchQuery } from '@lobechat/types'; +import type { Crawler, CrawlImplType, CrawlUniformResult } from '@lobechat/web-crawler'; import pMap from 'p-map'; import { toolsEnv } from '@/envs/tools'; @@ -7,6 +7,9 @@ import { toolsEnv } from '@/envs/tools'; import { type SearchImplType, type SearchServiceImpl } from './impls'; import { createSearchServiceImpl } from './impls'; +const DEFAULT_CRAWL_CONCURRENCY = 3; +const DEFAULT_CRAWLER_RETRY = 1; + const parseImplEnv = (envString: string = '') => { // Handle full-width commas and extra whitespace const envValue = envString.replaceAll(',', ',').trim(); @@ -24,6 +27,14 @@ export class SearchService { return parseImplEnv(toolsEnv.CRAWLER_IMPLS); } + private get crawlConcurrency() { + return toolsEnv.CRAWL_CONCURRENCY ?? DEFAULT_CRAWL_CONCURRENCY; + } + + private get crawlerRetry() { + return toolsEnv.CRAWLER_RETRY ?? DEFAULT_CRAWLER_RETRY; + } + constructor() { const impls = this.searchImpls; // TODO: need use turn mode @@ -37,14 +48,59 @@ export class SearchService { const results = await pMap( input.urls, async (url) => { - return await crawler.crawl({ impls: input.impls, url }); + return await this.crawlWithRetry(crawler, url, input.impls); }, - { concurrency: 3 }, + { concurrency: this.crawlConcurrency }, ); return { results }; } + private async crawlWithRetry( + crawler: Crawler, + url: string, + impls?: CrawlImplType[], + ): Promise { + const maxAttempts = this.crawlerRetry + 1; + let lastResult: CrawlUniformResult | undefined; + let lastError: Error | undefined; + + for (let attempt = 1; attempt <= maxAttempts; attempt++) { + try { + const result = await crawler.crawl({ impls, url }); + lastResult = result; + + if (!this.isFailedCrawlResult(result)) { + return result; + } + } catch (error) { + lastError = error as Error; + } + } + + if (lastResult) { + return lastResult; + } + + return { + crawler: 'unknown', + data: { + content: `Fail to crawl the page. Error type: ${lastError?.name || 'UnknownError'}, error message: ${lastError?.message}`, + errorMessage: lastError?.message, + errorType: lastError?.name || 'UnknownError', + }, + originalUrl: url, + }; + } + + /** + * A successful crawl result always includes `contentType` (e.g. 'text', 'json') + * in `result.data`, while a failed result contains `errorType`/`errorMessage` instead. + */ + private isFailedCrawlResult(result: CrawlUniformResult): boolean { + return !('contentType' in result.data); + } + private get searchImpls() { return parseImplEnv(toolsEnv.SEARCH_PROVIDERS) as SearchImplType[]; } @@ -58,17 +114,17 @@ export class SearchService { async webSearch({ query, searchCategories, searchEngines, searchTimeRange }: SearchQuery) { let data = await this.query(query, { - searchCategories: searchCategories, - searchEngines: searchEngines, - searchTimeRange: searchTimeRange, + searchCategories, + searchEngines, + searchTimeRange, }); // First retry: remove search engine restrictions if no results found if (data.results.length === 0 && searchEngines && searchEngines?.length > 0) { const paramsExcludeSearchEngines = { - searchCategories: searchCategories, + searchCategories, searchEngines: undefined, - searchTimeRange: searchTimeRange, + searchTimeRange, }; data = await this.query(query, paramsExcludeSearchEngines); }