test: add unit tests for eval-dataset-parser (#13197)

Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
LobeHub Bot
2026-03-25 10:55:58 +08:00
committed by GitHub
parent fed8b39957
commit afefe217db
3 changed files with 412 additions and 0 deletions

View File

@@ -0,0 +1,74 @@
import { describe, expect, it } from 'vitest';
import * as XLSX from 'xlsx';
import { detectFormat } from '../src/detect';
const XLSX_MAGIC = new Uint8Array([0x50, 0x4b, 0x03, 0x04]);
describe('detectFormat - edge cases', () => {
it('should detect XLS by filename extension', () => {
expect(detectFormat('', 'data.xls')).toBe('xlsx');
});
it('should detect XLSX magic bytes from Uint8Array without filename', () => {
// Create a real minimal XLSX binary
const workbook = XLSX.utils.book_new();
XLSX.utils.book_append_sheet(workbook, XLSX.utils.aoa_to_sheet([['a']]), 'Sheet1');
const buf = XLSX.write(workbook, { bookType: 'xlsx', type: 'array' });
const data = new Uint8Array(buf);
const result = detectFormat(data);
expect(result).toBe('xlsx');
});
it('should detect XLSX magic bytes from Buffer without filename', () => {
const workbook = XLSX.utils.book_new();
XLSX.utils.book_append_sheet(workbook, XLSX.utils.aoa_to_sheet([['a']]), 'Sheet1');
const buf = XLSX.write(workbook, { bookType: 'xlsx', type: 'buffer' }) as Buffer;
const result = detectFormat(buf);
expect(result).toBe('xlsx');
});
it('should parse JSON from Uint8Array containing JSON array', () => {
const json = '[{"a":1}]';
const data = new TextEncoder().encode(json);
expect(detectFormat(data)).toBe('json');
});
it('should parse JSONL from Uint8Array', () => {
const jsonl = '{"a":1}\n{"b":2}';
const data = new TextEncoder().encode(jsonl);
expect(detectFormat(data)).toBe('jsonl');
});
it('should fall back to CSV from Uint8Array with CSV content', () => {
const csv = 'col1,col2\nval1,val2';
const data = new TextEncoder().encode(csv);
expect(detectFormat(data)).toBe('csv');
});
it('should not detect XLSX from short Uint8Array (less than 4 bytes)', () => {
const data = new Uint8Array([0x50, 0x4b]);
// Not enough bytes for magic number → falls through to string detection
expect(detectFormat(data)).toBe('csv');
});
it('filename extension takes precedence over content', () => {
// Content looks like JSON but filename says CSV
const json = '[{"a":1}]';
expect(detectFormat(json, 'data.csv')).toBe('csv');
});
it('should treat a JSON-like string that fails parse as CSV', () => {
// Starts with '[' but is not valid JSON
const badJson = '[not valid json';
expect(detectFormat(badJson)).toBe('csv');
});
it('should treat an object-like first line that fails parse as CSV', () => {
// Starts with '{' on first line but is not valid JSON
const badJsonL = '{not valid jsonl}\nmore data';
expect(detectFormat(badJsonL)).toBe('csv');
});
});

View File

@@ -0,0 +1,111 @@
import { describe, expect, it } from 'vitest';
import * as XLSX from 'xlsx';
import { parseDataset } from '../src';
function makeXLSXBuffer(rows: Record<string, any>[], sheetName = 'Sheet1'): Uint8Array {
const workbook = XLSX.utils.book_new();
const worksheet = XLSX.utils.json_to_sheet(rows);
XLSX.utils.book_append_sheet(workbook, worksheet, sheetName);
const buffer = XLSX.write(workbook, { bookType: 'xlsx', type: 'array' });
return new Uint8Array(buffer);
}
describe('parseDataset - XLSX', () => {
const rows = [
{ question: 'Q1', answer: 'A1' },
{ question: 'Q2', answer: 'A2' },
];
it('should parse XLSX from Uint8Array', () => {
const data = makeXLSXBuffer(rows);
const result = parseDataset(data, { format: 'xlsx' });
expect(result.format).toBe('xlsx');
expect(result.headers).toEqual(['question', 'answer']);
expect(result.totalCount).toBe(2);
});
it('should auto-detect XLSX from magic bytes', () => {
const data = makeXLSXBuffer(rows);
const result = parseDataset(data);
expect(result.format).toBe('xlsx');
});
it('should auto-detect XLSX by filename', () => {
const data = makeXLSXBuffer(rows);
const result = parseDataset(data, { filename: 'test.xlsx' });
expect(result.format).toBe('xlsx');
expect(result.totalCount).toBe(2);
});
it('should throw when XLSX format is used with string input', () => {
expect(() => parseDataset('some string', { format: 'xlsx' })).toThrow(
'XLSX format requires binary input',
);
});
it('should support preview for XLSX', () => {
const manyRows = Array.from({ length: 10 }, (_, i) => ({ id: i, val: `v${i}` }));
const data = makeXLSXBuffer(manyRows);
const result = parseDataset(data, { format: 'xlsx', preview: 3 });
expect(result.rows).toHaveLength(3);
expect(result.totalCount).toBe(10);
});
});
describe('parseDataset - Buffer input', () => {
it('should parse CSV from Buffer', () => {
const csv = 'a,b\n1,2\n3,4';
const buf = Buffer.from(csv, 'utf8');
const result = parseDataset(buf, { format: 'csv' });
expect(result.format).toBe('csv');
expect(result.headers).toEqual(['a', 'b']);
expect(result.totalCount).toBe(2);
});
it('should parse JSON from Buffer', () => {
const json = '[{"x":1},{"x":2}]';
const buf = Buffer.from(json, 'utf8');
const result = parseDataset(buf, { format: 'json' });
expect(result.format).toBe('json');
expect(result.totalCount).toBe(2);
});
it('should parse JSONL from Buffer', () => {
const jsonl = '{"k":"v1"}\n{"k":"v2"}';
const buf = Buffer.from(jsonl, 'utf8');
const result = parseDataset(buf, { format: 'jsonl' });
expect(result.format).toBe('jsonl');
expect(result.totalCount).toBe(2);
});
});
describe('parseDataset - error cases', () => {
it('should throw for invalid JSON content', () => {
expect(() => parseDataset('not-json', { format: 'json' })).toThrow();
});
it('should throw when JSON is not an array', () => {
expect(() => parseDataset('{"a":1}', { format: 'json' })).toThrow(
'JSON file must contain an array of objects',
);
});
it('should throw on invalid JSONL line', () => {
expect(() => parseDataset('{"a":1}\nbad-line', { format: 'jsonl' })).toThrow(
'Invalid JSON at line 2',
);
});
it('should use explicit format over auto-detection', () => {
// Content looks like JSONL but format is forced to CSV
const result = parseDataset('{"a":1}', { format: 'csv' });
expect(result.format).toBe('csv');
});
it('should auto-detect when format is "auto"', () => {
const json = '[{"a":1}]';
const result = parseDataset(json, { format: 'auto' });
expect(result.format).toBe('json');
});
});

View File

@@ -0,0 +1,227 @@
import { describe, expect, it } from 'vitest';
import * as XLSX from 'xlsx';
import { parseCSV } from '../src/parsers/csv';
import { parseJSON } from '../src/parsers/json';
import { parseJSONL } from '../src/parsers/jsonl';
import { parseXLSX } from '../src/parsers/xlsx';
// ─── CSV ────────────────────────────────────────────────────────────────────
describe('parseCSV', () => {
const basicCSV = 'name,age,city\nAlice,30,NYC\nBob,25,LA\nCarol,35,Chicago';
it('should parse headers and rows correctly', () => {
const result = parseCSV(basicCSV);
expect(result.format).toBe('csv');
expect(result.headers).toEqual(['name', 'age', 'city']);
expect(result.totalCount).toBe(3);
expect(result.rows).toHaveLength(3);
expect(result.rows[0]).toEqual({ name: 'Alice', age: 30, city: 'NYC' });
});
it('should apply preview limit', () => {
const result = parseCSV(basicCSV, { preview: 2 });
expect(result.rows).toHaveLength(2);
expect(result.totalCount).toBe(3);
});
it('should handle custom delimiter', () => {
const tsvContent = 'name\tage\nAlice\t30\nBob\t25';
const result = parseCSV(tsvContent, { csvDelimiter: '\t' });
expect(result.headers).toEqual(['name', 'age']);
expect(result.rows[0]).toMatchObject({ name: 'Alice', age: 30 });
});
it('should handle empty CSV (only headers)', () => {
const result = parseCSV('name,age\n');
expect(result.headers).toEqual(['name', 'age']);
expect(result.totalCount).toBe(0);
expect(result.rows).toHaveLength(0);
});
it('should handle CSV with quoted fields', () => {
const csv = 'name,bio\nAlice,"She said, hello"\nBob,Simple';
const result = parseCSV(csv);
expect(result.rows[0].bio).toBe('She said, hello');
});
it('should dynamically type numeric values', () => {
const csv = 'id,score\n1,9.5\n2,8.0';
const result = parseCSV(csv);
expect(typeof result.rows[0].id).toBe('number');
expect(typeof result.rows[0].score).toBe('number');
});
});
// ─── JSON ────────────────────────────────────────────────────────────────────
describe('parseJSON', () => {
const validJSON = JSON.stringify([
{ question: 'Q1', answer: 'A1' },
{ question: 'Q2', answer: 'A2' },
{ question: 'Q3', answer: 'A3' },
]);
it('should parse a JSON array', () => {
const result = parseJSON(validJSON);
expect(result.format).toBe('json');
expect(result.headers).toEqual(['question', 'answer']);
expect(result.totalCount).toBe(3);
expect(result.rows).toHaveLength(3);
expect(result.rows[1]).toEqual({ question: 'Q2', answer: 'A2' });
});
it('should apply preview limit', () => {
const result = parseJSON(validJSON, { preview: 2 });
expect(result.rows).toHaveLength(2);
expect(result.totalCount).toBe(3);
});
it('should throw on invalid JSON', () => {
expect(() => parseJSON('not json at all')).toThrow();
});
it('should throw when JSON is not an array', () => {
expect(() => parseJSON('{"key":"value"}')).toThrow(
'JSON file must contain an array of objects',
);
});
it('should handle empty JSON array', () => {
const result = parseJSON('[]');
expect(result.headers).toEqual([]);
expect(result.totalCount).toBe(0);
expect(result.rows).toHaveLength(0);
});
it('should extract headers from first object only', () => {
const json = JSON.stringify([
{ a: 1, b: 2 },
{ a: 3, c: 4 }, // 'c' is extra
]);
const result = parseJSON(json);
expect(result.headers).toEqual(['a', 'b']);
});
});
// ─── JSONL ────────────────────────────────────────────────────────────────────
describe('parseJSONL', () => {
const validJSONL = '{"id":1,"text":"first"}\n{"id":2,"text":"second"}\n{"id":3,"text":"third"}';
it('should parse JSONL lines', () => {
const result = parseJSONL(validJSONL);
expect(result.format).toBe('jsonl');
expect(result.headers).toEqual(['id', 'text']);
expect(result.totalCount).toBe(3);
expect(result.rows).toHaveLength(3);
expect(result.rows[0]).toEqual({ id: 1, text: 'first' });
});
it('should apply preview limit', () => {
const result = parseJSONL(validJSONL, { preview: 2 });
expect(result.rows).toHaveLength(2);
expect(result.totalCount).toBe(3);
});
it('should throw on invalid JSON line with line number', () => {
const bad = '{"id":1}\nnot-json\n{"id":3}';
expect(() => parseJSONL(bad)).toThrow('Invalid JSON at line 2');
});
it('should skip blank lines', () => {
const withBlanks = '{"id":1}\n\n{"id":2}\n';
const result = parseJSONL(withBlanks);
expect(result.totalCount).toBe(2);
expect(result.rows).toHaveLength(2);
});
it('should handle single-line JSONL', () => {
const result = parseJSONL('{"only":"one"}');
expect(result.totalCount).toBe(1);
expect(result.rows[0]).toEqual({ only: 'one' });
});
it('should handle empty JSONL input', () => {
const result = parseJSONL('');
expect(result.totalCount).toBe(0);
expect(result.rows).toHaveLength(0);
expect(result.headers).toEqual([]);
});
});
// ─── XLSX ────────────────────────────────────────────────────────────────────
function makeXLSXBuffer(rows: Record<string, any>[], sheetName = 'Sheet1'): Uint8Array {
const workbook = XLSX.utils.book_new();
const worksheet = XLSX.utils.json_to_sheet(rows);
XLSX.utils.book_append_sheet(workbook, worksheet, sheetName);
const buffer = XLSX.write(workbook, { bookType: 'xlsx', type: 'array' });
return new Uint8Array(buffer);
}
describe('parseXLSX', () => {
const sampleRows = [
{ name: 'Alice', score: 95 },
{ name: 'Bob', score: 87 },
{ name: 'Carol', score: 72 },
];
it('should parse XLSX data from Uint8Array', () => {
const data = makeXLSXBuffer(sampleRows);
const result = parseXLSX(data);
expect(result.format).toBe('xlsx');
expect(result.headers).toEqual(['name', 'score']);
expect(result.totalCount).toBe(3);
expect(result.rows).toHaveLength(3);
expect(result.rows[0].name).toBe('Alice');
expect(result.metadata?.sheetName).toBe('Sheet1');
});
it('should apply preview limit', () => {
const data = makeXLSXBuffer(sampleRows);
const result = parseXLSX(data, { preview: 2 });
expect(result.rows).toHaveLength(2);
expect(result.totalCount).toBe(3);
});
it('should select sheet by name', () => {
const workbook = XLSX.utils.book_new();
XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet([{ x: 1 }]), 'First');
XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet([{ y: 2 }]), 'Second');
const data = new Uint8Array(XLSX.write(workbook, { bookType: 'xlsx', type: 'array' }));
const result = parseXLSX(data, { sheet: 'Second' });
expect(result.metadata?.sheetName).toBe('Second');
expect(result.headers).toEqual(['y']);
expect(result.rows[0].y).toBe('2');
});
it('should select sheet by index', () => {
const workbook = XLSX.utils.book_new();
XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet([{ x: 1 }]), 'First');
XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet([{ y: 2 }]), 'Second');
const data = new Uint8Array(XLSX.write(workbook, { bookType: 'xlsx', type: 'array' }));
const result = parseXLSX(data, { sheet: 1 });
expect(result.metadata?.sheetName).toBe('Second');
expect(result.headers).toEqual(['y']);
});
it('should return empty result for nonexistent sheet name', () => {
const data = makeXLSXBuffer(sampleRows, 'Data');
const result = parseXLSX(data, { sheet: 'NonExistent' });
expect(result.rows).toHaveLength(0);
expect(result.headers).toEqual([]);
expect(result.totalCount).toBe(0);
expect(result.metadata?.sheetName).toBe('NonExistent');
});
it('should default to first sheet when no sheet option provided', () => {
const data = makeXLSXBuffer(sampleRows, 'MySheet');
const result = parseXLSX(data);
expect(result.metadata?.sheetName).toBe('MySheet');
expect(result.totalCount).toBe(3);
});
});