✅ test: add unit tests for eval-dataset-parser (#13197)

Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 13:19:34 +07:00 · 2026-03-25 10:55:58 +08:00
parent fed8b39957
commit afefe217db
3 changed files with 412 additions and 0 deletions
--- a/packages/eval-dataset-parser/tests/detectFormat.edge.test.ts
+++ b/packages/eval-dataset-parser/tests/detectFormat.edge.test.ts
@@ -0,0 +1,74 @@
+import { describe, expect, it } from 'vitest';
+import * as XLSX from 'xlsx';
+
+import { detectFormat } from '../src/detect';
+
+const XLSX_MAGIC = new Uint8Array([0x50, 0x4b, 0x03, 0x04]);
+
+describe('detectFormat - edge cases', () => {
+  it('should detect XLS by filename extension', () => {
+    expect(detectFormat('', 'data.xls')).toBe('xlsx');
+  });
+
+  it('should detect XLSX magic bytes from Uint8Array without filename', () => {
+    // Create a real minimal XLSX binary
+    const workbook = XLSX.utils.book_new();
+    XLSX.utils.book_append_sheet(workbook, XLSX.utils.aoa_to_sheet([['a']]), 'Sheet1');
+    const buf = XLSX.write(workbook, { bookType: 'xlsx', type: 'array' });
+    const data = new Uint8Array(buf);
+
+    const result = detectFormat(data);
+    expect(result).toBe('xlsx');
+  });
+
+  it('should detect XLSX magic bytes from Buffer without filename', () => {
+    const workbook = XLSX.utils.book_new();
+    XLSX.utils.book_append_sheet(workbook, XLSX.utils.aoa_to_sheet([['a']]), 'Sheet1');
+    const buf = XLSX.write(workbook, { bookType: 'xlsx', type: 'buffer' }) as Buffer;
+
+    const result = detectFormat(buf);
+    expect(result).toBe('xlsx');
+  });
+
+  it('should parse JSON from Uint8Array containing JSON array', () => {
+    const json = '[{"a":1}]';
+    const data = new TextEncoder().encode(json);
+    expect(detectFormat(data)).toBe('json');
+  });
+
+  it('should parse JSONL from Uint8Array', () => {
+    const jsonl = '{"a":1}\n{"b":2}';
+    const data = new TextEncoder().encode(jsonl);
+    expect(detectFormat(data)).toBe('jsonl');
+  });
+
+  it('should fall back to CSV from Uint8Array with CSV content', () => {
+    const csv = 'col1,col2\nval1,val2';
+    const data = new TextEncoder().encode(csv);
+    expect(detectFormat(data)).toBe('csv');
+  });
+
+  it('should not detect XLSX from short Uint8Array (less than 4 bytes)', () => {
+    const data = new Uint8Array([0x50, 0x4b]);
+    // Not enough bytes for magic number → falls through to string detection
+    expect(detectFormat(data)).toBe('csv');
+  });
+
+  it('filename extension takes precedence over content', () => {
+    // Content looks like JSON but filename says CSV
+    const json = '[{"a":1}]';
+    expect(detectFormat(json, 'data.csv')).toBe('csv');
+  });
+
+  it('should treat a JSON-like string that fails parse as CSV', () => {
+    // Starts with '[' but is not valid JSON
+    const badJson = '[not valid json';
+    expect(detectFormat(badJson)).toBe('csv');
+  });
+
+  it('should treat an object-like first line that fails parse as CSV', () => {
+    // Starts with '{' on first line but is not valid JSON
+    const badJsonL = '{not valid jsonl}\nmore data';
+    expect(detectFormat(badJsonL)).toBe('csv');
+  });
+});
--- a/packages/eval-dataset-parser/tests/parseDataset.edge.test.ts
+++ b/packages/eval-dataset-parser/tests/parseDataset.edge.test.ts
@@ -0,0 +1,111 @@
+import { describe, expect, it } from 'vitest';
+import * as XLSX from 'xlsx';
+
+import { parseDataset } from '../src';
+
+function makeXLSXBuffer(rows: Record<string, any>[], sheetName = 'Sheet1'): Uint8Array {
+  const workbook = XLSX.utils.book_new();
+  const worksheet = XLSX.utils.json_to_sheet(rows);
+  XLSX.utils.book_append_sheet(workbook, worksheet, sheetName);
+  const buffer = XLSX.write(workbook, { bookType: 'xlsx', type: 'array' });
+  return new Uint8Array(buffer);
+}
+
+describe('parseDataset - XLSX', () => {
+  const rows = [
+    { question: 'Q1', answer: 'A1' },
+    { question: 'Q2', answer: 'A2' },
+  ];
+
+  it('should parse XLSX from Uint8Array', () => {
+    const data = makeXLSXBuffer(rows);
+    const result = parseDataset(data, { format: 'xlsx' });
+    expect(result.format).toBe('xlsx');
+    expect(result.headers).toEqual(['question', 'answer']);
+    expect(result.totalCount).toBe(2);
+  });
+
+  it('should auto-detect XLSX from magic bytes', () => {
+    const data = makeXLSXBuffer(rows);
+    const result = parseDataset(data);
+    expect(result.format).toBe('xlsx');
+  });
+
+  it('should auto-detect XLSX by filename', () => {
+    const data = makeXLSXBuffer(rows);
+    const result = parseDataset(data, { filename: 'test.xlsx' });
+    expect(result.format).toBe('xlsx');
+    expect(result.totalCount).toBe(2);
+  });
+
+  it('should throw when XLSX format is used with string input', () => {
+    expect(() => parseDataset('some string', { format: 'xlsx' })).toThrow(
+      'XLSX format requires binary input',
+    );
+  });
+
+  it('should support preview for XLSX', () => {
+    const manyRows = Array.from({ length: 10 }, (_, i) => ({ id: i, val: `v${i}` }));
+    const data = makeXLSXBuffer(manyRows);
+    const result = parseDataset(data, { format: 'xlsx', preview: 3 });
+    expect(result.rows).toHaveLength(3);
+    expect(result.totalCount).toBe(10);
+  });
+});
+
+describe('parseDataset - Buffer input', () => {
+  it('should parse CSV from Buffer', () => {
+    const csv = 'a,b\n1,2\n3,4';
+    const buf = Buffer.from(csv, 'utf8');
+    const result = parseDataset(buf, { format: 'csv' });
+    expect(result.format).toBe('csv');
+    expect(result.headers).toEqual(['a', 'b']);
+    expect(result.totalCount).toBe(2);
+  });
+
+  it('should parse JSON from Buffer', () => {
+    const json = '[{"x":1},{"x":2}]';
+    const buf = Buffer.from(json, 'utf8');
+    const result = parseDataset(buf, { format: 'json' });
+    expect(result.format).toBe('json');
+    expect(result.totalCount).toBe(2);
+  });
+
+  it('should parse JSONL from Buffer', () => {
+    const jsonl = '{"k":"v1"}\n{"k":"v2"}';
+    const buf = Buffer.from(jsonl, 'utf8');
+    const result = parseDataset(buf, { format: 'jsonl' });
+    expect(result.format).toBe('jsonl');
+    expect(result.totalCount).toBe(2);
+  });
+});
+
+describe('parseDataset - error cases', () => {
+  it('should throw for invalid JSON content', () => {
+    expect(() => parseDataset('not-json', { format: 'json' })).toThrow();
+  });
+
+  it('should throw when JSON is not an array', () => {
+    expect(() => parseDataset('{"a":1}', { format: 'json' })).toThrow(
+      'JSON file must contain an array of objects',
+    );
+  });
+
+  it('should throw on invalid JSONL line', () => {
+    expect(() => parseDataset('{"a":1}\nbad-line', { format: 'jsonl' })).toThrow(
+      'Invalid JSON at line 2',
+    );
+  });
+
+  it('should use explicit format over auto-detection', () => {
+    // Content looks like JSONL but format is forced to CSV
+    const result = parseDataset('{"a":1}', { format: 'csv' });
+    expect(result.format).toBe('csv');
+  });
+
+  it('should auto-detect when format is "auto"', () => {
+    const json = '[{"a":1}]';
+    const result = parseDataset(json, { format: 'auto' });
+    expect(result.format).toBe('json');
+  });
+});
--- a/packages/eval-dataset-parser/tests/parsers.test.ts
+++ b/packages/eval-dataset-parser/tests/parsers.test.ts
@@ -0,0 +1,227 @@
+import { describe, expect, it } from 'vitest';
+import * as XLSX from 'xlsx';
+
+import { parseCSV } from '../src/parsers/csv';
+import { parseJSON } from '../src/parsers/json';
+import { parseJSONL } from '../src/parsers/jsonl';
+import { parseXLSX } from '../src/parsers/xlsx';
+
+// ─── CSV ────────────────────────────────────────────────────────────────────
+
+describe('parseCSV', () => {
+  const basicCSV = 'name,age,city\nAlice,30,NYC\nBob,25,LA\nCarol,35,Chicago';
+
+  it('should parse headers and rows correctly', () => {
+    const result = parseCSV(basicCSV);
+    expect(result.format).toBe('csv');
+    expect(result.headers).toEqual(['name', 'age', 'city']);
+    expect(result.totalCount).toBe(3);
+    expect(result.rows).toHaveLength(3);
+    expect(result.rows[0]).toEqual({ name: 'Alice', age: 30, city: 'NYC' });
+  });
+
+  it('should apply preview limit', () => {
+    const result = parseCSV(basicCSV, { preview: 2 });
+    expect(result.rows).toHaveLength(2);
+    expect(result.totalCount).toBe(3);
+  });
+
+  it('should handle custom delimiter', () => {
+    const tsvContent = 'name\tage\nAlice\t30\nBob\t25';
+    const result = parseCSV(tsvContent, { csvDelimiter: '\t' });
+    expect(result.headers).toEqual(['name', 'age']);
+    expect(result.rows[0]).toMatchObject({ name: 'Alice', age: 30 });
+  });
+
+  it('should handle empty CSV (only headers)', () => {
+    const result = parseCSV('name,age\n');
+    expect(result.headers).toEqual(['name', 'age']);
+    expect(result.totalCount).toBe(0);
+    expect(result.rows).toHaveLength(0);
+  });
+
+  it('should handle CSV with quoted fields', () => {
+    const csv = 'name,bio\nAlice,"She said, hello"\nBob,Simple';
+    const result = parseCSV(csv);
+    expect(result.rows[0].bio).toBe('She said, hello');
+  });
+
+  it('should dynamically type numeric values', () => {
+    const csv = 'id,score\n1,9.5\n2,8.0';
+    const result = parseCSV(csv);
+    expect(typeof result.rows[0].id).toBe('number');
+    expect(typeof result.rows[0].score).toBe('number');
+  });
+});
+
+// ─── JSON ────────────────────────────────────────────────────────────────────
+
+describe('parseJSON', () => {
+  const validJSON = JSON.stringify([
+    { question: 'Q1', answer: 'A1' },
+    { question: 'Q2', answer: 'A2' },
+    { question: 'Q3', answer: 'A3' },
+  ]);
+
+  it('should parse a JSON array', () => {
+    const result = parseJSON(validJSON);
+    expect(result.format).toBe('json');
+    expect(result.headers).toEqual(['question', 'answer']);
+    expect(result.totalCount).toBe(3);
+    expect(result.rows).toHaveLength(3);
+    expect(result.rows[1]).toEqual({ question: 'Q2', answer: 'A2' });
+  });
+
+  it('should apply preview limit', () => {
+    const result = parseJSON(validJSON, { preview: 2 });
+    expect(result.rows).toHaveLength(2);
+    expect(result.totalCount).toBe(3);
+  });
+
+  it('should throw on invalid JSON', () => {
+    expect(() => parseJSON('not json at all')).toThrow();
+  });
+
+  it('should throw when JSON is not an array', () => {
+    expect(() => parseJSON('{"key":"value"}')).toThrow(
+      'JSON file must contain an array of objects',
+    );
+  });
+
+  it('should handle empty JSON array', () => {
+    const result = parseJSON('[]');
+    expect(result.headers).toEqual([]);
+    expect(result.totalCount).toBe(0);
+    expect(result.rows).toHaveLength(0);
+  });
+
+  it('should extract headers from first object only', () => {
+    const json = JSON.stringify([
+      { a: 1, b: 2 },
+      { a: 3, c: 4 }, // 'c' is extra
+    ]);
+    const result = parseJSON(json);
+    expect(result.headers).toEqual(['a', 'b']);
+  });
+});
+
+// ─── JSONL ────────────────────────────────────────────────────────────────────
+
+describe('parseJSONL', () => {
+  const validJSONL = '{"id":1,"text":"first"}\n{"id":2,"text":"second"}\n{"id":3,"text":"third"}';
+
+  it('should parse JSONL lines', () => {
+    const result = parseJSONL(validJSONL);
+    expect(result.format).toBe('jsonl');
+    expect(result.headers).toEqual(['id', 'text']);
+    expect(result.totalCount).toBe(3);
+    expect(result.rows).toHaveLength(3);
+    expect(result.rows[0]).toEqual({ id: 1, text: 'first' });
+  });
+
+  it('should apply preview limit', () => {
+    const result = parseJSONL(validJSONL, { preview: 2 });
+    expect(result.rows).toHaveLength(2);
+    expect(result.totalCount).toBe(3);
+  });
+
+  it('should throw on invalid JSON line with line number', () => {
+    const bad = '{"id":1}\nnot-json\n{"id":3}';
+    expect(() => parseJSONL(bad)).toThrow('Invalid JSON at line 2');
+  });
+
+  it('should skip blank lines', () => {
+    const withBlanks = '{"id":1}\n\n{"id":2}\n';
+    const result = parseJSONL(withBlanks);
+    expect(result.totalCount).toBe(2);
+    expect(result.rows).toHaveLength(2);
+  });
+
+  it('should handle single-line JSONL', () => {
+    const result = parseJSONL('{"only":"one"}');
+    expect(result.totalCount).toBe(1);
+    expect(result.rows[0]).toEqual({ only: 'one' });
+  });
+
+  it('should handle empty JSONL input', () => {
+    const result = parseJSONL('');
+    expect(result.totalCount).toBe(0);
+    expect(result.rows).toHaveLength(0);
+    expect(result.headers).toEqual([]);
+  });
+});
+
+// ─── XLSX ────────────────────────────────────────────────────────────────────
+
+function makeXLSXBuffer(rows: Record<string, any>[], sheetName = 'Sheet1'): Uint8Array {
+  const workbook = XLSX.utils.book_new();
+  const worksheet = XLSX.utils.json_to_sheet(rows);
+  XLSX.utils.book_append_sheet(workbook, worksheet, sheetName);
+  const buffer = XLSX.write(workbook, { bookType: 'xlsx', type: 'array' });
+  return new Uint8Array(buffer);
+}
+
+describe('parseXLSX', () => {
+  const sampleRows = [
+    { name: 'Alice', score: 95 },
+    { name: 'Bob', score: 87 },
+    { name: 'Carol', score: 72 },
+  ];
+
+  it('should parse XLSX data from Uint8Array', () => {
+    const data = makeXLSXBuffer(sampleRows);
+    const result = parseXLSX(data);
+    expect(result.format).toBe('xlsx');
+    expect(result.headers).toEqual(['name', 'score']);
+    expect(result.totalCount).toBe(3);
+    expect(result.rows).toHaveLength(3);
+    expect(result.rows[0].name).toBe('Alice');
+    expect(result.metadata?.sheetName).toBe('Sheet1');
+  });
+
+  it('should apply preview limit', () => {
+    const data = makeXLSXBuffer(sampleRows);
+    const result = parseXLSX(data, { preview: 2 });
+    expect(result.rows).toHaveLength(2);
+    expect(result.totalCount).toBe(3);
+  });
+
+  it('should select sheet by name', () => {
+    const workbook = XLSX.utils.book_new();
+    XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet([{ x: 1 }]), 'First');
+    XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet([{ y: 2 }]), 'Second');
+    const data = new Uint8Array(XLSX.write(workbook, { bookType: 'xlsx', type: 'array' }));
+
+    const result = parseXLSX(data, { sheet: 'Second' });
+    expect(result.metadata?.sheetName).toBe('Second');
+    expect(result.headers).toEqual(['y']);
+    expect(result.rows[0].y).toBe('2');
+  });
+
+  it('should select sheet by index', () => {
+    const workbook = XLSX.utils.book_new();
+    XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet([{ x: 1 }]), 'First');
+    XLSX.utils.book_append_sheet(workbook, XLSX.utils.json_to_sheet([{ y: 2 }]), 'Second');
+    const data = new Uint8Array(XLSX.write(workbook, { bookType: 'xlsx', type: 'array' }));
+
+    const result = parseXLSX(data, { sheet: 1 });
+    expect(result.metadata?.sheetName).toBe('Second');
+    expect(result.headers).toEqual(['y']);
+  });
+
+  it('should return empty result for nonexistent sheet name', () => {
+    const data = makeXLSXBuffer(sampleRows, 'Data');
+    const result = parseXLSX(data, { sheet: 'NonExistent' });
+    expect(result.rows).toHaveLength(0);
+    expect(result.headers).toEqual([]);
+    expect(result.totalCount).toBe(0);
+    expect(result.metadata?.sheetName).toBe('NonExistent');
+  });
+
+  it('should default to first sheet when no sheet option provided', () => {
+    const data = makeXLSXBuffer(sampleRows, 'MySheet');
+    const result = parseXLSX(data);
+    expect(result.metadata?.sheetName).toBe('MySheet');
+    expect(result.totalCount).toBe(3);
+  });
+});