From 216f80fc85b2f2c8fdd321ec87da7e5a0731ecaa Mon Sep 17 00:00:00 2001 From: Ronald Veth Date: Thu, 9 Apr 2026 13:16:45 +0200 Subject: [PATCH] Adds fine-grained empty value preservation options Introduces two new configuration options to control how empty CSV values are handled in the parsed output: - `preserveEmptyColumnAsEmptyString`: Controls unquoted empty cells (e.g., `,,`) - `preserveEmptyString`: Controls explicitly quoted empty strings (e.g., `""`) By default, unquoted empties are omitted while quoted empties are preserved as empty strings. This allows users to differentiate between "no value provided" and "explicitly empty" in their CSV data. Implements a clear precedence chain: defaultValues > nullValues > preservation options > omit. Both parser implementations (string-based and stream-based) support these options consistently. The implementation uses internal provenance tracking to distinguish between quoted and unquoted empty values during parsing, ensuring accurate preservation behavior without breaking existing functionality. --- .changeset/late-planets-study.md | 11 +++ README.md | 60 +++++++++++- src/csv-parser.ts | 2 +- src/csv-reader.ts | 156 ++++++++++++++++++++++--------- src/csv-stream-parser.ts | 128 +++++++++++++++++++------ src/internal-empty-cell.ts | 16 ++++ src/nested-json-converter.ts | 122 +++++++++++++++++------- src/types.ts | 21 +++++ tests/csv-stream-parser.test.ts | 53 ++++++++++- tests/parser-options.test.ts | 123 ++++++++++++++++++++++++ 10 files changed, 581 insertions(+), 111 deletions(-) create mode 100644 .changeset/late-planets-study.md create mode 100644 src/internal-empty-cell.ts diff --git a/.changeset/late-planets-study.md b/.changeset/late-planets-study.md new file mode 100644 index 0000000..9c83421 --- /dev/null +++ b/.changeset/late-planets-study.md @@ -0,0 +1,11 @@ +--- +"@cerios/csv-nested-json": patch +--- + +Add empty-value preservation controls for nested output in both `CsvParser` and `CsvStreamParser`. + +- Add `preserveEmptyColumnAsEmptyString` to preserve unquoted empty cells (for example `,,`). +- Add `preserveEmptyString` to preserve explicitly quoted empty cells (for example `""` with the configured quote character), enabled by default. +- Keep empty-value behavior aligned between sync and streaming parsing paths. +- Apply empty-value precedence consistently: `defaultValues`, then `nullValues` + `nullRepresentation`, then preserve options, then omit. +- Treat quoted-empty identifier values as continuation rows and prevent continuation rows from overriding the active grouping identifier. \ No newline at end of file diff --git a/README.md b/README.md index ad456a3..9ba06c1 100644 --- a/README.md +++ b/README.md @@ -636,6 +636,42 @@ const result = CsvParser.parseString(csvContent, { // ] ``` +### Empty Value Preservation + +By default, unquoted empty values are omitted and explicitly quoted empty values are preserved. You can control each case independently. + +```typescript +const csvContent = `id,emptyColumn,emptyQuoted +1,,""`; + +// Preserve only unquoted empties: ,, -> '' +const preserveColumns = CsvParser.parseString(csvContent, { + preserveEmptyColumnAsEmptyString: true, + preserveEmptyString: false +}); +// [{ id: "1", emptyColumn: "" }] + +// Preserve only quoted empties: "" -> '' +const preserveQuoted = CsvParser.parseString(csvContent, { + preserveEmptyString: true +}); +// [{ id: "1", emptyQuoted: "" }] + +// Preserve both +const preserveBoth = CsvParser.parseString(csvContent, { + preserveEmptyColumnAsEmptyString: true, + preserveEmptyString: true +}); +// [{ id: "1", emptyColumn: "", emptyQuoted: "" }] +``` + +When multiple options apply, precedence is: + +1. `defaultValues` +2. `nullValues` + `nullRepresentation` +3. `preserveEmptyColumnAsEmptyString` / `preserveEmptyString` +4. Omit + ### Null Value Handling ```typescript @@ -929,6 +965,10 @@ interface CsvParserOptions { // Default values defaultValues?: Record; // Default values for empty cells + // Empty value preservation + preserveEmptyColumnAsEmptyString?: boolean; // Preserve unquoted empties: ,, + preserveEmptyString?: boolean; // Preserve quoted empties: "" + // Row grouping identifierColumn?: string; // Column for grouping continuation rows } @@ -1139,6 +1179,20 @@ Default values for columns when cells are empty. defaultValues: { status: 'pending', country: 'Unknown' } ``` +#### `preserveEmptyColumnAsEmptyString` + +Preserve unquoted empty columns (for example `,,`) as `''` in nested output. + +- Default: `false` + +#### `preserveEmptyString` + +Preserve explicitly quoted empty strings (for example `""` with the default quote character) as `''` in nested output. + +- Default: `true` + +Both options work for `CsvParser` and `CsvStreamParser`. Set `preserveEmptyString: false` if you want quoted empties omitted. + ### Complete Example with All Options ```typescript @@ -1188,7 +1242,11 @@ const result = await CsvParser.parseFile('./data.csv', { nullRepresentation: 'null', // Defaults - defaultValues: { status: 'pending' } + defaultValues: { status: 'pending' }, + + // Empty value preservation + preserveEmptyColumnAsEmptyString: true, + preserveEmptyString: true }); ``` diff --git a/src/csv-parser.ts b/src/csv-parser.ts index e6ef3fa..faba96f 100644 --- a/src/csv-parser.ts +++ b/src/csv-parser.ts @@ -155,7 +155,7 @@ export abstract class CsvParser { * ``` */ static parseString(csvContent: string, options: CsvParserOptions = {}): T[] { - const records = CsvReader.parse(csvContent, options); + const records = CsvReader.parseWithQuotedEmptyProvenance(csvContent, options); return NestedJsonConverter.convert(records, options) as T[]; } } diff --git a/src/csv-reader.ts b/src/csv-reader.ts index 95243ff..c6d3241 100644 --- a/src/csv-reader.ts +++ b/src/csv-reader.ts @@ -1,4 +1,11 @@ import { CsvDuplicateHeaderError, CsvValidationError } from "./errors"; +import { + type InternalCsvCellValue, + type InternalCsvRecord, + isEmptyCsvCellValue, + QUOTED_EMPTY_CELL, + toPublicCsvCellValue, +} from "./internal-empty-cell"; import type { CsvParserOptions, CsvRecord, DuplicateHeaderStrategy, ValidationMode } from "./types"; /** @@ -35,6 +42,23 @@ export class CsvReader { * ``` */ static parse(content: string, options: CsvParserOptions = {}): CsvRecord[] { + return this.parseInternal(content, options, false) as CsvRecord[]; + } + + /** + * Parse CSV content with internal quoted-empty provenance tracking. + * + * @internal + */ + static parseWithQuotedEmptyProvenance(content: string, options: CsvParserOptions = {}): InternalCsvRecord[] { + return this.parseInternal(content, options, true); + } + + private static parseInternal( + content: string, + options: CsvParserOptions, + preserveQuotedEmpty: boolean + ): InternalCsvRecord[] { if (!content || content.trim() === "") { return []; } @@ -91,13 +115,13 @@ export class CsvReader { headers = processedHeaders; // Parse data rows - const records: CsvRecord[] = []; + const records: InternalCsvRecord[] = []; let dataRowIndex = 0; for (let i = dataStartIndex + 1; i < lines.length; i++) { const line = lines[i].trim(); if (line === "") continue; // Skip empty lines - const values = this.parseLine(lines[i], delimiter, quote); + const values = this.parseLine(lines[i], delimiter, quote, preserveQuotedEmpty); // Validate column count if (values.length > headers.length && validationMode !== "ignore") { @@ -113,48 +137,11 @@ export class CsvReader { } } - const record: CsvRecord = {}; - for (let j = 0; j < headers.length; j++) { - // Get the original column index for this filtered header - const originalIndex = includedIndices[j]; - let value = originalIndex < values.length ? values[originalIndex] : ""; - - // Apply default value if cell is empty - if (value === "" && options.defaultValues?.[headers[j]] !== undefined) { - value = options.defaultValues[headers[j]]; - } - - const header = headers[j]; - - // Handle duplicate values based on strategy - if (duplicateIndices.has(j)) { - switch (dupStrategy) { - case "first": - // Only set if not already present - if (!(header in record)) { - record[header] = value; - } - break; - case "combine": - // Combine into comma-separated values (arrays handled by NestedJsonConverter) - if (header in record) { - record[header] = record[header] ? `${record[header]},${value}` : value; - } else { - record[header] = value; - } - break; - default: - // Just overwrite (default behavior) - record[header] = value; - break; - } - } else { - record[header] = value; - } - } + const record = this.createRecord(values, headers, includedIndices, duplicateIndices, dupStrategy, options); // Apply row filter if specified - if (options.rowFilter && !options.rowFilter(record, dataRowIndex)) { + const rowFilterRecord = preserveQuotedEmpty ? this.toPublicRecord(record) : (record as CsvRecord); + if (options.rowFilter && !options.rowFilter(rowFilterRecord, dataRowIndex)) { dataRowIndex++; continue; } @@ -166,6 +153,61 @@ export class CsvReader { return records; } + private static createRecord( + values: InternalCsvCellValue[], + headers: string[], + includedIndices: number[], + duplicateIndices: Set, + dupStrategy: DuplicateHeaderStrategy, + options: CsvParserOptions + ): InternalCsvRecord { + const record: InternalCsvRecord = {}; + + for (let j = 0; j < headers.length; j++) { + const originalIndex = includedIndices[j]; + let value: InternalCsvCellValue = originalIndex < values.length ? values[originalIndex] : ""; + + if (isEmptyCsvCellValue(value) && options.defaultValues?.[headers[j]] !== undefined) { + value = options.defaultValues[headers[j]]; + } + + const header = headers[j]; + + if (duplicateIndices.has(j)) { + switch (dupStrategy) { + case "first": + if (!(header in record)) { + record[header] = value; + } + break; + case "combine": { + const existing = header in record ? toPublicCsvCellValue(record[header]) : ""; + const incoming = toPublicCsvCellValue(value); + record[header] = existing ? `${existing},${incoming}` : incoming; + break; + } + default: + record[header] = value; + break; + } + } else { + record[header] = value; + } + } + + return record; + } + + private static toPublicRecord(record: InternalCsvRecord): CsvRecord { + const publicRecord: CsvRecord = {}; + + for (const [header, value] of Object.entries(record)) { + publicRecord[header] = toPublicCsvCellValue(value); + } + + return publicRecord; + } + /** * Strip BOM (Byte Order Mark) from the beginning of content. * Handles UTF-8 and UTF-16 BOMs. @@ -266,10 +308,18 @@ export class CsvReader { * // ['1', 'Say "Hello"', 'test'] * ``` */ - static parseLine(line: string, delimiter = ",", quote = '"'): string[] { - const values: string[] = []; + static parseLine(line: string, delimiter?: string, quote?: string): string[]; + static parseLine( + line: string, + delimiter: string, + quote: string, + preserveQuotedEmpty: boolean + ): InternalCsvCellValue[]; + static parseLine(line: string, delimiter = ",", quote = '"', preserveQuotedEmpty = false): InternalCsvCellValue[] { + const values: InternalCsvCellValue[] = []; let currentValue = ""; let insideQuotes = false; + let fieldWasQuoted = false; for (let i = 0; i < line.length; i++) { const char = line[i]; @@ -283,22 +333,36 @@ export class CsvReader { } else { // Toggle quote state insideQuotes = !insideQuotes; + fieldWasQuoted = true; } } else if (char === delimiter && !insideQuotes) { // Field delimiter - values.push(currentValue); + values.push(this.finalizeParsedCellValue(currentValue, fieldWasQuoted, preserveQuotedEmpty)); currentValue = ""; + fieldWasQuoted = false; } else { currentValue += char; } } // Don't forget the last value - values.push(currentValue); + values.push(this.finalizeParsedCellValue(currentValue, fieldWasQuoted, preserveQuotedEmpty)); return values; } + private static finalizeParsedCellValue( + value: string, + fieldWasQuoted: boolean, + preserveQuotedEmpty: boolean + ): InternalCsvCellValue { + if (preserveQuotedEmpty && fieldWasQuoted && value === "") { + return QUOTED_EMPTY_CELL; + } + + return value; + } + /** * Detect duplicate headers and process them according to the strategy. * Returns the processed headers and a set of indices that are duplicates. diff --git a/src/csv-stream-parser.ts b/src/csv-stream-parser.ts index 7bdf2be..93d5369 100644 --- a/src/csv-stream-parser.ts +++ b/src/csv-stream-parser.ts @@ -1,5 +1,13 @@ import { Transform, type TransformCallback, type TransformOptions } from "node:stream"; import { CsvDuplicateHeaderError, CsvParseError } from "./errors"; +import { + type InternalCsvCellValue, + type InternalCsvRecord, + isEmptyCsvCellValue, + isQuotedEmptyCell, + QUOTED_EMPTY_CELL, + toPublicCsvCellValue, +} from "./internal-empty-cell"; import { NestedJsonConverter } from "./nested-json-converter"; import type { CsvParserOptions, CsvRecord, DuplicateHeaderStrategy, NestedObject, ProgressCallback } from "./types"; @@ -133,7 +141,7 @@ export class CsvStreamParser extends Transform { private recordBatch: NestedObject[] = []; // Continuation grouping - private groupedRecords: CsvRecord[] = []; + private groupedRecords: InternalCsvRecord[] = []; private groupingIdentifierColumn: string | null = null; private maxContinuationGroupSize: number; @@ -401,10 +409,9 @@ export class CsvStreamParser extends Transform { return; } - const values = this.parseLine(line); - // First non-skipped line is the header if (!this.headersProcessed) { + const values = this.parseLine(line, false); const originalHeaders = values; // Apply column filtering FIRST (based on original header names) @@ -436,11 +443,13 @@ export class CsvStreamParser extends Transform { return; } + const values = this.parseLine(line, true); + // Create record from values const record = this.createRecord(values); // Apply row filter if specified - if (this.options.rowFilter && !this.options.rowFilter(record, this.dataRowIndex)) { + if (this.options.rowFilter && !this.options.rowFilter(this.toPublicRecord(record), this.dataRowIndex)) { this.dataRowIndex++; return; } @@ -495,7 +504,7 @@ export class CsvStreamParser extends Transform { /** * Add a record to the active continuation group with memory guardrails. */ - private pushGroupedRecord(record: CsvRecord): void { + private pushGroupedRecord(record: InternalCsvRecord): void { if (this.groupedRecords.length >= this.maxContinuationGroupSize) { const identifierColumn = this.groupingIdentifierColumn ?? this.options.identifierColumn ?? "(unknown)"; throw new CsvParseError( @@ -509,11 +518,10 @@ export class CsvStreamParser extends Transform { /** * Buffer records and flush groups when a new identifier value is encountered. */ - private bufferGroupedRecord(record: CsvRecord): void { + private bufferGroupedRecord(record: InternalCsvRecord): void { const identifierColumn = this.groupingIdentifierColumn; const identifierValue = identifierColumn ? record[identifierColumn] : undefined; - const startsNewGroup = - identifierValue !== undefined && identifierValue !== null && String(identifierValue).trim() !== ""; + const startsNewGroup = this.hasIdentifierValue(identifierValue); if (this.groupedRecords.length === 0) { if (!startsNewGroup) { @@ -535,6 +543,14 @@ export class CsvStreamParser extends Transform { this.pushGroupedRecord(record); } + private hasIdentifierValue(value: InternalCsvCellValue | undefined): boolean { + if (isEmptyCsvCellValue(value)) { + return false; + } + + return String(value).trim() !== ""; + } + /** * Flush buffered grouped records through NestedJsonConverter. */ @@ -599,10 +615,13 @@ export class CsvStreamParser extends Transform { /** * Parse a single line into values. */ - private parseLine(line: string): string[] { - const values: string[] = []; + private parseLine(line: string, preserveQuotedEmpty: false): string[]; + private parseLine(line: string, preserveQuotedEmpty: true): InternalCsvCellValue[]; + private parseLine(line: string, preserveQuotedEmpty: boolean): InternalCsvCellValue[] { + const values: InternalCsvCellValue[] = []; let currentValue = ""; let insideQuotes = false; + let fieldWasQuoted = false; for (let i = 0; i < line.length; i++) { const char = line[i]; @@ -615,31 +634,45 @@ export class CsvStreamParser extends Transform { i++; } else { insideQuotes = !insideQuotes; + fieldWasQuoted = true; } } else if (char === this.delimiter && !insideQuotes) { - values.push(currentValue); + values.push(this.finalizeParsedCellValue(currentValue, fieldWasQuoted, preserveQuotedEmpty)); currentValue = ""; + fieldWasQuoted = false; } else { currentValue += char; } } - values.push(currentValue); + values.push(this.finalizeParsedCellValue(currentValue, fieldWasQuoted, preserveQuotedEmpty)); return values; } + private finalizeParsedCellValue( + value: string, + fieldWasQuoted: boolean, + preserveQuotedEmpty: boolean + ): InternalCsvCellValue { + if (preserveQuotedEmpty && fieldWasQuoted && value === "") { + return QUOTED_EMPTY_CELL; + } + + return value; + } + /** * Create a record object from values array. */ - private createRecord(values: string[]): Record { - const record: Record = {}; + private createRecord(values: InternalCsvCellValue[]): InternalCsvRecord { + const record: InternalCsvRecord = {}; for (let i = 0; i < this.headers.length; i++) { // Get the original column index for this filtered header const originalIndex = this.includedIndices[i]; - let value = originalIndex < values.length ? values[originalIndex] : ""; + let value: InternalCsvCellValue = originalIndex < values.length ? values[originalIndex] : ""; // Apply default value if cell is empty - if (value === "" && this.options.defaultValues?.[this.headers[i]] !== undefined) { + if (isEmptyCsvCellValue(value) && this.options.defaultValues?.[this.headers[i]] !== undefined) { value = this.options.defaultValues[this.headers[i]]; } @@ -654,14 +687,17 @@ export class CsvStreamParser extends Transform { record[header] = value; } break; - case "combine": + case "combine": { // Combine into comma-separated values + const incoming = toPublicCsvCellValue(value); if (header in record) { - record[header] = record[header] ? `${record[header]},${value}` : value; + const existing = toPublicCsvCellValue(record[header]); + record[header] = existing ? `${existing},${incoming}` : incoming; } else { - record[header] = value; + record[header] = incoming; } break; + } default: // Just overwrite (default behavior) record[header] = value; @@ -674,12 +710,22 @@ export class CsvStreamParser extends Transform { return record; } + private toPublicRecord(record: InternalCsvRecord): CsvRecord { + const publicRecord: CsvRecord = {}; + + for (const [header, value] of Object.entries(record)) { + publicRecord[header] = toPublicCsvCellValue(value); + } + + return publicRecord; + } + /** * Apply value transformations to a record. */ private applyTransformations( - record: Record - ): Record { + record: InternalCsvRecord + ): Record { const { autoParseNumbers, preserveUnsafeIntegersAsString, @@ -694,10 +740,23 @@ export class CsvStreamParser extends Transform { return record; } - const transformed: Record = {}; + const transformed: Record = + {}; for (const [header, value] of Object.entries(record)) { - let transformedValue: string | number | boolean | Date | null | undefined = value; + let transformedValue: string | number | boolean | Date | null | undefined | typeof QUOTED_EMPTY_CELL = value; + + if (isQuotedEmptyCell(value)) { + if (nullValues !== undefined && this.nullSet.has("")) { + transformedValue = this.applyNullRepresentation(nullRepresentation); + if (nullRepresentation === "omit") { + continue; + } + } + + transformed[header] = transformedValue; + continue; + } // Handle empty values if (value === "") { @@ -826,12 +885,27 @@ export class CsvStreamParser extends Transform { /** * Unflatten a record with dot-notation keys into a nested object. */ - private unflatten(record: Record): NestedObject { + private unflatten( + record: Record + ): NestedObject { const result: NestedObject = {}; + const preserveEmptyColumns = this.options.preserveEmptyColumnAsEmptyString === true; + const preserveEmptyStrings = this.options.preserveEmptyString !== false; for (const [key, value] of Object.entries(record)) { - // Skip empty strings and undefined, but preserve null as a valid value - if (value === "" || value === undefined) continue; + if (value === undefined) continue; + + let normalizedValue: string | number | boolean | Date | null; + + if (isQuotedEmptyCell(value)) { + if (!preserveEmptyStrings) continue; + normalizedValue = ""; + } else if (value === "") { + if (!preserveEmptyColumns) continue; + normalizedValue = ""; + } else { + normalizedValue = value; + } // Remove array suffix if present const arraySuffix = this.options.arraySuffixIndicator ?? "[]"; @@ -851,7 +925,7 @@ export class CsvStreamParser extends Transform { current = current[part] as NestedObject; } - current[parts[parts.length - 1]] = value; + current[parts[parts.length - 1]] = normalizedValue; } return result; diff --git a/src/internal-empty-cell.ts b/src/internal-empty-cell.ts new file mode 100644 index 0000000..535e250 --- /dev/null +++ b/src/internal-empty-cell.ts @@ -0,0 +1,16 @@ +export const QUOTED_EMPTY_CELL = Symbol("quoted-empty-cell"); + +export type InternalCsvCellValue = string | typeof QUOTED_EMPTY_CELL; +export type InternalCsvRecord = Record; + +export function isQuotedEmptyCell(value: unknown): value is typeof QUOTED_EMPTY_CELL { + return value === QUOTED_EMPTY_CELL; +} + +export function isEmptyCsvCellValue(value: InternalCsvCellValue | null | undefined): boolean { + return value === undefined || value === null || value === "" || isQuotedEmptyCell(value); +} + +export function toPublicCsvCellValue(value: InternalCsvCellValue): string { + return isQuotedEmptyCell(value) ? "" : value; +} diff --git a/src/nested-json-converter.ts b/src/nested-json-converter.ts index b86c15d..e5bb65b 100644 --- a/src/nested-json-converter.ts +++ b/src/nested-json-converter.ts @@ -1,4 +1,5 @@ import { CsvParseError } from "./errors"; +import { type InternalCsvRecord, isQuotedEmptyCell, QUOTED_EMPTY_CELL } from "./internal-empty-cell"; import type { CsvParserOptions, CsvRecord, @@ -9,6 +10,10 @@ import type { RowContext, } from "./types"; +type ConvertibleCsvRecord = CsvRecord | InternalCsvRecord; +type TransformedRecordValue = string | number | boolean | Date | null | undefined | typeof QUOTED_EMPTY_CELL; +type TransformedRecord = Record; + /** * Nested JSON conversion utilities. * Converts flat CSV records into nested JSON structures with automatic array detection. @@ -57,7 +62,7 @@ export class NestedJsonConverter { * // [{ id: 1, active: true, score: 95.5 }] * ``` */ - static convert(records: CsvRecord[], options: CsvParserOptions = {}): NestedObject[] { + static convert(records: ConvertibleCsvRecord[], options: CsvParserOptions = {}): NestedObject[] { if (records.length === 0) return []; const arraySuffix = options.arraySuffixIndicator ?? "[]"; @@ -101,21 +106,20 @@ export class NestedJsonConverter { const identifierColumn = configuredIdentifier ?? availableColumns[0]; // Group by the identifier column - const groups: NestedObject[][] = []; - let currentGroup: NestedObject[] = []; + const groups: TransformedRecord[][] = []; + let currentGroup: TransformedRecord[] = []; for (let rowIndex = 0; rowIndex < transformedRecords.length; rowIndex++) { const row = transformedRecords[rowIndex]; const identifierValue = row[identifierColumn]; - const hasIdentifierValue = - identifierValue !== undefined && identifierValue !== null && String(identifierValue).trim() !== ""; + const hasIdentifierValue = this.hasIdentifierValue(identifierValue); // Check if the identifier column has a value if (hasIdentifierValue) { if (currentGroup.length > 0) { groups.push(currentGroup); } - currentGroup = [row as NestedObject]; + currentGroup = [row]; } else { if (currentGroup.length === 0) { throw new CsvParseError( @@ -123,7 +127,12 @@ export class NestedJsonConverter { ); } - currentGroup.push(row as NestedObject); + // Continuation rows should never overwrite the grouping identifier value. + const continuationRow: TransformedRecord = { + ...row, + [identifierColumn]: undefined, + }; + currentGroup.push(continuationRow); } } if (currentGroup.length > 0) { @@ -131,7 +140,9 @@ export class NestedJsonConverter { } // First pass: process all groups with hierarchy-aware merging - const processedGroups = groups.map(group => this.processGroupWithHierarchy(group, hierarchy, forcedArrayFields)); + const processedGroups = groups.map(group => + this.processGroupWithHierarchy(group, hierarchy, forcedArrayFields, options) + ); // Second pass: detect which fields are arrays in any group (auto-detected) const autoArrayFields = this.detectArrayFields(processedGroups); @@ -145,14 +156,22 @@ export class NestedJsonConverter { ); } + private static hasIdentifierValue(value: TransformedRecordValue): boolean { + if (value === undefined || value === null || isQuotedEmptyCell(value)) { + return false; + } + + return String(value).trim() !== ""; + } + /** * Apply value transformations (null detection, auto-parse numbers, booleans, dates, custom transformer). * Transformation order: nullValues → autoParseNumbers → autoParseBooleans → autoParseDates → valueTransformer */ private static applyValueTransformations( - records: CsvRecord[], + records: InternalCsvRecord[], options: CsvParserOptions - ): Record[] { + ): TransformedRecord[] { const { autoParseNumbers, preserveUnsafeIntegersAsString, @@ -168,14 +187,26 @@ export class NestedJsonConverter { // If no transformations are needed, return records as-is if (!autoParseNumbers && !autoParseBooleans && !autoParseDates && !valueTransformer && nullValues === undefined) { - return records; + return records as TransformedRecord[]; } return records.map(record => { - const transformed: Record = {}; + const transformed: TransformedRecord = {}; for (const [header, value] of Object.entries(record)) { - let transformedValue: string | number | boolean | Date | null | undefined = value; + let transformedValue: TransformedRecordValue = value; + + if (isQuotedEmptyCell(value)) { + if (nullValues !== undefined && nullSet.has("")) { + transformedValue = this.applyNullRepresentation(nullRepresentation); + if (nullRepresentation === "omit") { + continue; + } + } + + transformed[header] = transformedValue; + continue; + } // Skip empty values (unless they match nullValues) if (value === "") { @@ -312,7 +343,7 @@ export class NestedJsonConverter { return null; } - private static detectForcedArrayFields(records: CsvRecord[], arraySuffix: string): Set { + private static detectForcedArrayFields(records: ConvertibleCsvRecord[], arraySuffix: string): Set { if (records.length === 0 || !arraySuffix) return new Set(); const forcedFields = new Set(); @@ -341,11 +372,11 @@ export class NestedJsonConverter { return forcedFields; } - private static normalizeHeaders(records: CsvRecord[], arraySuffix: string): CsvRecord[] { - if (!arraySuffix) return records; + private static normalizeHeaders(records: ConvertibleCsvRecord[], arraySuffix: string): InternalCsvRecord[] { + if (!arraySuffix) return records as InternalCsvRecord[]; return records.map(record => { - const normalized: CsvRecord = {}; + const normalized: InternalCsvRecord = {}; for (const [key, value] of Object.entries(record)) { // Remove all occurrences of the array suffix from the key const normalizedKey = key @@ -358,10 +389,10 @@ export class NestedJsonConverter { }); } - private static processGroup(rows: NestedObject[]): NestedObject { + private static processGroup(rows: TransformedRecord[], options: CsvParserOptions): NestedObject { const result: NestedObject = {}; for (const row of rows) { - const rowObj = this.unflatten(row); + const rowObj = this.unflatten(row, options); this.deepMerge(result, rowObj); } return result; @@ -452,7 +483,7 @@ export class NestedJsonConverter { /** * Analyze a row to determine merge behavior based on which fields have values. */ - private static analyzeRowContext(row: NestedObject, hierarchy: ForcedArrayHierarchy): RowContext { + private static analyzeRowContext(row: TransformedRecord, hierarchy: ForcedArrayHierarchy): RowContext { const context: RowContext = { populatedPaths: new Set(), hasSiblingValues: new Map(), @@ -460,7 +491,7 @@ export class NestedJsonConverter { // Find all populated paths (normalized flat paths with values) for (const [key, value] of Object.entries(row)) { - if (value !== "" && value !== undefined && value !== null) { + if (!this.isEffectivelyEmptyValue(value)) { context.populatedPaths.add(key); } } @@ -501,15 +532,16 @@ export class NestedJsonConverter { * "create new parent item" vs "append to nested array in existing item". */ private static processGroupWithHierarchy( - rows: NestedObject[], + rows: TransformedRecord[], hierarchy: ForcedArrayHierarchy, - forcedArrayFields: Set + forcedArrayFields: Set, + options: CsvParserOptions ): NestedObject { if (rows.length === 0) return {}; // If no forced array fields, use the simple merge if (forcedArrayFields.size === 0) { - return this.processGroup(rows); + return this.processGroup(rows, options); } const result: NestedObject = {}; @@ -518,7 +550,7 @@ export class NestedJsonConverter { for (let i = 0; i < rows.length; i++) { const row = rows[i]; const isFirstRow = i === 0; - const unflattened = this.unflatten(row); + const unflattened = this.unflatten(row, options); if (isFirstRow) { // First row: merge normally and track last items for each forced array @@ -633,7 +665,7 @@ export class NestedJsonConverter { rowContext: RowContext, mergeState: MergeState, forcedArrayFields: Set, - flatRow: NestedObject + flatRow: TransformedRecord ): void { // Determine which array paths need new items vs append to existing const createNewItemAt = new Set(); @@ -684,7 +716,11 @@ export class NestedJsonConverter { * Check if the only data under this path is in child forced array fields. * Returns false if there are no child forced arrays. */ - private static hasOnlyChildArrayData(flatRow: NestedObject, path: string, hierarchy: ForcedArrayHierarchy): boolean { + private static hasOnlyChildArrayData( + flatRow: TransformedRecord, + path: string, + hierarchy: ForcedArrayHierarchy + ): boolean { const prefix = `${path}.`; const childArrayPaths = hierarchy.childrenMap.get(path) || new Set(); @@ -697,7 +733,7 @@ export class NestedJsonConverter { for (const [key, value] of Object.entries(flatRow)) { if (!key.startsWith(prefix)) continue; - if (value === "" || value === undefined) continue; + if (this.isEffectivelyEmptyValue(value)) continue; hasAnyData = true; @@ -724,10 +760,10 @@ export class NestedJsonConverter { /** * Check if there's any data under a given path in the flat row. */ - private static hasDataUnderPath(flatRow: NestedObject, path: string): boolean { + private static hasDataUnderPath(flatRow: TransformedRecord, path: string): boolean { const prefix = `${path}.`; for (const key of Object.keys(flatRow)) { - if ((key === path || key.startsWith(prefix)) && flatRow[key] !== "" && flatRow[key] !== undefined) { + if ((key === path || key.startsWith(prefix)) && !this.isEffectivelyEmptyValue(flatRow[key])) { return true; } } @@ -971,11 +1007,25 @@ export class NestedJsonConverter { return this.ensureArrayAtPath(obj, relativePath); } - private static unflatten(row: NestedObject): NestedObject { + private static unflatten(row: TransformedRecord, options: CsvParserOptions): NestedObject { const result: NestedObject = {}; + const preserveEmptyColumns = options.preserveEmptyColumnAsEmptyString === true; + const preserveEmptyStrings = options.preserveEmptyString !== false; + for (const [key, value] of Object.entries(row)) { - // Skip empty strings and undefined, but preserve null as a valid value - if (value === "" || value === undefined) continue; + if (value === undefined) continue; + + let normalizedValue: NestedValue; + + if (isQuotedEmptyCell(value)) { + if (!preserveEmptyStrings) continue; + normalizedValue = ""; + } else if (value === "") { + if (!preserveEmptyColumns) continue; + normalizedValue = ""; + } else { + normalizedValue = value as NestedValue; + } const parts = key.split("."); let current: NestedObject = result; @@ -984,11 +1034,15 @@ export class NestedJsonConverter { if (!current[part]) current[part] = {}; current = current[part] as NestedObject; } - current[parts[parts.length - 1]] = value; + current[parts[parts.length - 1]] = normalizedValue; } return result; } + private static isEffectivelyEmptyValue(value: unknown): boolean { + return value === "" || value === undefined || value === null || isQuotedEmptyCell(value); + } + private static deepMerge(target: NestedObject, source: NestedObject): void { for (const key of Object.keys(source)) { const sourceValue = source[key]; diff --git a/src/types.ts b/src/types.ts index 911b151..661cd4e 100644 --- a/src/types.ts +++ b/src/types.ts @@ -462,6 +462,27 @@ export interface CsvParserOptions { */ nullRepresentation?: NullRepresentation; + /** + * Preserve unquoted empty cells as empty strings in nested output. + * + * @remarks + * This option only applies to unquoted empty columns such as `,,`. + * Explicit quoted empty strings are controlled by `preserveEmptyString`. + * + * @default false + */ + preserveEmptyColumnAsEmptyString?: boolean; + + /** + * Preserve explicitly quoted empty strings as empty strings in nested output. + * + * @remarks + * This option applies to values such as `""` (or the configured quote character equivalent). + * + * @default true + */ + preserveEmptyString?: boolean; + /** * Maximum number of records to parse. * Parsing stops after this limit is reached. diff --git a/tests/csv-stream-parser.test.ts b/tests/csv-stream-parser.test.ts index ef05348..77788f5 100644 --- a/tests/csv-stream-parser.test.ts +++ b/tests/csv-stream-parser.test.ts @@ -506,8 +506,8 @@ Line 2"`; records.push(record as NestedObject); } - // Empty values are omitted - expect(records).toEqual([{ id: "1" }]); + // Explicit quoted empties are preserved by default + expect(records).toEqual([{ id: "1", value: "" }]); }); }); @@ -674,6 +674,55 @@ g1,2,c`; expect(records).toEqual([{ id: "1", name: "Alice" }]); }); + it("should preserve only unquoted empty columns when configured", async () => { + const csvContent = 'id,emptyColumn,emptyQuoted\n1,,""'; + const stream = Readable.from([csvContent]); + const parser = new CsvStreamParser({ + preserveEmptyColumnAsEmptyString: true, + preserveEmptyString: false, + }); + + const records: NestedObject[] = []; + for await (const record of stream.pipe(parser)) { + records.push(record as NestedObject); + } + + expect(records).toEqual([{ id: "1", emptyColumn: "" }]); + }); + + it("should preserve only quoted empty strings when configured", async () => { + const csvContent = 'id,emptyColumn,emptyQuoted\n1,,""'; + const stream = Readable.from([csvContent]); + const parser = new CsvStreamParser({ + preserveEmptyString: true, + }); + + const records: NestedObject[] = []; + for await (const record of stream.pipe(parser)) { + records.push(record as NestedObject); + } + + expect(records).toEqual([{ id: "1", emptyQuoted: "" }]); + }); + + it("should treat quoted-empty identifier values as continuation rows", async () => { + const csvContent = 'id,tags[]\n1,a\n"",b\n2,c'; + const stream = Readable.from([csvContent]); + const parser = new CsvStreamParser({ + preserveEmptyString: true, + }); + + const records: NestedObject[] = []; + for await (const record of stream.pipe(parser)) { + records.push(record as NestedObject); + } + + expect(records).toEqual([ + { id: "1", tags: ["a", "b"] }, + { id: "2", tags: ["c"] }, + ]); + }); + it("should throw when row with all empty values starts a group", async () => { const csvContent = `id,name,email ,,`; diff --git a/tests/parser-options.test.ts b/tests/parser-options.test.ts index b57ce7e..1d826da 100644 --- a/tests/parser-options.test.ts +++ b/tests/parser-options.test.ts @@ -1,3 +1,4 @@ +import { Readable } from "node:stream"; import type { HeaderTransformer, RowFilter } from "../src"; import { CsvParser, CsvStreamParser } from "../src"; @@ -496,6 +497,128 @@ describe("Parser Options - New Features", () => { }); }); + describe("empty value preservation", () => { + it("should omit unquoted empty values and preserve quoted empties by default", () => { + const csv = 'id,emptyColumn,emptyQuoted\n1,,""'; + + const result = CsvParser.parseString(csv); + + expect(result).toEqual([{ id: "1", emptyQuoted: "" }]); + }); + + it("should preserve only unquoted empty columns", () => { + const csv = 'id,emptyColumn,emptyQuoted\n1,,""'; + + const result = CsvParser.parseString(csv, { + preserveEmptyColumnAsEmptyString: true, + preserveEmptyString: false, + }); + + expect(result).toEqual([{ id: "1", emptyColumn: "" }]); + }); + + it("should preserve only quoted empty strings", () => { + const csv = 'id,emptyColumn,emptyQuoted\n1,,""'; + + const result = CsvParser.parseString(csv, { + preserveEmptyString: true, + }); + + expect(result).toEqual([{ id: "1", emptyQuoted: "" }]); + }); + + it("should preserve both kinds of empty values when both options are enabled", () => { + const csv = 'id,emptyColumn,emptyQuoted\n1,,""'; + + const result = CsvParser.parseString(csv, { + preserveEmptyColumnAsEmptyString: true, + preserveEmptyString: true, + }); + + expect(result).toEqual([{ id: "1", emptyColumn: "", emptyQuoted: "" }]); + }); + + it("should keep defaultValues precedence over preserve options", () => { + const csv = 'id,emptyColumn,emptyQuoted\n1,,""'; + + const result = CsvParser.parseString(csv, { + defaultValues: { + emptyColumn: "fallback-column", + emptyQuoted: "fallback-quoted", + }, + preserveEmptyColumnAsEmptyString: true, + preserveEmptyString: true, + }); + + expect(result).toEqual([ + { + id: "1", + emptyColumn: "fallback-column", + emptyQuoted: "fallback-quoted", + }, + ]); + }); + + it("should apply null handling before preserve options", () => { + const csv = 'id,emptyColumn,emptyQuoted\n1,,""'; + + const result = CsvParser.parseString(csv, { + nullValues: [""], + nullRepresentation: "null", + preserveEmptyColumnAsEmptyString: true, + preserveEmptyString: true, + }); + + expect(result).toEqual([ + { + id: "1", + emptyColumn: null, + emptyQuoted: null, + }, + ]); + }); + + it("should keep quoted-empty identifier values as continuation rows", () => { + const csv = 'id,tags[]\n1,a\n"",b\n2,c'; + + const result = CsvParser.parseString(csv, { + preserveEmptyString: true, + }); + + expect(result).toEqual([ + { id: "1", tags: ["a", "b"] }, + { id: "2", tags: ["c"] }, + ]); + }); + + it("should support quoted-empty preservation with custom quote character", () => { + const csv = "id,emptyQuoted,emptyColumn\n1,'',\n2,'value',"; + + const result = CsvParser.parseString(csv, { + quote: "'", + preserveEmptyString: true, + }); + + expect(result).toEqual([ + { id: "1", emptyQuoted: "" }, + { id: "2", emptyQuoted: "value" }, + ]); + }); + + it("should match CsvStreamParser output for empty preservation scenarios", async () => { + const csv = 'id,emptyColumn,emptyQuoted\n1,,""\n2,filled,""'; + const options = { + preserveEmptyString: true, + preserveEmptyColumnAsEmptyString: false, + }; + + const fromParser = CsvParser.parseString(csv, options); + const fromStream = await CsvStreamParser.parseStream(Readable.from([csv]), options); + + expect(fromStream).toEqual(fromParser); + }); + }); + // ============================================================================= // Combined Features Tests // =============================================================================