Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .changeset/late-planets-study.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
"@cerios/csv-nested-json": patch
---

Add empty-value preservation controls for nested output in both `CsvParser` and `CsvStreamParser`.

- Add `preserveEmptyColumnAsEmptyString` to preserve unquoted empty cells (for example `,,`).
- Add `preserveEmptyString` to preserve explicitly quoted empty cells (for example `""` with the configured quote character), enabled by default.
- Keep empty-value behavior aligned between sync and streaming parsing paths.
- Apply empty-value precedence consistently: `defaultValues`, then `nullValues` + `nullRepresentation`, then preserve options, then omit.
- Treat quoted-empty identifier values as continuation rows and prevent continuation rows from overriding the active grouping identifier.
60 changes: 59 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,42 @@ const result = CsvParser.parseString(csvContent, {
// ]
```

### Empty Value Preservation

By default, unquoted empty values are omitted and explicitly quoted empty values are preserved. You can control each case independently.

```typescript
const csvContent = `id,emptyColumn,emptyQuoted
1,,""`;

// Preserve only unquoted empties: ,, -> ''
const preserveColumns = CsvParser.parseString(csvContent, {
preserveEmptyColumnAsEmptyString: true,
preserveEmptyString: false
});
// [{ id: "1", emptyColumn: "" }]

// Preserve only quoted empties: "" -> ''
const preserveQuoted = CsvParser.parseString(csvContent, {
preserveEmptyString: true
});
// [{ id: "1", emptyQuoted: "" }]

// Preserve both
const preserveBoth = CsvParser.parseString(csvContent, {
preserveEmptyColumnAsEmptyString: true,
preserveEmptyString: true
});
// [{ id: "1", emptyColumn: "", emptyQuoted: "" }]
```

When multiple options apply, precedence is:

1. `defaultValues`
2. `nullValues` + `nullRepresentation`
3. `preserveEmptyColumnAsEmptyString` / `preserveEmptyString`
4. Omit

### Null Value Handling

```typescript
Expand Down Expand Up @@ -929,6 +965,10 @@ interface CsvParserOptions {
// Default values
defaultValues?: Record<string, string>; // Default values for empty cells

// Empty value preservation
preserveEmptyColumnAsEmptyString?: boolean; // Preserve unquoted empties: ,,
preserveEmptyString?: boolean; // Preserve quoted empties: ""

// Row grouping
identifierColumn?: string; // Column for grouping continuation rows
}
Expand Down Expand Up @@ -1139,6 +1179,20 @@ Default values for columns when cells are empty.
defaultValues: { status: 'pending', country: 'Unknown' }
```

#### `preserveEmptyColumnAsEmptyString`

Preserve unquoted empty columns (for example `,,`) as `''` in nested output.

- Default: `false`

#### `preserveEmptyString`

Preserve explicitly quoted empty strings (for example `""` with the default quote character) as `''` in nested output.

- Default: `true`

Both options work for `CsvParser` and `CsvStreamParser`. Set `preserveEmptyString: false` if you want quoted empties omitted.

### Complete Example with All Options

```typescript
Expand Down Expand Up @@ -1188,7 +1242,11 @@ const result = await CsvParser.parseFile('./data.csv', {
nullRepresentation: 'null',

// Defaults
defaultValues: { status: 'pending' }
defaultValues: { status: 'pending' },

// Empty value preservation
preserveEmptyColumnAsEmptyString: true,
preserveEmptyString: true
});
```

Expand Down
2 changes: 1 addition & 1 deletion src/csv-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ export abstract class CsvParser {
* ```
*/
static parseString<T = NestedObject>(csvContent: string, options: CsvParserOptions = {}): T[] {
const records = CsvReader.parse(csvContent, options);
const records = CsvReader.parseWithQuotedEmptyProvenance(csvContent, options);
return NestedJsonConverter.convert(records, options) as T[];
}
}
156 changes: 110 additions & 46 deletions src/csv-reader.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
import { CsvDuplicateHeaderError, CsvValidationError } from "./errors";
import {
type InternalCsvCellValue,
type InternalCsvRecord,
isEmptyCsvCellValue,
QUOTED_EMPTY_CELL,
toPublicCsvCellValue,
} from "./internal-empty-cell";
import type { CsvParserOptions, CsvRecord, DuplicateHeaderStrategy, ValidationMode } from "./types";

/**
Expand Down Expand Up @@ -35,6 +42,23 @@ export class CsvReader {
* ```
*/
static parse(content: string, options: CsvParserOptions = {}): CsvRecord[] {
return this.parseInternal(content, options, false) as CsvRecord[];
}

/**
* Parse CSV content with internal quoted-empty provenance tracking.
*
* @internal
*/
static parseWithQuotedEmptyProvenance(content: string, options: CsvParserOptions = {}): InternalCsvRecord[] {
return this.parseInternal(content, options, true);
}

private static parseInternal(
content: string,
options: CsvParserOptions,
preserveQuotedEmpty: boolean
): InternalCsvRecord[] {
if (!content || content.trim() === "") {
return [];
}
Expand Down Expand Up @@ -91,13 +115,13 @@ export class CsvReader {
headers = processedHeaders;

// Parse data rows
const records: CsvRecord[] = [];
const records: InternalCsvRecord[] = [];
let dataRowIndex = 0;
for (let i = dataStartIndex + 1; i < lines.length; i++) {
const line = lines[i].trim();
if (line === "") continue; // Skip empty lines

const values = this.parseLine(lines[i], delimiter, quote);
const values = this.parseLine(lines[i], delimiter, quote, preserveQuotedEmpty);

// Validate column count
if (values.length > headers.length && validationMode !== "ignore") {
Expand All @@ -113,48 +137,11 @@ export class CsvReader {
}
}

const record: CsvRecord = {};
for (let j = 0; j < headers.length; j++) {
// Get the original column index for this filtered header
const originalIndex = includedIndices[j];
let value = originalIndex < values.length ? values[originalIndex] : "";

// Apply default value if cell is empty
if (value === "" && options.defaultValues?.[headers[j]] !== undefined) {
value = options.defaultValues[headers[j]];
}

const header = headers[j];

// Handle duplicate values based on strategy
if (duplicateIndices.has(j)) {
switch (dupStrategy) {
case "first":
// Only set if not already present
if (!(header in record)) {
record[header] = value;
}
break;
case "combine":
// Combine into comma-separated values (arrays handled by NestedJsonConverter)
if (header in record) {
record[header] = record[header] ? `${record[header]},${value}` : value;
} else {
record[header] = value;
}
break;
default:
// Just overwrite (default behavior)
record[header] = value;
break;
}
} else {
record[header] = value;
}
}
const record = this.createRecord(values, headers, includedIndices, duplicateIndices, dupStrategy, options);

// Apply row filter if specified
if (options.rowFilter && !options.rowFilter(record, dataRowIndex)) {
const rowFilterRecord = preserveQuotedEmpty ? this.toPublicRecord(record) : (record as CsvRecord);
if (options.rowFilter && !options.rowFilter(rowFilterRecord, dataRowIndex)) {
dataRowIndex++;
continue;
}
Expand All @@ -166,6 +153,61 @@ export class CsvReader {
return records;
}

private static createRecord(
values: InternalCsvCellValue[],
headers: string[],
includedIndices: number[],
duplicateIndices: Set<number>,
dupStrategy: DuplicateHeaderStrategy,
options: CsvParserOptions
): InternalCsvRecord {
const record: InternalCsvRecord = {};

for (let j = 0; j < headers.length; j++) {
const originalIndex = includedIndices[j];
let value: InternalCsvCellValue = originalIndex < values.length ? values[originalIndex] : "";

if (isEmptyCsvCellValue(value) && options.defaultValues?.[headers[j]] !== undefined) {
value = options.defaultValues[headers[j]];
}

const header = headers[j];

if (duplicateIndices.has(j)) {
switch (dupStrategy) {
case "first":
if (!(header in record)) {
record[header] = value;
}
break;
case "combine": {
const existing = header in record ? toPublicCsvCellValue(record[header]) : "";
const incoming = toPublicCsvCellValue(value);
record[header] = existing ? `${existing},${incoming}` : incoming;
break;
}
default:
record[header] = value;
break;
}
} else {
record[header] = value;
}
}

return record;
}

private static toPublicRecord(record: InternalCsvRecord): CsvRecord {
const publicRecord: CsvRecord = {};

for (const [header, value] of Object.entries(record)) {
publicRecord[header] = toPublicCsvCellValue(value);
}

return publicRecord;
}

/**
* Strip BOM (Byte Order Mark) from the beginning of content.
* Handles UTF-8 and UTF-16 BOMs.
Expand Down Expand Up @@ -266,10 +308,18 @@ export class CsvReader {
* // ['1', 'Say "Hello"', 'test']
* ```
*/
static parseLine(line: string, delimiter = ",", quote = '"'): string[] {
const values: string[] = [];
static parseLine(line: string, delimiter?: string, quote?: string): string[];
static parseLine(
line: string,
delimiter: string,
quote: string,
preserveQuotedEmpty: boolean
): InternalCsvCellValue[];
static parseLine(line: string, delimiter = ",", quote = '"', preserveQuotedEmpty = false): InternalCsvCellValue[] {
const values: InternalCsvCellValue[] = [];
let currentValue = "";
let insideQuotes = false;
let fieldWasQuoted = false;

for (let i = 0; i < line.length; i++) {
const char = line[i];
Expand All @@ -283,22 +333,36 @@ export class CsvReader {
} else {
// Toggle quote state
insideQuotes = !insideQuotes;
fieldWasQuoted = true;
}
} else if (char === delimiter && !insideQuotes) {
// Field delimiter
values.push(currentValue);
values.push(this.finalizeParsedCellValue(currentValue, fieldWasQuoted, preserveQuotedEmpty));
currentValue = "";
fieldWasQuoted = false;
} else {
currentValue += char;
}
}

// Don't forget the last value
values.push(currentValue);
values.push(this.finalizeParsedCellValue(currentValue, fieldWasQuoted, preserveQuotedEmpty));

return values;
}

private static finalizeParsedCellValue(
value: string,
fieldWasQuoted: boolean,
preserveQuotedEmpty: boolean
): InternalCsvCellValue {
if (preserveQuotedEmpty && fieldWasQuoted && value === "") {
return QUOTED_EMPTY_CELL;
}

return value;
}

/**
* Detect duplicate headers and process them according to the strategy.
* Returns the processed headers and a set of indices that are duplicates.
Expand Down
Loading
Loading