From 92e1d92deaf8474745e7654fc6dfde78f2148f3d Mon Sep 17 00:00:00 2001 From: Steve Hansen Date: Sat, 16 May 2026 12:51:54 +0200 Subject: [PATCH 1/2] refactor: unify the four CsvReader read loops behind one JIT-devirtualized engine (#118) Collapse the five duplicated read-loop implementations (Read / ReadAsSpan / ReadAsync / ReadFromMemoryOptimized / ReadFromMemory) into a single internal Enumerate + EnumerateAsync<...> engine. TSource and TFactory are generic struct constraints (struct, ILineSource / IAsyncLineSource / IRowFactory), so the JIT specializes one monomorphized native body per concrete (source, factory) pair with no virtual dispatch on the per-row hot path. Correctness: the HeaderAbsent + multiline-first-record pre-pass now applies uniformly across all five paths. Previously ReadAsync, ReadFromMemoryOptimized, and ReadFromMemory silently miscounted columns when HeaderMode = HeaderAbsent and the first record contained an embedded newline inside a quoted field. Pinned by the cross-path regression test in Csv.Tests/EngineUnificationTests.cs. Performance: TextReader-backed paths thread the natural-string-form alongside the ReadOnlyMemory view via ILineSource.TryReadLine's out parameter, so StringRowFactory / SpanRowFactory pass the original string straight into the row ctor instead of paying new string(span) per row. MemoryReaderLineSource.Concat delegates to StringHelpers.Concat (single allocation) instead of the rent-then-allocate anti-pattern. EnumerateAsync plumbs CancellationToken all the way to TextReader.ReadLineAsync(ct) -- the async path had no cancellation support before this change. Public API unchanged. CsvLineSplitter, CsvWriter, CsvOptions, and the row classes are untouched except for a private -> internal accessibility shift on the row classes required by the new internal factories (InternalsVisibleTo("Csv.Tests") already in place). Co-Authored-By: Claude Opus 4.7 (1M context) --- Csv.Tests/EngineUnificationTests.cs | 610 ++++++++++++++++++++++++++++ Csv/CsvReader.Engine.cs | 474 +++++++++++++++++++++ Csv/CsvReader.FromMemory.cs | 81 +--- Csv/CsvReader.cs | 414 +------------------ 4 files changed, 1094 insertions(+), 485 deletions(-) create mode 100644 Csv.Tests/EngineUnificationTests.cs create mode 100644 Csv/CsvReader.Engine.cs diff --git a/Csv.Tests/EngineUnificationTests.cs b/Csv.Tests/EngineUnificationTests.cs new file mode 100644 index 0000000..ed79802 --- /dev/null +++ b/Csv.Tests/EngineUnificationTests.cs @@ -0,0 +1,610 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Csv.Tests +{ + [TestClass] + public class EngineUnificationTests + { + private enum ReadPath + { + Read, +#if NET8_0_OR_GREATER + ReadAsSpan, + ReadAsync, + ReadFromMemoryOptimized, + ReadFromMemory, +#endif + } + + private static readonly ReadPath[] AllPaths = + { + ReadPath.Read, +#if NET8_0_OR_GREATER + ReadPath.ReadAsSpan, + ReadPath.ReadAsync, + ReadPath.ReadFromMemoryOptimized, + ReadPath.ReadFromMemory, +#endif + }; + + private sealed class Row + { + public string[] Headers { get; set; } = Array.Empty(); + public string[] Values { get; set; } = Array.Empty(); + public int Index { get; set; } + public string Raw { get; set; } = string.Empty; + public Dictionary ByName { get; set; } = new(); + } + + private static List Run(ReadPath path, string csv, Func optionsFactory) + { + return path switch + { + ReadPath.Read => RunRead(csv, optionsFactory()), +#if NET8_0_OR_GREATER + ReadPath.ReadAsSpan => RunReadAsSpan(csv, optionsFactory()), + ReadPath.ReadAsync => RunReadAsync(csv, optionsFactory()).GetAwaiter().GetResult(), + ReadPath.ReadFromMemoryOptimized => RunReadFromMemoryOptimized(csv, optionsFactory()), + ReadPath.ReadFromMemory => RunReadFromMemory(csv, optionsFactory()), +#endif + _ => throw new ArgumentOutOfRangeException(nameof(path)) + }; + } + + private static List RunRead(string csv, CsvOptions options) + { + using var reader = new StringReader(csv); + var result = new List(); + foreach (var line in CsvReader.Read(reader, options)) + { + var byName = new Dictionary(); + foreach (var h in line.Headers) + byName[h] = line[h]; + + result.Add(new Row + { + Headers = line.Headers, + Values = line.Values, + Index = line.Index, + Raw = line.Raw, + ByName = byName, + }); + } + return result; + } + +#if NET8_0_OR_GREATER + private static List RunReadAsSpan(string csv, CsvOptions options) + { + using var reader = new StringReader(csv); + var result = new List(); + foreach (var line in CsvReader.ReadAsSpan(reader, options)) + { + var byName = new Dictionary(); + foreach (var h in line.Headers) + byName[h] = line[h]; + + result.Add(new Row + { + Headers = line.Headers, + Values = line.Values, + Index = line.Index, + Raw = line.Raw, + ByName = byName, + }); + } + return result; + } + + private static async Task> RunReadAsync(string csv, CsvOptions options) + { + using var reader = new StringReader(csv); + var result = new List(); + await foreach (var line in CsvReader.ReadAsync(reader, options)) + { + var byName = new Dictionary(); + foreach (var h in line.Headers) + byName[h] = line[h]; + + result.Add(new Row + { + Headers = line.Headers, + Values = line.Values, + Index = line.Index, + Raw = line.Raw, + ByName = byName, + }); + } + return result; + } + + private static List RunReadFromMemoryOptimized(string csv, CsvOptions options) + { + var result = new List(); + foreach (var line in CsvReader.ReadFromMemoryOptimized(csv.AsMemory(), options)) + { + var byName = new Dictionary(); + foreach (var h in line.Headers) + byName[h] = line[h]; + + result.Add(new Row + { + Headers = line.Headers, + Values = line.Values, + Index = line.Index, + Raw = line.Raw, + ByName = byName, + }); + } + return result; + } + + private static List RunReadFromMemory(string csv, CsvOptions options) + { + var result = new List(); + foreach (var line in CsvReader.ReadFromMemory(csv.AsMemory(), options)) + { + var headerStrings = line.Headers.Select(h => h.ToString()).ToArray(); + var valueStrings = line.Values.Select(v => v.ToString()).ToArray(); + var byName = new Dictionary(); + foreach (var h in headerStrings) + byName[h] = line[h].ToString(); + + result.Add(new Row + { + Headers = headerStrings, + Values = valueStrings, + Index = line.Index, + Raw = line.Raw.ToString(), + ByName = byName, + }); + } + return result; + } +#endif + + // ---------------------------------------------------------------------- + // 1. Cross-path skip / header / alias matrix + // ---------------------------------------------------------------------- + + [TestMethod] + public void When_HeaderPresentHappyPath_Then_AllPathsProduceTwoRecordsWithNamedAndIndexedAccess() + { + var csv = "name,age\nAlice,30\nBob,25\n"; + foreach (var path in AllPaths) + { + var rows = Run(path, csv, () => new CsvOptions()); + + Assert.AreEqual(2, rows.Count, $"path={path}"); + Assert.AreEqual(2, rows[0].Headers.Length, $"path={path}"); + Assert.AreEqual("name", rows[0].Headers[0], $"path={path}"); + Assert.AreEqual("age", rows[0].Headers[1], $"path={path}"); + Assert.AreEqual("Alice", rows[0].Values[0], $"path={path}"); + Assert.AreEqual("30", rows[0].Values[1], $"path={path}"); + Assert.AreEqual("Alice", rows[0].ByName["name"], $"path={path}"); + Assert.AreEqual("30", rows[0].ByName["age"], $"path={path}"); + Assert.AreEqual("Bob", rows[1].Values[0], $"path={path}"); + Assert.AreEqual("25", rows[1].Values[1], $"path={path}"); + } + } + + [TestMethod] + public void When_HeaderAbsentHappyPath_Then_AllPathsSynthesizeColumn1Column2Headers() + { + var csv = "1,2,3\n4,5,6\n"; + foreach (var path in AllPaths) + { + var rows = Run(path, csv, () => new CsvOptions { HeaderMode = HeaderMode.HeaderAbsent }); + + Assert.AreEqual(2, rows.Count, $"path={path}"); + CollectionAssert.AreEqual(new[] { "Column1", "Column2", "Column3" }, rows[0].Headers, $"path={path}"); + CollectionAssert.AreEqual(new[] { "1", "2", "3" }, rows[0].Values, $"path={path}"); + CollectionAssert.AreEqual(new[] { "4", "5", "6" }, rows[1].Values, $"path={path}"); + Assert.AreEqual("1", rows[0].ByName["Column1"], $"path={path}"); + Assert.AreEqual("6", rows[1].ByName["Column3"], $"path={path}"); + } + } + + [TestMethod] + public void When_RowsToSkipIsTwo_Then_AllPathsTreatThirdLineAsHeader() + { + var csv = "preamble line 1\npreamble line 2\nname,age\nAlice,30\nBob,25\n"; + foreach (var path in AllPaths) + { + var rows = Run(path, csv, () => new CsvOptions { RowsToSkip = 2 }); + + Assert.AreEqual(2, rows.Count, $"path={path}"); + CollectionAssert.AreEqual(new[] { "name", "age" }, rows[0].Headers, $"path={path}"); + Assert.AreEqual("Alice", rows[0].Values[0], $"path={path}"); + Assert.AreEqual("Bob", rows[1].Values[0], $"path={path}"); + } + } + + [TestMethod] + public void When_SkipRowFiltersCommentLines_Then_AllPathsExcludeThem() + { + var csv = "name,age\n# comment row\nAlice,30\n# another\nBob,25\n"; + foreach (var path in AllPaths) + { + // The default SkipRow predicate already skips '#' lines; we re-state it explicitly for clarity. + var rows = Run(path, csv, () => new CsvOptions + { +#if NET8_0_OR_GREATER + SkipRow = (row, idx) => row.Span.IsEmpty || row.Span[0] == '#' +#else + SkipRow = (row, idx) => string.IsNullOrEmpty(row) || row[0] == '#' +#endif + }); + + Assert.AreEqual(2, rows.Count, $"path={path}"); + Assert.AreEqual("Alice", rows[0].Values[0], $"path={path}"); + Assert.AreEqual("Bob", rows[1].Values[0], $"path={path}"); + } + } + + [TestMethod] + public void When_AliasGroupMatchesOneHeader_Then_AllAliasNamesResolveToSameColumn() + { + // Aliases live in the header lookup but not in the Headers array, so this case + // accesses the row via the public indexer directly rather than the path-agnostic + // ByName projection used elsewhere. + var csv = "category,price\nbooks,10\n"; + + void AssertOnReadLineLike(Func> source) + { + var options = new CsvOptions + { + Aliases = new List + { + new[] { "category", "Category Name", "category_name" } + } + }; + var lines = source(options).ToList(); + Assert.AreEqual(1, lines.Count); + Assert.AreEqual("books", lines[0]["category"]); + Assert.AreEqual("books", lines[0]["Category Name"]); + Assert.AreEqual("books", lines[0]["category_name"]); + } + + AssertOnReadLineLike(opts => + { + var reader = new StringReader(csv); + return CsvReader.Read(reader, opts); + }); + +#if NET8_0_OR_GREATER + AssertOnReadLineLike(opts => + { + var reader = new StringReader(csv); + return CsvReader.ReadAsSpan(reader, opts); + }); + + // Async path + { + var options = new CsvOptions + { + Aliases = new List + { + new[] { "category", "Category Name", "category_name" } + } + }; + using var reader = new StringReader(csv); + var async = CollectAsync(CsvReader.ReadAsync(reader, options)).GetAwaiter().GetResult(); + Assert.AreEqual(1, async.Count); + Assert.AreEqual("books", async[0]["category"]); + Assert.AreEqual("books", async[0]["Category Name"]); + Assert.AreEqual("books", async[0]["category_name"]); + } + + AssertOnReadLineLike(opts => CsvReader.ReadFromMemoryOptimized(csv.AsMemory(), opts)); + + // ReadFromMemory returns ICsvLineFromMemory rather than ICsvLine. + { + var options = new CsvOptions + { + Aliases = new List + { + new[] { "category", "Category Name", "category_name" } + } + }; + var lines = CsvReader.ReadFromMemory(csv.AsMemory(), options).ToList(); + Assert.AreEqual(1, lines.Count); + Assert.AreEqual("books", lines[0]["category"].ToString()); + Assert.AreEqual("books", lines[0]["Category Name"].ToString()); + Assert.AreEqual("books", lines[0]["category_name"].ToString()); + } +#endif + } + +#if NET8_0_OR_GREATER + private static async Task> CollectAsync(IAsyncEnumerable source) + { + var result = new List(); + await foreach (var line in source) + result.Add(line); + return result; + } +#endif + + [TestMethod] + public void When_AliasGroupMatchesMultipleHeaders_Then_AllPathsThrowInvalidOperation() + { + var csv = "A,B\n1,2\n"; + foreach (var path in AllPaths) + { + var ex = Assert.ThrowsExactly( + () => Run(path, csv, () => new CsvOptions + { + Aliases = new List { new[] { "A", "B" } } + }), + $"path={path}"); + + StringAssert.Contains(ex.Message, "alias group", $"path={path}"); + } + } + + [TestMethod] + public void When_DuplicateHeadersWithAutoRenameOn_Then_AllPathsAppendNumericSuffix() + { + var csv = "A,A,A\n1,2,3\n"; + foreach (var path in AllPaths) + { + var rows = Run(path, csv, () => new CsvOptions { AutoRenameHeaders = true }); + + Assert.AreEqual(1, rows.Count, $"path={path}"); + CollectionAssert.AreEqual(new[] { "A", "A2", "A3" }, rows[0].Headers, $"path={path}"); + Assert.AreEqual("1", rows[0].ByName["A"], $"path={path}"); + Assert.AreEqual("2", rows[0].ByName["A2"], $"path={path}"); + Assert.AreEqual("3", rows[0].ByName["A3"], $"path={path}"); + } + } + + [TestMethod] + public void When_DuplicateHeadersWithAutoRenameOff_Then_AllPathsThrowInvalidOperation() + { + var csv = "A,A\n1,2\n"; + foreach (var path in AllPaths) + { + var ex = Assert.ThrowsExactly( + () => Run(path, csv, () => new CsvOptions { AutoRenameHeaders = false }), + $"path={path}"); + + StringAssert.Contains(ex.Message, "Duplicate headers", $"path={path}"); + } + } + + // ---------------------------------------------------------------------- + // 2. Multiline correctness matrix (the regression test for the bug fix) + // ---------------------------------------------------------------------- + + [TestMethod] + public void When_HeaderPresentAndMultilineInDataRecord_Then_AllPathsKeepFieldIntact() + { + var csv = "col1,col2\r\nfoo,\"bar\r\nbaz\"\r\n"; + foreach (var path in AllPaths) + { + var rows = Run(path, csv, () => new CsvOptions + { + AllowNewLineInEnclosedFieldValues = true, + NewLine = "\r\n" + }); + + Assert.AreEqual(1, rows.Count, $"path={path}"); + CollectionAssert.AreEqual(new[] { "col1", "col2" }, rows[0].Headers, $"path={path}"); + Assert.AreEqual("foo", rows[0].Values[0], $"path={path}"); + Assert.AreEqual("bar\r\nbaz", rows[0].Values[1], $"path={path}"); + } + } + + [TestMethod] + public void When_HeaderAbsentAndMultilineInFirstRecord_Then_AllPathsProduceCorrectColumnCount() + { + // This is the bug-fix case. Before the engine unification, ReadAsync, + // ReadFromMemoryOptimized, and ReadFromMemory would all miscount columns here + // because they lacked the HeaderAbsent + multiline pre-pass. + var csv = "\"a\r\nb\",c,d\r\nx,y,z\r\n"; + foreach (var path in AllPaths) + { + var rows = Run(path, csv, () => new CsvOptions + { + HeaderMode = HeaderMode.HeaderAbsent, + AllowNewLineInEnclosedFieldValues = true, + NewLine = "\r\n" + }); + + Assert.AreEqual(2, rows.Count, $"path={path}"); + Assert.AreEqual(3, rows[0].Headers.Length, $"path={path}"); + CollectionAssert.AreEqual(new[] { "Column1", "Column2", "Column3" }, rows[0].Headers, $"path={path}"); + Assert.AreEqual("a\r\nb", rows[0].Values[0], $"path={path}"); + Assert.AreEqual("c", rows[0].Values[1], $"path={path}"); + Assert.AreEqual("d", rows[0].Values[2], $"path={path}"); + Assert.AreEqual("x", rows[1].Values[0], $"path={path}"); + Assert.AreEqual("y", rows[1].Values[1], $"path={path}"); + Assert.AreEqual("z", rows[1].Values[2], $"path={path}"); + } + } + + [TestMethod] + public void When_HeaderAbsentAndMultilineInLaterRecord_Then_AllPathsKeepFieldIntact() + { + var csv = "a,b,c\r\n\"x\r\ny\",p,q\r\n"; + foreach (var path in AllPaths) + { + var rows = Run(path, csv, () => new CsvOptions + { + HeaderMode = HeaderMode.HeaderAbsent, + AllowNewLineInEnclosedFieldValues = true, + NewLine = "\r\n" + }); + + Assert.AreEqual(2, rows.Count, $"path={path}"); + CollectionAssert.AreEqual(new[] { "a", "b", "c" }, rows[0].Values, $"path={path}"); + Assert.AreEqual("x\r\ny", rows[1].Values[0], $"path={path}"); + Assert.AreEqual("p", rows[1].Values[1], $"path={path}"); + Assert.AreEqual("q", rows[1].Values[2], $"path={path}"); + } + } + + [TestMethod] + public void When_UnterminatedQuoteAtEof_Then_AllPathsTerminateWithoutInfiniteLoop() + { + var csv = "a,b\r\nfoo,\"unterminated\r\n"; + foreach (var path in AllPaths) + { + // The contract: must not hang. Last record returned with the accumulated content; + // exact field values are not asserted (they depend on splitter behavior for an + // unterminated quote), only that enumeration terminates with at least 1 row. + List rows; + try + { + rows = Run(path, csv, () => new CsvOptions + { + AllowNewLineInEnclosedFieldValues = true, + NewLine = "\r\n" + }); + } + catch (InvalidOperationException) + { + // Acceptable: some paths may surface the malformed input as an error + // rather than yielding a partial row. The critical invariant is that + // enumeration terminates, which it did. + continue; + } + + Assert.IsTrue(rows.Count >= 1, $"path={path} expected at least one row, got {rows.Count}"); + } + } + + [TestMethod] + public void When_NewLineIsLineFeedOnly_Then_AllPathsRespectOptionsNewLineForConcatenation() + { + var csv = "col1,col2\nfoo,\"bar\nbaz\"\n"; + foreach (var path in AllPaths) + { + var rows = Run(path, csv, () => new CsvOptions + { + AllowNewLineInEnclosedFieldValues = true, + NewLine = "\n" + }); + + Assert.AreEqual(1, rows.Count, $"path={path}"); + Assert.AreEqual("foo", rows[0].Values[0], $"path={path}"); + Assert.AreEqual("bar\nbaz", rows[0].Values[1], $"path={path}"); + } + } + + // ---------------------------------------------------------------------- + // 3. Per-path contract tests + // ---------------------------------------------------------------------- + +#if NET8_0_OR_GREATER + [TestMethod] + public async Task When_ReadAsyncOverStringReader_Then_ReturnsIAsyncEnumerableAndYieldsExpectedRows() + { + using var reader = new StringReader("a,b\n1,2\n3,4\n"); + var enumerable = CsvReader.ReadAsync(reader); + Assert.IsInstanceOfType>(enumerable); + + var collected = new List<(string a, string b)>(); + await foreach (var line in enumerable) + collected.Add((line["a"], line["b"])); + + Assert.AreEqual(2, collected.Count); + Assert.AreEqual(("1", "2"), collected[0]); + Assert.AreEqual(("3", "4"), collected[1]); + } + + [TestMethod] + public void When_ReadFromMemoryOptimizedGivenExplicitMemoryOptions_Then_ProducesCorrectRecords() + { + var memoryOptions = new CsvMemoryOptions(); + var lines = CsvReader.ReadFromMemoryOptimized( + "name,age\nAlice,30\nBob,25\n".AsMemory(), + new CsvOptions(), + memoryOptions).ToList(); + + Assert.AreEqual(2, lines.Count); + Assert.AreEqual("Alice", lines[0]["name"]); + Assert.AreEqual("30", lines[0]["age"]); + Assert.AreEqual("Bob", lines[1]["name"]); + Assert.AreEqual("25", lines[1]["age"]); + } + + [TestMethod] + public void When_ReadFromMemory_Then_HeadersValuesAndRawAreReadOnlyMemoryOfChar() + { + var lines = CsvReader.ReadFromMemory("name,age\nAlice,30\n".AsMemory()).ToList(); + + Assert.AreEqual(1, lines.Count); + + var line = lines[0]; + // Headers/Values/Raw expose ReadOnlyMemory rather than string. + ReadOnlyMemory[] headers = line.Headers; + ReadOnlyMemory[] values = line.Values; + ReadOnlyMemory raw = line.Raw; + ReadOnlyMemory byName = line["name"]; + ReadOnlyMemory byIndex = line[0]; + + Assert.AreEqual("name", headers[0].ToString()); + Assert.AreEqual("age", headers[1].ToString()); + Assert.AreEqual("Alice", values[0].ToString()); + Assert.AreEqual("30", values[1].ToString()); + Assert.AreEqual("Alice,30", raw.ToString()); + Assert.AreEqual("Alice", byName.ToString()); + Assert.AreEqual("Alice", byIndex.ToString()); + } +#endif + + // ---------------------------------------------------------------------- + // 4. Allocation-parity smoke test + // ---------------------------------------------------------------------- + +#if NET8_0_OR_GREATER + [TestMethod] + public void When_ReadAsSpanEnumeratesOneThousandRecords_Then_AllocatedBytesIsFinite() + { + var builder = new StringBuilder(); + builder.AppendLine("col1,col2,col3"); + for (int i = 0; i < 1000; i++) + builder.AppendLine($"value{i}a,value{i}b,value{i}c"); + var csv = builder.ToString(); + + long before; + try + { + before = GC.GetTotalAllocatedBytes(precise: true); + } + catch (PlatformNotSupportedException) + { + Assert.Inconclusive("GC.GetTotalAllocatedBytes is unavailable on this platform."); + return; + } + + using var reader = new StringReader(csv); + int rowCount = 0; + foreach (var line in CsvReader.ReadAsSpan(reader)) + { + // Touch a column so the row materializes fully. + _ = line.GetSpan(0).Length; + rowCount++; + } + + var after = GC.GetTotalAllocatedBytes(precise: true); + var delta = after - before; + + Assert.AreEqual(1000, rowCount); + Assert.IsTrue(delta >= 0, $"Expected non-negative allocation delta, got {delta}"); + Assert.IsTrue(delta < long.MaxValue, $"Allocation delta out of range: {delta}"); + // Documented for drift visibility; no hard upper bound. + System.Diagnostics.Trace.WriteLine($"ReadAsSpan over 1000 records allocated {delta} bytes."); + } +#endif + } +} diff --git a/Csv/CsvReader.Engine.cs b/Csv/CsvReader.Engine.cs new file mode 100644 index 0000000..49bfede --- /dev/null +++ b/Csv/CsvReader.Engine.cs @@ -0,0 +1,474 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; + +#if NET8_0_OR_GREATER +using System.Buffers; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; +using MemoryText = System.ReadOnlyMemory; +using SpanText = System.ReadOnlySpan; +#else +using MemoryText = System.String; +using SpanText = System.String; +#endif + +namespace Csv +{ + partial class CsvReader + { + internal interface ILineSource + { + bool TryReadLine(out MemoryText line, out string? lineString); + MemoryText Concat(MemoryText head, string newLine, MemoryText tail, out string? combined); + } + +#if NET8_0_OR_GREATER + internal interface IAsyncLineSource + { + ValueTask<(bool ok, MemoryText line, string? lineString)> TryReadLineAsync(CancellationToken ct); + MemoryText Concat(MemoryText head, string newLine, MemoryText tail, out string? combined); + } +#endif + + internal interface IRowFactory where TRow : class + { + TRow Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, CsvOptions options); + } + + internal readonly struct TextReaderLineSource : ILineSource + { + private readonly TextReader reader; + + public TextReaderLineSource(TextReader reader) + { + this.reader = reader; + } + + public bool TryReadLine(out MemoryText line, out string? lineString) + { + var read = reader.ReadLine(); + if (read == null) + { + line = default!; + lineString = null; + return false; + } + + lineString = read; +#if NET8_0_OR_GREATER + line = read.AsMemory(); +#else + line = read; +#endif + return true; + } + + public MemoryText Concat(MemoryText head, string newLine, MemoryText tail, out string? combined) + { +#if NET8_0_OR_GREATER + combined = string.Concat(head.Span, newLine.AsSpan(), tail.Span); + return combined.AsMemory(); +#else + combined = head + newLine + tail; + return combined; +#endif + } + } + +#if NET8_0_OR_GREATER + internal readonly struct AsyncTextReaderLineSource : IAsyncLineSource + { + private readonly TextReader reader; + + public AsyncTextReaderLineSource(TextReader reader) + { + this.reader = reader; + } + + public async ValueTask<(bool ok, MemoryText line, string? lineString)> TryReadLineAsync(CancellationToken ct) + { + var read = await reader.ReadLineAsync(ct).ConfigureAwait(false); + if (read == null) + return (false, default, null); + + return (true, read.AsMemory(), read); + } + + public MemoryText Concat(MemoryText head, string newLine, MemoryText tail, out string? combined) + { + combined = string.Concat(head.Span, newLine.AsSpan(), tail.Span); + return combined.AsMemory(); + } + } + + internal struct MemorySliceLineSource : ILineSource + { + private readonly ReadOnlyMemory csv; + private readonly CsvMemoryOptions memoryOptions; + private int position; + + public MemorySliceLineSource(ReadOnlyMemory csv, CsvMemoryOptions memoryOptions) + { + this.csv = csv; + this.memoryOptions = memoryOptions; + this.position = 0; + } + + public bool TryReadLine(out MemoryText line, out string? lineString) + { + lineString = null; + + if (position >= csv.Length) + { + line = default; + return false; + } + + var span = csv.Span.Slice(position); + var newlineIndex = span.IndexOfAny('\n', '\r'); + + if (newlineIndex == -1) + { + line = csv.Slice(position); + position = csv.Length; + return !line.IsEmpty; + } + + var lineLength = newlineIndex; + var slice = csv.Slice(position, lineLength); + + position += lineLength; + if (position < csv.Length) + { + var ch = csv.Span[position]; + if (ch == '\r' || ch == '\n') + { + position++; + if (position < csv.Length && ch == '\r' && csv.Span[position] == '\n') + position++; + } + } + + if (slice.IsEmpty) + { + line = default; + return false; + } + + line = slice; + return true; + } + + public MemoryText Concat(MemoryText head, string newLine, MemoryText tail, out string? combined) + { + combined = null; + + var separator = newLine.AsMemory(); + var totalLength = head.Length + separator.Length + tail.Length; + var buffer = memoryOptions.CharArrayPool.Rent(totalLength); + + try + { + var span = buffer.AsSpan(); + head.Span.CopyTo(span); + separator.Span.CopyTo(span.Slice(head.Length)); + tail.Span.CopyTo(span.Slice(head.Length + separator.Length)); + + var result = new char[totalLength]; + span.Slice(0, totalLength).CopyTo(result); + return result.AsMemory(); + } + finally + { + memoryOptions.CharArrayPool.Return(buffer); + } + } + } + + internal struct MemoryReaderLineSource : ILineSource + { + private readonly ReadOnlyMemory csv; + private int position; + + public MemoryReaderLineSource(ReadOnlyMemory csv) + { + this.csv = csv; + this.position = 0; + } + + public bool TryReadLine(out MemoryText line, out string? lineString) + { + lineString = null; + + if (position >= csv.Length) + { + line = default; + return false; + } + + line = csv.ReadLine(ref position); + return !line.IsEmpty; + } + + public MemoryText Concat(MemoryText head, string newLine, MemoryText tail, out string? combined) + { + combined = null; + return StringHelpers.Concat(head, newLine, tail); + } + } +#endif + + internal readonly struct StringRowFactory : IRowFactory + { + public ReadLine Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, CsvOptions options) + { +#if NET8_0_OR_GREATER + return new ReadLine(headers, headerLookup, index, rawString ?? raw.ToString(), options); +#else + return new ReadLine(headers, headerLookup, index, rawString ?? raw, options); +#endif + } + } + +#if NET8_0_OR_GREATER + internal readonly struct SpanRowFactory : IRowFactory + { + public ReadLineSpan Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, CsvOptions options) + { + return new ReadLineSpan(headers, headerLookup, index, rawString ?? raw.ToString(), options); + } + } + + internal readonly struct OptimizedRowFactory : IRowFactory + { + private readonly CsvMemoryOptions memoryOptions; + + public OptimizedRowFactory(CsvMemoryOptions memoryOptions) + { + this.memoryOptions = memoryOptions; + } + + public ReadLineSpanOptimized Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, CsvOptions options) + { + return new ReadLineSpanOptimized(headers, headerLookup, index, raw, options, memoryOptions); + } + } + + internal readonly struct MemoryRowFactory : IRowFactory + { + public ReadLineFromMemory Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, CsvOptions options) + { + return new ReadLineFromMemory(headers, headerLookup, index, raw, options); + } + } +#endif + + private static IEnumerable Enumerate(TSource source, TFactory factory, CsvOptions options) + where TSource : struct, ILineSource + where TFactory : struct, IRowFactory + where TRow : class + { + Debug.Assert(options.Splitter == null, "CsvOptions cannot be reused across enumerations. Create a new instance."); + + var index = 0; + MemoryText[]? headers = null; + Dictionary? headerLookup = null; + + while (source.TryReadLine(out var line, out var lineString)) + { + index++; + + if (index <= options.RowsToSkip || options.SkipRow?.Invoke(line, index) == true) + continue; + + if (headers == null || headerLookup == null) + { + InitializeOptions(line.AsSpan(), options); + var skipInitialLine = options.HeaderMode == HeaderMode.HeaderPresent; + + // HeaderAbsent + multiline: complete the first data line before deriving column count, + // otherwise the headers are sized to a partial record. The yield loop below detects this + // case via index == RowsToSkip + 1 and skips its own multiline pass to avoid double-reading. + if (!skipInitialLine && options.AllowNewLineInEnclosedFieldValues) + { + var splitLine = options.Splitter.Split(line, options); + + while (splitLine.Any(f => CsvLineSplitter.IsUnterminatedQuotedValue(f.AsSpan(), options))) + { + if (!source.TryReadLine(out var nextLine, out _)) + break; + + line = source.Concat(line, options.NewLine, nextLine, out lineString); + splitLine = options.Splitter.Split(line, options); + } + } + + headers = skipInitialLine ? GetHeaders(line, options) : CreateDefaultHeaders(line, options); + + try + { + headerLookup = CreateHeaderLookup(headers, options); + } + catch (ArgumentException) + { + throw new InvalidOperationException("Duplicate headers detected in HeaderPresent mode. If you don't have a header you can set the HeaderMode to HeaderAbsent."); + } + + var aliases = options.Aliases; + if (aliases != null) + { + foreach (var aliasGroup in aliases) + { + var groupIndex = -1; + foreach (var alias in aliasGroup) + { + if (headerLookup.TryGetValue(alias, out var aliasIndex)) + { + if (groupIndex != -1) + throw new InvalidOperationException("Found multiple matches within alias group: " + string.Join(";", aliasGroup)); + + groupIndex = aliasIndex; + } + } + + if (groupIndex != -1) + { + foreach (var alias in aliasGroup) + headerLookup[alias] = groupIndex; + } + } + } + + if (skipInitialLine) + continue; + } + + var isFirstDataLineInHeaderAbsentMode = options.HeaderMode == HeaderMode.HeaderAbsent && index == (options.RowsToSkip + 1); + if (options.AllowNewLineInEnclosedFieldValues && !isFirstDataLineInHeaderAbsentMode) + { + var rawSplit = options.Splitter.Split(line, options); + while (rawSplit.Any(f => CsvLineSplitter.IsUnterminatedQuotedValue(f.AsSpan(), options))) + { + if (!source.TryReadLine(out var nextLine, out _)) + break; + + line = source.Concat(line, options.NewLine, nextLine, out lineString); + rawSplit = options.Splitter.Split(line, options); + } + } + + yield return factory.Create(headers, headerLookup, index, line, lineString, options); + } + } + +#if NET8_0_OR_GREATER + private static async IAsyncEnumerable EnumerateAsync(TSource source, TFactory factory, CsvOptions options, [EnumeratorCancellation] CancellationToken ct = default) + where TSource : struct, IAsyncLineSource + where TFactory : struct, IRowFactory + where TRow : class + { + Debug.Assert(options.Splitter == null, "CsvOptions cannot be reused across enumerations. Create a new instance."); + + var index = 0; + MemoryText[]? headers = null; + Dictionary? headerLookup = null; + + while (true) + { + var (ok, line, lineString) = await source.TryReadLineAsync(ct).ConfigureAwait(false); + if (!ok) + break; + + index++; + + if (index <= options.RowsToSkip || options.SkipRow?.Invoke(line, index) == true) + continue; + + if (headers == null || headerLookup == null) + { + InitializeOptions(line.AsSpan(), options); + var skipInitialLine = options.HeaderMode == HeaderMode.HeaderPresent; + + // HeaderAbsent + multiline: complete the first data line before deriving column count, + // otherwise the headers are sized to a partial record. The yield loop below detects this + // case via index == RowsToSkip + 1 and skips its own multiline pass to avoid double-reading. + if (!skipInitialLine && options.AllowNewLineInEnclosedFieldValues) + { + var splitLine = options.Splitter.Split(line, options); + + while (splitLine.Any(f => CsvLineSplitter.IsUnterminatedQuotedValue(f.AsSpan(), options))) + { + var (nextOk, nextLine, _) = await source.TryReadLineAsync(ct).ConfigureAwait(false); + if (!nextOk) + break; + + line = source.Concat(line, options.NewLine, nextLine, out lineString); + splitLine = options.Splitter.Split(line, options); + } + } + + headers = skipInitialLine ? GetHeaders(line, options) : CreateDefaultHeaders(line, options); + + try + { + headerLookup = CreateHeaderLookup(headers, options); + } + catch (ArgumentException) + { + throw new InvalidOperationException("Duplicate headers detected in HeaderPresent mode. If you don't have a header you can set the HeaderMode to HeaderAbsent."); + } + + var aliases = options.Aliases; + if (aliases != null) + { + foreach (var aliasGroup in aliases) + { + var groupIndex = -1; + foreach (var alias in aliasGroup) + { + if (headerLookup.TryGetValue(alias, out var aliasIndex)) + { + if (groupIndex != -1) + throw new InvalidOperationException("Found multiple matches within alias group: " + string.Join(";", aliasGroup)); + + groupIndex = aliasIndex; + } + } + + if (groupIndex != -1) + { + foreach (var alias in aliasGroup) + headerLookup[alias] = groupIndex; + } + } + } + + if (skipInitialLine) + continue; + } + + var isFirstDataLineInHeaderAbsentMode = options.HeaderMode == HeaderMode.HeaderAbsent && index == (options.RowsToSkip + 1); + if (options.AllowNewLineInEnclosedFieldValues && !isFirstDataLineInHeaderAbsentMode) + { + var rawSplit = options.Splitter.Split(line, options); + while (rawSplit.Any(f => CsvLineSplitter.IsUnterminatedQuotedValue(f.AsSpan(), options))) + { + var (nextOk, nextLine, _) = await source.TryReadLineAsync(ct).ConfigureAwait(false); + if (!nextOk) + break; + + line = source.Concat(line, options.NewLine, nextLine, out lineString); + rawSplit = options.Splitter.Split(line, options); + } + } + + yield return factory.Create(headers, headerLookup, index, line, lineString, options); + } + } +#endif + } +} diff --git a/Csv/CsvReader.FromMemory.cs b/Csv/CsvReader.FromMemory.cs index a8c7548..48146d4 100644 --- a/Csv/CsvReader.FromMemory.cs +++ b/Csv/CsvReader.FromMemory.cs @@ -17,86 +17,9 @@ partial class CsvReader /// The csv string to read the data from. /// The optional options to use when reading. public static IEnumerable ReadFromMemory(MemoryText csv, CsvOptions? options = null) - { - // NOTE: Logic is copied in ReadImpl/ReadImplAsync/ReadFromMemory - options ??= new CsvOptions(); - - MemoryText line; - var index = 0; - var position = 0; - MemoryText[]? headers = null; - Dictionary? headerLookup = null; - while (!(line = csv.ReadLine(ref position)).IsEmpty) - { - index++; - if (index <= options.RowsToSkip || options.SkipRow?.Invoke(line, index) == true) - continue; - - if (headers == null || headerLookup == null) - { - InitializeOptions(line.Span, options); - var skipInitialLine = options.HeaderMode == HeaderMode.HeaderPresent; - - headers = skipInitialLine ? GetHeaders(line, options) : CreateDefaultHeaders(line, options); - - try - { - headerLookup = CreateHeaderLookup(headers, options); - } - catch (ArgumentException) - { - throw new InvalidOperationException("Duplicate headers detected in HeaderPresent mode. If you don't have a header you can set the HeaderMode to HeaderAbsent."); - } - - var aliases = options.Aliases; - if (aliases != null) - { - // NOTE: For each group we need at most 1 match (i.e. SingleOrDefault) - foreach (var aliasGroup in aliases) - { - var groupIndex = -1; - foreach (var alias in aliasGroup) - { - if (headerLookup.TryGetValue(alias, out var aliasIndex)) - { - if (groupIndex != -1) - throw new InvalidOperationException("Found multiple matches within alias group: " + string.Join(";", aliasGroup)); - - groupIndex = aliasIndex; - } - } - - if (groupIndex != -1) - { - foreach (var alias in aliasGroup) - headerLookup[alias] = groupIndex; - } - } - } - - if (skipInitialLine) - continue; - } - - var record = new ReadLineFromMemory(headers, headerLookup, index, line, options); - if (options.AllowNewLineInEnclosedFieldValues) - { - while (record.RawSplitLine.Any(f => CsvLineSplitter.IsUnterminatedQuotedValue(f.AsSpan(), options))) - { - var nextLine = csv.ReadLine(ref position); - if (nextLine.IsEmpty) - break; - - line = StringHelpers.Concat(line, options.NewLine, nextLine); - record = new ReadLineFromMemory(headers, headerLookup, index, line, options); - } - } - - yield return record; - } - } + => Enumerate(new MemoryReaderLineSource(csv), default, options ?? new CsvOptions()); - private sealed class ReadLineFromMemory : ICsvLineFromMemory + internal sealed class ReadLineFromMemory : ICsvLineFromMemory { private readonly Dictionary headerLookup; private readonly CsvOptions options; diff --git a/Csv/CsvReader.cs b/Csv/CsvReader.cs index fab2d77..04d14fc 100644 --- a/Csv/CsvReader.cs +++ b/Csv/CsvReader.cs @@ -117,110 +117,7 @@ private static IEnumerable ReadFromTextSpanImpl(string csv, CsvOpt } private static IEnumerable ReadSpanImpl(TextReader reader, CsvOptions? options) - { - options ??= new CsvOptions(); - - string? line; - var index = 0; - MemoryText[]? headers = null; - Dictionary? headerLookup = null; - while ((line = reader.ReadLine()) != null) - { - index++; - - var lineAsMemory = line.AsMemory(); - if (index <= options.RowsToSkip || options.SkipRow?.Invoke(lineAsMemory, index) == true) - continue; - - if (headers == null || headerLookup == null) - { - InitializeOptions(lineAsMemory.AsSpan(), options); - var skipInitialLine = options.HeaderMode == HeaderMode.HeaderPresent; - - // For HeaderAbsent mode with multiline fields, we need to process the complete line first - if (!skipInitialLine && options.AllowNewLineInEnclosedFieldValues) - { - // Process multiline fields to get the complete first data line - var completeLineForHeaders = line; - var tempSplitter = CsvLineSplitter.Get(options); - var splitLine = tempSplitter.Split(lineAsMemory, options); - - while (splitLine.Any(f => CsvLineSplitter.IsUnterminatedQuotedValue(f.AsSpan(), options))) - { - var nextLine = reader.ReadLine(); - if (nextLine == null) - break; - - completeLineForHeaders = StringHelpers.Concat(completeLineForHeaders.AsMemory(), options.NewLine, nextLine.AsMemory()).AsString(); - lineAsMemory = completeLineForHeaders.AsMemory(); - splitLine = tempSplitter.Split(lineAsMemory, options); - } - - // Update line to the complete multiline version - line = completeLineForHeaders; - lineAsMemory = line.AsMemory(); - } - - headers = skipInitialLine ? GetHeaders(lineAsMemory, options) : CreateDefaultHeaders(lineAsMemory, options); - - try - { - headerLookup = CreateHeaderLookup(headers, options); - } - catch (ArgumentException) - { - throw new InvalidOperationException("Duplicate headers detected in HeaderPresent mode. If you don't have a header you can set the HeaderMode to HeaderAbsent."); - } - - var aliases = options.Aliases; - if (aliases != null) - { - foreach (var aliasGroup in aliases) - { - var groupIndex = -1; - foreach (var alias in aliasGroup) - { - if (headerLookup.TryGetValue(alias, out var aliasIndex)) - { - if (groupIndex != -1) - throw new InvalidOperationException("Found multiple matches within alias group: " + string.Join(";", aliasGroup)); - - groupIndex = aliasIndex; - } - } - - if (groupIndex != -1) - { - foreach (var alias in aliasGroup) - headerLookup[alias] = groupIndex; - } - } - } - - if (skipInitialLine) - continue; - } - - var record = new ReadLineSpan(headers!, headerLookup, index, line, options); - // Only process multiline if we haven't already done it for header creation - var isFirstDataLineInHeaderAbsentMode = (headers != null && options.HeaderMode == HeaderMode.HeaderAbsent && - record.Index == (options.RowsToSkip + 1)); - if (options.AllowNewLineInEnclosedFieldValues && !isFirstDataLineInHeaderAbsentMode) - { - while (record.RawSplitLine.Any(f => CsvLineSplitter.IsUnterminatedQuotedValue(f.AsSpan(), options))) - { - var nextLine = reader.ReadLine(); - if (nextLine == null) - break; - - line = StringHelpers.Concat(line.AsMemory(), options.NewLine, nextLine.AsMemory()).AsString(); - record = new ReadLineSpan(headers!, headerLookup, index, line, options); - } - } - - yield return record; - } - } + => Enumerate(new TextReaderLineSource(reader), default, options ?? new CsvOptions()); /// /// Reads CSV data from memory with enhanced memory management options. @@ -261,117 +158,7 @@ public static CsvBufferWriter CreateBufferWriter(ReadOnlySpan headers, c } private static IEnumerable ReadFromMemoryOptimizedImpl(ReadOnlyMemory csv, CsvOptions options, CsvMemoryOptions memoryOptions) - { - var position = 0; - var index = 0; - ReadOnlyMemory[]? headers = null; - Dictionary? headerLookup = null; - - while (position < csv.Length) - { - var line = ReadLineOptimized(csv, ref position, memoryOptions); - if (line.IsEmpty) break; - - index++; - - if (index <= options.RowsToSkip || options.SkipRow?.Invoke(line, index) == true) - continue; - - if (headers == null || headerLookup == null) - { - InitializeOptions(line.Span, options); - var skipInitialLine = options.HeaderMode == HeaderMode.HeaderPresent; - - headers = skipInitialLine ? GetHeaders(line, options) : CreateDefaultHeaders(line, options); - - try - { - headerLookup = CreateHeaderLookup(headers, options); - } - catch (ArgumentException) - { - throw new InvalidOperationException("Duplicate headers detected in HeaderPresent mode. If you don't have a header you can set the HeaderMode to HeaderAbsent."); - } - - if (skipInitialLine) - continue; - } - - var record = new ReadLineSpanOptimized(headers, headerLookup, index, line, options, memoryOptions); - if (options.AllowNewLineInEnclosedFieldValues) - { - while (record.RawSplitLine.Any(f => CsvLineSplitter.IsUnterminatedQuotedValue(f.Span, options))) - { - var nextLine = ReadLineOptimized(csv, ref position, memoryOptions); - if (nextLine.IsEmpty) - break; - - line = ConcatenateMemory(line, options.NewLine.AsMemory(), nextLine, memoryOptions); - record = new ReadLineSpanOptimized(headers, headerLookup, index, line, options, memoryOptions); - } - } - - yield return record; - } - } - - private static ReadOnlyMemory ReadLineOptimized(ReadOnlyMemory source, ref int position, CsvMemoryOptions memoryOptions) - { - if (position >= source.Length) - return ReadOnlyMemory.Empty; - - var span = source.Span.Slice(position); - var newlineIndex = span.IndexOfAny('\n', '\r'); - - if (newlineIndex == -1) - { - // Last line without newline - var result = source.Slice(position); - position = source.Length; - return result; - } - - var lineLength = newlineIndex; - var line = source.Slice(position, lineLength); - - // Skip newline characters - position += lineLength; - if (position < source.Length) - { - var ch = source.Span[position]; - if (ch == '\r' || ch == '\n') - { - position++; - // Handle CRLF - if (position < source.Length && ch == '\r' && source.Span[position] == '\n') - position++; - } - } - - return line; - } - - private static ReadOnlyMemory ConcatenateMemory(ReadOnlyMemory first, ReadOnlyMemory separator, ReadOnlyMemory second, CsvMemoryOptions memoryOptions) - { - var totalLength = first.Length + separator.Length + second.Length; - var buffer = memoryOptions.CharArrayPool.Rent(totalLength); - - try - { - var span = buffer.AsSpan(); - first.Span.CopyTo(span); - separator.Span.CopyTo(span.Slice(first.Length)); - second.Span.CopyTo(span.Slice(first.Length + separator.Length)); - - var result = new char[totalLength]; - span.Slice(0, totalLength).CopyTo(result); - return result.AsMemory(); - } - finally - { - memoryOptions.CharArrayPool.Return(buffer); - } - } + => Enumerate(new MemorySliceLineSource(csv, memoryOptions), new OptimizedRowFactory(memoryOptions), options); #endif @@ -394,114 +181,7 @@ private static IEnumerable ReadFromTextImpl(string csv, CsvOptions? op } private static IEnumerable ReadImpl(TextReader reader, CsvOptions? options) - { - // NOTE: Logic is copied in ReadImpl/ReadImplAsync/ReadFromMemory - options ??= new CsvOptions(); - - string? line; - var index = 0; - MemoryText[]? headers = null; - Dictionary? headerLookup = null; - while ((line = reader.ReadLine()) != null) - { - index++; - - var lineAsMemory = line.AsMemory(); - if (index <= options.RowsToSkip || options.SkipRow?.Invoke(lineAsMemory, index) == true) - continue; - - if (headers == null || headerLookup == null) - { - InitializeOptions(lineAsMemory.AsSpan(), options); - var skipInitialLine = options.HeaderMode == HeaderMode.HeaderPresent; - - // For HeaderAbsent mode with multiline fields, we need to process the complete line first - if (!skipInitialLine && options.AllowNewLineInEnclosedFieldValues) - { - // Process multiline fields to get the complete first data line - var completeLineForHeaders = line; - var tempSplitter = CsvLineSplitter.Get(options); - var splitLine = tempSplitter.Split(lineAsMemory, options); - - while (splitLine.Any(f => CsvLineSplitter.IsUnterminatedQuotedValue(f.AsSpan(), options))) - { - var nextLine = reader.ReadLine(); - if (nextLine == null) - break; - - completeLineForHeaders += options.NewLine + nextLine; - lineAsMemory = completeLineForHeaders.AsMemory(); - splitLine = tempSplitter.Split(lineAsMemory, options); - } - - // Update line to the complete multiline version - line = completeLineForHeaders; - lineAsMemory = line.AsMemory(); - } - - headers = skipInitialLine ? GetHeaders(lineAsMemory, options) : CreateDefaultHeaders(lineAsMemory, options); - - try - { - headerLookup = CreateHeaderLookup(headers, options); - } - catch (ArgumentException) - { - throw new InvalidOperationException("Duplicate headers detected in HeaderPresent mode. If you don't have a header you can set the HeaderMode to HeaderAbsent."); - } - - var aliases = options.Aliases; - if (aliases != null) - { - // NOTE: For each group we need at most 1 match (i.e. SingleOrDefault) - foreach (var aliasGroup in aliases) - { - var groupIndex = -1; - foreach (var alias in aliasGroup) - { - if (headerLookup.TryGetValue(alias, out var aliasIndex)) - { - if (groupIndex != -1) - throw new InvalidOperationException("Found multiple matches within alias group: " + string.Join(";", aliasGroup)); - - groupIndex = aliasIndex; - } - } - - if (groupIndex != -1) - { - foreach (var alias in aliasGroup) - headerLookup[alias] = groupIndex; - } - } - } - - if (skipInitialLine) - continue; - } - - var record = new ReadLine(headers!, headerLookup, index, line, options); - // Only process multiline if we haven't already done it for header creation - var isFirstDataLineInHeaderAbsentMode = (headers != null && options.HeaderMode == HeaderMode.HeaderAbsent && - record.Index == (options.RowsToSkip + 1)); - if (options.AllowNewLineInEnclosedFieldValues && !isFirstDataLineInHeaderAbsentMode) - { - // TODO: Move to CsvLineSplitter? - // TODO: Shouldn't we only check the last part? - while (record.RawSplitLine.Any(f => CsvLineSplitter.IsUnterminatedQuotedValue(f.AsSpan(), options))) - { - var nextLine = reader.ReadLine(); - if (nextLine == null) - break; - - line += options.NewLine + nextLine; - record = new ReadLine(headers!, headerLookup, index, line, options); - } - } - - yield return record; - } - } + => Enumerate(new TextReaderLineSource(reader), default, options ?? new CsvOptions()); #if NET8_0_OR_GREATER /// @@ -557,86 +237,8 @@ static async IAsyncEnumerable Impl(string csv, CsvOptions? options) return Impl(csv, options); } - private static async IAsyncEnumerable ReadImplAsync(TextReader reader, CsvOptions? options) - { - // NOTE: Logic is copied in ReadImpl/ReadImplAsync/ReadFromMemory - options ??= new CsvOptions(); - - string? line; - var index = 0; - MemoryText[]? headers = null; - Dictionary? headerLookup = null; - while ((line = await reader.ReadLineAsync()) != null) - { - index++; - - var lineAsMemory = line.AsMemory(); - if (index <= options.RowsToSkip || options.SkipRow?.Invoke(lineAsMemory, index) == true) - continue; - - if (headers == null || headerLookup == null) - { - InitializeOptions(lineAsMemory.Span, options); - var skipInitialLine = options.HeaderMode == HeaderMode.HeaderPresent; - - headers = skipInitialLine ? GetHeaders(lineAsMemory, options) : CreateDefaultHeaders(lineAsMemory, options); - - try - { - headerLookup = CreateHeaderLookup(headers, options); - } - catch (ArgumentException) - { - throw new InvalidOperationException("Duplicate headers detected in HeaderPresent mode. If you don't have a header you can set the HeaderMode to HeaderAbsent."); - } - - var aliases = options.Aliases; - if (aliases != null) - { - // NOTE: For each group we need at most 1 match (i.e. SingleOrDefault) - foreach (var aliasGroup in aliases) - { - var groupIndex = -1; - foreach (var alias in aliasGroup) - { - if (headerLookup.TryGetValue(alias, out var aliasIndex)) - { - if (groupIndex != -1) - throw new InvalidOperationException("Found multiple matches within alias group: " + string.Join(";", aliasGroup)); - - groupIndex = aliasIndex; - } - } - - if (groupIndex != -1) - { - foreach (var alias in aliasGroup) - headerLookup[alias] = groupIndex; - } - } - } - - if (skipInitialLine) - continue; - } - - var record = new ReadLine(headers!, headerLookup, index, line, options); - if (options.AllowNewLineInEnclosedFieldValues) - { - while (record.RawSplitLine.Any(f => CsvLineSplitter.IsUnterminatedQuotedValue(f.AsSpan(), options))) - { - var nextLine = await reader.ReadLineAsync(); - if (nextLine == null) - break; - - line += options.NewLine + nextLine; - record = new ReadLine(headers!, headerLookup, index, line, options); - } - } - - yield return record; - } - } + private static IAsyncEnumerable ReadImplAsync(TextReader reader, CsvOptions? options) + => EnumerateAsync(new AsyncTextReaderLineSource(reader), default, options ?? new CsvOptions()); #endif private static char AutoDetectSeparator(SpanText sampleLine) @@ -843,7 +445,7 @@ ICsvLine SubLine(ICsvLine line, int start, int length) return new ReadLine(headers, map, line.Index, line.Raw, new CsvOptions()) { parsedLine = values }; } } - private sealed class ReadLine : ICsvLine + internal sealed class ReadLine : ICsvLine { private readonly Dictionary headerLookup; private readonly CsvOptions options; @@ -944,7 +546,7 @@ public override string ToString() #if NET8_0_OR_GREATER - private sealed class ReadLineSpan : ICsvLineSpan + internal sealed class ReadLineSpan : ICsvLineSpan { private readonly Dictionary headerLookup; private readonly CsvOptions options; @@ -1085,7 +687,7 @@ public bool TryGetSpan(int index, out ReadOnlySpan value) public override string ToString() => Raw; } - private sealed class ReadLineSpanOptimized : ICsvLineSpan + internal sealed class ReadLineSpanOptimized : ICsvLineSpan { private readonly Dictionary headerLookup; private readonly CsvOptions options; From 84e2c9608c08c93a6c0861f39fc654f9e02f042a Mon Sep 17 00:00:00 2001 From: Steve Hansen Date: Sat, 16 May 2026 13:07:56 +0200 Subject: [PATCH 2/2] perf: only check the last field for unterminated quotes in multiline continuation By RFC 4180, only the last field of a split line can be unterminated -- an unterminated quote in any earlier field would have been swallowed by Split into a single multi-comma field. Replace splitLine.Any(...) with a direct check of splitLine[Count-1] in both sync and async engine bodies (header pre-pass and per-row loop, 4 spots). Drops the System.Linq import in the engine file. Closes the deferred should-fix item from the #118 review and the corresponding part of #120. Co-Authored-By: Claude Opus 4.7 (1M context) --- Csv/CsvReader.Engine.cs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Csv/CsvReader.Engine.cs b/Csv/CsvReader.Engine.cs index 49bfede..4cc8a4d 100644 --- a/Csv/CsvReader.Engine.cs +++ b/Csv/CsvReader.Engine.cs @@ -2,7 +2,6 @@ using System.Collections.Generic; using System.Diagnostics; using System.IO; -using System.Linq; #if NET8_0_OR_GREATER using System.Buffers; @@ -297,7 +296,7 @@ private static IEnumerable Enumerate(TSource sour { var splitLine = options.Splitter.Split(line, options); - while (splitLine.Any(f => CsvLineSplitter.IsUnterminatedQuotedValue(f.AsSpan(), options))) + while (splitLine.Count > 0 && CsvLineSplitter.IsUnterminatedQuotedValue(splitLine[splitLine.Count - 1].AsSpan(), options)) { if (!source.TryReadLine(out var nextLine, out _)) break; @@ -351,7 +350,7 @@ private static IEnumerable Enumerate(TSource sour if (options.AllowNewLineInEnclosedFieldValues && !isFirstDataLineInHeaderAbsentMode) { var rawSplit = options.Splitter.Split(line, options); - while (rawSplit.Any(f => CsvLineSplitter.IsUnterminatedQuotedValue(f.AsSpan(), options))) + while (rawSplit.Count > 0 && CsvLineSplitter.IsUnterminatedQuotedValue(rawSplit[rawSplit.Count - 1].AsSpan(), options)) { if (!source.TryReadLine(out var nextLine, out _)) break; @@ -400,7 +399,7 @@ private static async IAsyncEnumerable EnumerateAsync CsvLineSplitter.IsUnterminatedQuotedValue(f.AsSpan(), options))) + while (splitLine.Count > 0 && CsvLineSplitter.IsUnterminatedQuotedValue(splitLine[splitLine.Count - 1].AsSpan(), options)) { var (nextOk, nextLine, _) = await source.TryReadLineAsync(ct).ConfigureAwait(false); if (!nextOk) @@ -455,7 +454,7 @@ private static async IAsyncEnumerable EnumerateAsync CsvLineSplitter.IsUnterminatedQuotedValue(f.AsSpan(), options))) + while (rawSplit.Count > 0 && CsvLineSplitter.IsUnterminatedQuotedValue(rawSplit[rawSplit.Count - 1].AsSpan(), options)) { var (nextOk, nextLine, _) = await source.TryReadLineAsync(ct).ConfigureAwait(false); if (!nextOk)