From 5431ca449936a8069bde905948ce6e851b45e2cc Mon Sep 17 00:00:00 2001 From: Martin Raifer Date: Mon, 14 Mar 2016 18:17:13 +0100 Subject: [PATCH 1/6] use global offset for buffered chunks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For split matchers that occur rarely in a stream with many chunks, resetting the search offset inside the handler makes for very bad performance (e.g. for a matcher that doesn't occur at all, the runtime is O(N²) where N is the total number of chunks in the stream). Moving the offset outside of the stream-chunk-handler restores linear runtime. --- index.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/index.js b/index.js index 59a2ad7..73e7f85 100644 --- a/index.js +++ b/index.js @@ -9,11 +9,11 @@ function BinarySplit (matcher) { matcher = bops.from(matcher || os.EOL) var buffered var bufcount = 0 + var offset = 0 return through(write, end) function write (buf, enc, done) { bufcount++ - var offset = 0 if (buffered) { buf = bops.join([buffered, buf]) @@ -37,6 +37,7 @@ function BinarySplit (matcher) { } else { if (offset >= buf.length) { buffered = undefined + offset = 0 } else { buffered = buf } From 8a9bccd97466ba2008e633e8d62062f3bd258834 Mon Sep 17 00:00:00 2001 From: Martin Raifer Date: Mon, 14 Mar 2016 19:59:08 +0100 Subject: [PATCH 2/6] add (failing) test for chunked input data (#8) --- test.js | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test.js b/test.js index dda3c05..4d16170 100644 --- a/test.js +++ b/test.js @@ -65,3 +65,14 @@ test('matcher at index 0 check', function (t) { splitStream.write(new Buffer('\nhello\nmax')) splitStream.end() }) + +test('chunked input', function (t) { + fs.createReadStream('test.json') + .pipe(split('\n')) + .pipe(split('i')) + .pipe(splitTest(':', function (err, items) { + if (err) throw err + t.equals(items.length, 4) + t.end() + })) +}) From d0641be733b1fc2b444d454b5a1f906d23fe286a Mon Sep 17 00:00:00 2001 From: Martin Raifer Date: Mon, 14 Mar 2016 20:03:46 +0100 Subject: [PATCH 3/6] fix handling of buffered chunks --- index.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/index.js b/index.js index 73e7f85..28ff6a1 100644 --- a/index.js +++ b/index.js @@ -23,14 +23,14 @@ function BinarySplit (matcher) { while (buf) { var idx = firstMatch(buf, offset) if (idx) { - var line = bops.subarray(buf, offset, idx) if (idx === buf.length) { - buffered = line + buffered = buf buf = undefined offset = idx } else { - this.push(line) - offset = idx + matcher.length + this.push(bops.subarray(buf, 0, idx)) + buf = bops.subarray(buf, idx) + offset = 0 } } else if (idx === 0) { buf = bops.subarray(buf, offset + matcher.length) From 873500231875383a8aae29801462a3dd3f0d8046 Mon Sep 17 00:00:00 2001 From: Martin Raifer Date: Mon, 14 Mar 2016 20:32:21 +0100 Subject: [PATCH 4/6] compactify code, drop unnecessary stuff --- index.js | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/index.js b/index.js index 28ff6a1..e5d4d10 100644 --- a/index.js +++ b/index.js @@ -22,25 +22,13 @@ function BinarySplit (matcher) { while (buf) { var idx = firstMatch(buf, offset) - if (idx) { - if (idx === buf.length) { - buffered = buf - buf = undefined - offset = idx - } else { - this.push(bops.subarray(buf, 0, idx)) - buf = bops.subarray(buf, idx) - offset = 0 - } - } else if (idx === 0) { - buf = bops.subarray(buf, offset + matcher.length) + if (idx !== -1 && idx < buf.length) { + this.push(bops.subarray(buf, 0, idx)) + buf = bops.subarray(buf, idx + matcher.length) + offset = 0 } else { - if (offset >= buf.length) { - buffered = undefined - offset = 0 - } else { - buffered = buf - } + buffered = buf + offset = buf.length buf = undefined } } @@ -55,7 +43,7 @@ function BinarySplit (matcher) { } function firstMatch (buf, offset) { - if (offset >= buf.length) return false + if (offset >= buf.length) return -1 for (var i = offset; i < buf.length; i++) { if (buf[i] === matcher[0]) { if (matcher.length > 1) { From ee24ff09fa4babada127447c038663ee3c3e90ec Mon Sep 17 00:00:00 2001 From: Martin Raifer Date: Mon, 14 Mar 2016 20:42:01 +0100 Subject: [PATCH 5/6] add another test for chunked input (this one fails before #8) --- test.js | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test.js b/test.js index 4d16170..9e18bb0 100644 --- a/test.js +++ b/test.js @@ -76,3 +76,14 @@ test('chunked input', function (t) { t.end() })) }) + +test('chunked input with long matcher', function (t) { + fs.createReadStream('test.json') + .pipe(split('\n')) + .pipe(splitTest('hello', function (err, items) { + if (err) throw err + t.equals(items.length, 2) + t.equals(items[0].toString(), '{"') + t.end() + })) +}) From d3f8149a03701ba06599d9d8dc95c75eadcb1f84 Mon Sep 17 00:00:00 2001 From: Martin Raifer Date: Tue, 15 Mar 2016 13:26:38 +0100 Subject: [PATCH 6/6] make it fast again for input with many short lines see https://github.com/maxogden/binary-split/pull/8#issuecomment-196769583 --- index.js | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/index.js b/index.js index e5d4d10..3a63fcc 100644 --- a/index.js +++ b/index.js @@ -9,27 +9,27 @@ function BinarySplit (matcher) { matcher = bops.from(matcher || os.EOL) var buffered var bufcount = 0 - var offset = 0 return through(write, end) function write (buf, enc, done) { bufcount++ - + var offset = 0 + var lastMatch = 0 if (buffered) { buf = bops.join([buffered, buf]) + offset = buffered.length buffered = undefined } - while (buf) { + while (true) { var idx = firstMatch(buf, offset) if (idx !== -1 && idx < buf.length) { - this.push(bops.subarray(buf, 0, idx)) - buf = bops.subarray(buf, idx + matcher.length) - offset = 0 + this.push(bops.subarray(buf, lastMatch, idx)) + offset = idx + matcher.length + lastMatch = offset } else { - buffered = buf - offset = buf.length - buf = undefined + buffered = bops.subarray(buf, lastMatch) + break } }