Skip to content

Commit 09d23db

Browse files
committed
feat: fix BlockReader#SkipNext & add SourceOffset property
* SkipNext's metadata calculation wasn't working properly in the case of a CARv2 as it didn't take into account the V1 header in the original offset calculation. * Add a SourceOffset and some docs to be absolutely clear what offsets we're returning in the metadata. Ref: filecoin-project/boost#1651
1 parent 72edbd2 commit 09d23db

File tree

2 files changed

+206
-36
lines changed

2 files changed

+206
-36
lines changed

v2/block_reader.go

Lines changed: 57 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package car
22

33
import (
4+
"errors"
45
"fmt"
56
"io"
67

@@ -23,6 +24,7 @@ type BlockReader struct {
2324
// Used internally only, by BlockReader.Next during iteration over blocks.
2425
r io.Reader
2526
offset uint64
27+
v1offset uint64
2628
readerSize int64
2729
opts Options
2830
}
@@ -80,7 +82,8 @@ func NewBlockReader(r io.Reader, opts ...Option) (*BlockReader, error) {
8082
if _, err := rs.Seek(int64(v2h.DataOffset)-PragmaSize-HeaderSize, io.SeekCurrent); err != nil {
8183
return nil, err
8284
}
83-
br.offset = uint64(v2h.DataOffset)
85+
br.v1offset = uint64(v2h.DataOffset)
86+
br.offset = br.v1offset
8487
br.readerSize = int64(v2h.DataOffset + v2h.DataSize)
8588

8689
// Set br.r to a LimitReader reading from r limited to dataSize.
@@ -96,6 +99,8 @@ func NewBlockReader(r io.Reader, opts ...Option) (*BlockReader, error) {
9699
return nil, fmt.Errorf("invalid data payload header version; expected 1, got %v", header.Version)
97100
}
98101
br.Roots = header.Roots
102+
hs, _ := carv1.HeaderSize(header)
103+
br.offset += hs
99104
default:
100105
// Otherwise, error out with invalid version since only versions 1 or 2 are expected.
101106
return nil, fmt.Errorf("invalid car version: %d", br.Version)
@@ -136,10 +141,22 @@ func (br *BlockReader) Next() (blocks.Block, error) {
136141
return blocks.NewBlockWithCid(data, c)
137142
}
138143

144+
// BlockMetadata contains metadata about a block's section in a CAR file/stream.
145+
//
146+
// There are two offsets for the block data which will be the same if the
147+
// original CAR is a CARv1, but will differ if the original CAR is a CARv2. In
148+
// the case of a CARv2, SourceOffset will be the offset from the beginning of
149+
// the file/steam, and Offset will be the offset from the beginning of the CARv1
150+
// payload container within the CARv2.
151+
//
152+
// Offset is useful for index generation which requires an offset from the CARv1
153+
// payload; while SourceOffset is useful for direct block reads out of the
154+
// source file/stream regardless of version.
139155
type BlockMetadata struct {
140156
cid.Cid
141-
Offset uint64
142-
Size uint64
157+
Offset uint64 // Offset of the block data in the container CARv1
158+
SourceOffset uint64 // SourceOffset is the offset of block data in the source file/stream
159+
Size uint64
143160
}
144161

145162
// SkipNext jumps over the next block, returning metadata about what it is (the CID, offset, and size).
@@ -148,24 +165,33 @@ type BlockMetadata struct {
148165
// If the underlying reader used by the BlockReader is actually a ReadSeeker, this method will attempt to
149166
// seek over the underlying data rather than reading it into memory.
150167
func (br *BlockReader) SkipNext() (*BlockMetadata, error) {
151-
sctSize, err := util.LdReadSize(br.r, br.opts.ZeroLengthSectionAsEOF, br.opts.MaxAllowedSectionSize)
168+
sectionSize, err := util.LdReadSize(br.r, br.opts.ZeroLengthSectionAsEOF, br.opts.MaxAllowedSectionSize)
152169
if err != nil {
153170
return nil, err
154171
}
155-
156-
if sctSize == 0 {
157-
_, _, err := cid.CidFromBytes([]byte{})
172+
if sectionSize == 0 {
173+
_, _, err := cid.CidFromBytes([]byte{}) // generate zero-byte CID error
174+
if err == nil {
175+
panic("expected zero-byte CID error")
176+
}
158177
return nil, err
159178
}
160179

161-
cidSize, c, err := cid.CidFromReader(io.LimitReader(br.r, int64(sctSize)))
180+
lenSize := uint64(varint.UvarintSize(sectionSize))
181+
182+
cidSize, c, err := cid.CidFromReader(io.LimitReader(br.r, int64(sectionSize)))
162183
if err != nil {
163184
return nil, err
164185
}
165186

166-
blkSize := sctSize - uint64(cidSize)
187+
blockSize := sectionSize - uint64(cidSize)
188+
blockOffset := br.offset + lenSize + uint64(cidSize)
189+
190+
// move our reader forward; either by seeking or slurping
191+
167192
if brs, ok := br.r.(io.ReadSeeker); ok {
168-
// carv1 and we don't know the size, so work it out and cache it
193+
// carv1 and we don't know the size, so work it out and cache it so we
194+
// can use it to determine over-reads
169195
if br.readerSize == -1 {
170196
cur, err := brs.Seek(0, io.SeekCurrent)
171197
if err != nil {
@@ -180,42 +206,37 @@ func (br *BlockReader) SkipNext() (*BlockMetadata, error) {
180206
return nil, err
181207
}
182208
}
183-
// seek.
184-
finalOffset, err := brs.Seek(int64(blkSize), io.SeekCurrent)
209+
210+
// seek forward past the block data
211+
finalOffset, err := brs.Seek(int64(blockSize), io.SeekCurrent)
185212
if err != nil {
186213
return nil, err
187214
}
188-
if finalOffset != int64(br.offset)+int64(sctSize)+int64(varint.UvarintSize(sctSize)) {
189-
return nil, fmt.Errorf("unexpected length")
215+
if finalOffset != int64(br.offset)+int64(lenSize)+int64(sectionSize) {
216+
return nil, errors.New("unexpected length")
190217
}
191218
if finalOffset > br.readerSize {
192219
return nil, io.ErrUnexpectedEOF
193220
}
194-
br.offset = uint64(finalOffset)
195-
return &BlockMetadata{
196-
c,
197-
uint64(finalOffset) - sctSize - uint64(varint.UvarintSize(sctSize)),
198-
blkSize,
199-
}, nil
200-
}
201-
202-
// read to end.
203-
readCnt, err := io.CopyN(io.Discard, br.r, int64(blkSize))
204-
if err != nil {
205-
if err == io.EOF {
206-
return nil, io.ErrUnexpectedEOF
221+
} else { // just a reader, we need to slurp the block bytes
222+
readCnt, err := io.CopyN(io.Discard, br.r, int64(blockSize))
223+
if err != nil {
224+
if err == io.EOF {
225+
return nil, io.ErrUnexpectedEOF
226+
}
227+
return nil, err
228+
}
229+
if readCnt != int64(blockSize) {
230+
return nil, errors.New("unexpected length")
207231
}
208-
return nil, err
209-
}
210-
if readCnt != int64(blkSize) {
211-
return nil, fmt.Errorf("unexpected length")
212232
}
213-
origOffset := br.offset
214-
br.offset += uint64(varint.UvarintSize(sctSize)) + sctSize
233+
234+
br.offset = blockOffset + blockSize
215235

216236
return &BlockMetadata{
217-
c,
218-
origOffset,
219-
blkSize,
237+
Cid: c,
238+
Offset: blockOffset - br.v1offset,
239+
SourceOffset: blockOffset,
240+
Size: blockSize,
220241
}, nil
221242
}

v2/block_reader_test.go

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@ package car_test
22

33
import (
44
"bytes"
5+
"crypto/rand"
56
"encoding/hex"
67
"fmt"
78
"io"
89
"os"
910
"testing"
1011

12+
blocks "github.com/ipfs/go-block-format"
1113
"github.com/ipfs/go-cid"
1214
carv2 "github.com/ipld/go-car/v2"
1315
"github.com/ipld/go-car/v2/internal/carv1"
@@ -229,6 +231,153 @@ func TestMaxHeaderLength(t *testing.T) {
229231
require.EqualError(t, err, "invalid header data, length of read beyond allowable maximum")
230232
}
231233

234+
func TestBlockReader(t *testing.T) {
235+
req := require.New(t)
236+
237+
// prepare a CARv1 with 100 blocks
238+
roots := []cid.Cid{cid.MustParse("bafyrgqhai26anf3i7pips7q22coa4sz2fr4gk4q4sqdtymvvjyginfzaqewveaeqdh524nsktaq43j65v22xxrybrtertmcfxufdam3da3hbk")}
239+
blks := make([]struct {
240+
block blocks.Block
241+
dataOffset uint64
242+
}, 100)
243+
v1buf := new(bytes.Buffer)
244+
carv1.WriteHeader(&carv1.CarHeader{Roots: roots, Version: 1}, v1buf)
245+
vb := make([]byte, 2)
246+
for i := 0; i < 100; i++ {
247+
blk := randBlock(100 + i) // we should cross the varint two-byte boundary in here somewhere
248+
vn := varint.PutUvarint(vb, uint64(len(blk.Cid().Bytes())+len(blk.RawData())))
249+
n, err := v1buf.Write(vb[:vn])
250+
req.NoError(err)
251+
req.Equal(n, vn)
252+
n, err = v1buf.Write(blk.Cid().Bytes())
253+
req.NoError(err)
254+
req.Equal(len(blk.Cid().Bytes()), n)
255+
blks[i] = struct {
256+
block blocks.Block
257+
dataOffset uint64
258+
}{block: blk, dataOffset: uint64(v1buf.Len())}
259+
n, err = v1buf.Write(blk.RawData())
260+
req.NoError(err)
261+
req.Equal(len(blk.RawData()), n)
262+
}
263+
264+
v2buf := new(bytes.Buffer)
265+
n, err := v2buf.Write(carv2.Pragma)
266+
req.NoError(err)
267+
req.Equal(len(carv2.Pragma), n)
268+
v2Header := carv2.NewHeader(uint64(v1buf.Len()))
269+
ni, err := v2Header.WriteTo(v2buf)
270+
req.NoError(err)
271+
req.Equal(carv2.HeaderSize, int(ni))
272+
n, err = v2buf.Write(v1buf.Bytes())
273+
req.NoError(err)
274+
req.Equal(v1buf.Len(), n)
275+
276+
v2padbuf := new(bytes.Buffer)
277+
n, err = v2padbuf.Write(carv2.Pragma)
278+
req.NoError(err)
279+
req.Equal(len(carv2.Pragma), n)
280+
v2Header = carv2.NewHeader(uint64(v1buf.Len()))
281+
// pad with 100 bytes
282+
v2Header.DataOffset += 100
283+
ni, err = v2Header.WriteTo(v2padbuf)
284+
req.NoError(err)
285+
req.Equal(carv2.HeaderSize, int(ni))
286+
v2padbuf.Write(make([]byte, 100))
287+
n, err = v2padbuf.Write(v1buf.Bytes())
288+
req.NoError(err)
289+
req.Equal(v1buf.Len(), n)
290+
291+
for _, testCase := range []struct {
292+
name string
293+
reader func() io.Reader
294+
v1offset uint64
295+
}{
296+
{
297+
name: "v1",
298+
reader: func() io.Reader { return &readerOnly{bytes.NewReader(v1buf.Bytes())} },
299+
},
300+
{
301+
name: "v2",
302+
reader: func() io.Reader { return &readerOnly{bytes.NewReader(v2buf.Bytes())} },
303+
v1offset: uint64(carv2.PragmaSize + carv2.HeaderSize),
304+
},
305+
{
306+
name: "v2 padded",
307+
reader: func() io.Reader { return &readerOnly{bytes.NewReader(v2padbuf.Bytes())} },
308+
v1offset: uint64(carv2.PragmaSize+carv2.HeaderSize) + 100,
309+
},
310+
{
311+
name: "v1 w/ReadSeeker",
312+
reader: func() io.Reader { return bytes.NewReader(v1buf.Bytes()) },
313+
},
314+
{
315+
name: "v2 w/ReadSeeker",
316+
reader: func() io.Reader { return bytes.NewReader(v2buf.Bytes()) },
317+
v1offset: uint64(carv2.PragmaSize + carv2.HeaderSize),
318+
},
319+
{
320+
name: "v2 padded w/ReadSeeker",
321+
reader: func() io.Reader { return bytes.NewReader(v2padbuf.Bytes()) },
322+
v1offset: uint64(carv2.PragmaSize+carv2.HeaderSize) + 100,
323+
},
324+
} {
325+
t.Run(testCase.name, func(t *testing.T) {
326+
req := require.New(t)
327+
328+
car, err := carv2.NewBlockReader(testCase.reader())
329+
req.NoError(err)
330+
req.ElementsMatch(roots, car.Roots)
331+
332+
for i := 0; i < 100; i++ {
333+
blk, err := car.Next()
334+
req.NoError(err)
335+
req.Equal(blks[i].block.Cid(), blk.Cid())
336+
req.Equal(blks[i].block.RawData(), blk.RawData())
337+
}
338+
_, err = car.Next()
339+
req.ErrorIs(err, io.EOF)
340+
341+
car, err = carv2.NewBlockReader(testCase.reader())
342+
req.NoError(err)
343+
req.ElementsMatch(roots, car.Roots)
344+
345+
for i := 0; i < 100; i++ {
346+
blk, err := car.SkipNext()
347+
req.NoError(err)
348+
req.Equal(blks[i].block.Cid(), blk.Cid)
349+
req.Equal(uint64(len(blks[i].block.RawData())), blk.Size)
350+
req.Equal(blks[i].dataOffset, blk.Offset, "block #%d", i)
351+
req.Equal(blks[i].dataOffset+testCase.v1offset, blk.SourceOffset)
352+
}
353+
_, err = car.Next()
354+
req.ErrorIs(err, io.EOF)
355+
})
356+
}
357+
}
358+
359+
type readerOnly struct {
360+
r io.Reader
361+
}
362+
363+
func (r readerOnly) Read(b []byte) (int, error) {
364+
return r.r.Read(b)
365+
}
366+
367+
func randBlock(l int) blocks.Block {
368+
data := make([]byte, l)
369+
rand.Read(data)
370+
h, err := mh.Sum(data, mh.SHA2_512, -1)
371+
if err != nil {
372+
panic(err)
373+
}
374+
blk, err := blocks.NewBlockWithCid(data, cid.NewCidV1(cid.Raw, h))
375+
if err != nil {
376+
panic(err)
377+
}
378+
return blk
379+
}
380+
232381
func requireReaderFromPath(t *testing.T, path string) io.Reader {
233382
f, err := os.Open(path)
234383
require.NoError(t, err)

0 commit comments

Comments
 (0)