Skip to content
/ fq Public
forked from wader/fq

Commit efc59a8

Browse files
committed
ldb: uncompression support
1 parent fb910bd commit efc59a8

File tree

19 files changed

+295
-57
lines changed

19 files changed

+295
-57
lines changed

format/ldb/ldb.go

Lines changed: 130 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,14 @@ package ldb
55
// https://github.com/google/leveldb/blob/main/doc/index.md
66

77
import (
8+
"bytes"
89
"encoding/binary"
910
"fmt"
1011
"hash/crc32"
1112

13+
"github.com/golang/snappy"
1214
"github.com/wader/fq/format"
15+
"github.com/wader/fq/pkg/bitio"
1316
"github.com/wader/fq/pkg/decode"
1417
"github.com/wader/fq/pkg/interp"
1518
"github.com/wader/fq/pkg/scalar"
@@ -39,10 +42,16 @@ const (
3942
)
4043

4144
// https://github.com/google/leveldb/blob/main/include/leveldb/options.h#L25
45+
const (
46+
compressionTypeNone = 0x0
47+
compressionTypeSnappy = 0x1
48+
compressionTypeZstandard = 0x2
49+
)
50+
4251
var compressionTypes = scalar.UintMapSymStr{
43-
0x0: "none",
44-
0x1: "Snappy",
45-
0x2: "Zstandard",
52+
compressionTypeNone: "none",
53+
compressionTypeSnappy: "Snappy",
54+
compressionTypeZstandard: "Zstandard",
4655
}
4756

4857
// https://github.com/google/leveldb/blob/main/db/dbformat.h#L54
@@ -59,15 +68,15 @@ type BlockHandle struct {
5968
func ldbDecode(d *decode.D) any {
6069
d.Endian = decode.LittleEndian
6170

62-
// Read the footer (last 48 bytes)
71+
// footer
72+
6373
d.SeekAbs(d.Len() - footerEncodedLength*8)
6474
var indexOffset int64
6575
var indexSize int64
6676
var metaIndexOffset int64
6777
var metaIndexSize int64
6878

6979
d.FieldStruct("footer", func(d *decode.D) {
70-
// Extract varints for metaindex offset and size, index offset and size
7180
d.FieldStruct("metaindex_handle", func(d *decode.D) {
7281
metaIndexOffset = int64(d.FieldUintFn("offset", decodeVarInt))
7382
metaIndexSize = int64(d.FieldUintFn("size", decodeVarInt))
@@ -80,12 +89,25 @@ func ldbDecode(d *decode.D) any {
8089
d.FieldU64("magic_number", d.UintAssert(tableMagicNumber), scalar.UintHex)
8190
})
8291

92+
// metaindex
93+
8394
d.SeekAbs(metaIndexOffset * 8)
84-
fieldStructBlock("metaindex_block", metaIndexSize, nil, d)
95+
var metaHandles []BlockHandle
96+
fieldStructBlock("metaindex", metaIndexSize, readKeyValueContent, func(d *decode.D) {
97+
// BlockHandle
98+
// https://github.com/google/leveldb/blob/main/table/format.cc#L24
99+
handle := BlockHandle{
100+
Offset: d.FieldUintFn("offset", decodeVarInt),
101+
Size: d.FieldUintFn("size", decodeVarInt),
102+
}
103+
metaHandles = append(metaHandles, handle)
104+
}, d)
105+
106+
// index
85107

86108
d.SeekAbs(indexOffset * 8)
87109
var dataHandles []BlockHandle
88-
fieldStructBlock("index_block", indexSize, func(d *decode.D) {
110+
fieldStructBlock("index", indexSize, readKeyValueContent, func(d *decode.D) {
89111
// BlockHandle
90112
// https://github.com/google/leveldb/blob/main/table/format.cc#L24
91113
handle := BlockHandle{
@@ -95,77 +117,128 @@ func ldbDecode(d *decode.D) any {
95117
dataHandles = append(dataHandles, handle)
96118
}, d)
97119

98-
fmt.Println("total handles", len(dataHandles))
99-
d.FieldArray("data_blocks", func(d *decode.D) {
100-
for _, handle := range dataHandles {
101-
d.SeekAbs(int64(handle.Offset) * 8)
102-
fieldStructBlock("data_block", int64(handle.Size), nil, d)
103-
}
104-
})
120+
// meta
121+
122+
if len(metaHandles) > 0 {
123+
d.FieldArray("meta", func(d *decode.D) {
124+
for _, handle := range metaHandles {
125+
d.SeekAbs(int64(handle.Offset) * 8)
126+
fieldStructBlock("meta_block", int64(handle.Size), readMetaContent, nil, d)
127+
}
128+
})
129+
}
130+
131+
// data
132+
133+
if len(dataHandles) > 0 {
134+
d.FieldArray("data", func(d *decode.D) {
135+
for _, handle := range dataHandles {
136+
d.SeekAbs(int64(handle.Offset) * 8)
137+
fieldStructBlock("data_block", int64(handle.Size), readKeyValueContent, nil, d)
138+
}
139+
})
140+
}
105141

106142
return nil
107143
}
108144

109145
// Helpers
110146

111-
func fieldStructBlock(name string, size int64, valueCallbackFn func(d *decode.D), d *decode.D) *decode.D {
147+
func fieldStructBlock(name string, size int64, readBlockContent func(size int64, valueCallbackFn func(d *decode.D), d *decode.D), valueCallbackFn func(d *decode.D), d *decode.D) *decode.D {
112148
// ReadBlock: https://github.com/google/leveldb/blob/main/table/format.cc#L69
113-
uint32Size := int64(32)
114-
uint64Size := int64(64)
115149
return d.FieldStruct(name, func(d *decode.D) {
116150
start := d.Pos()
117151
br := d.RawLen(size * 8)
118-
end := d.Pos()
119152
compressionType := d.FieldU8("compression", compressionTypes, scalar.UintHex)
120153
// validate crc
121154
data := d.ReadAllBits(br)
122155
bytesToCheck := append(data, uint8(compressionType))
123156
maskedCRCInt := maskedCrc32(bytesToCheck)
124157
d.FieldU32("crc", d.UintAssert(uint64(maskedCRCInt)), scalar.UintHex)
125-
d.FieldStruct("data", func(d *decode.D) {
126-
// https://github.com/google/leveldb/blob/main/table/block_builder.cc#L16
127-
// https://github.com/google/leveldb/blob/main/table/block.cc
128-
var restartOffset int64
129-
d.SeekAbs(end - uint32Size)
130-
d.FieldStruct("trailer", func(d *decode.D) {
131-
numRestarts := int64(d.FieldU32("num_restarts"))
132-
restartOffset = size*8 - (1+numRestarts)*uint32Size
133-
d.SeekAbs(start + restartOffset)
134-
d.FieldArray("restarts", func(d *decode.D) {
135-
for i := 0; i < int(numRestarts); i++ {
136-
d.FieldU32("restart")
137-
}
138-
})
158+
159+
d.SeekAbs(start)
160+
if compressionType == compressionTypeNone {
161+
d.FieldStruct("uncompressed", func(d *decode.D) {
162+
readBlockContent(size, valueCallbackFn, d)
139163
})
140-
// TK: how do you make an empty entries-array appear _above_ the trailer?
141-
// Right now, its omited if empty.
142-
if restartOffset <= 0 {
143-
return
144-
}
145-
d.SeekAbs(start)
146-
d.FieldArray("entries", func(d *decode.D) {
147-
for d.Pos() < start+restartOffset {
148-
d.FieldStruct("entry", func(d *decode.D) {
149-
d.FieldUintFn("shared_bytes", decodeVarInt)
150-
unshared := int64(d.FieldUintFn("unshared_bytes", decodeVarInt))
151-
valueLength := d.FieldUintFn("value_length", decodeVarInt)
152-
// InternalKey
153-
// https://github.com/google/leveldb/blob/main/db/dbformat.h#L171
154-
d.FieldStruct("key_delta", func(d *decode.D) {
155-
d.FieldUTF8("user_key", int(unshared-uint64Size/8))
156-
d.FieldU8("type", valueTypes, scalar.UintHex)
157-
d.FieldU56("sequence_number")
158-
})
159-
if valueCallbackFn == nil {
160-
d.FieldUTF8("value", int(valueLength))
161-
} else {
162-
d.FieldStruct("value", valueCallbackFn)
163-
}
164-
})
164+
} else {
165+
compressedSize := size
166+
compressed := data
167+
bb := &bytes.Buffer{}
168+
_ = bb
169+
switch compressionType {
170+
case compressionTypeSnappy:
171+
decompressed, err := snappy.Decode(nil, compressed)
172+
if err != nil {
173+
d.Fatalf("failed decompressing data: %v", err)
165174
}
175+
d.Copy(bb, bytes.NewReader(decompressed))
176+
default:
177+
d.Fatalf("Unsupported compression type: %x", compressionType)
178+
}
179+
d.FieldStructRootBitBufFn("uncompressed", bitio.NewBitReader(bb.Bytes(), -1), func(d *decode.D) {
180+
readBlockContent(int64(bb.Len()), valueCallbackFn, d)
166181
})
182+
d.FieldRawLen("compressed", compressedSize*8)
183+
}
184+
185+
})
186+
}
187+
188+
func readKeyValueContent(size int64, valueCallbackFn func(d *decode.D), d *decode.D) {
189+
// https://github.com/google/leveldb/blob/main/table/block_builder.cc#L16
190+
// https://github.com/google/leveldb/blob/main/table/block.cc
191+
uint32Size := int64(32)
192+
uint64Size := int64(64)
193+
start := d.Pos()
194+
end := start + size*8
195+
196+
var restartOffset int64
197+
d.SeekAbs(end - uint32Size)
198+
d.FieldStruct("trailer", func(d *decode.D) {
199+
numRestarts := int64(d.FieldU32("num_restarts"))
200+
restartOffset = size*8 - (1+numRestarts)*uint32Size
201+
d.SeekAbs(start + restartOffset)
202+
d.FieldArray("restarts", func(d *decode.D) {
203+
for i := 0; i < int(numRestarts); i++ {
204+
d.FieldU32("restart")
205+
}
167206
})
168207
})
208+
// TK: how do you make an empty entries-array appear _above_ the trailer?
209+
// Right now, its omited if empty.
210+
if restartOffset <= 0 {
211+
return
212+
}
213+
d.SeekAbs(start)
214+
d.FieldArray("entries", func(d *decode.D) {
215+
for d.Pos() < start+restartOffset {
216+
d.FieldStruct("entry", func(d *decode.D) {
217+
d.FieldUintFn("shared_bytes", decodeVarInt)
218+
unshared := int64(d.FieldUintFn("unshared_bytes", decodeVarInt))
219+
valueLength := d.FieldUintFn("value_length", decodeVarInt)
220+
// InternalKey
221+
// https://github.com/google/leveldb/blob/main/db/dbformat.h#L171
222+
d.FieldStruct("key_delta", func(d *decode.D) {
223+
d.FieldUTF8("user_key", int(unshared-uint64Size/8))
224+
d.FieldU8("type", valueTypes, scalar.UintHex)
225+
d.FieldU56("sequence_number")
226+
})
227+
if valueCallbackFn == nil {
228+
d.FieldUTF8("value", int(valueLength))
229+
} else {
230+
d.FieldStruct("value", valueCallbackFn)
231+
}
232+
})
233+
}
234+
})
235+
}
236+
237+
func readMetaContent(size int64, valueCallbackFn func(d *decode.D), d *decode.D) {
238+
// TK(2023-12-04)
239+
// https://github.com/google/leveldb/blob/main/doc/table_format.md#filter-meta-block
240+
// https://github.com/google/leveldb/blob/main/table/filter_block.cc
241+
d.FieldRawLen("raw", size*8)
169242
}
170243

171244
func decodeVarInt(d *decode.D) uint64 {

format/ldb/testdata/000005.ldb

-464 Bytes
Binary file not shown.

format/ldb/testdata/ldb.fqtest

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
$ fq -d ldb dv uncompressed.ldb/000005.ldb
2+
|00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|.{}: uncompressed.ldb/000005.ldb (ldb) 0x0-0x65a (1626)
3+
| | | data[0:1]: 0x0-0x601 (1537)
4+
| | | [0]{}: data_block 0x0-0x601 (1537)
5+
| | | uncompressed{}: 0x0-0x5fc (1532)
6+
| | | entries[0:4]: 0x0-0x5f4 (1524)
7+
| | | [0]{}: entry 0x0-0x1d4 (468)
8+
0x000|00 |. | shared_bytes: 0 0x0-0x1 (1)
9+
0x000| 13 | . | unshared_bytes: 19 0x1-0x2 (1)
10+
0x000| bd 03 | .. | value_length: 445 0x2-0x4 (2)
11+
| | | key_delta{}: 0x4-0x17 (19)
12+
0x000| 6c 6f 72 65 6d 2e 64 6f 6c 6f 72 | lorem.dolor | user_key: "lorem.dolor" 0x4-0xf (11)
13+
0x000| 01| .| type: "value" (0x1) 0xf-0x10 (1)
14+
0x010|03 00 00 00 00 00 00 |....... | sequence_number: 3 0x10-0x17 (7)
15+
0x010| 4c 6f 72 65 6d 20 69 70 73| Lorem ips| value: "Lorem ipsum dolor sit amet, consectetur adipisc..." 0x17-0x1d4 (445)
16+
0x020|75 6d 20 64 6f 6c 6f 72 20 73 69 74 20 61 6d 65|um dolor sit ame|
17+
* |until 0x1d3.7 (445) | |
18+
| | | [1]{}: entry 0x1d4-0x3a2 (462)
19+
0x1d0| 06 | . | shared_bytes: 6 0x1d4-0x1d5 (1)
20+
0x1d0| 0d | . | unshared_bytes: 13 0x1d5-0x1d6 (1)
21+
0x1d0| bd 03 | .. | value_length: 445 0x1d6-0x1d8 (2)
22+
| | | key_delta{}: 0x1d8-0x1e5 (13)
23+
0x1d0| 69 70 73 75 6d | ipsum | user_key: "ipsum" 0x1d8-0x1dd (5)
24+
0x1d0| 01 | . | type: "value" (0x1) 0x1dd-0x1de (1)
25+
0x1d0| 02 00| ..| sequence_number: 2 0x1de-0x1e5 (7)
26+
0x1e0|00 00 00 00 00 |..... |
27+
0x1e0| 4c 6f 72 65 6d 20 69 70 73 75 6d| Lorem ipsum| value: "Lorem ipsum dolor sit amet, consectetur adipisc..." 0x1e5-0x3a2 (445)
28+
0x1f0|20 64 6f 6c 6f 72 20 73 69 74 20 61 6d 65 74 2c| dolor sit amet,|
29+
* |until 0x3a1.7 (445) | |
30+
| | | [2]{}: entry 0x3a2-0x570 (462)
31+
0x3a0| 06 | . | shared_bytes: 6 0x3a2-0x3a3 (1)
32+
0x3a0| 0d | . | unshared_bytes: 13 0x3a3-0x3a4 (1)
33+
0x3a0| bd 03 | .. | value_length: 445 0x3a4-0x3a6 (2)
34+
| | | key_delta{}: 0x3a6-0x3b3 (13)
35+
0x3a0| 6c 6f 72 65 6d | lorem | user_key: "lorem" 0x3a6-0x3ab (5)
36+
0x3a0| 01 | . | type: "value" (0x1) 0x3ab-0x3ac (1)
37+
0x3a0| 01 00 00 00| ....| sequence_number: 1 0x3ac-0x3b3 (7)
38+
0x3b0|00 00 00 |... |
39+
0x3b0| 4c 6f 72 65 6d 20 69 70 73 75 6d 20 64| Lorem ipsum d| value: "Lorem ipsum dolor sit amet, consectetur adipisc..." 0x3b3-0x570 (445)
40+
0x3c0|6f 6c 6f 72 20 73 69 74 20 61 6d 65 74 2c 20 63|olor sit amet, c|
41+
* |until 0x56f.7 (445) | |
42+
| | | [3]{}: entry 0x570-0x5f4 (132)
43+
0x570|00 |. | shared_bytes: 0 0x570-0x571 (1)
44+
0x570| 0b | . | unshared_bytes: 11 0x571-0x572 (1)
45+
0x570| 76 | v | value_length: 118 0x572-0x573 (1)
46+
| | | key_delta{}: 0x573-0x57e (11)
47+
0x570| 72 6f 77 | row | user_key: "row" 0x573-0x576 (3)
48+
0x570| 01 | . | type: "value" (0x1) 0x576-0x577 (1)
49+
0x570| 04 00 00 00 00 00 00 | ....... | sequence_number: 4 0x577-0x57e (7)
50+
0x570| 52 6f| Ro| value: "Row, row, row your boat\nGently down the stream...." 0x57e-0x5f4 (118)
51+
0x580|77 2c 20 72 6f 77 2c 20 72 6f 77 20 79 6f 75 72|w, row, row your|
52+
* |until 0x5f3.7 (118) | |
53+
| | | trailer{}: 0x5f4-0x5fc (8)
54+
| | | restarts[0:1]: 0x5f4-0x5f8 (4)
55+
0x5f0| 00 00 00 00 | .... | [0]: 0 restart 0x5f4-0x5f8 (4)
56+
0x5f0| 01 00 00 00 | .... | num_restarts: 1 0x5f8-0x5fc (4)
57+
0x5f0| 00 | . | compression: "none" (0x0) 0x5fc-0x5fd (1)
58+
0x5f0| 6f 99 1d| o..| crc: 0xb31d996f (valid) 0x5fd-0x601 (4)
59+
0x600|b3 |. |
60+
| | | metaindex{}: 0x601-0x60e (13)
61+
| | | uncompressed{}: 0x601-0x609 (8)
62+
| | | trailer{}: 0x601-0x609 (8)
63+
| | | restarts[0:1]: 0x601-0x605 (4)
64+
0x600| 00 00 00 00 | .... | [0]: 0 restart 0x601-0x605 (4)
65+
0x600| 01 00 00 00 | .... | num_restarts: 1 0x605-0x609 (4)
66+
0x600| 00 | . | compression: "none" (0x0) 0x609-0x60a (1)
67+
0x600| c0 f2 a1 b0 | .... | crc: 0xb0a1f2c0 (valid) 0x60a-0x60e (4)
68+
| | | index{}: 0x60e-0x62a (28)
69+
| | | uncompressed{}: 0x60e-0x625 (23)
70+
| | | entries[0:1]: 0x60e-0x61d (15)
71+
| | | [0]{}: entry 0x60e-0x61d (15)
72+
0x600| 00 | . | shared_bytes: 0 0x60e-0x60f (1)
73+
0x600| 09| .| unshared_bytes: 9 0x60f-0x610 (1)
74+
0x610|03 |. | value_length: 3 0x610-0x611 (1)
75+
| | | key_delta{}: 0x611-0x61a (9)
76+
0x610| 73 | s | user_key: "s" 0x611-0x612 (1)
77+
0x610| 01 | . | type: "value" (0x1) 0x612-0x613 (1)
78+
0x610| ff ff ff ff ff ff ff | ....... | sequence_number: 72057594037927935 0x613-0x61a (7)
79+
| | | value{}: 0x61a-0x61d (3)
80+
0x610| 00 | . | offset: 0 0x61a-0x61b (1)
81+
0x610| fc 0b | .. | size: 1532 0x61b-0x61d (2)
82+
| | | trailer{}: 0x61d-0x625 (8)
83+
| | | restarts[0:1]: 0x61d-0x621 (4)
84+
0x610| 00 00 00| ...| [0]: 0 restart 0x61d-0x621 (4)
85+
0x620|00 |. |
86+
0x620| 01 00 00 00 | .... | num_restarts: 1 0x621-0x625 (4)
87+
0x620| 00 | . | compression: "none" (0x0) 0x625-0x626 (1)
88+
0x620| 68 e2 bf 46 | h..F | crc: 0x46bfe268 (valid) 0x626-0x62a (4)
89+
| | | footer{}: 0x62a-0x65a (48)
90+
| | | metaindex_handle{}: 0x62a-0x62d (3)
91+
0x620| 81 0c | .. | offset: 1537 0x62a-0x62c (2)
92+
0x620| 08 | . | size: 8 0x62c-0x62d (1)
93+
| | | index_handle{}: 0x62d-0x630 (3)
94+
0x620| 8e 0c | .. | offset: 1550 0x62d-0x62f (2)
95+
0x620| 17| .| size: 23 0x62f-0x630 (1)
96+
0x630|00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00|................| padding: raw bits 0x630-0x652 (34)
97+
* |until 0x651.7 (34) | |
98+
0x650| 57 fb 80 8b 24 75 47 db| | W...$uG.| | magic_number: 0xdb4775248b80fb57 (valid) 0x652-0x65a (8)

0 commit comments

Comments
 (0)