-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathvis.py
More file actions
56 lines (38 loc) · 1.54 KB
/
vis.py
File metadata and controls
56 lines (38 loc) · 1.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python
"""
Inspect a Megatron IndexedDataset.
For first 100 documents it prints
• token IDs / pieces / detokenised text`
"""
import sys
from typing import List
from tqdm import tqdm
import numpy as np
import sentencepiece as spm
from megatron.data.indexed_dataset_hacked import MMapIndexedDataset
# --- helpers ----
def load_indexed_dset(path_to_bin: str) -> MMapIndexedDataset:
"""Open <prefix>.idx / <prefix>.bin as a Megatron IndexedDataset."""
return MMapIndexedDataset(path_to_bin.replace(".bin", ""))
def show(ids: List[int], label: str, doc_idx: int, sp: any):
print(f"[doc {doc_idx}] {label} IDs : {ids}")
print(f"[doc {doc_idx}] {label} pieces: {[sp.id_to_piece(i) for i in ids]}")
print(f"[doc {doc_idx}] {label} text : {sp.decode(ids)}")
def main(input_bin, sp_model):
special_tokens = []
# --- SentencePiece --------
sp = spm.SentencePieceProcessor(model_file=sp_model)
print(f"Loaded SentencePiece model ({sp.get_piece_size()} pieces) from {sp_model!r}")
dset = load_indexed_dset(input_bin)
for doc_idx in tqdm(range(100), desc="docs"):
ids: np.ndarray = dset.get(doc_idx)
pretty_ids = ids.tolist()
show(pretty_ids, "full document", doc_idx, sp)
# --- main ----------------------------------------------------------
if __name__ == "__main__":
if len(sys.argv) < 3:
print(f"Usage: {sys.argv[0]} /path/to/file.bin /path/to/model.model")
sys.exit(1)
inp_bin = sys.argv[1]
sp_model_path = sys.argv[2]
main(inp_bin, sp_model_path)