Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion convert-baichuan-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def parse_args() -> argparse.Namespace:
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
return parser.parse_args()

args = parse_args()
Expand All @@ -86,6 +87,11 @@ def parse_args() -> argparse.Namespace:
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)

endianess = gguf.GGUFEndian.LITTLE
if args.bigendian:
endianess = gguf.GGUFEndian.BIG
endianess_str = "Big Endian" if args.bigendian else "Little Endian"
print(f"gguf: Conversion Endianess {endianess}")
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
Expand Down Expand Up @@ -113,7 +119,7 @@ def parse_args() -> argparse.Namespace:
num_parts = count_model_parts(dir_model)
print(f"num_parts:{num_parts}\n")
ARCH=gguf.MODEL_ARCH.BAICHUAN
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

print("gguf: get model metadata")

Expand Down
22 changes: 14 additions & 8 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,8 +803,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:


class OutputFile:
def __init__(self, fname_out: Path) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

def add_meta_arch(self, params: Params) -> None:
name = "LLaMA"
Expand Down Expand Up @@ -875,10 +875,10 @@ def close(self) -> None:
self.gguf.close()

@staticmethod
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
check_vocab_size(params, vocab)

of = OutputFile(fname_out)
of = OutputFile(fname_out, endianess=endianess)

# meta data
of.add_meta_arch(params)
Expand All @@ -903,10 +903,10 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
return dt.quantize(arr)

@staticmethod
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
check_vocab_size(params, vocab)

of = OutputFile(fname_out)
of = OutputFile(fname_out, endianess=endianess)

# meta data
of.add_meta_arch(params)
Expand All @@ -932,6 +932,8 @@ def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyM
elapsed = time.time() - start
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
if endianess==gguf.GGUFEndian.BIG:
ndarray.byteswap(inplace=True)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be handle in GGUFWriter.write_tensor_data just like you do in add_tensor. Conversion script should not have no responsibility for handling endianness other than setting it in the constructor.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@monatis updated as your comments

print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
of.gguf.write_tensor_data(ndarray)

Expand Down Expand Up @@ -1123,8 +1125,9 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
args = parser.parse_args(args_in)
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")

args = parser.parse_args(args_in)
if args.dump_single:
model_plus = lazy_load_file(args.model)
do_dump_model(model_plus)
Expand All @@ -1138,6 +1141,9 @@ def main(args_in: list[str] | None = None) -> None:
if args.dump:
do_dump_model(model_plus)
return
endianess = gguf.GGUFEndian.LITTLE
if args.bigendian:
endianess = gguf.GGUFEndian.BIG

params = Params.load(model_plus)
if params.n_ctx == -1:
Expand Down Expand Up @@ -1185,7 +1191,7 @@ def main(args_in: list[str] | None = None) -> None:
params.ftype = ftype
print(f"Writing {outfile}, format {ftype}")

OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
print(f"Wrote {outfile}")


Expand Down
15 changes: 13 additions & 2 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,19 @@
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1

#define GGUF_MAGIC 0x46554747 // "GGUF"
#define GGUF_VERSION 2
#if defined(__linux__)
#include <endian.h>
#if BYTE_ORDER == LITTLE_ENDIAN
#define GGUF_MAGIC 0x46554747
#elif BYTE_ORDER == BIG_ENDIAN
#define GGUF_MAGIC 0x47475546
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should either have a comment here to explain it's the same byte sequence in the file or (maybe even better) read raw bytes as Georgi suggested.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now I changed it to char string.

#endif
#else
// Use little endian magic uint_32 value
#define GGUF_MAGIC 0x46554747
#endif

#define GGUF_VERSION 3

#define GGUF_DEFAULT_ALIGNMENT 32

Expand Down
71 changes: 44 additions & 27 deletions gguf-py/gguf/gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@
#

GGUF_MAGIC = 0x46554747
GGUF_VERSION = 2
GGUF_VERSION = 3
GGUF_DEFAULT_ALIGNMENT = 32


# general
KEY_GENERAL_ARCHITECTURE = "general.architecture"
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
Expand Down Expand Up @@ -569,6 +570,10 @@ class GGMLQuantizationType(IntEnum):
Q6_K = 14
Q8_K = 15

class GGUFEndian(IntEnum):
LITTLE = 0
BIG = 1


class GGUFValueType(IntEnum):
UINT8 = 0
Expand Down Expand Up @@ -616,18 +621,41 @@ class GGUFWriter:
temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
tensors: list[tuple[np.ndarray[Any, Any], int]]

def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
@property
def pack_prefix(self):
if self.endianess==GGUFEndian.LITTLE:
return "<"
else:
return ">"

def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
self.fout = open(path, "wb")
self.arch = arch
self.endianess = endianess
self._simple_value_packing = {
GGUFValueType.UINT8: f"{self.pack_prefix}B",
GGUFValueType.INT8: f"{self.pack_prefix}b",
GGUFValueType.UINT16: f"{self.pack_prefix}H",
GGUFValueType.INT16: f"{self.pack_prefix}h",
GGUFValueType.UINT32: f"{self.pack_prefix}I",
GGUFValueType.INT32: f"{self.pack_prefix}i",
GGUFValueType.FLOAT32: f"{self.pack_prefix}f",
GGUFValueType.UINT64: f"{self.pack_prefix}Q",
GGUFValueType.INT64: f"{self.pack_prefix}q",
GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
GGUFValueType.BOOL: "?" ,
}
self.add_architecture()
self.use_temp_file = use_temp_file
self.tensors = []
endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
print(f"This gguf file is for {endianess_str} only")

def write_header_to_file(self):
self.fout.write(struct.pack("<I", GGUF_MAGIC))
self.fout.write(struct.pack("<I", GGUF_VERSION))
self.fout.write(struct.pack("<Q", self.ti_data_count))
self.fout.write(struct.pack("<Q", self.kv_data_count))
self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
self.flush()
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))

Expand Down Expand Up @@ -699,40 +727,27 @@ def add_array(self, key: str, val: Sequence[Any]):
self.add_key(key)
self.add_val(val, GGUFValueType.ARRAY)

_simple_value_packing = {
GGUFValueType.UINT8: "<B",
GGUFValueType.INT8: "<b",
GGUFValueType.UINT16: "<H",
GGUFValueType.INT16: "<h",
GGUFValueType.UINT32: "<I",
GGUFValueType.INT32: "<i",
GGUFValueType.FLOAT32: "<f",
GGUFValueType.UINT64: "<Q",
GGUFValueType.INT64: "<q",
GGUFValueType.FLOAT64: "<d",
GGUFValueType.BOOL: "?" ,
}
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
if vtype is None:
vtype = GGUFValueType.get_type(val)

if add_vtype:
self.kv_data += struct.pack("<I", vtype)
self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype)
self.kv_data_count += 1

pack_fmt = self._simple_value_packing.get(vtype)
if pack_fmt is not None:
self.kv_data += struct.pack(pack_fmt, val)
elif vtype == GGUFValueType.STRING:
encoded_val = val.encode("utf8") if isinstance(val, str) else val
self.kv_data += struct.pack("<Q", len(encoded_val))
self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val))
self.kv_data += encoded_val
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
ltype = GGUFValueType.get_type(val[0])
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
raise ValueError("All items in a GGUF array should be of the same type")
self.kv_data += struct.pack("<I", ltype)
self.kv_data += struct.pack("<Q", len(val))
self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype)
self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val))
for item in val:
self.add_val(item, add_vtype=False)
else:
Expand All @@ -746,22 +761,24 @@ def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype:
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"

encoded_name = name.encode("utf8")
self.ti_data += struct.pack("<Q", len(encoded_name))
self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name))
self.ti_data += encoded_name
n_dims = len(tensor_shape)
self.ti_data += struct.pack("<I", n_dims)
self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims)
for i in range(n_dims):
self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i])
if raw_dtype is None:
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
else:
dtype = raw_dtype
self.ti_data += struct.pack("<I", dtype)
self.ti_data += struct.pack("<Q", self.offset_tensor)
self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype)
self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor)
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
self.ti_data_count += 1

def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
if self.endianess == GGUFEndian.BIG:
tensor.byteswap(inplace=True)
if self.use_temp_file and self.temp_file is None:
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
fp.seek(0)
Expand Down
2 changes: 1 addition & 1 deletion gguf-py/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
version = "0.4.4"
version = "0.4.5"
description = "Write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [
Expand Down
2 changes: 1 addition & 1 deletion k_quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h>
#else
#if !defined(__riscv)
#if !defined(__riscv) && !defined(__s390__)
#include <immintrin.h>
#endif
#endif
Expand Down
2 changes: 2 additions & 0 deletions tests/test-double-float.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

#undef NDEBUG
#include <cassert>
#if !defined(__riscv) && !defined(__s390__)
#include <immintrin.h>
#endif
#include <cmath>
#include <cstdint>
#include <cstring>
Expand Down