From 74fc633b073b8c0ee54ce5c98851dc2ac606ea9a Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Mon, 27 Apr 2026 17:25:49 +0800 Subject: [PATCH] fix: classify Hugging Face model artifacts Signed-off-by: Zhao Chen --- pkg/modelfile/constants.go | 32 ++++++++++++++++++--------- pkg/modelfile/constants_test.go | 39 +++++++++++++++++++++++++++++++++ pkg/modelfile/modelfile_test.go | 38 +++++++++++++++++++++++++++++++- 3 files changed, 98 insertions(+), 11 deletions(-) diff --git a/pkg/modelfile/constants.go b/pkg/modelfile/constants.go index bc09feff..9b686f22 100644 --- a/pkg/modelfile/constants.go +++ b/pkg/modelfile/constants.go @@ -48,13 +48,22 @@ var ( "*.modelcard", // Model card metadata "*.meta", // Model metadata "*tokenizer.model*", // Tokenizer files (e.g., Mistral v3) - "config.json.*", // Model configuration variants - "*.hparams", // Hyperparameter files - "*.params", // Parameter files - "*.hyperparams", // Hyperparameter configuration - "*.wandb", // Weights & Biases configuration - "*.mlflow", // MLflow configuration - "*.tensorboard", // TensorBoard configuration + "*.tiktoken", // TikToken vocabulary files + "vocab.txt", // Tokenizer vocabulary files + "merges.txt", // Tokenizer merge rules + "added_tokens.txt", // Additional tokenizer tokens + "spiece.model", // SentencePiece tokenizer files + "sentencepiece*.model", + "sentencepiece*.vocab", + "tiktoken.model", + "chat_template.jinja", + "config.json.*", // Model configuration variants + "*.hparams", // Hyperparameter files + "*.params", // Parameter files + "*.hyperparams", // Hyperparameter configuration + "*.wandb", // Weights & Biases configuration + "*.mlflow", // MLflow configuration + "*.tensorboard", // TensorBoard configuration } // Model file patterns - supported model file extensions. @@ -97,9 +106,10 @@ var ( "*.f32", // GGML F32 format // checkpoint formats. - "*.ckpt", // Checkpoint format - "*.checkpoint", // Checkpoint format (alternative extension) - "*.dist_ckpt", // Distributed checkpoint format + "*.ckpt", // Checkpoint format + "*.checkpoint", // Checkpoint format (alternative extension) + "*.dist_ckpt", // Distributed checkpoint format + "tensor[0-9]*_[0-9]*", // Sharded checkpoint tensor files // Semantics-specific formats "*.tensor", // Generic tensor format @@ -113,6 +123,7 @@ var ( "*.engine", // TensorRT format "*.trt", // TensorRT format (alternative extension) "*.onnx", // Open Neural Network Exchange format + "*.onnx_data*", // ONNX external data files "*.msgpack", // MessagePack serialization "*.model", // Some NLP frameworks "*.pkl", // Pickle format @@ -124,6 +135,7 @@ var ( "*.nc", // NetCDF format "*.mlmodel", // Apple Core ML format "*.coreml", // Apple Core ML format (alternative) + "*.mil", // Core ML intermediate language files "*.mleap", // MLeap format (Spark ML) "*.surml", // SurrealML format "*.llamafile", // Llamafile format diff --git a/pkg/modelfile/constants_test.go b/pkg/modelfile/constants_test.go index 77fa50e6..8a5c7004 100644 --- a/pkg/modelfile/constants_test.go +++ b/pkg/modelfile/constants_test.go @@ -49,6 +49,9 @@ func TestIsFileTypeModelPatterns(t *testing.T) { {"model.bin.part2", true}, {"model.gguf.part1", true}, {"model.gguf.00001-of-00003", true}, + {"model.onnx_data", true}, + {"model.onnx_data_1", true}, + {"ckpt-0/tensor00001_000", true}, {"model.llamafile.zip", true}, {"model.llamafile.gz", true}, @@ -56,9 +59,11 @@ func TestIsFileTypeModelPatterns(t *testing.T) { {"model.safetensors", true}, {"model.bin", true}, {"model.gguf", true}, + {"model.mil", true}, {"model.llamafile", true}, // Non-matching files. + {"merges.txt", false}, {"readme.txt", false}, {"script.py", false}, {"events.out.tfevents.1679012345.hostname", false}, // tfevents moved to DocFilePatterns @@ -70,6 +75,30 @@ func TestIsFileTypeModelPatterns(t *testing.T) { } } +func TestIsFileTypeConfigPatterns(t *testing.T) { + testCases := []struct { + filename string + expected bool + }{ + {"vocab.txt", true}, + {"merges.txt", true}, + {"added_tokens.txt", true}, + {"chat_template.jinja", true}, + {"tokenizer.tiktoken", true}, + {"spiece.model", true}, + {"sentencepiece.bpe.model", true}, + {"sentencepiece.bpe.vocab", true}, + {"tiktoken.model", true}, + {"weights.model", false}, + {"readme.txt", false}, + } + + assert := assert.New(t) + for _, tc := range testCases { + assert.Equal(tc.expected, IsFileType(tc.filename, ConfigFilePatterns), "filename: %s", tc.filename) + } +} + func TestIsFileTypeDocPatternsTfevents(t *testing.T) { testCases := []struct { filename string @@ -98,10 +127,20 @@ func TestInferFileType(t *testing.T) { {"config yaml", "settings.yaml", 1024, FileTypeConfig}, {"model safetensors", "model.safetensors", 1024, FileTypeModel}, {"model bin", "weights.bin", 1024, FileTypeModel}, + {"model onnx external data", "model.onnx_data_1", 1024, FileTypeModel}, + {"model coreml mil", "model.mil", 1024, FileTypeModel}, + {"checkpoint tensor shard", "ckpt-0/tensor00001_000", 1024, FileTypeModel}, {"code python", "script.py", 1024, FileTypeCode}, {"code go", "main.go", 1024, FileTypeCode}, {"doc markdown", "README.md", 1024, FileTypeDoc}, {"doc pdf", "guide.pdf", 1024, FileTypeDoc}, + {"tokenizer vocab txt", "vocab.txt", 1024, FileTypeConfig}, + {"tokenizer merges txt", "merges.txt", 1024, FileTypeConfig}, + {"tokenizer added tokens txt", "added_tokens.txt", 1024, FileTypeConfig}, + {"sentencepiece spiece model", "spiece.model", 1024, FileTypeConfig}, + {"sentencepiece bpe model", "sentencepiece.bpe.model", 1024, FileTypeConfig}, + {"tiktoken model", "tiktoken.model", 1024, FileTypeConfig}, + {"chat template jinja", "chat_template.jinja", 1024, FileTypeConfig}, // Dotfile with known secondary extension {".cache.json is config", ".cache.json", 1024, FileTypeConfig}, diff --git a/pkg/modelfile/modelfile_test.go b/pkg/modelfile/modelfile_test.go index b9c3b084..0f3812ad 100644 --- a/pkg/modelfile/modelfile_test.go +++ b/pkg/modelfile/modelfile_test.go @@ -606,6 +606,7 @@ func TestNewModelfileByWorkspace(t *testing.T) { "tokenizer.json", "special_tokens_map.json", "vocab.json", + "merges.txt", }, expectModels: []string{ "pytorch_model.bin", @@ -617,7 +618,7 @@ func TestNewModelfileByWorkspace(t *testing.T) { "scripts/convert_weights.py", "scripts/preprocessing/prep.py", }, - expectDocs: []string{"merges.txt", "README.md"}, + expectDocs: []string{"README.md"}, expectName: "llama-7b", expectArch: "transformer", expectFamily: "llama", @@ -1891,6 +1892,41 @@ func TestFileTypeClassification(t *testing.T) { expectedCodes: []string{"script.py", "inference.py"}, expectedDocs: []string{"README.md", "LICENSE"}, }, + { + name: "huggingface tokenizer and runtime artifacts", + files: map[string]int64{ + "config.json": 1024, + "vocab.txt": 1024, + "merges.txt": 1024, + "added_tokens.txt": 1024, + "tokenizer/spiece.model": 1024, + "tokenizer/sentencepiece.bpe.model": 1024, + "tokenizer/tiktoken.model": 1024, + "chat_template.jinja": 1024, + "onnx/model.onnx_data_1": 1024, + "coreml/model.mil": 1024, + "ckpt-0/tensor00001_000": 1024, + "scripts/inference.py": 1024, + "README.md": 1024, + }, + expectedConfigs: []string{ + "config.json", + "vocab.txt", + "merges.txt", + "added_tokens.txt", + "tokenizer/spiece.model", + "tokenizer/sentencepiece.bpe.model", + "tokenizer/tiktoken.model", + "chat_template.jinja", + }, + expectedModels: []string{ + "onnx/model.onnx_data_1", + "coreml/model.mil", + "ckpt-0/tensor00001_000", + }, + expectedCodes: []string{"scripts/inference.py"}, + expectedDocs: []string{"README.md"}, + }, { name: "small unknown files treated as code files", files: map[string]int64{