Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions pkg/modelfile/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,22 @@ var (
"*.modelcard", // Model card metadata
"*.meta", // Model metadata
"*tokenizer.model*", // Tokenizer files (e.g., Mistral v3)
"config.json.*", // Model configuration variants
"*.hparams", // Hyperparameter files
"*.params", // Parameter files
"*.hyperparams", // Hyperparameter configuration
"*.wandb", // Weights & Biases configuration
"*.mlflow", // MLflow configuration
"*.tensorboard", // TensorBoard configuration
"*.tiktoken", // TikToken vocabulary files
"vocab.txt", // Tokenizer vocabulary files
"merges.txt", // Tokenizer merge rules
"added_tokens.txt", // Additional tokenizer tokens
"spiece.model", // SentencePiece tokenizer files
"sentencepiece*.model",
"sentencepiece*.vocab",
"tiktoken.model",
"chat_template.jinja",
Comment thread
chlins marked this conversation as resolved.
"config.json.*", // Model configuration variants
"*.hparams", // Hyperparameter files
"*.params", // Parameter files
"*.hyperparams", // Hyperparameter configuration
"*.wandb", // Weights & Biases configuration
"*.mlflow", // MLflow configuration
"*.tensorboard", // TensorBoard configuration
}

// Model file patterns - supported model file extensions.
Expand Down Expand Up @@ -97,9 +106,10 @@ var (
"*.f32", // GGML F32 format

// checkpoint formats.
"*.ckpt", // Checkpoint format
"*.checkpoint", // Checkpoint format (alternative extension)
"*.dist_ckpt", // Distributed checkpoint format
"*.ckpt", // Checkpoint format
"*.checkpoint", // Checkpoint format (alternative extension)
"*.dist_ckpt", // Distributed checkpoint format
"tensor[0-9]*_[0-9]*", // Sharded checkpoint tensor files

// Semantics-specific formats
"*.tensor", // Generic tensor format
Expand All @@ -113,6 +123,7 @@ var (
"*.engine", // TensorRT format
"*.trt", // TensorRT format (alternative extension)
"*.onnx", // Open Neural Network Exchange format
"*.onnx_data*", // ONNX external data files
"*.msgpack", // MessagePack serialization
"*.model", // Some NLP frameworks
"*.pkl", // Pickle format
Expand All @@ -124,6 +135,7 @@ var (
"*.nc", // NetCDF format
"*.mlmodel", // Apple Core ML format
"*.coreml", // Apple Core ML format (alternative)
"*.mil", // Core ML intermediate language files
"*.mleap", // MLeap format (Spark ML)
"*.surml", // SurrealML format
"*.llamafile", // Llamafile format
Expand Down
39 changes: 39 additions & 0 deletions pkg/modelfile/constants_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,21 @@ func TestIsFileTypeModelPatterns(t *testing.T) {
{"model.bin.part2", true},
{"model.gguf.part1", true},
{"model.gguf.00001-of-00003", true},
{"model.onnx_data", true},
{"model.onnx_data_1", true},
{"ckpt-0/tensor00001_000", true},
{"model.llamafile.zip", true},
{"model.llamafile.gz", true},

// Existing patterns still work.
{"model.safetensors", true},
{"model.bin", true},
{"model.gguf", true},
{"model.mil", true},
{"model.llamafile", true},

// Non-matching files.
{"merges.txt", false},
{"readme.txt", false},
{"script.py", false},
{"events.out.tfevents.1679012345.hostname", false}, // tfevents moved to DocFilePatterns
Expand All @@ -70,6 +75,30 @@ func TestIsFileTypeModelPatterns(t *testing.T) {
}
}

func TestIsFileTypeConfigPatterns(t *testing.T) {
testCases := []struct {
filename string
expected bool
}{
{"vocab.txt", true},
{"merges.txt", true},
{"added_tokens.txt", true},
{"chat_template.jinja", true},
{"tokenizer.tiktoken", true},
{"spiece.model", true},
{"sentencepiece.bpe.model", true},
{"sentencepiece.bpe.vocab", true},
{"tiktoken.model", true},
{"weights.model", false},
{"readme.txt", false},
}

assert := assert.New(t)
for _, tc := range testCases {
assert.Equal(tc.expected, IsFileType(tc.filename, ConfigFilePatterns), "filename: %s", tc.filename)
}
}

func TestIsFileTypeDocPatternsTfevents(t *testing.T) {
testCases := []struct {
filename string
Expand Down Expand Up @@ -98,10 +127,20 @@ func TestInferFileType(t *testing.T) {
{"config yaml", "settings.yaml", 1024, FileTypeConfig},
{"model safetensors", "model.safetensors", 1024, FileTypeModel},
{"model bin", "weights.bin", 1024, FileTypeModel},
{"model onnx external data", "model.onnx_data_1", 1024, FileTypeModel},
{"model coreml mil", "model.mil", 1024, FileTypeModel},
{"checkpoint tensor shard", "ckpt-0/tensor00001_000", 1024, FileTypeModel},
{"code python", "script.py", 1024, FileTypeCode},
{"code go", "main.go", 1024, FileTypeCode},
{"doc markdown", "README.md", 1024, FileTypeDoc},
{"doc pdf", "guide.pdf", 1024, FileTypeDoc},
{"tokenizer vocab txt", "vocab.txt", 1024, FileTypeConfig},
{"tokenizer merges txt", "merges.txt", 1024, FileTypeConfig},
{"tokenizer added tokens txt", "added_tokens.txt", 1024, FileTypeConfig},
{"sentencepiece spiece model", "spiece.model", 1024, FileTypeConfig},
{"sentencepiece bpe model", "sentencepiece.bpe.model", 1024, FileTypeConfig},
{"tiktoken model", "tiktoken.model", 1024, FileTypeConfig},
{"chat template jinja", "chat_template.jinja", 1024, FileTypeConfig},

// Dotfile with known secondary extension
{".cache.json is config", ".cache.json", 1024, FileTypeConfig},
Expand Down
38 changes: 37 additions & 1 deletion pkg/modelfile/modelfile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,7 @@ func TestNewModelfileByWorkspace(t *testing.T) {
"tokenizer.json",
"special_tokens_map.json",
"vocab.json",
"merges.txt",
},
expectModels: []string{
"pytorch_model.bin",
Expand All @@ -617,7 +618,7 @@ func TestNewModelfileByWorkspace(t *testing.T) {
"scripts/convert_weights.py",
"scripts/preprocessing/prep.py",
},
expectDocs: []string{"merges.txt", "README.md"},
expectDocs: []string{"README.md"},
expectName: "llama-7b",
expectArch: "transformer",
expectFamily: "llama",
Expand Down Expand Up @@ -1891,6 +1892,41 @@ func TestFileTypeClassification(t *testing.T) {
expectedCodes: []string{"script.py", "inference.py"},
expectedDocs: []string{"README.md", "LICENSE"},
},
{
name: "huggingface tokenizer and runtime artifacts",
files: map[string]int64{
"config.json": 1024,
"vocab.txt": 1024,
"merges.txt": 1024,
"added_tokens.txt": 1024,
"tokenizer/spiece.model": 1024,
"tokenizer/sentencepiece.bpe.model": 1024,
"tokenizer/tiktoken.model": 1024,
"chat_template.jinja": 1024,
"onnx/model.onnx_data_1": 1024,
"coreml/model.mil": 1024,
"ckpt-0/tensor00001_000": 1024,
"scripts/inference.py": 1024,
"README.md": 1024,
},
expectedConfigs: []string{
"config.json",
"vocab.txt",
"merges.txt",
"added_tokens.txt",
"tokenizer/spiece.model",
"tokenizer/sentencepiece.bpe.model",
"tokenizer/tiktoken.model",
"chat_template.jinja",
},
expectedModels: []string{
"onnx/model.onnx_data_1",
"coreml/model.mil",
"ckpt-0/tensor00001_000",
},
expectedCodes: []string{"scripts/inference.py"},
expectedDocs: []string{"README.md"},
},
{
name: "small unknown files treated as code files",
files: map[string]int64{
Expand Down
Loading