-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathexample_autobit_vllm_inference.py
More file actions
63 lines (45 loc) · 1.38 KB
/
example_autobit_vllm_inference.py
File metadata and controls
63 lines (45 loc) · 1.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""
Example: Quantize a model with auto_run and run inference with vLLM
Performs the following steps:
1. Quantize with AutoBit (mixed-precision) + QEP using auto_run
2. Load the quantized model with vLLM's offline LLM interface
3. Generate text
Requirements:
pip install vllm
Copyright 2025-2026 Fujitsu Ltd.
Author: Keiji Kimura
"""
import gc
import torch
from onecomp import Runner
from vllm import LLM, SamplingParams
def main():
# Step 1: Quantize and save the model
save_dir = "./TinyLlama-1.1B-autobit"
runner = Runner.auto_run(
model_id="TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
save_dir=save_dir,
evaluate=False,
)
# Free GPU memory used by quantization before loading vLLM
del runner
gc.collect()
torch.cuda.empty_cache()
# Step 2: Load the quantized model with vLLM and generate text
llm = LLM(
model=save_dir,
max_model_len=512,
dtype="float16",
enforce_eager=True,
)
prompts = [
"Explain what post-training quantization is in one sentence:",
"The capital of France is",
]
outputs = llm.generate(prompts, SamplingParams(max_tokens=64, temperature=0.0))
for output in outputs:
print(f"Prompt: {output.prompt}")
print(f"Response: {output.outputs[0].text}")
print()
if __name__ == "__main__":
main()