forked from InfiniTensor/go-llama-go
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinfer.py
More file actions
105 lines (83 loc) · 3.03 KB
/
infer.py
File metadata and controls
105 lines (83 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import argparse
import json
import time
import torch
from transformers import AutoTokenizer
import llama
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Text generation with a Llama model.")
parser.add_argument("--model", type=str, required=True, help="Path to the model.")
parser.add_argument(
"--prompts",
type=str,
nargs="+",
required=True,
help="List of prompts for text generation.",
)
parser.add_argument(
"--max-new-tokens",
type=int,
default=64,
help="Maximum number of new tokens to generate.",
)
parser.add_argument(
"--device",
type=str,
default="cpu",
help='Device to use for inference (e.g., "cuda", "cpu").',
)
parser.add_argument(
"--num-warmup-iterations",
type=int,
default=0,
help="For profiling. The number of warmup iterations to run before measuring performance.",
)
parser.add_argument(
"--num-profiling-iterations",
type=int,
default=1,
help="For profiling. The number of iterations to run for performance measurement.",
)
args = parser.parse_args()
model_path = args.model
prompts = args.prompts
max_new_tokens = args.max_new_tokens
device = args.device
num_warmup_iterations = args.num_warmup_iterations
num_profiling_iterations = args.num_profiling_iterations
tokenizer = AutoTokenizer.from_pretrained(model_path)
# 设置padding token,对于Llama模型通常使用eos_token作为pad_token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
model = llama.ModelForCausalLM.from_pretrained(model_path).to(device)
texts = []
for _ in range(num_warmup_iterations):
outputs = model.generate(inputs.input_ids, max_new_tokens=max_new_tokens)
texts.append(tokenizer.batch_decode(outputs, skip_special_tokens=True))
if device == "cuda":
torch.cuda.synchronize()
elapsed_time = 0
for _ in range(num_profiling_iterations):
start_time = time.time()
outputs = model.generate(inputs.input_ids, max_new_tokens=max_new_tokens)
if device == "cuda":
torch.cuda.synchronize()
end_time = time.time()
elapsed_time += end_time - start_time
texts.append(tokenizer.batch_decode(outputs, skip_special_tokens=True))
average_time = elapsed_time / num_profiling_iterations
num_input_tokens = inputs["input_ids"].size(-1)
num_output_tokens = outputs.size(-1) - num_input_tokens
num_tokens_per_second = num_output_tokens / average_time
print(
json.dumps(
{
"texts": texts,
"average_time": average_time,
"num_input_tokens": num_input_tokens,
"num_output_tokens": num_output_tokens,
"num_tokens_per_second": num_tokens_per_second,
}
)
)