go-llama-go/infer.py at master · TomMao23/go-llama-go · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import argparse
import json
import time

import torch
from transformers import AutoTokenizer

import llama

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Text generation with a Llama model.")

    parser.add_argument("--model", type=str, required=True, help="Path to the model.")
    parser.add_argument(
        "--prompts",
        type=str,
        nargs="+",
        required=True,
        help="List of prompts for text generation.",
    )
    parser.add_argument(
        "--max-new-tokens",
        type=int,
        default=64,
        help="Maximum number of new tokens to generate.",
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cpu",
        help='Device to use for inference (e.g., "cuda", "cpu").',
    )
    parser.add_argument(
        "--num-warmup-iterations",
        type=int,
        default=0,
        help="For profiling. The number of warmup iterations to run before measuring performance.",
    )
    parser.add_argument(
        "--num-profiling-iterations",
        type=int,
        default=1,
        help="For profiling. The number of iterations to run for performance measurement.",
    )

    args = parser.parse_args()

    model_path = args.model
    prompts = args.prompts
    max_new_tokens = args.max_new_tokens
    device = args.device
    num_warmup_iterations = args.num_warmup_iterations
    num_profiling_iterations = args.num_profiling_iterations

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    # 设置padding token，对于Llama模型通常使用eos_token作为pad_token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(device)

    model = llama.ModelForCausalLM.from_pretrained(model_path).to(device)

    texts = []

    for _ in range(num_warmup_iterations):
        outputs = model.generate(inputs.input_ids, max_new_tokens=max_new_tokens)

        texts.append(tokenizer.batch_decode(outputs, skip_special_tokens=True))

    if device == "cuda":
        torch.cuda.synchronize()

    elapsed_time = 0

    for _ in range(num_profiling_iterations):
        start_time = time.time()

        outputs = model.generate(inputs.input_ids, max_new_tokens=max_new_tokens)

        if device == "cuda":
            torch.cuda.synchronize()

        end_time = time.time()

        elapsed_time += end_time - start_time

        texts.append(tokenizer.batch_decode(outputs, skip_special_tokens=True))

    average_time = elapsed_time / num_profiling_iterations
    num_input_tokens = inputs["input_ids"].size(-1)
    num_output_tokens = outputs.size(-1) - num_input_tokens
    num_tokens_per_second = num_output_tokens / average_time

    print(
        json.dumps(
            {
                "texts": texts,
                "average_time": average_time,
                "num_input_tokens": num_input_tokens,
                "num_output_tokens": num_output_tokens,
                "num_tokens_per_second": num_tokens_per_second,
            }
        )
    )