DL4AGX/AV-Solutions/llms-trtllm/setup_from_source_Llama-3.1-8B.sh at master · NVIDIA/DL4AGX · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/bin/bash

# Setup virtual environment
python3.8 -m venv trtllm_0.13
source trtllm_0.13/bin/activate


# Setup model/engine path
export path_engine=$PWD/Llama-3.1-8B_int4_awq
export path_model=$PWD/Llama-3.1-8B


# Disable rotary_scaling in Llama 3.1 for Orin deployment
python json_modifier.py $path_engine/config.json


# git clone TensorRT-LLM codebase
git clone -b v0.13.0 https://github.com/NVIDIA/TensorRT-LLM.git $PWD/TensorRT-LLM
cd $PWD/TensorRT-LLM
git submodule update --init --recursive
cp -r ../batch_manager/ ./cpp/tensorrt_llm/
cp -r ../executor/ ./cpp/tensorrt_llm/
cp -r ../nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so ./cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/
git apply ../build_from_source_changes.patch


# setup dependency
## NCCL
cd ..
git clone https://github.com/NVIDIA/nccl.git
cd nccl
git checkout v2.22.3-1
make -j8 src.build


# Install pytorch
cd ..
wget https://developer.download.nvidia.cn/compute/redist/jp/v51/pytorch/torch-1.14.0a0+44dac51c.nv23.01-cp38-cp38-linux_aarch64.whl
python -m pip install --upgrade pip
export TORCH_INSTALL=$PWD/torch*cp38-linux_aarch64.whl
python3 -m pip install --no-cache $TORCH_INSTALL


# Install TensorRT
pip install $PWD/TensorRT-10.4.0.11/python/tensorrt-10.4.0b11-cp38-none-linux_aarch64.whl

# Copy libnvinfer.so file
sudo cp $PWD/TensorRT-10.4.0.11/lib/libnvinfer.so.10.4.0 /usr/lib/aarch64-linux-gnu/libnvinfer.so

# Build TensorRT-LLM 0.13 from scratch on Orin
## Setup paths for build
export LD_LIBRARY_PATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH
export PATH_TRT=$PWD/TensorRT-10.4.0.11
export PATH_NCCL=$PWD/nccl/build
export LD_LIBRARY_PATH=${PATH_TRT}/lib
export LD_LIBRARY_PATH=$PWD/TensorRT-10.4.0.11/lib/:$LD_LIBRARY_PATH


## Build from source
cd $PWD/TensorRT-LLM
./scripts/build_wheel.py --trt_root $PATH_TRT --nccl_root $PATH_NCCL --clean --cuda_architectures "87"


# Install TensorRT-LLM
pip install ./build/tensorrt_llm-0.13.0-cp38-cp38-linux_aarch64.whl


# Fix transformers version
pip install transformers==4.45.0


# Try import TensorRT-LLM
cd ..
python3 -c "import tensorrt_llm; print(tensorrt_llm.__version__)"

# Create Engine for Llama v3.1
cd TensorRT-LLM/examples/llama
trtllm-build --checkpoint_dir $path_engine --output_dir $path_engine/1-gpu/ --gemm_plugin auto --max_batch_size 1

# Inference for Llama v3.1
python ../run.py --max_output_len 128 --engine_dir $path_engine/1-gpu/ --tokenizer_dir $path_model