-
Notifications
You must be signed in to change notification settings - Fork 58
Expand file tree
/
Copy pathsetup_from_source_Llama-3.1-8B.sh
More file actions
82 lines (56 loc) · 2.47 KB
/
setup_from_source_Llama-3.1-8B.sh
File metadata and controls
82 lines (56 loc) · 2.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/bin/bash
# Setup virtual environment
python3.8 -m venv trtllm_0.13
source trtllm_0.13/bin/activate
# Setup model/engine path
export path_engine=$PWD/Llama-3.1-8B_int4_awq
export path_model=$PWD/Llama-3.1-8B
# Disable rotary_scaling in Llama 3.1 for Orin deployment
python json_modifier.py $path_engine/config.json
# git clone TensorRT-LLM codebase
git clone -b v0.13.0 https://github.com/NVIDIA/TensorRT-LLM.git $PWD/TensorRT-LLM
cd $PWD/TensorRT-LLM
git submodule update --init --recursive
cp -r ../batch_manager/ ./cpp/tensorrt_llm/
cp -r ../executor/ ./cpp/tensorrt_llm/
cp -r ../nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so ./cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/
git apply ../build_from_source_changes.patch
# setup dependency
## NCCL
cd ..
git clone https://github.com/NVIDIA/nccl.git
cd nccl
git checkout v2.22.3-1
make -j8 src.build
# Install pytorch
cd ..
wget https://developer.download.nvidia.cn/compute/redist/jp/v51/pytorch/torch-1.14.0a0+44dac51c.nv23.01-cp38-cp38-linux_aarch64.whl
python -m pip install --upgrade pip
export TORCH_INSTALL=$PWD/torch*cp38-linux_aarch64.whl
python3 -m pip install --no-cache $TORCH_INSTALL
# Install TensorRT
pip install $PWD/TensorRT-10.4.0.11/python/tensorrt-10.4.0b11-cp38-none-linux_aarch64.whl
# Copy libnvinfer.so file
sudo cp $PWD/TensorRT-10.4.0.11/lib/libnvinfer.so.10.4.0 /usr/lib/aarch64-linux-gnu/libnvinfer.so
# Build TensorRT-LLM 0.13 from scratch on Orin
## Setup paths for build
export LD_LIBRARY_PATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH
export PATH_TRT=$PWD/TensorRT-10.4.0.11
export PATH_NCCL=$PWD/nccl/build
export LD_LIBRARY_PATH=${PATH_TRT}/lib
export LD_LIBRARY_PATH=$PWD/TensorRT-10.4.0.11/lib/:$LD_LIBRARY_PATH
## Build from source
cd $PWD/TensorRT-LLM
./scripts/build_wheel.py --trt_root $PATH_TRT --nccl_root $PATH_NCCL --clean --cuda_architectures "87"
# Install TensorRT-LLM
pip install ./build/tensorrt_llm-0.13.0-cp38-cp38-linux_aarch64.whl
# Fix transformers version
pip install transformers==4.45.0
# Try import TensorRT-LLM
cd ..
python3 -c "import tensorrt_llm; print(tensorrt_llm.__version__)"
# Create Engine for Llama v3.1
cd TensorRT-LLM/examples/llama
trtllm-build --checkpoint_dir $path_engine --output_dir $path_engine/1-gpu/ --gemm_plugin auto --max_batch_size 1
# Inference for Llama v3.1
python ../run.py --max_output_len 128 --engine_dir $path_engine/1-gpu/ --tokenizer_dir $path_model