-
Notifications
You must be signed in to change notification settings - Fork 90
126 lines (114 loc) · 4.69 KB
/
internode.yml
File metadata and controls
126 lines (114 loc) · 4.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
name: 'internode (Ascend NPU)'
on:
workflow_call:
inputs:
soc_version:
required: true
type: string
description: use a2 or a3
runner:
required: false
type: string
default: linux-amd64-cpu-0
image:
required: false
type: string
description: image for pods
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/dockerhub/lmsysorg/sglang:main-cann8.3.rc1-a3"
test_config_name:
required: true
type: string
description: test config name
node_size:
required: true
type: number
description: number of nodes
test_case:
required: true
type: string
description: path of test case file
concurrency:
group: ascend-nightly-multi-node-${{ github.ref }}-${{ inputs.soc_version }}
cancel-in-progress: true
jobs:
multi-node:
name: ${{ inputs.test_config_name }}
runs-on: ${{ inputs.runner }}
container:
image: swr.ap-southeast-1.myhuaweicloud.com/base_image/ascend-ci/sglang:main-x86
env:
KUBECONFIG: /root/.cache/.cache/kube.yaml
KUBECTL: /root/.cache/.cache/kubectl
NAMESPACE: sgl-kernel-npu
ASCEND_TEST_CASE_PATH: tests/python/deepep
KUBE_JOB_TYPE: multi
KUBE_JOB_NAME: sglang-npu-multi
KUBE_CONFIG_MAP: sglang-info
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install system dependencies
run: |
pip3 install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple jinja2-cli
- name: Install kubernetes
run: |
# Install kubernetes
pip3 install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple kubernetes
cp $KUBECTL /usr/local/sbin/
- name: Create K8s Namespace
run: |
kubectl create namespace $NAMESPACE --dry-run=client -o yaml | kubectl apply -f -
- name: Prepare scripts
run: |
# prepare for test code
sglang_source_path=/root/.cache/tests/sglang
mkdir -p $sglang_source_path && chmod -R 777 $sglang_source_path
rm -rf $sglang_source_path/*
cp -r $GITHUB_WORKSPACE/* $sglang_source_path/
ls -l $sglang_source_path/tests/python/deepep/
echo "Code copied to $sglang_source_path, files in deepep: $(ls $sglang_source_path/tests/python/deepep/)"
# prepare for k8s_multi.yaml
image="${{ inputs.image }}"
node_size="${{ inputs.node_size }}"
test_case="${{ inputs.test_case }}"
echo "{ \"image\": \"$image\", \
\"name_space\": \"$NAMESPACE\", \
\"kube_job_name\": \"$KUBE_JOB_NAME\", \
\"kube_config\": \"$KUBECONFIG\", \
\"kube_config_map\": \"$KUBE_CONFIG_MAP\", \
\"node_size\": $node_size, \
\"sglang_source_path\": \"$sglang_source_path\", \
\"test_case\": \"$test_case\" }" |\
jinja2 ${ASCEND_TEST_CASE_PATH}/k8s_multi.yaml.jinja2 -o ${ASCEND_TEST_CASE_PATH}/k8s_multi.yaml
echo "Kubectl config file is generated: ${ASCEND_TEST_CASE_PATH}/k8s_multi.yaml"
- name: Clear resources
run: |
cd $ASCEND_TEST_CASE_PATH
kubectl delete -f ./k8s_multi.yaml --ignore-not-found=true || true
pod_name_prefix="${KUBE_JOB_NAME}-sglang"
echo "kube name space: $NAMESPACE, pod name prefix: ${pod_name_prefix}"
while true; do
if kubectl get po -A -n $NAMESPACE | grep -q "${pod_name_prefix}"; then
echo "Found exist sglang job, sleeping for 30 seconds..."
sleep 30
kubectl get pods | grep "${pod_name_prefix}" | awk '{print $1}' | xargs kubectl delete pod -n $NAMESPACE || true
else
echo "No sglang job exist, start test case..."
break
fi
done
- name: Run test
timeout-minutes: 300
env:
SGLANG_USE_MODELSCOPE: true
SGLANG_IS_IN_CI: true
HF_ENDPOINT: https://hf-mirror.com
run: |
cd $ASCEND_TEST_CASE_PATH
python3 -u run_ascend_ci.py
- name: Post process
if: always()
run: |
kubectl get pods -n $NAMESPACE
cd $ASCEND_TEST_CASE_PATH
kubectl delete -f ./k8s_multi.yaml --ignore-not-found=true || true