forked from microsoft/pai
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathservices-configuration.yaml.template
More file actions
312 lines (279 loc) · 11.7 KB
/
services-configuration.yaml.template
File metadata and controls
312 lines (279 loc) · 11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
cluster:
common:
cluster-id: pai
cluster-type: k8s
k8s-rbac: "true"
job-history: "true"
data-path: "/datastorage"
qos-switch: "{{ env["cfg"]["qos-switch"] | default('false') }}"
docker-data-root: "{{ env['cfg']['docker_data_root'] | default('/mnt/docker') }}"
marketplace: "{{ env["cfg"]["enable_marketplace"] | default('false') }}"
# the docker registry to store docker images that contain system services like frameworklauncher, hadoop, etc.
docker-registry:
# The namespace in your registry. If the registry is docker.io, the namespace will be your user account.
namespace: {{ env["cfg"]['docker_registry_namespace'] | default('openpai') }}
# E.g., gcr.io.
# if the registry is hub.docker, please fill this value with docker.io
domain: {{ env["cfg"]['docker_registry_domain'] | default('docker.io') }}
# If the docker registry doesn't require authentication, please comment username and password
{%- if "docker_registry_username" in env["cfg"] %}
username: {{ env["cfg"]["docker_registry_username"] }}
{%- else %}
#username: <username>
{%- endif %}
{%- if "docker_registry_password" in env["cfg"] %}
password: {{ env["cfg"]["docker_registry_password"] }}
{%- else %}
#password: <password>
{%- endif %}
tag: {{ env["cfg"]['docker_image_tag'] }}
# The name of the secret in kubernetes will be created in your cluster
# Must be lower case, e.g., regsecret.
secret-name: pai-secret
{% if env["cfg"]["enable_docker_cache"]|default(false) is sameas true %}
docker-cache:
enabled: {{ env["cfg"]["enable_docker_cache"] }}
{%- if "docker_cache_storage_backend" in env["cfg"] %}
storage_backend: {{ env["cfg"]["docker_cache_storage_backend"]}}
{%- if env["cfg"]["docker_cache_storage_backend"] == "azure" %}
azure_account_name: {{ env["cfg"]["docker_cache_azure_account_name"] }}
azure_account_key: {{ env["cfg"]["docker_cache_azure_account_key"] }}
{%- if env["cfg"]["docker_cache_azure_container_name"] %}
azure_container_name: {{ env["cfg"]["docker_cache_azure_container_name"] }}
{%- endif %}
{%- elif env["cfg"]["docker_cache_storage_backend"] == "filesystem" and env["cfg"]["docker_cache_fs_mount_path"]%}
fs_mount_path: {{ env["cfg"]["docker_cache_fs_mount_path"]}}
{%- else %}
{%- endif %}
{%- endif %}
{%- if "docker_cache_remote_url" in env["cfg"] %}
remote_url: {{ env["cfg"]["docker_cache_remote_url"] }}
{%- endif %}
{%- if "docker_cache_registry_htpasswd" in env["cfg"] %}
registry-htpasswd: {{ env["cfg"]["docker_cache_registry_htpasswd"] }}
{%- endif %}
{%- else %}
{%- endif %}
rest-server:
# # launcher type. k8s or yarn
# launcher-type: k8s
# database admin username
default-pai-admin-username: admin
# database admin password
default-pai-admin-password: admin-password
webportal:
server-port: 9286
# If you want to customize the scheduling config, such add more virtual clusters or more gpu types, check:
# https://github.com/microsoft/pai/blob/master/docs/manual/cluster-admin/how-to-set-up-virtual-clusters.md
{% if env["cfg"]["enable_hived_scheduler"]|default(true) is sameas true %}
hivedscheduler:
config: |
physicalCluster:
skuTypes:
{%- for sku_name, sku_spec in env["hived"]["skus"].items() %}
{{ sku_name }}:
{%- if sku_spec.gpu %}
gpu: 1
cpu: {{ sku_spec.cpu }}
{%- else %}
cpu: 1
{%- endif %}
memory: {{ sku_spec.memory }}Mi
{%- endfor %}
cellTypes:
{%- for sku_name, sku_spec in env["hived"]["skus"].items() %}
{{ sku_name }}-NODE:
childCellType: {{ sku_name }}
{%- if sku_spec.gpu %}
childCellNumber: {{ sku_spec.gpuCount }}
{%- else %}
childCellNumber: {{ sku_spec.cpu }}
{%- endif %}
isNodeLevel: true
{{ sku_name }}-NODE-POOL:
childCellType: {{ sku_name }}-NODE
childCellNumber: {{ sku_spec.workers|length }}
{%- endfor %}
physicalCells:
{%- for sku_name, sku_spec in env["hived"]["skus"].items() %}
- cellType: {{ sku_name }}-NODE-POOL
cellChildren:
{%- for worker in sku_spec.workers %}
- cellAddress: {{ worker }}
{%- endfor %}
{%- endfor %}
virtualClusters:
{%- for sku_name, sku_spec in env["hived"]["skus"].items() %}
{%- if loop.index0 == 0 %}
default:
{%- else %}
vc{{loop.index0 }}:
{%- endif %}
virtualCells:
- cellType: {{ sku_name }}-NODE-POOL.{{ sku_name }}-NODE
cellNumber: {{ sku_spec.workers|length }}
{%- endfor %}
{% else %}
{% endif %}
# uncomment following section, if you want to customize the authentication solution.
authentication:
OIDC: false
# If OIDC is set as the value true, you will have to configure the following properties.
#OIDC-type: AAD
#
#AAD:
# # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
# # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud
# # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
# # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration
# # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration
# wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
#
# # Required
# tenantID: ${tenat_id}
#
# # Required, the client ID of your app in AAD
# clientID: ${your_client_id}
#
# # Required if `responseType` is 'code', 'id_token code' or 'code id_token'.
# # If app key contains '\', replace it with '\\'.
# clientSecret: '${your_client_secret}'
#
# # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds).
# nonceLifetime: null
#
# # Optional. The max amount of nonce saved in session or cookie, the default value is 10.
# nonceMaxAmount: 5
#
# # Optional. The clock skew allowed in token validation, the default value is 300 seconds.
# clockSkew: null
#
# # Optional.
# # If you want to get access_token for a specific resource, you can provide the resource here; otherwise,
# # set the value to null.
# # Note that in order to get access_token, the responseType must be 'code', 'code id_token' or 'id_token code'.
# resourceURL: 'https://graph.windows.net'
group-manager:
# basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist.
# winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration.
group-data-source: basic
# If you set winbind as your data source, you should configure this configuration.
# winbind-server-address: xxxxxxx
# Admin group name and its user list
admin-group:
groupname: admingroup
description: "admin's group"
externalName: ""
extension:
acls:
virtualClusters: []
admin: true
# Group for default vc.
# For yarn default queue hack.
default-group:
groupname: default
description: "group for default vc"
externalName: ""
extension:
acls:
virtualClusters: ["default"]
admin: false
# # If the following groups are not in the data store, it will be created by default.
# grouplist:
# - groupname: forexample
# # internal name
# description: forexample
# # description of the group
# externalName: ""
# # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from
# # the result of winbind. If the group-data-source is basic, this field is useless.
# uncomment following section if you want to customize the port of pylon
# pylon:
# port: 80
# if you want to enable alert-handler actions, uncomment following lines and fill the right values.
# alert-manager:
# port: 9093 # optional, do not modify this if you do not want to change the port alert-manager is listening on
# pai-bearer-token: 'your-application-token-for-pai-rest-server'
# alert-handler: # alert-handler will only be enabled when this field is not empty
# port: 9095 # optional, do not modify this if you do not want to change the port alert-handler is listening on
# log-level: "info" # optional
# email-configs: # email-notification will only be enabled when this field is not empty
# admin-receiver: addr-of-admin-receiver@example.com
# smtp-host: smtp.office365.com
# smtp-port: 587
# smtp-from: alert-sender@example.com
# smtp-auth-username: alert-sender@example.com
# smtp-auth-password: password-for-alert-sender
# cluster-utilization: # cluster-utilization is a k8s CronJob which reports the GPU utilization of the cluster
# # for schedule syntax, refer to https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#cron-schedule-syntax
# schedule: "0 0 * * *" # daily report at UTC 00:00
# customized-routes:
# routes:
# - receiver: pai-email-admin-user-and-stop-job
# match:
# alertname: PAIJobGpuPercentLowerThan0_3For1h
# - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf
# match:
# alertname: NodeGpuLowPerfState
# customized-receivers: # receivers are combination of several actions
# - name: "pai-email-admin-user-and-stop-job"
# actions:
# # the email template for `email-admin` and `email-user `can be chosen from ['general-template', 'kill-low-efficiency-job-alert']
# # if no template specified, 'general-template' will be used.
# email-admin:
# email-user:
# template: 'kill-low-efficiency-job-alert'
# stop-jobs: # no parameters required for stop-jobs action
# tag-jobs:
# tags:
# - 'stopped-by-alert-manager'
# - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf"
# actions:
# email-admin:
# fix-nvidia-gpu-low-perf:
# uncomment following if you want to customize prometheus
# prometheus:
# port: 9091 # optional, do not modify this if you do not want to change the port prometheus is listening on
# # How frequently to scrape targets
# scrape_interval: 30
# customized-alerts: |
# groups:
# - name: customized-alerts
# rules:
# - alert: PAIJobGpuPercentLowerThan0_3For1h
# expr: avg(task_gpu_percent{virtual_cluster=~"default"}) by (job_name) < 0.3
# for: 1h
# labels:
# severity: warn
# annotations:
# summary: "{% raw %}{{$labels.job_name}}{% endraw %} has a job gpu percent lower than 30% for 1 hour"
# description: Monitor job level gpu utilization in certain virtual clusters.
# uncomment following if you want to change customize grafana
# grafana:
# port: 3000
# uncomment following if you want node-exporter listen to different port
# node-exporter:
# port: 9100
# uncomment following if you want to customeize job-exporter
# job-exporter:
# port: 9102
# logging-level: INFO
# interface: eth0,eno2
# uncomment following section if you want to customize the threshold of cleaner
# cleaner:
# threshold: 94
# interval: 60
# uncomment following section if you want to customize the port of log-manager
# log-manager:
# port: 9103
# admin_name: "admin"
# admin_password: "admin"
# jwt_secret: "jwt_secret"
# token_expired_second: 120
# uncomment following section if you want to customize the port of storage-manager
# storage-manager:
# localpath: /share
# security-type: AUTO
# workgroup: WORKGROUP
# smbuser: smbuser
# smbpwd: smbpwd