forked from microsoft/pai
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathservices-configuration.yaml
More file actions
253 lines (240 loc) · 10.2 KB
/
services-configuration.yaml
File metadata and controls
253 lines (240 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#uncomment following if you want to change customize cluster overall configuration
#cluster:
#
# common:
# cluster-id: pai
# cluster-type: k8s
#
# # HDFS, zookeeper data path on your cluster machine.
# data-path: "/datastorage"
#
# # Enable job history feature. Default value is "true"
# job-history: "true"
#
# # Enable QoS feature or not. Default value is "true"
# qos-switch: "true"
#
# # If your cluster is created by Azure and the machine is rdma enabled.
# # Set this configuration as "true", the rdma environment will be set into your container.
# az-rdma: "false"
# # If RBAC is enabled in your cluster, you should set this value to true.
# # If RBAC is enabled in your cluster, please ensure the kubeconfig for paictl has enough permission.
# k8s-rbac: "false"
#
# # If Pai will be deployed in aks, you should set this value to true.
# deploy-in-aks: "false"
#
# # the docker registry to store docker images that contain system services like frameworklauncher, hadoop, etc.
# docker-registry:
#
# # domain/namespace/image-name:tag
#
# # If resgiry is docker.io, please fill it the same as your username
# namespace: openpai
#
# # E.g., gcr.io.
# # If your registry is at hub.docker, please fill it as docker.io
# domain: docker.io
#
# # If the docker registry doesn't require authentication, please comment out username and password
# #username: your_registry_username
# #password: your_registry_password
#
# tag: latest
#
# # The name of the secret in kubernetes will be created in your cluster
# # Must be lower case, e.g., regsecret.
# secret-name: regsecret
#Uncomment following and config it if you want to use hivedscheduler
#Check ../../docs/hivedscheduler/devops.md to config it.
#hivedscheduler:
# config: |
rest-server:
# secret for signing authentication tokens, e.g., hello
#jwt-secret: HelloPAI
#jwt-expire-time: '7d'
# database admin username
default-pai-admin-username: your_default_pai_admin_username
# database admin password
default-pai-admin-password: your_default_pai_admin_password
# rest server would achieve marketplace template from below configed github repository
#github-owner: Microsoft
#github-repository: pai
#github-path: marketplace
# Job Debugging Reservation Seconds.
#debugging-reservation-seconds: 604800
# uncomment following section if you want to customize the port of web portal
# webportal:
# server-port: 9286
# log-type: yarn
# uncomment following if you want to change customize grafana
# grafana:
# port: 3000
# uncomment following if you want to customize device plugin
# device-plugin:
# devices:
# - "nvidia.com/gpu"
# - "amd.com/gpu"
# - "rdma/hca"
# uncomment following if you want node-exporter listen to different port
# node-exporter:
# port: 9100
# if you want to enable alert-handler actions, uncomment following lines and fill the right values.
# alert-manager:
# port: 9093 # optional, do not modify this if you do not want to change the port alert-manager is listening on
# pai-bearer-token: 'your-application-token-for-pai-rest-server'
# alert-handler: # alert-handler will only be enabled when this field is not empty
# port: 9095 # optional, do not modify this if you do not want to change the port alert-handler is listening on
# log-level: "info" # optional
# email-configs: # email-notification will only be enabled when this field is not empty
# admin-receiver: addr-of-admin-receiver@example.com
# smtp-host: smtp.office365.com
# smtp-port: 587
# smtp-from: alert-sender@example.com
# smtp-auth-username: alert-sender@example.com
# smtp-auth-password: password-for-alert-sender
# cluster-utilization: # cluster-utilization is a k8s CronJob which reports the GPU utilization of the cluster
# # for schedule syntax, refer to https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#cron-schedule-syntax
# schedule: "0 0 * * *" # daily report at UTC 00:00
# customized-routes:
# routes:
# - receiver: pai-email-admin-user-and-stop-job
# match:
# alertname: PAIJobGpuPercentLowerThan0_3For1h
# - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf
# match:
# alertname: NodeGpuLowPerfState
# customized-receivers: # receivers are combination of several actions
# - name: "pai-email-admin-user-and-stop-job"
# actions:
# # the email template for `email-admin` and `email-user `can be chosen from ['general-template', 'kill-low-efficiency-job-alert']
# # if no template specified, 'general-template' will be used.
# email-admin:
# email-user:
# template: 'kill-low-efficiency-job-alert'
# stop-jobs: # no parameters required for stop-jobs action
# tag-jobs:
# tags:
# - 'stopped-by-alert-manager'
# - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf"
# actions:
# email-admin:
# fix-nvidia-gpu-low-perf:
# uncomment following if you want to customize prometheus
# prometheus:
# port: 9091 # optional, do not modify this if you do not want to change the port prometheus is listening on
# # How frequently to scrape targets
# scrape_interval: 30
# customized-alerts: |
# groups:
# - name: customized-alerts
# rules:
# - alert: PAIJobGpuPercentLowerThan0_3For1h
# expr: avg(task_gpu_percent{virtual_cluster=~"default"}) by (job_name) < 0.3
# for: 1h
# labels:
# severity: warn
# annotations:
# summary: "{{$labels.job_name}} has a job gpu percent lower than 30% for 1 hour"
# description: Monitor job level gpu utilization in certain virtual clusters.
# uncomment following section if you want to customize the port of pylon
# pylon:
# port: 80
# webhdfs-legacy-port: 50070
# deploy-service: false
# # By default, master's ipaddress will be treated as the pylon's domain
# # If you wanna configure pylon with other domain, you should config it here.
# domain: ip-address
# # If you wanna integrate OpenPAI with aad, you should configure this.
# ssl:
# # self-sign
# # crt_name: xxxxxx
# # crt_path: /path/to/xxxxxx
# # key_name: yyyyyy
# # key_path: /path/to/yyyyyy
# uncomment following section if you want to customize the threshold of cleaner
# cleaner:
# threshold: 90
# interval: 60
# uncomment following section if you want to customize the port of log-manager
# log-manager:
# port: 9103
# uncomment following section, if you want to customize the authentication solution.
#authentication:
#OIDC: false
# If OIDC is set as the value true, you will have to configure the following properties.
#OIDC-type: AAD
#
#AAD:
# # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
# # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud
# # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
# # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration
# # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration
# wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
#
# # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
# tenantID: ${tenat_id}
#
# # Required, the client ID of your app in AAD
# clientID: ${your_client_id}
#
# # Required if `responseType` is 'code', 'id_token code' or 'code id_token'.
# # If app key contains '\', replace it with '\\'.
# clientSecret: '${your_client_secret}'
#
# # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds).
# nonceLifetime: null
#
# # Optional. The max amount of nonce saved in session or cookie, the default value is 10.
# nonceMaxAmount: 5
#
# # Optional. The clock skew allowed in token validation, the default value is 300 seconds.
# clockSkew: null
#
#group-manager:
# # basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist.
# # winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration.
# group-data-source: basic
#
# # If you set winbind as your data source, you should configure this configuration.
# # winbind-server-address: xxxxxxx
#
# # Admin group name and its user list
# admin-group:
# groupname: admingroup
# description: "admin's group"
# externalName: ""
#
# # Group for default vc.
# # For yarn default queue hack.
# default-group:
# groupname: default
# description: "group for default vc"
# externalName: ""
#
# # If the following groups are not in the data store, it will be created by default.
# grouplist:
# - groupname: forexample
# # internal name
# description: forexample
# # description of the group
# externalName: ""
# # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from
# # the result of winbind. If the group-data-source is basic, this field is useless.