forked from microsoft/pai
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathservices-configuration.yaml.template
More file actions
228 lines (204 loc) · 9.11 KB
/
services-configuration.yaml.template
File metadata and controls
228 lines (204 loc) · 9.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
cluster:
#common:
# cluster-id: pai-example
# cluster-type: k8s
# # HDFS, zookeeper data path on your cluster machine.
# data-path: "/datastorage"
# marketplace: "true"
# the docker registry to store docker images that contain system services like frameworklauncher, hadoop, etc.
docker-registry:
# The namespace in your registry. If the registry is docker.io, the namespace will be your user account.
namespace: openpai
# E.g., gcr.io.
# if the registry is hub.docker, please fill this value with docker.io
domain: docker.io
# If the docker registry doesn't require authentication, please comment username and password
#username: <username>
#password: <password>
tag: {{ env['pai-version'] }}
# The name of the secret in kubernetes will be created in your cluster
# Must be lower case, e.g., regsecret.
secret-name: pai-secret
rest-server:
# database admin username
default-pai-admin-username: admin
# database admin password
default-pai-admin-password: admin-password
# uncomment following section if you want to customize the port of web portal
# webportal:
# server-port: 9286
# uncomment following if you want to change customeize grafana
# grafana:
# port: 3000
# uncomment following if you want node-exporter listen to different port
# node-exporter:
# port: 9100
# uncomment following if you want to customeize job-exporter
# job-exporter:
# port: 9102
# logging-level: INFO
# interface: eth0,eno2
# if you want to enable alert-handler actions, uncomment following lines and fill the right values.
# alert-manager:
# port: 9093 # optional, do not modify this if you do not want to change the port alert-manager is listening on
# pai-bearer-token: 'your-application-token-for-pai-rest-server'
# alert-handler: # alert-handler will only be enabled when this field is not empty
# port: 9095 # optional, do not modify this if you do not want to change the port alert-handler is listening on
# log-level: "info" # optional
# email-configs: # email-notification will only be enabled when this field is not empty
# admin-receiver: addr-of-admin-receiver@example.com
# smtp-host: smtp.office365.com
# smtp-port: 587
# smtp-from: alert-sender@example.com
# smtp-auth-username: alert-sender@example.com
# smtp-auth-password: password-for-alert-sender
# cluster-utilization: # cluster-utilization is a k8s CronJob which reports the GPU utilization of the cluster
# # for schedule syntax, refer to https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#cron-schedule-syntax
# schedule: "0 0 * * *" # daily report at UTC 00:00
# customized-routes:
# routes:
# - receiver: pai-email-admin-user-and-stop-job
# match:
# alertname: PAIJobGpuPercentLowerThan0_3For1h
# - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf
# match:
# alertname: NodeGpuLowPerfState
# customized-receivers: # receivers are combination of several actions
# - name: "pai-email-admin-user-and-stop-job"
# actions:
# # the email template for `email-admin` and `email-user `can be chosen from ['general-template', 'kill-low-efficiency-job-alert']
# # if no template specified, 'general-template' will be used.
# email-admin:
# email-user:
# template: 'kill-low-efficiency-job-alert'
# stop-jobs: # no parameters required for stop-jobs action
# tag-jobs:
# tags:
# - 'stopped-by-alert-manager'
# - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf"
# actions:
# email-admin:
# fix-nvidia-gpu-low-perf:
# uncomment following if you want to customize prometheus
# prometheus:
# port: 9091 # optional, do not modify this if you do not want to change the port prometheus is listening on
# # How frequently to scrape targets
# scrape_interval: 30
# customized-alerts: |
# groups:
# - name: customized-alerts
# rules:
# - alert: PAIJobGpuPercentLowerThan0_3For1h
# expr: avg(task_gpu_percent{virtual_cluster=~"default"}) by (job_name) < 0.3
# for: 1h
# labels:
# severity: warn
# annotations:
# summary: "{{$labels.job_name}} has a job gpu percent lower than 30% for 1 hour"
# description: Monitor job level gpu utilization in certain virtual clusters.
# uncomment following section if you want to customize the port of pylon
# pylon:
# port: 80
# uncomment following section if you want to customize the threshold of cleaner
# cleaner:
# threshold: 94
# interval: 60
# uncomment following section if you want to customize the port of log-manager
# log-manager:
# port: 9103
# admin_name: "admin"
# admin_password: "admin"
# jwt_secret: "jwt_secret"
# token_expired_second: 120
# uncomment following section if you want to customize the port of storage-manager
# storage-manager:
# localpath: /share
# security-type: AUTO
# workgroup: WORKGROUP
# smbuser: smbuser
# smbpwd: smbpwd
# uncomment following section, if you want to customize the authentication solution.
#authentication:
#OIDC: false
# If OIDC is set as the value true, you will have to configure the following properties.
#OIDC-type: AAD
#
#AAD:
# # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
# # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud
# # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
# # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration
# # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration
# wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
#
# # Required
# tenantID: ${tenat_id}
#
# # Required, the client ID of your app in AAD
# clientID: ${your_client_id}
#
# # Required if `responseType` is 'code', 'id_token code' or 'code id_token'.
# # If app key contains '\', replace it with '\\'.
# clientSecret: '${your_client_secret}'
#
# # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds).
# nonceLifetime: null
#
# # Optional. The max amount of nonce saved in session or cookie, the default value is 10.
# nonceMaxAmount: 5
#
# # Optional. The clock skew allowed in token validation, the default value is 300 seconds.
# clockSkew: null
#
# # Optional.
# # If you want to get access_token for a specific resource, you can provide the resource here; otherwise,
# # set the value to null.
# # Note that in order to get access_token, the responseType must be 'code', 'code id_token' or 'id_token code'.
# resourceURL: 'https://graph.windows.net'
#
#group-manager:
# # basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist.
# # winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration.
# group-data-source: basic
#
# # If you set winbind as your data source, you should configure this configuration.
# # winbind-server-address: xxxxxxx
#
# # Admin group name and its user list
# admin-group:
# groupname: admingroup
# description: "admin's group"
# externalName: ""
#
# # Group for default vc.
# # For yarn default queue hack.
# default-group:
# groupname: default
# description: "group for default vc"
# externalName: ""
#
# # If the following groups are not in the data store, it will be created by default.
# grouplist:
# - groupname: forexample
# # internal name
# description: forexample
# # description of the group
# externalName: ""
# # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from
# # the result of winbind. If the group-data-source is basic, this field is useless.