Skip to content
This repository was archived by the owner on Jun 6, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/rest-server/src/config/v2/hived.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ const hivedSchema = {
default: null,
},
},
additionalProperties: false,
additionalProperties: true,
},
},
minProperties: 1,
Expand Down
36 changes: 23 additions & 13 deletions src/rest-server/src/middlewares/v2/hived.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,14 @@ const getCellStatus = async (virtualCluster) => {
logger.warn('Failed to inspect vc from hived scheduler: ', error.response ? error.response.data : error);
return {
cellQuota: Number.MAX_SAFE_INTEGER,
cellUnits: Object.values(resourceUnits),
cellUnits: {...resourceUnits},
};
}

let cellQuota = 0;
const cellUnits = [...new Set(vcStatus.map((cell) => cell.gpuType))]
.filter((key) => key in resourceUnits)
.reduce((arr, key) => ([...arr, resourceUnits[key]]), []);
.reduce((dict, key) => ({...dict, [key]: resourceUnits[key]}), {});
const cellQueue = [...vcStatus];
while (cellQueue.length > 0) {
const curr = cellQueue.shift();
Expand All @@ -78,6 +78,8 @@ const hivedValidate = async (protocolObj, username) => {
const virtualCluster = ('defaults' in protocolObj && protocolObj.defaults.virtualCluster != null) ?
protocolObj.defaults.virtualCluster : 'default';

const {cellQuota, cellUnits} = await getCellStatus(virtualCluster);

if ('extras' in protocolObj && 'hivedScheduler' in protocolObj.extras) {
hivedConfig = protocolObj.extras.hivedScheduler;
if (hivedConfig && hivedConfig.jobPriorityClass === 'oppo') {
Expand All @@ -104,12 +106,21 @@ const hivedValidate = async (protocolObj, username) => {
);
}

if (taskRoleConfig.skuType !== null && !(taskRoleConfig.skuType in resourceUnits)) {
throw createError(
'Bad Request',
'InvalidProtocolError',
`Taskrole ${taskRole} has unknown skuType ${taskRoleConfig.skuType}, allow ${Object.keys(resourceUnits)}.`
);
if (taskRoleConfig.skuType !== null) {
if (!(taskRoleConfig.skuType in resourceUnits)) {
throw createError(
'Bad Request',
'InvalidProtocolError',
`Taskrole ${taskRole} has unknown skuType ${taskRoleConfig.skuType}, allow ${Object.keys(resourceUnits)}.`
);
}
if (!opportunistic && !(taskRoleConfig.skuType in cellUnits)) {
throw createError(
'Bad Request',
'InvalidProtocolError',
`Taskrole ${taskRole} has skuType ${taskRoleConfig.skuType}, VC ${virtualCluster} only allows ${Object.keys(cellUnits)}.`
);
}
}

const affinityGroupName = taskRoleConfig.affinityGroupName;
Expand Down Expand Up @@ -173,12 +184,11 @@ const hivedValidate = async (protocolObj, username) => {

// generate podSpec for every taskRole
let requestCellNumber = 0;
const {cellQuota, cellUnits} = await getCellStatus(virtualCluster);
for (let taskRole of Object.keys(protocolObj.taskRoles)) {
const resourcePerCell = {};
for (const t of ['gpu', 'cpu', 'memory']) {
resourcePerCell[t] = Math.min(
...Array.from(opportunistic ? Object.values(resourceUnits) : cellUnits, (v) => v[t]));
...Array.from(Object.values(opportunistic ? resourceUnits: cellUnits), (v) => v[t]));
}

const podSpec = {
Expand Down Expand Up @@ -235,9 +245,9 @@ const hivedValidate = async (protocolObj, username) => {
}

const cellNumber = Math.max(
Math.ceil(gpu / resourcePerCell.gpu),
Math.ceil(cpu / resourcePerCell.cpu),
Math.ceil(memoryMB / resourcePerCell.memory),
gpu === 0 ? 0 : Math.ceil(gpu / resourcePerCell.gpu),
cpu === 0 ? 0 : Math.ceil(cpu / resourcePerCell.cpu),
memoryMB === 0 ? 0 : Math.ceil(memoryMB / resourcePerCell.memory),
);
podSpec.gpuNumber = cellNumber;
requestCellNumber += protocolObj.taskRoles[taskRole].instances * cellNumber;
Expand Down
6 changes: 6 additions & 0 deletions src/webportal/deploy/webportal.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ spec:
env:
- name: LAUNCHER_TYPE
value: {{ cluster_cfg["cluster"]["common"]["cluster-type"] }}
- name: LAUNCHER_SCHEDULER
{% if cluster_cfg['hivedscheduler']['config']|length > 1 %}
value: hivedscheduler
{% else %}
value: defaultscheduler
{% endif %}
- name: REST_SERVER_URI
value: {{ cluster_cfg['rest-server']['uri'] }}
- name: PROMETHEUS_URI
Expand Down
1 change: 1 addition & 0 deletions src/webportal/src/app/env.js.template
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ window.ENV = {
logType: '${LOG_TYPE}',
alertManagerUri: '${ALERT_MANAGER_URI}/alert-manager',
launcherType: '${LAUNCHER_TYPE}',
launcherScheduler: '${LAUNCHER_SCHEDULER}',
jobHistory: '${JOB_HISTORY}',
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import React from 'react';

export default React.createContext({
vcNames: [],
hivedSkuTypes: {},
errorMessages: {},
setErrorMessage: (id, msg) => {},
});
100 changes: 100 additions & 0 deletions src/webportal/src/app/job-submission/components/hived-sku-section.jsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import React, { useCallback, useContext, useMemo } from 'react';
import { get } from 'lodash';
import { Dropdown, Stack } from 'office-ui-fabric-react';
import PropTypes from 'prop-types';
import { BasicSection } from './basic-section';
import { CSpinButton } from './customized-components';
import { FormShortSection } from './form-page';
import { PROTOCOL_TOOLTIPS } from '../utils/constants';
import Context from './context';

export const HivedSkuSection = React.memo(props => {
const { value, onChange } = props;
const { skuNum, skuType } = value;
const { hivedSkuTypes } = useContext(Context);

const skuOptions = useMemo(
() =>
Object.entries(hivedSkuTypes).reduce((options, skuType) => {
const [name, { gpu, cpu, memory }] = skuType;
return [
...options,
{
key: name,
sku: { gpu, cpu, memory },
text: `${name} (${gpu} GPU, ${cpu} CPU, ${memory} memory)`,
},
];
}, []),
[hivedSkuTypes],
);

const _setSku = () => {
if (value.skuType != null) {
const selected = skuOptions.find(option => option.key === value.skuType);
if (selected == null) {
onChange({ ...value, skuType: null, sku: null });
} else if (value.sku == null) {
const sku = get(selected, 'sku', null);
onChange({ ...value, sku });
}
}
};

const _onSkuNumChange = useCallback(
num => {
onChange({
...value,
skuNum: num,
});
},
[onChange],
);

const _onSkuTypeChange = useCallback(
(_, item) => {
onChange({
...value,
skuType: item.key,
sku: item.sku,
});
},
[onChange],
);

_setSku();
return (
<BasicSection
sectionLabel='Resources SKU'
sectionTooltip={PROTOCOL_TOOLTIPS.hivedSkuType}
>
<FormShortSection gap='m'>
<Stack horizontal verticalAlign='baseline'>
<div style={{ width: '20%' }}>SKU count</div>
<Stack.Item grow>
<CSpinButton value={skuNum} min={1} onChange={_onSkuNumChange} />
</Stack.Item>
</Stack>
<Stack horizontal verticalAlign='baseline'>
<div style={{ width: '20%' }}>SKU type</div>
<Stack.Item grow>
<Dropdown
placeholder='Select SKU type'
options={skuOptions}
onChange={_onSkuTypeChange}
selectedKey={skuType}
/>
</Stack.Item>
</Stack>
</FormShortSection>
</BasicSection>
);
});

HivedSkuSection.propTypes = {
value: PropTypes.object.isRequired,
onChange: PropTypes.func,
};
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@ import { FormPage } from './form-page';
import { JobTaskRole } from '../models/job-task-role';
import { FormSpinButton } from './form-spin-button';
import { ContainerSizeSection } from './container-size-section';
import { HivedSkuSection } from './hived-sku-section';
import { CommandSection } from './command-section';
import { CompletionPolicy } from './task-role/completion-policy';
import { PortsList } from './task-role/ports-list';
import { getDefaultContainerSize } from '../models/container-size';
import { PROTOCOL_TOOLTIPS } from '../utils/constants';
import config from '../../config/webportal.config';

export const TabFormContent = ({
jobTaskRole,
Expand Down Expand Up @@ -85,21 +87,28 @@ export const TabFormContent = ({
onChange={value => _onValueChange('instances', value)}
/>
)}
<ContainerSizeSection
value={jobTaskRole.containerSize}
onEnable={checked =>
_onValuesChange({
isContainerSizeEnabled: checked,
containerSize: getDefaultContainerSize(
jobTaskRole.containerSize.gpu,
),
})
}
onChange={containerSize =>
_onValueChange('containerSize', containerSize)
}
isContainerSizeEnabled={jobTaskRole.isContainerSizeEnabled}
/>
{(config.launcherScheduler === 'hivedscheduler' && (
<HivedSkuSection
value={jobTaskRole.hivedSku}
onChange={hivedSku => _onValueChange('hivedSku', hivedSku)}
/>
)) || (
<ContainerSizeSection
value={jobTaskRole.containerSize}
onEnable={checked =>
_onValuesChange({
isContainerSizeEnabled: checked,
containerSize: getDefaultContainerSize(
jobTaskRole.containerSize.gpu,
),
})
}
onChange={containerSize =>
_onValueChange('containerSize', containerSize)
}
isContainerSizeEnabled={jobTaskRole.isContainerSizeEnabled}
/>
)}
<DockerSection
sectionTooltip={PROTOCOL_TOOLTIPS.dockerImage}
value={jobTaskRole.dockerInfo}
Expand Down
13 changes: 12 additions & 1 deletion src/webportal/src/app/job-submission/job-submission-page.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import { TaskRoles } from './components/task-roles';
import Context from './components/context';
import {
fetchJobConfig,
listHivedSkuTypes,
listUserVirtualClusters,
listUserStorageConfigs,
fetchStorageDetails,
Expand Down Expand Up @@ -97,6 +98,7 @@ export const JobSubmissionPage = ({

// Context variables
const [vcNames, setVcNames] = useState([]);
const [hivedSkuTypes, setHivedSkuTypes] = useState({});
const [errorMessages, setErrorMessages] = useState({});

const setJobTaskRoles = useCallback(
Expand Down Expand Up @@ -169,10 +171,11 @@ export const JobSubmissionPage = ({
const contextValue = useMemo(
() => ({
vcNames,
hivedSkuTypes,
errorMessages,
setErrorMessage,
}),
[vcNames, errorMessages, setErrorMessage],
[vcNames, hivedSkuTypes, errorMessages, setErrorMessage],
);

useEffect(() => {
Expand Down Expand Up @@ -345,6 +348,14 @@ export const JobSubmissionPage = ({
.catch(alert);
}, []);

useEffect(() => {
listHivedSkuTypes()
.then(hivedSkuTypes => {
setHivedSkuTypes(hivedSkuTypes);
})
.catch(alert);
}, []);

const onToggleAdvanceFlag = useCallback(() => {
setAdvanceFlag(!advanceFlag);
}, [advanceFlag, setAdvanceFlag]);
Expand Down
31 changes: 23 additions & 8 deletions src/webportal/src/app/job-submission/models/job-protocol.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import yaml from 'js-yaml';
import Joi from 'joi-browser';
import { removeEmptyProperties } from '../utils/utils';
import { TaskRolesManager } from '../utils/task-roles-manager';
import config from '../../config/webportal.config';

export class JobProtocol {
constructor(props) {
Expand Down Expand Up @@ -131,7 +132,12 @@ export class JobProtocol {
return { ...oriPre, ...curPre };
});

const taskRoles = this._updateAndConvertTaskRoles(jobTaskRoles);
const { taskRoles, hivedTaskRoles } = this._updateAndConvertTaskRoles(
jobTaskRoles,
);
if (config.launcherScheduler === 'hivedscheduler') {
protocolExtras.hivedScheduler = { taskRoles: hivedTaskRoles };
}
const secrets = removeEmptyProperties(
jobSecrets.reduce((res, secret) => {
res[secret.key] = secret.value;
Expand All @@ -157,13 +163,22 @@ export class JobProtocol {
}

_updateAndConvertTaskRoles(jobTaskRoles) {
return jobTaskRoles.reduce(
(res, taskRole) => ({
...res,
...taskRole.convertToProtocolFormat(),
}),
{},
);
return {
taskRoles: jobTaskRoles.reduce(
(res, taskRole) => ({
...res,
...taskRole.convertToProtocolFormat().taskRole,
}),
{},
),
hivedTaskRoles: jobTaskRoles.reduce(
(res, taskRole) => ({
...res,
...taskRole.convertToProtocolFormat().hivedTaskRole,
}),
{},
),
};
}

_generateDeployments(jobTaskRoles) {
Expand Down
Loading