diff --git a/CHANGELOG.md b/CHANGELOG.md index 0848fd772..aac3aa00a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,10 @@ The automatic allocator has been finally reimplemented, and is now much better: * It now uses information from the scheduler to determine how many allocations to spawn, and thus it can react to the current computational load much more accurately. It should also be less "eager". * It properly supports multi-node tasks. -* It considers computational load across all allocation queues (before, each queue was treated separately, which led to creating too many submissions). +* It considers computational load across all allocation queues (before, each queue was treated separately, which led to + creating too many submissions). +* It now exposes a `min-utilization` parameter, which can be used to avoid spawning an allocation that couldn't be utilized + enough. As this is a large behavioral change, we would be happy to hear your feedback! diff --git a/crates/hyperqueue/src/client/commands/autoalloc.rs b/crates/hyperqueue/src/client/commands/autoalloc.rs index a2f90887c..2da88eb13 100644 --- a/crates/hyperqueue/src/client/commands/autoalloc.rs +++ b/crates/hyperqueue/src/client/commands/autoalloc.rs @@ -140,6 +140,15 @@ The limit must not be larger than the allocation time limit."#) )] worker_time_limit: Option, + /// Minimal expected utilization required to submit an allocation into this queue + /// + /// Autoalloc will not spawn an allocation unless the scheduler thinks it could use at least + /// `min_utilization`% of the resources of workers in the allocation. + /// + /// The default is 0.0. + #[arg(long)] + min_utilization: Option, + /// Additional arguments passed to the submit command #[arg(trailing_var_arg(true))] additional_args: Vec, @@ -241,11 +250,20 @@ fn args_to_params( worker_start_cmd, worker_stop_cmd, worker_time_limit, + min_utilization, additional_args, on_server_lost, no_dry_run: _, } = args; + if let Some(min_utilization) = min_utilization { + if !(0.0..=1.0).contains(&min_utilization) { + return Err(anyhow::anyhow!( + "Minimal utilization has to be in the interval [0.0, 1.0]." + )); + } + } + if let Some(ref idle_timeout) = worker_args.idle_timeout { if *idle_timeout > Duration::from_secs(60 * 10) { log::warn!( @@ -317,6 +335,7 @@ wasted allocation duration." max_workers_per_alloc, backlog, timelimit: time_limit, + min_utilization, name, additional_args, worker_start_cmd, diff --git a/crates/hyperqueue/src/server/autoalloc/process.rs b/crates/hyperqueue/src/server/autoalloc/process.rs index f71db3b2c..e861b7b77 100644 --- a/crates/hyperqueue/src/server/autoalloc/process.rs +++ b/crates/hyperqueue/src/server/autoalloc/process.rs @@ -174,6 +174,7 @@ pub fn create_queue_info(params: AllocationQueueParams) -> QueueInfo { max_workers_per_alloc, backlog, timelimit, + min_utilization, additional_args, max_worker_count, worker_start_cmd, @@ -192,6 +193,7 @@ pub fn create_queue_info(params: AllocationQueueParams) -> QueueInfo { idle_timeout, worker_start_cmd, worker_stop_cmd, + min_utilization, ) } @@ -430,8 +432,7 @@ fn create_queue_worker_query(queue: &AllocationQueue) -> WorkerTypeQuery { // How many workers can we provide at the moment max_sn_workers: info.backlog() * info.max_workers_per_alloc(), max_workers_per_allocation: info.max_workers_per_alloc(), - // TODO: expose this through the CLI - min_utilization: 0.0, + min_utilization: info.min_utilization().unwrap_or(0.0), } } @@ -2010,6 +2011,7 @@ mod tests { timelimit: queue_info.timelimit(), name: Some("Queue".to_string()), max_worker_count: queue_info.max_worker_count(), + min_utilization: None, additional_args: vec![], worker_start_cmd: None, worker_stop_cmd: None, @@ -2424,6 +2426,8 @@ mod tests { limiter_max_submit_fails: u64, #[builder(default = "vec![Duration::ZERO]")] limiter_delays: Vec, + #[builder(default)] + min_utilization: Option, } impl QueueBuilder { @@ -2437,6 +2441,7 @@ mod tests { limiter_max_alloc_fails, limiter_max_submit_fails, limiter_delays, + min_utilization, } = self.finish().unwrap(); ( QueueInfo::new( @@ -2450,6 +2455,7 @@ mod tests { None, None, None, + min_utilization, ), RateLimiter::new( limiter_delays, diff --git a/crates/hyperqueue/src/server/autoalloc/queue/mod.rs b/crates/hyperqueue/src/server/autoalloc/queue/mod.rs index 60d6a45f5..7183a7325 100644 --- a/crates/hyperqueue/src/server/autoalloc/queue/mod.rs +++ b/crates/hyperqueue/src/server/autoalloc/queue/mod.rs @@ -24,6 +24,7 @@ pub struct QueueInfo { idle_timeout: Option, worker_start_cmd: Option, worker_stop_cmd: Option, + min_utilization: Option, } impl QueueInfo { @@ -39,6 +40,7 @@ impl QueueInfo { idle_timeout: Option, worker_start_cmd: Option, worker_stop_cmd: Option, + min_utilization: Option, ) -> Self { Self { manager, @@ -51,6 +53,7 @@ impl QueueInfo { idle_timeout, worker_start_cmd, worker_stop_cmd, + min_utilization, } } @@ -81,6 +84,10 @@ impl QueueInfo { pub fn worker_args(&self) -> &[String] { &self.worker_args } + + pub fn min_utilization(&self) -> Option { + self.min_utilization + } } #[derive(Debug)] diff --git a/crates/hyperqueue/src/server/autoalloc/state.rs b/crates/hyperqueue/src/server/autoalloc/state.rs index d96cf9ea8..d06eba059 100644 --- a/crates/hyperqueue/src/server/autoalloc/state.rs +++ b/crates/hyperqueue/src/server/autoalloc/state.rs @@ -509,6 +509,7 @@ mod tests { None, None, None, + None, ), None, Box::new(NullHandler), diff --git a/crates/hyperqueue/src/transfer/messages.rs b/crates/hyperqueue/src/transfer/messages.rs index af85fda1f..fbf270ba1 100644 --- a/crates/hyperqueue/src/transfer/messages.rs +++ b/crates/hyperqueue/src/transfer/messages.rs @@ -311,6 +311,7 @@ pub struct AllocationQueueParams { pub timelimit: Duration, pub name: Option, pub max_worker_count: Option, + pub min_utilization: Option, pub additional_args: Vec, pub worker_start_cmd: Option, diff --git a/docs/deployment/allocation.md b/docs/deployment/allocation.md index a22873310..35ee19ff7 100644 --- a/docs/deployment/allocation.md +++ b/docs/deployment/allocation.md @@ -107,6 +107,15 @@ Maximum number of workers that can be queued or running in the allocation queue. limited by the manager (PBS/Slurm), but you can use this parameter to make the limit smaller, for example if you also want to manage allocations outside HyperQueue. +#### Minimal utilization +Format: `--min-utilization ` + +Minimal utilization determines how could the scheduler utilize workers from submitted allocations. If the schedules thinks that it can make use of `N%` of worker resources in a single allocation of this queue, `min-utilization` has to be at least `N`, otherwise the allocation will not be created. + +It has to be a floating point number between 0.0 and 1.0. + +The default minimal utilization is `0`, which means that an allocation will be created if the scheduler thinks that it can use any (non-zero) amount of resources of worker(s) in the allocation. + #### Worker resources You can specify [CPU](../jobs/cresources.md) and [generic](../jobs/resources.md) resources of workers spawned by the allocation queue. The name and syntax of these parameters is the same as when you create a