Skip to content

Commit

Permalink
Avoid using sinfo -N to support cluster creation with large compute…
Browse files Browse the repository at this point in the history
… nodes

Prior to this commit, if the total number of dynamic compute node is larger than 130k. After the commit, number becomes 180k

Signed-off-by: Hanwen <[email protected]>
  • Loading branch information
hanwen-cluster committed Aug 29, 2024
1 parent d39131d commit ca27a9c
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@
mode '0755'
end

template "/usr/local/bin/is_fleet_ready.sh" do
source 'compute_fleet_status/is_fleet_ready.erb'
owner 'root'
group 'root'
mode '0755'
end

template "#{node['cluster']['etc_dir']}/clusterstatusmgtd.conf" do
source 'clusterstatusmgtd/clusterstatusmgtd.conf.erb'
owner 'root'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

sinfo_output=$(<%= node['cluster']['slurm']['install_dir'] %>/bin/sinfo -h -o '%N %t' | grep -v -E '(idle|alloc|mix|maint)$')
while IFS= read -r line; do
nodelist=$(echo "$line" | awk '{print $1}')
<%= node['cluster']['slurm']['install_dir'] %>/bin/scontrol show hostnames "$nodelist" | { grep -E '^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true; }
done <<< "$sinfo_output"
16 changes: 4 additions & 12 deletions cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -204,18 +204,10 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested
"/usr/local/bin/get-compute-fleet-status.sh"
)
# Example output for sinfo
# $ /opt/slurm/bin/sinfo -N -h -o '%N %t'
# ondemand-dy-c52xlarge-1 idle~
# ondemand-dy-c52xlarge-2 idle~
# spot-dy-c5xlarge-1 idle~
# spot-st-t2large-1 down
# spot-st-t2large-2 idle
# capacity-block-st-t2micro-1 maint
# capacity-block-dy-t2micro-1 maint
is_fleet_ready_command = Shellwords.escape(
"set -o pipefail && #{node['cluster']['slurm']['install_dir']}/bin/sinfo -N -h -o '%N %t' | { grep -E '^[a-z0-9\\-]+\\-st\\-[a-z0-9\\-]+\\-[0-9]+ .*' || true; } | { grep -v -E '(idle|alloc|mix|maint)$' || true; }"
)
until shell_out!("/bin/bash -c #{is_fleet_ready_command}").stdout.strip.empty?
# sinfo -h -o '%N %t'
# queue-0-dy-compute-resource-g4dn-0-[1-10],queue-1-dy-compute-resource-g4dn-1-[1-10] idle~
# queue-2-dy-compute-resource-g4dn-2-[1-10],queue-3-dy-compute-resource-g4dn-3-[1-10] idle
until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh").stdout.strip.empty?
check_for_protected_mode(fleet_status_command)

Chef::Log.info("Waiting for static fleet capacity provisioning")
Expand Down

0 comments on commit ca27a9c

Please sign in to comment.