diff --git a/cookbooks/aws-parallelcluster-computefleet/recipes/config/fleet_status.rb b/cookbooks/aws-parallelcluster-computefleet/recipes/config/fleet_status.rb index 10197ac84..e579f7599 100644 --- a/cookbooks/aws-parallelcluster-computefleet/recipes/config/fleet_status.rb +++ b/cookbooks/aws-parallelcluster-computefleet/recipes/config/fleet_status.rb @@ -38,6 +38,13 @@ mode '0755' end + template "/usr/local/bin/is_fleet_ready.sh" do + source 'compute_fleet_status/is_fleet_ready.erb' + owner 'root' + group 'root' + mode '0755' + end + template "#{node['cluster']['etc_dir']}/clusterstatusmgtd.conf" do source 'clusterstatusmgtd/clusterstatusmgtd.conf.erb' owner 'root' diff --git a/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb b/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb new file mode 100644 index 000000000..0cf4d4a05 --- /dev/null +++ b/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb @@ -0,0 +1,7 @@ +#!/bin/bash + +sinfo_output=$(<%= node['cluster']['slurm']['install_dir'] %>/bin/sinfo -h -o '%N %t' | grep -v -E '(idle|alloc|mix|maint)$') +while IFS= read -r line; do + nodelist=$(echo "$line" | awk '{print $1}') + <%= node['cluster']['slurm']['install_dir'] %>/bin/scontrol show hostnames "$nodelist" | { grep -E '^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true; } +done <<< "$sinfo_output" diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb index adf72705a..c012db4cf 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb @@ -204,18 +204,10 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested "/usr/local/bin/get-compute-fleet-status.sh" ) # Example output for sinfo - # $ /opt/slurm/bin/sinfo -N -h -o '%N %t' - # ondemand-dy-c52xlarge-1 idle~ - # ondemand-dy-c52xlarge-2 idle~ - # spot-dy-c5xlarge-1 idle~ - # spot-st-t2large-1 down - # spot-st-t2large-2 idle - # capacity-block-st-t2micro-1 maint - # capacity-block-dy-t2micro-1 maint - is_fleet_ready_command = Shellwords.escape( - "set -o pipefail && #{node['cluster']['slurm']['install_dir']}/bin/sinfo -N -h -o '%N %t' | { grep -E '^[a-z0-9\\-]+\\-st\\-[a-z0-9\\-]+\\-[0-9]+ .*' || true; } | { grep -v -E '(idle|alloc|mix|maint)$' || true; }" - ) - until shell_out!("/bin/bash -c #{is_fleet_ready_command}").stdout.strip.empty? + # sinfo -h -o '%N %t' + # queue-0-dy-compute-resource-g4dn-0-[1-10],queue-1-dy-compute-resource-g4dn-1-[1-10] idle~ + # queue-2-dy-compute-resource-g4dn-2-[1-10],queue-3-dy-compute-resource-g4dn-3-[1-10] idle + until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh").stdout.strip.empty? check_for_protected_mode(fleet_status_command) Chef::Log.info("Waiting for static fleet capacity provisioning")