Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 3 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1067,20 +1067,15 @@ is a free and open-source job scheduler for Linux and Unix-like kernels,
used by many of the world's supercomputers and computer clusters.
[reference](https://en.wikipedia.org/wiki/Slurm_Workload_Manager)

> [MUNGE](https://github.com/dun/munge) (MUNGE Uid 'N' Gid Emporium) is
an authentication service for creating and validating credentials. It is
designed to be highly scalable for use in an HPC cluster environment.
[reference](https://dun.github.io/munge/)

This class installs base packages and config files that are essential
to all Slurm's roles. It also installs and configure Munge service.
to all Slurm's roles.

### parameters

| Variable | Description | Type |
| :---------------------- | :----------------------- | :------ |
| `cluster_name` | Name of the cluster | String |
| `munge_key` | Base64 encoded Munge key | String |
| `auth_key` | Base64 encoded Slurm auth key | String |
| `slurm_version` | Slurm version to install | Enum['24.05', '24.11', '25.05', '25.11'] |
| `os_reserved_memory` | Memory in MB reserved for the operating system on the compute nodes | Integer |
| `suspend_time` | Idle time (seconds) for nodes to becomes eligible for suspension. | Integer |
Expand All @@ -1098,7 +1093,7 @@ to all Slurm's roles. It also installs and configure Munge service.

```yaml
profile::slurm::base::cluster_name: "%{alias('terraform.data.cluster_name')}"
profile::slurm::base::munge_key: ENC[PKCS7, ...]
profile::slurm::base::auth_key: ENC[PKCS7, ...]
profile::slurm::base::slurm_version: '23.11'
profile::slurm::base::os_reserved_memory: 512
profile::slurm::base::suspend_time: 3600
Expand Down
2 changes: 1 addition & 1 deletion bootstrap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ ENC_CMD="eyaml encrypt -o block --pkcs7-public-key=${PKCS7_KEY}"
(
$ENC_CMD -l 'jupyterhub::prometheus_token' -s $(uuidgen)
$ENC_CMD -l 'profile::consul::acl_api_token' -s $(uuidgen)
$ENC_CMD -l 'profile::slurm::base::munge_key' -s $(openssl rand 1024 | openssl enc -A -base64)
$ENC_CMD -l 'profile::slurm::base::auth_key' -s $(openssl rand 1024 | openssl enc -A -base64)
$ENC_CMD -l 'profile::slurm::accounting::password' -s $(openssl rand -base64 9)
$ENC_CMD -l 'profile::freeipa::mokey::password' -s $(openssl rand -base64 9)
$ENC_CMD -l 'profile::freeipa::server::ds_password' -s $(openssl rand -base64 9)
Expand Down
1 change: 0 additions & 1 deletion site/profile/files/base/prepare4image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ systemctl daemon-reload

systemctl stop rsyslog
: > /var/log/messages
test -d /var/log/munge && : > /var/log/munge/munged.log
: > /var/log/secure
: > /var/log/cron
test -d /var/log/audit && : > /var/log/audit/audit.log
Expand Down
Binary file removed site/profile/files/slurm/munge_socket.pp
Binary file not shown.
10 changes: 0 additions & 10 deletions site/profile/files/slurm/munge_socket.te

This file was deleted.

122 changes: 31 additions & 91 deletions site/profile/manifests/slurm.pp
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# Slurm base class that is included in each different profile.
# The class configures the slurm and munge users, install the
# The class configures the slurm user, install the
# base slurm packages and configures everything that is required
# on all types of nodes.
# @param cluster_name Specifies the name of the cluster as it appears in slurm.conf
# @param munge_key Specifies the munge secret key that allows slurm nodes to communicate
# @param auth_key Specifies the auth secret key that allows slurm nodes to communicate
# @param slurm_version Specifies which version of Slurm to install
# @param os_reserved_memory Specifies the amount of memory reserved for the operating system in compute node
class profile::slurm::base (
String $cluster_name,
String $munge_key,
String $auth_key,
Enum['24.05', '24.11', '25.05', '25.11'] $slurm_version,
Integer $os_reserved_memory,
Integer $suspend_time = 3600,
Expand Down Expand Up @@ -39,64 +39,10 @@
before => Package['slurm']
}

group { 'munge':
ensure => 'present',
gid => '2002'
}

user { 'munge':
ensure => 'present',
groups => 'munge',
uid => '2002',
home => '/var/lib/munge',
comment => 'MUNGE Uid N Gid Emporium',
shell => '/sbin/nologin',
before => Package['munge']
}

package { 'xauth':
ensure => 'installed',
}

package { 'munge':
ensure => 'installed',
}

# Sometime /var/run/munge is not created.
# Munge RPM provides /usr/lib/tmpfiles.d/munge.conf
# tmpfiles.d config was replaced with RuntimeDirectory as of munge 0.5.14
# but we are stuck with 0.5.13 as upstream has not updated munge
# since 2021. The next 2 file_lines make sure munge does not rely on
# systemd-tmpfiles-setup.service.
# Ref: https://github.com/dun/munge/commit/3eed37e3ca73c14b679394df7be151d27566b0fe
# Ref: https://github.com/dun/munge/issues/75
file_line { 'munge_runtimedirectory':
path => '/usr/lib/systemd/system/munge.service',
match => '^RuntimeDirectory=',
line => 'RuntimeDirectory=munge',
after => 'Group=munge',
require => Package['munge'],
notify => Service['munge'],
}

file_line { 'munge_runtimedirectorymode':
path => '/usr/lib/systemd/system/munge.service',
match => '^RuntimeDirectoryMode=',
line => 'RuntimeDirectoryMode=0755',
after => 'Group=munge',
require => Package['munge'],
notify => Service['munge'],
}

# Fix a warning in systemctl status munge about the location of the PID file.
file_line { 'munge_pidfile':
path => '/usr/lib/systemd/system/munge.service',
match => '^PIDFile=',
line => 'PIDFile=/run/munge/munged.pid',
require => Package['munge'],
notify => Service['munge'],
}

file { '/var/log/slurm':
ensure => 'directory',
owner => 'slurm',
Expand All @@ -116,12 +62,6 @@
seltype => 'usr_t'
}

file { '/etc/munge':
ensure => 'directory',
owner => 'munge',
group => 'munge'
}

file { '/etc/slurm/cgroup.conf':
ensure => 'present',
owner => 'slurm',
Expand Down Expand Up @@ -155,20 +95,15 @@
content => $slurm_path,
}

file { '/etc/munge/munge.key':
file { '/etc/slurm/slurm.key':
ensure => 'present',
owner => 'munge',
group => 'munge',
mode => '0400',
content => $munge_key,
before => Service['munge']
}

service { 'munge':
ensure => 'running',
enable => true,
subscribe => File['/etc/munge/munge.key'],
require => Package['munge']
owner => 'slurm',
group => 'slurm',
mode => '0600',
content => $auth_key,
require => [
File['/etc/slurm'],
]
}

$yumrepo_prefix = "https://download.copr.fedorainfracloud.org/results/cmdntrf/Slurm${slurm_version}/"
Expand All @@ -187,7 +122,6 @@
name => "slurm-${slurm_version}*",
require => [
Exec['enable_powertools'],
Package['munge'],
Yumrepo['slurm-copr-repo'],
Yumrepo['epel'],
],
Expand All @@ -197,7 +131,6 @@
ensure => 'installed',
require => [
Package['slurm'],
Package['munge'],
Yumrepo['slurm-copr-repo']],
}

Expand All @@ -211,7 +144,6 @@
ensure => 'installed',
require => [
Package['slurm'],
Package['munge'],
Yumrepo['slurm-copr-repo']
],
}
Expand Down Expand Up @@ -261,16 +193,6 @@
require => File['/etc/slurm'],
}

# SELinux policy required to allow confined users to submit job with Slurm 19, 20, 21.
# Slurm commands tries to write to a socket in /var/run/munge.
# Confined users cannot stat this file, neither write to it. The policy
# allows user_t to getattr and write var_run_t sock file.
# To get the policy, we had to disable dontaudit rules with : sudo semanage -DB
selinux::module { 'munge_socket':
ensure => 'present',
source_pp => 'puppet:///modules/profile/slurm/munge_socket.pp',
}

file {'/etc/slurm/nodes.conf':
ensure => 'present',
owner => 'slurm',
Expand Down Expand Up @@ -325,7 +247,6 @@
name => "slurm-slurmdbd-${slurm_version}*",
require => [
Package['slurm'],
Package['munge'],
Yumrepo['slurm-copr-repo']
],
}
Expand All @@ -338,6 +259,7 @@
File['/etc/slurm/slurmdbd.conf'],
],
subscribe => [
File['/etc/slurm/slurm.key'],
Mysql::Db['slurm_acct_db'],
],
before => Service['slurmctld']
Expand Down Expand Up @@ -515,7 +437,6 @@
package { 'slurm-slurmctld':
ensure => 'installed',
require => [
Package['munge'],
Package['slurm'],
],
}
Expand All @@ -531,6 +452,7 @@
File['/etc/slurm/slurm-addendum.conf'],
File['/etc/slurm/gres.conf'],
File['/etc/slurm/nodes.conf'],
File['/etc/slurm/slurm.key'],
]
}

Expand Down Expand Up @@ -757,6 +679,7 @@
File['/etc/slurm/slurm.conf'],
File['/etc/slurm/slurm-addendum.conf'],
File['/etc/slurm/nodes.conf'],
File['/etc/slurm/slurm.key'],
],
require => [
Package['slurm-slurmd'],
Expand Down Expand Up @@ -799,4 +722,21 @@
# controller through Slurm command-line tools.
class profile::slurm::submitter {
contain profile::slurm::base

package { 'slurm-sackd':
require => [
Package['slurm'],
]
}

service { 'sackd':
ensure => running,
enable => true,
subscribe => [
File['/etc/slurm/slurm.key'],
],
require => [
Package['slurm-sackd'],
],
}
}
4 changes: 2 additions & 2 deletions site/profile/templates/slurm/slurm.conf.epp
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ JobAcctGatherParams=NoOverMemoryKill

# MANAGEMENT POLICIES
ClusterName=<%= $cluster_name %>
AuthType=auth/munge
CryptoType=crypto/munge
AuthType=auth/slurm
CredType=cred/slurm
SlurmUser=slurm
# SCHEDULER CONFIGURATIONS
SchedulerType=sched/backfill
Expand Down
2 changes: 1 addition & 1 deletion site/profile/templates/slurm/slurmdbd.conf.epp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
AuthType=auth/munge
AuthType=auth/slurm
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd/slurmdbd.pid
DbdHost=<%= $dbd_host %>
Expand Down
Loading