Sure. Here is our lua script.
-Paul Edmon-
On 09/13/2018 07:28 AM, Andre Torres wrote:
That's interesting using AD to maintain uid consistency across all the nodes.
Like Loris, I'm also interested in your Lua script.
-
André
On 13/09/2018, 11:42, "slurm-users on behalf of Loris Bennett"
<slurm-users-boun...@lists.schedmd.com on behalf of loris.benn...@fu-berlin.de> wrote:
Hi Paul,
I'd be interested in seeing your Lua submit script, if you're willing to
share.
Until now I had thought that the most elegant way of setting up Slurm
users would be via a PAM module analogous to pam_mkhomedir, the simplest
option being to use pam_script.
However, given that we do have users who somehow never get round to
submitting a job before their HPC access expires, setting up the Slurm
account when the first job is submitted seems quite appealing.
Cheers,
Loris
Paul Edmon <ped...@cfa.harvard.edu> writes:
> So useradd is adding a Linux user, which sacctmgr creates a Slurm user.
>
> What we do is that we run AD for our Linux user managment. We then in
our job submit lua script look to see if the user has an account in slurm and if
they don't we create it.
>
> Another way would be to make all your Linux users and then map that in
to Slurm using sacctmgr.
>
> It really depends on if your Slurm users are a subset of your regular
users or not.
>
> -Paul Edmon-
>
> On 9/12/2018 12:21 PM, Andre Torres wrote:
>
> Hi all,
>
> I’m new to slurm and I’m confused regarding user creation. I have an
installation with 1 login node and 5 compute nodes. If I create a user across all
the nodes with the same uid and gid I can execute jobs but
> I can’t understand the difference between user creation with “useradd”
command and the “sacctmgr” command
>
> sacctmgr create account name=test
>
> sacctmgr create user jdoe account=test
>
> Also, is there anyway of creating a user at login node and replicate to
the compute nodes ? What is the best practice for user creation ?
>
> Thanks in advance
>
>
--
Dr. Loris Bennett (Mr.)
ZEDAT, Freie Universität Berlin Email loris.benn...@fu-berlin.de
-- Cache association existance for this many seconds, with the caveat
-- that if you manually remove a user's association, it will take SLURM
-- up to this long to recognize that it's gone.
local ASSOC_CACHE_LIFETIME = 24 * 60 * 60
-- Cache the start time of a downtime (if any) for this many minutes, so we
-- don't have to call 'scontrol show reservations' for every job submission.
local DOWNTIME_RESERVATION_CACHE = 30
-- Demand that jobs end this many minutes before a scheduled downtime.
local SUGGESTED_JOB_END_BEFORE_DOWNTIME = 60
local posix = require "posix"
function in_group(user, group)
local group = posix.getgroup(group)
local i = 0
while group[i] do
if group[i] == user then
return true
end
i = i + 1
end
return false
end
-- Compatibility: Lua-5.1
-- Got this from: http://lua-users.org/wiki/SplitJoin
function split(str, pat)
local t = {} -- NOTE: use {n = 0} in Lua-5.0
local fpat = "(.-)" .. pat
local last_end = 1
local s, e, cap = str:find(fpat, 1)
while s do
if s ~= 1 or cap ~= "" then
table.insert(t,cap)
end
last_end = e+1
s, e, cap = str:find(fpat, last_end)
end
if last_end <= #str then
cap = str:sub(last_end)
table.insert(t, cap)
end
return t
end
local existing_assocs = {}
local existing_assocs_last_expired = os.time()
function assoc_exists(user, account)
-- Expire the cache if it's old.
if existing_assocs_last_expired + ASSOC_CACHE_LIFETIME < os.time() then
existing_assocs = {}
existing_assocs_last_expired = os.time()
end
if existing_assocs[user .. "\n" .. account] then
return true
end
-- Unfortunately, filehandles returned by io.popen() don't have a
-- way to return their exitstatuses until lua 5.2. We should be
-- reasonably safe here, since if we erroneously conclude the
-- association doesn't exist, we'll just try to add it.
--
-- http://lua-users.org/lists/lua-l/2012-01/msg00364.html
local fp = io.popen(string.format(
"sacctmgr --parsable2 --noheader list associations " ..
"format=account users='%s'",
user
))
for line in fp:lines() do
if line == account then
existing_assocs[user .. "\n" .. account] = true
return true
end
end
return false
end
function ensure_assoc_exists(user, account)
--See if user exists and is associated
oldaccount = assoc_exists(user,account)
if oldaccount then
return true
end
local fp = io.popen(string.format(
"sacctmgr --parsable2 --noheader list associations " ..
"format=account users='%s'",
user
))
oldaccount=0
for line in fp:lines() do
oldaccount=line
end
if oldaccount == 0 then
--Create new group if needed
ret = os.execute(string.format(
"sacctmgr -i create account name='%s' " ..
"fairshare=45",
account
))
if ret ~= 0 then
slurm.log_info("sacctmgr failed to add account %s " ..
"with exit status %d",
account, ret)
end
--Create new users if needed
ret = os.execute(string.format(
"sacctmgr -i create user name='%s' defaultaccount='%s' " ..
"fairshare=parent maxsubmitjobs=10100",
user, account
))
if ret == 0 then
existing_assocs[user .. "\n" .. account] = true
slurm.log_info("added association to %s for %s",
account, user)
return true
end
slurm.log_info("sacctmgr failed to add association to " ..
"%s for %s with exit status %d",
account, user, ret)
return false
end
--If we get here we need to reassociate the user because
--The user does exist but is in the wrong group
--Create new group if needed
ret = os.execute(string.format(
"sacctmgr -i create account name='%s' " ..
"fairshare=45",
account
))
if ret ~= 0 then
slurm.log_info("sacctmgr failed to add account %s " ..
"with exit status %d",
account, ret)
end
--Now to move the user to the their new group
ret = os.execute(string.format(
"sacctmgr -i add user name='%s' defaultaccount='%s'",
user, account
))
if ret == 0 then
slurm.log_info("added association to %s for %s",
account, user)
end
if ret ~= 0 then
slurm.log_info("sacctmgr failed to add account %s " ..
"with exit status %d",
account, ret)
end
--Removing the old group
ret = os.execute(string.format(
"sacctmgr -i delete user name='%s' account='%s'",
user, oldaccount
))
if ret == 0 then
slurm.log_info("removed association of %s for %s",
oldaccount, user)
end
if ret ~= 0 then
slurm.log_info("sacctmgr failed to remove account %s " ..
"with exit status %d",
oldaccount, ret)
end
--Setting other parameters for the user
ret = os.execute(string.format(
"sacctmgr -i modify user where name='%s' " ..
"cluster=odyssey defaultaccount='%s' set " ..
"fairshare=parent maxsubmitjobs=10100",
user, account
))
if ret == 0 then
existing_assocs[user .. "\n" .. account] = true
slurm.log_info("added association to %s for %s",
account, user)
return true
end
if ret ~= 0 then
slurm.log_info("sacctmgr failed to modify user %s " ..
"with exit status %d",
user, ret)
return false
end
end
_downtime_start = -1
_downtime_start_last_fetched = -1
function downtime_reservation_start()
if _downtime_start_last_fetched + DOWNTIME_RESERVATION_CACHE * 60 >= os.time() then
-- Return the unexpired cached value.
return _downtime_start
end
local fp = io.popen("scontrol -o show reservations downtime")
for line in fp:lines() do
if line:sub(1, string.len("ReservationName=downtime ")) == "ReservationName=downtime " then
result, count = line:gsub(".* StartTime=(%d+-%d+-%d+)T(%d+:%d+:%d+) .*", "%1 %2")
if count > 0 then
_downtime_start_last_fetched = os.time()
-- el6's lua-posix is too old to have strptime(),
-- so we'll use date(1) to parse times instead.
_downtime_start = tonumber(io.popen(
string.format("date -d '%s' +%%s", result)):read())
return _downtime_start
-- parsed, _ = posix.strptime(result, "%Y-%m-%d %H:%M:%S")
-- -- luaposix puts the day of the month in 'monthday', but
-- -- os.time() wants it in 'day'.
-- parsed['day'] = parsed['monthday']
-- return os.time(parsed)
end
end
end
-- No scheduled downtime.
_downtime_start_last_fetched = os.time()
_downtime_start = -1
return _downtime_start
end
function seconds_to_slurm_timespec(seconds)
local days = math.floor(seconds / 60 / 60 / 24)
local leftover = seconds - (days * 60 * 60 * 24)
local hours = math.floor(leftover / 60 / 60)
leftover = (leftover - (hours * 60 * 60))
local minutes = math.floor(leftover / 60)
return string.format("%02i-%02i:%02i", days, hours, minutes)
end
function slurm_timespec_to_seconds(timespec)
local days, hours, minutes, seconds
minutes = timespec:match("^(%d+)$")
if minutes then
return tonumber(timespec) * 60
end
minutes, seconds = timespec:match("^(%d+):(%d+)$")
if minutes and seconds then
return minutes * 60 + seconds
end
hours, minutes, seconds = timespec:match("^(%d+):(%d+):(%d+)$")
if hours and minutes and seconds then
return hours * 60 * 60 + minutes * 60 + seconds
end
days, hours = timespec:match("^(%d+)-(%d+)$")
if days and hours then
return days * 24 * 60 * 60 + hours * 60 * 60
end
days, hours, minutes = timespec:match("^(%d+)-(%d+):(%d+)$")
if days and hours and minutes then
return days * 24 * 60 * 60 + hours * 60 * 60 + minutes * 60
end
days, hours, minutes, seconds = timespec:match("^(%d+)-(%d+):(%d+):(%d+)$")
if days and hours and minutes and seconds then
return days * 24 * 60 * 60 + hours * 60 * 60 + minutes * 60 + seconds
end
return -1
end
function friendly_error_if_time_limit_intersects_downtime(time_limit)
local reservation_start = get_downtime_start()
if not reservation_start then
return 0
end
local now = os.time(os.date("*t"))
-- time_limit is in minutes.
local job_end_time = now + time_limit * 60
if job_end_time <= reservation_start then
-- The job is scheduled to end before the downtime.
return 0
end
-- Suggest a runtime that ends SUGGESTED_JOB_END_BEFORE_DOWNTIME
-- minutes before the downtime, so slurmctld has time to process
-- all the RPCs before the downtime actually starts.
--
-- This also gives the user a little leeway to update their job
-- with the suggested downtime, so the suggested time isn't
-- rejected if they take a few minutes to update their submission.
local secs_until_downtime = reservation_start -
now - SUGGESTED_JOB_END_BEFORE_DOWNTIME * 60
local suggested_timespec = seconds_to_slurm_timespec(
secs_until_downtime)
local downtime_start = posix.localtime(reservation_start)
log_user(string.format([[
==============================================================
Your job has not been submitted.
The Odyssey cluster has a scheduled maintenance downtime
starting at %04d-%02d-%02d %02d:%02d:%02d %s.
Your job will not end before the downtime. Please specify
a shorter time limit for your job, such as:
-t %s
This will give your job the most possible time to run before
the downtime. If your job does not finish before the downtime
starts, it will be terminated then.
==============================================================
]],
downtime_start.year, downtime_start.month,
downtime_start.monthday, downtime_start.hour,
downtime_start.min, downtime_start.sec,
os.date("%Z"), suggested_timespec))
return 2051
-- return slurm.ESLURM_INVALID_TIME_LIMIT
end
-- This function checks to makes sure that the gres field is filled to ensure that there is an actual request for gres resources.
-- Otherwise it will reject the job and send a message to the user. gres is the job gres, type is the type of gres to check against,
-- part is the partition that we want to check, and newpart is the partition the user should use if they don't have that gres defined
function gres_check(gres,type,part,newpart)
if (gres == nil) then
gres = "none"
end
if string.match(gres, type) then
else
slurm.log_user("You must request a %s using the --gres option to use %s partition, if you have CPU work for this hardware please use %s",type,part,newpart)
exit()
end
end
function slurm_job_submit(job_desc, part_list, submit_uid)
-- Need to exclude root as root is not subject to this.
if job_desc.user_id == 0 then
return 0
end
-- Silently reject --exclusive, forcing jobs to share
-- nodes if other resources allow.
if job_desc.shared == 0 then
job_desc.shared = 1
end
submit_user = posix.getpasswd(submit_uid)
submit_user_primary_group = posix.getgroup(submit_user.gid).name
ensure_assoc_exists(submit_user.name, submit_user_primary_group)
-- Per RT 1185187, we are going to put a gate on the gpgpu and aagk80 partitions to ensure only jobs that request GPU's get access
-- This table handles the mapping of partition to partition the user should use if they don't request gpu's
gres_part={["gpu_requeue"] = "serial_requeue",["gpgpu"] = "gpgpu_requeue",["aagk80"] = "aagk80_requeue"}
gres_type={["gpu_requeue"] = "gpu",["gpgpu"] = "gpu",["aagk80"] = "gpu"}
-- Need to test to see if there is actually anything set
if (job_desc.partition ~= nil) then
-- Need to get partitions that the user is requesting.
local partition = split(job_desc.partition,",")
-- Now we need to check if we are submitting to any of the partitions that we have limits on.
for i,part in ipairs(partition)
do
-- As it turns out when --mem-per-cpu is invoked it takes the value for mem-per-cpu and adds 2147483647 to it (half of 4294967294 which is the largest int)
-- We don't want to permit --mem-per-cpu for these nodes as those can submit to normal hardware. Also we can't apriori know right now if some one is
-- using mem-per-cpu versus mem as they may just be asking for a ridiculous amount of memory.
-- Suffice it to say this puts an upper limit on memory asks for bigmem at 2 PB.
if part == "bigmem" then
if (job_desc.pn_min_memory < 250000 or job_desc.pn_min_memory > 2147483646) then
slurm.log_user("You must request more than 250GB for jobs in bigmem partition")
return 2052
end
end
if part == "ncf_bigmem" then
if (job_desc.pn_min_memory < 30000 or job_desc.pn_min_memory > 2147483646) then
slurm.log_user("You must request more than 30GB for jobs in ncf_bigmem partition")
return 2052
end
end
if part == "interact" then
slurm.log_user("The interact partition has been replaced with the test partition. Please use the test partition.")
return 2052
end
-- Checks GRES requests.
if gres_part[part] then
gres_check(job_desc.gres,gres_type[part],part,gres_part[part])
end
end
end
--Needs to be fixed for the new lua bindings that allow us to query reservations
--return friendly_error_if_time_limit_intersects_downtime(job_desc.time_limit)
return 0
end
function slurm_job_modify(job_desc, job_rec, part_list, modify_uid)
return 0
end
slurm.log_info("initialized")
return slurm.SUCCESS