-
Notifications
You must be signed in to change notification settings - Fork 0
/
atlas-slurm
executable file
·146 lines (132 loc) · 4.13 KB
/
atlas-slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env bash
scriptDir=$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
source $scriptDir/generic_routines.sh
source $scriptDir/slurm.sh
usageOpts="[ -c <command string, mandatory field> ] \
[ -w <working directory, mandatory field> ] \
[ -m <memory in MB or in the format <size>[units], defaults to cluster default. Different units can be specified using the suffix [K|M|G|T]> ] \
[ -p <number of cores, defaults to cluster default> ] \
[ -t <allocated maxtime, defaults to max allowed time for partition. SLURM accepted format > ] \
[ -j <job name, defaults to cluster default> ] \
[ -l <log prefix, no logs written by default> ] \
[ -e <clean up log files after monitored run? Defaults to no> ] \
[ -n <monitor submitted job? Defaults to yes> ] \
[ -o <with -l, print standard output content? Defaults to no> ] \
[ -s <monitor style: 'status' for job status updates on polling, 'std_out_err' to report ongoing content of logs (where used). Defaults to std_out_err> ] \
[ -f <poll frequency in seconds if job is monitored. Defaults to 60.> ] \
[ -q <slurm queue, defaults to cluster default> ] \
[ -u <suppress logging output? Default: no> ] \
[ -v <name of the conda environment in which to run the job> ]"
usageOpts=$(echo -e "$usageOpts" | tr -s " ")
usage() { echo "Usage: $0 $usageOpts"; }
# Parse arguments
commandString=
workingDir=
memory=
cores=
maxTime=
jobName=
queue=
logPrefix=
returnStdout=no
logCleanup=no
monitorJob=yes
pollFreqSecs=60
monitorStyle=std_out_err
prioritise=no
quiet=no
while getopts ":c:w:m:p:t:j:g:l:o:e:n:f:q:s:r:v:u:" o; do
case "${o}" in
c)
commandString=${OPTARG}
;;
w)
workingDir=${OPTARG}
;;
m)
memory=${OPTARG}
;;
p)
cores=${OPTARG}
;;
t)
maxTime=${OPTARG}
;;
j)
jobName=${OPTARG}
;;
q)
queue=${OPTARG}
;;
l)
logPrefix=${OPTARG}
;;
e)
logCleanup=${OPTARG}
;;
n)
monitorJob=${OPTARG}
;;
f)
pollFreqSecs=${OPTARG}
;;
s)
monitorStyle=${OPTARG}
;;
r)
prioritise=${OPTARG}
;;
o)
returnStdout=${OPTARG}
;;
v)
condaEnv=${OPTARG}
;;
u)
quiet=${OPTARG}
;;
*)
usage
exit 0
;;
esac
done
if [ -z "$commandString" ]; then
die "Need at least a command string for SLURM submission"
elif [ -z "$workingDir" ]; then
die "Needs a workingDir specified (-w) for SLURM submission"
fi
if [[ ! -d "${workingDir}" ]]; then
mkdir -p "$workingDir" || die "Failed to create directory: $workingDir"
fi
if [ -z "$queue" ]; then
queue="production"
fi
if [ -z "$maxTime" ]; then
maxTime=$(slurm_maxtime_for_partition "$queue")
fi
# Submit the jobs
# if [ -n "${workingDir}/${logPrefix}.out" ]; then
# jobStdout="${workingDir}/${logPrefix}.out"
# jobStderr="${workingDir}/${logPrefix}.err"
# rm -rf $jobStdout $jobStderr
# fi
slurmJobId=$(slurm_submit "$commandString" "$queue" "$jobName" "$memory" "$cores" "$maxTime" "$workingDir" "$logPrefix" "$prioritise" "$condaEnv" "$quiet")
submitStatus=$?
if [ "$submitStatus" -ne "0" ] && [ -n "$slurmJobId" ]; then
die "Job submission failed, status is $submitStatus"
else
warn "Job submission succeeded, received job ID $slurmJobId" "$quiet"
if [ "$monitorJob" = 'yes' ]; then
logFile=
if [ -n "$logPrefix" ]; then
logFile=${logPrefix}.out
fi
sleep 10
slurm_monitor_job "$slurmJobId" "$pollFreqSecs" "$logFile" "$monitorStyle" "$logCleanup" "$returnStdout" "$quiet"
slurmExitCode=$?
if [ $slurmExitCode -ne 0 ]; then
die "Command \"$commandString\" failed" "$slurmExitCode" "$quiet"
fi
fi
fi