4067 |
02 Sep 16 |
nicklas |
1 |
package net.sf.basedb.opengrid; |
4067 |
02 Sep 16 |
nicklas |
2 |
|
4067 |
02 Sep 16 |
nicklas |
3 |
import java.io.Closeable; |
4130 |
26 Sep 16 |
nicklas |
4 |
import java.util.ArrayList; |
4197 |
31 Oct 16 |
nicklas |
5 |
import java.util.Arrays; |
4212 |
08 Nov 16 |
nicklas |
6 |
import java.util.Date; |
4130 |
26 Sep 16 |
nicklas |
7 |
import java.util.List; |
4067 |
02 Sep 16 |
nicklas |
8 |
|
4067 |
02 Sep 16 |
nicklas |
9 |
import org.slf4j.LoggerFactory; |
4067 |
02 Sep 16 |
nicklas |
10 |
|
4067 |
02 Sep 16 |
nicklas |
11 |
import net.schmizz.sshj.SSHClient; |
4203 |
02 Nov 16 |
nicklas |
12 |
import net.sf.basedb.core.DbControl; |
4203 |
02 Nov 16 |
nicklas |
13 |
import net.sf.basedb.core.Job; |
4203 |
02 Nov 16 |
nicklas |
14 |
import net.sf.basedb.core.signal.ExtensionSignalTransporter; |
5984 |
10 Jul 20 |
nicklas |
15 |
import net.sf.basedb.opengrid.CmdResult.DateResult; |
4130 |
26 Sep 16 |
nicklas |
16 |
import net.sf.basedb.opengrid.JobDefinition.UploadSourceWithPermission; |
4254 |
25 Nov 16 |
nicklas |
17 |
import net.sf.basedb.opengrid.config.ClusterConfig; |
4254 |
25 Nov 16 |
nicklas |
18 |
import net.sf.basedb.opengrid.config.JobConfig; |
5982 |
07 Jul 20 |
nicklas |
19 |
import net.sf.basedb.opengrid.engine.ClusterEngine; |
5982 |
07 Jul 20 |
nicklas |
20 |
import net.sf.basedb.opengrid.engine.JobSubmission; |
4270 |
16 Dec 16 |
nicklas |
21 |
import net.sf.basedb.opengrid.filetransfer.ByteArrayDownloadTarget; |
4121 |
22 Sep 16 |
nicklas |
22 |
import net.sf.basedb.opengrid.filetransfer.FilePermission; |
4121 |
22 Sep 16 |
nicklas |
23 |
import net.sf.basedb.opengrid.filetransfer.UploadSource; |
7075 |
27 Mar 23 |
nicklas |
24 |
import net.sf.basedb.opengrid.service.OpenGridService; |
4203 |
02 Nov 16 |
nicklas |
25 |
import net.sf.basedb.opengrid.service.OpenGridSignalHandlerFactory; |
7075 |
27 Mar 23 |
nicklas |
26 |
import net.sf.basedb.util.extensions.logging.ExtensionsLog; |
7075 |
27 Mar 23 |
nicklas |
27 |
import net.sf.basedb.util.extensions.logging.ExtensionsLogger; |
4067 |
02 Sep 16 |
nicklas |
28 |
|
4067 |
02 Sep 16 |
nicklas |
29 |
/** |
4278 |
20 Dec 16 |
nicklas |
An open session to an Open Grid cluster. It provides a lot |
4278 |
20 Dec 16 |
nicklas |
of extra functionality for submitting jobs, check job status, |
4278 |
20 Dec 16 |
nicklas |
abort jobs, etc. Do not forget to {@link #close()} |
4067 |
02 Sep 16 |
nicklas |
the session after use. |
4067 |
02 Sep 16 |
nicklas |
34 |
|
4067 |
02 Sep 16 |
nicklas |
A session is not thread-safe and need external synchronization if |
4067 |
02 Sep 16 |
nicklas |
used with multiple threads. |
4067 |
02 Sep 16 |
nicklas |
37 |
|
4067 |
02 Sep 16 |
nicklas |
@author nicklas |
4067 |
02 Sep 16 |
nicklas |
@since 1.0 |
4067 |
02 Sep 16 |
nicklas |
40 |
*/ |
4067 |
02 Sep 16 |
nicklas |
41 |
public class OpenGridSession |
4278 |
20 Dec 16 |
nicklas |
42 |
extends AbstractSession<OpenGridCluster> |
4067 |
02 Sep 16 |
nicklas |
43 |
implements Closeable |
4067 |
02 Sep 16 |
nicklas |
44 |
{ |
4067 |
02 Sep 16 |
nicklas |
45 |
|
7075 |
27 Mar 23 |
nicklas |
46 |
private static final ExtensionsLogger logger = |
7075 |
27 Mar 23 |
nicklas |
47 |
ExtensionsLog.getLogger(OpenGridService.ID, true).wrap(LoggerFactory.getLogger(OpenGridSession.class)); |
4067 |
02 Sep 16 |
nicklas |
48 |
|
5982 |
07 Jul 20 |
nicklas |
49 |
private final ClusterEngine engine; |
5982 |
07 Jul 20 |
nicklas |
50 |
|
4212 |
08 Nov 16 |
nicklas |
51 |
private Integer localTimeAdjustment; |
4067 |
02 Sep 16 |
nicklas |
52 |
|
4222 |
09 Nov 16 |
nicklas |
53 |
|
4067 |
02 Sep 16 |
nicklas |
54 |
/** |
4067 |
02 Sep 16 |
nicklas |
Session are typically created from |
4067 |
02 Sep 16 |
nicklas |
{@link OpenGridCluster#connect(int)}. |
4067 |
02 Sep 16 |
nicklas |
57 |
*/ |
4067 |
02 Sep 16 |
nicklas |
58 |
OpenGridSession(OpenGridCluster cluster, SSHClient ssh) |
4067 |
02 Sep 16 |
nicklas |
59 |
{ |
4278 |
20 Dec 16 |
nicklas |
60 |
super(cluster, ssh); |
5984 |
10 Jul 20 |
nicklas |
61 |
this.engine = cluster.getConfig().getType().createEngine(); |
4067 |
02 Sep 16 |
nicklas |
62 |
} |
4067 |
02 Sep 16 |
nicklas |
63 |
|
4067 |
02 Sep 16 |
nicklas |
64 |
/** |
4275 |
19 Dec 16 |
nicklas |
Execute a command on a node in the cluster. |
4275 |
19 Dec 16 |
nicklas |
@param node The name of the node, or null to execute on the master |
4275 |
19 Dec 16 |
nicklas |
@see #executeCmd(String, int) |
4275 |
19 Dec 16 |
nicklas |
68 |
*/ |
4275 |
19 Dec 16 |
nicklas |
69 |
public CmdResult<String> executeOnNode(String cmd, String node, int timeout) |
4275 |
19 Dec 16 |
nicklas |
70 |
{ |
4275 |
19 Dec 16 |
nicklas |
71 |
String nodeCmd = cmd; |
4275 |
19 Dec 16 |
nicklas |
72 |
if (node != null) |
4275 |
19 Dec 16 |
nicklas |
73 |
{ |
4275 |
19 Dec 16 |
nicklas |
74 |
nodeCmd = "ssh -T " + node + " << 'EOF'\n"; |
4275 |
19 Dec 16 |
nicklas |
75 |
nodeCmd += cmd + "\n"; |
4275 |
19 Dec 16 |
nicklas |
76 |
nodeCmd += "EOF\n"; |
4275 |
19 Dec 16 |
nicklas |
77 |
} |
4275 |
19 Dec 16 |
nicklas |
78 |
return executeCmd(nodeCmd, timeout); |
4275 |
19 Dec 16 |
nicklas |
79 |
} |
4275 |
19 Dec 16 |
nicklas |
80 |
|
6659 |
01 Apr 22 |
nicklas |
81 |
/** |
6659 |
01 Apr 22 |
nicklas |
Execute a command on a node in the cluster with support for uploading files |
6659 |
01 Apr 22 |
nicklas |
before the command is executed. The job definition is used for the file |
6659 |
01 Apr 22 |
nicklas |
upload only. The files are removed again after the command has been executed, |
6659 |
01 Apr 22 |
nicklas |
unless the debug flag is set. If there are no files, this method is |
6659 |
01 Apr 22 |
nicklas |
equivalent to {@link #executeOnNode(String, String, int)} |
6659 |
01 Apr 22 |
nicklas |
@since 1.5 |
6659 |
01 Apr 22 |
nicklas |
88 |
*/ |
6659 |
01 Apr 22 |
nicklas |
89 |
public CmdResult<String> executeNow(JobDefinition job, String node, int timeout) |
6659 |
01 Apr 22 |
nicklas |
90 |
{ |
6659 |
01 Apr 22 |
nicklas |
91 |
ScriptBuilder script = new ScriptBuilder(); |
6659 |
01 Apr 22 |
nicklas |
92 |
List<UploadSourceWithPermission> files = job.getFiles(); |
6659 |
01 Apr 22 |
nicklas |
93 |
if (files.size() > 0) |
6659 |
01 Apr 22 |
nicklas |
94 |
{ |
6659 |
01 Apr 22 |
nicklas |
95 |
OpenGridCluster cluster = getHost(); |
6659 |
01 Apr 22 |
nicklas |
// Create working directory for the command and ensure it is empty |
6659 |
01 Apr 22 |
nicklas |
97 |
ScriptBuilder mkdir = new ScriptBuilder(); |
6659 |
01 Apr 22 |
nicklas |
98 |
String workFolder = cluster.getWorkFolder(job.getName()); |
6659 |
01 Apr 22 |
nicklas |
99 |
mkdir.cmd("rm -rf " + workFolder); |
6659 |
01 Apr 22 |
nicklas |
100 |
mkdir.cmd("mkdir -p " + workFolder); |
6659 |
01 Apr 22 |
nicklas |
101 |
|
6659 |
01 Apr 22 |
nicklas |
102 |
CmdResult<String> cmd = execute(new CmdResult<>(mkdir.toString()), 10); |
6659 |
01 Apr 22 |
nicklas |
103 |
if (cmd.getExitStatus() != 0) return cmd; |
6659 |
01 Apr 22 |
nicklas |
104 |
|
6659 |
01 Apr 22 |
nicklas |
// Upload the job script and files to the cluster |
6659 |
01 Apr 22 |
nicklas |
106 |
if (logger.isDebugEnabled()) |
6659 |
01 Apr 22 |
nicklas |
107 |
{ |
6659 |
01 Apr 22 |
nicklas |
108 |
logger.debug("Uploading files for job '" + job.getName() + "' to cluster " + cluster); |
6659 |
01 Apr 22 |
nicklas |
109 |
} |
6659 |
01 Apr 22 |
nicklas |
110 |
for (UploadSourceWithPermission upload : files) |
6659 |
01 Apr 22 |
nicklas |
111 |
{ |
6659 |
01 Apr 22 |
nicklas |
112 |
UploadSource file = upload.getUploadSource(); |
6659 |
01 Apr 22 |
nicklas |
113 |
String filePath = workFolder + "/" + OpenGrid.checkValidFilename(file.getName()); |
6659 |
01 Apr 22 |
nicklas |
114 |
uploadFile(file, filePath, upload.getFilePermission()); |
6659 |
01 Apr 22 |
nicklas |
115 |
} |
6659 |
01 Apr 22 |
nicklas |
116 |
script.cmd("cleanupOnExit() { rm -rf ${WD}; }"); |
6659 |
01 Apr 22 |
nicklas |
117 |
if (!job.getDebug()) |
6659 |
01 Apr 22 |
nicklas |
118 |
{ |
6659 |
01 Apr 22 |
nicklas |
119 |
script.cmd("trap cleanupOnExit EXIT"); |
6659 |
01 Apr 22 |
nicklas |
120 |
} |
6659 |
01 Apr 22 |
nicklas |
121 |
script.export("WD", workFolder); |
6659 |
01 Apr 22 |
nicklas |
122 |
script.cmd("cd ${WD}"); |
6659 |
01 Apr 22 |
nicklas |
123 |
} |
6659 |
01 Apr 22 |
nicklas |
124 |
script.cmd(job.getCmd()); |
6659 |
01 Apr 22 |
nicklas |
125 |
if (files.size() > 0) |
6659 |
01 Apr 22 |
nicklas |
126 |
{ |
6659 |
01 Apr 22 |
nicklas |
127 |
script.cmd("cd ~"); |
6659 |
01 Apr 22 |
nicklas |
128 |
} |
6659 |
01 Apr 22 |
nicklas |
129 |
return executeOnNode(script.toString(), node, timeout); |
6659 |
01 Apr 22 |
nicklas |
130 |
} |
4067 |
02 Sep 16 |
nicklas |
131 |
|
4212 |
08 Nov 16 |
nicklas |
132 |
/** |
4212 |
08 Nov 16 |
nicklas |
Get the current date and time of the cluster node. |
4212 |
08 Nov 16 |
nicklas |
134 |
*/ |
4212 |
08 Nov 16 |
nicklas |
135 |
public CmdResult<Date> getClusterDate() |
4212 |
08 Nov 16 |
nicklas |
136 |
{ |
4278 |
20 Dec 16 |
nicklas |
137 |
String dateCommand = getHost().getConfig().getDateCommand(); |
5984 |
10 Jul 20 |
nicklas |
138 |
return execute(new DateResult(dateCommand), 2); |
4067 |
02 Sep 16 |
nicklas |
139 |
} |
4212 |
08 Nov 16 |
nicklas |
140 |
|
4212 |
08 Nov 16 |
nicklas |
141 |
/** |
4257 |
30 Nov 16 |
nicklas |
Utility method for getting information about the cluster |
4257 |
30 Nov 16 |
nicklas |
hardware and operating system by executing the |
4257 |
30 Nov 16 |
nicklas |
{@link ClusterConfig#getHostInfoCommand()}. |
4257 |
30 Nov 16 |
nicklas |
145 |
*/ |
4257 |
30 Nov 16 |
nicklas |
146 |
public CmdResult<String> getHostInfo() |
4257 |
30 Nov 16 |
nicklas |
147 |
{ |
4278 |
20 Dec 16 |
nicklas |
148 |
return executeCmd(getHost().getConfig().getHostInfoCommand(), 2); |
4257 |
30 Nov 16 |
nicklas |
149 |
} |
4257 |
30 Nov 16 |
nicklas |
150 |
|
4257 |
30 Nov 16 |
nicklas |
151 |
/** |
4257 |
30 Nov 16 |
nicklas |
Utility method for getting information about the Open |
4257 |
30 Nov 16 |
nicklas |
Grid Scheduler software version by executing the |
4257 |
30 Nov 16 |
nicklas |
{@link ClusterConfig#getOpenGridInfoCommand()}. |
4257 |
30 Nov 16 |
nicklas |
155 |
*/ |
4257 |
30 Nov 16 |
nicklas |
156 |
public CmdResult<String> getOpenGridInfo() |
4257 |
30 Nov 16 |
nicklas |
157 |
{ |
4278 |
20 Dec 16 |
nicklas |
158 |
return executeCmd(getHost().getConfig().getOpenGridInfoCommand(), 2); |
4257 |
30 Nov 16 |
nicklas |
159 |
} |
4257 |
30 Nov 16 |
nicklas |
160 |
|
4257 |
30 Nov 16 |
nicklas |
161 |
/** |
4212 |
08 Nov 16 |
nicklas |
Get the time different in seconds between the BASE server and the remote |
4212 |
08 Nov 16 |
nicklas |
host. A positive value means the remote host is ahead of BASE, a negative value |
4212 |
08 Nov 16 |
nicklas |
that the remote host is behind. |
4212 |
08 Nov 16 |
nicklas |
NOTE! The value returned by this method is cached for the length of the session |
4212 |
08 Nov 16 |
nicklas |
166 |
*/ |
4212 |
08 Nov 16 |
nicklas |
167 |
public int getTimeAdjustment() |
4212 |
08 Nov 16 |
nicklas |
168 |
{ |
4212 |
08 Nov 16 |
nicklas |
169 |
if (localTimeAdjustment == null) |
4212 |
08 Nov 16 |
nicklas |
170 |
{ |
4212 |
08 Nov 16 |
nicklas |
171 |
Date clusterDate = getClusterDate().getResult(); |
4212 |
08 Nov 16 |
nicklas |
172 |
if (clusterDate != null) |
4212 |
08 Nov 16 |
nicklas |
173 |
{ |
4212 |
08 Nov 16 |
nicklas |
174 |
Date localDate = new Date(); |
4212 |
08 Nov 16 |
nicklas |
175 |
localTimeAdjustment = (int)((clusterDate.getTime() - localDate.getTime()) / 1000); |
4212 |
08 Nov 16 |
nicklas |
176 |
} |
4212 |
08 Nov 16 |
nicklas |
177 |
else |
4212 |
08 Nov 16 |
nicklas |
178 |
{ |
4212 |
08 Nov 16 |
nicklas |
179 |
localTimeAdjustment = 0; |
4212 |
08 Nov 16 |
nicklas |
180 |
} |
4212 |
08 Nov 16 |
nicklas |
181 |
if (logger.isDebugEnabled()) |
4212 |
08 Nov 16 |
nicklas |
182 |
{ |
4278 |
20 Dec 16 |
nicklas |
183 |
logger.debug("Time adjustment for cluster " + getHost() + ": " + localTimeAdjustment); |
4212 |
08 Nov 16 |
nicklas |
184 |
} |
4212 |
08 Nov 16 |
nicklas |
185 |
} |
4212 |
08 Nov 16 |
nicklas |
186 |
return localTimeAdjustment; |
4212 |
08 Nov 16 |
nicklas |
187 |
} |
4067 |
02 Sep 16 |
nicklas |
188 |
|
4212 |
08 Nov 16 |
nicklas |
189 |
|
4197 |
31 Oct 16 |
nicklas |
190 |
/** |
4203 |
02 Nov 16 |
nicklas |
Submit jobs to the cluster. If any of the jobs are connected to |
4203 |
02 Nov 16 |
nicklas |
BASE jobs a DbControl is required. The BASE jobs will be updated |
4203 |
02 Nov 16 |
nicklas |
with information about the jobs on the cluster. If the transaction |
4203 |
02 Nov 16 |
nicklas |
is not committed the jobs on the cluster will automatically be |
4203 |
02 Nov 16 |
nicklas |
aborted. |
4197 |
31 Oct 16 |
nicklas |
196 |
|
4203 |
02 Nov 16 |
nicklas |
@param dc An open DbControl connection. This is needed if any of the |
4203 |
02 Nov 16 |
nicklas |
job definitions are connected to a BASE job but can be null otherwise |
4203 |
02 Nov 16 |
nicklas |
@param jobs A list with job definitions that should be submitted to the |
4203 |
02 Nov 16 |
nicklas |
cluster |
4197 |
31 Oct 16 |
nicklas |
@return A list with job status information. The order |
4197 |
31 Oct 16 |
nicklas |
is the same as the order of the job definitions |
4197 |
31 Oct 16 |
nicklas |
203 |
*/ |
4212 |
08 Nov 16 |
nicklas |
204 |
public CmdResult<List<JobStatus>> qsub(DbControl dc, List<JobDefinition> jobs) |
4130 |
26 Sep 16 |
nicklas |
205 |
{ |
4197 |
31 Oct 16 |
nicklas |
206 |
if (logger.isDebugEnabled()) |
4197 |
31 Oct 16 |
nicklas |
207 |
{ |
4278 |
20 Dec 16 |
nicklas |
208 |
logger.debug("Submitting " + jobs.size() + " jobs to cluster: " + getHost()); |
4197 |
31 Oct 16 |
nicklas |
209 |
} |
4130 |
26 Sep 16 |
nicklas |
// Create working directories for all jobs and ensure they are empty |
4278 |
20 Dec 16 |
nicklas |
211 |
OpenGridCluster cluster = getHost(); |
4130 |
26 Sep 16 |
nicklas |
212 |
ScriptBuilder mkdir = new ScriptBuilder(); |
4130 |
26 Sep 16 |
nicklas |
213 |
for (JobDefinition job : jobs) |
4130 |
26 Sep 16 |
nicklas |
214 |
{ |
4270 |
16 Dec 16 |
nicklas |
215 |
String workFolder = cluster.getWorkFolder(job.getName()); |
4130 |
26 Sep 16 |
nicklas |
216 |
mkdir.cmd("rm -rf " + workFolder); |
4130 |
26 Sep 16 |
nicklas |
217 |
mkdir.cmd("mkdir -p " + workFolder); |
4130 |
26 Sep 16 |
nicklas |
218 |
} |
4130 |
26 Sep 16 |
nicklas |
219 |
|
5984 |
10 Jul 20 |
nicklas |
220 |
CmdResult<List<JobStatus>> cmd = execute(new CmdResult<>(mkdir.toString()), 10 + jobs.size()); |
4212 |
08 Nov 16 |
nicklas |
221 |
if (cmd.getExitStatus() != 0) return cmd; |
4212 |
08 Nov 16 |
nicklas |
222 |
|
4130 |
26 Sep 16 |
nicklas |
// Upload the job script and files to the cluster |
4130 |
26 Sep 16 |
nicklas |
// And generate the 'qsub' script for registering the jobs |
4130 |
26 Sep 16 |
nicklas |
225 |
ScriptBuilder qsub = new ScriptBuilder(); |
4130 |
26 Sep 16 |
nicklas |
226 |
for (JobDefinition job : jobs) |
4130 |
26 Sep 16 |
nicklas |
227 |
{ |
4197 |
31 Oct 16 |
nicklas |
228 |
if (logger.isDebugEnabled()) |
4197 |
31 Oct 16 |
nicklas |
229 |
{ |
4203 |
02 Nov 16 |
nicklas |
230 |
logger.debug("Uploading files for job '" + job.getName() + "' to cluster " + cluster); |
4197 |
31 Oct 16 |
nicklas |
231 |
} |
4197 |
31 Oct 16 |
nicklas |
232 |
|
4270 |
16 Dec 16 |
nicklas |
233 |
String workFolder = cluster.getWorkFolder(job.getName()); |
4270 |
16 Dec 16 |
nicklas |
234 |
String tmpFolder = cluster.getTmpFolder(job.getName(), job.getDebug()); |
4130 |
26 Sep 16 |
nicklas |
235 |
for (UploadSourceWithPermission upload : job.getFiles()) |
4130 |
26 Sep 16 |
nicklas |
236 |
{ |
4130 |
26 Sep 16 |
nicklas |
237 |
UploadSource file = upload.getUploadSource(); |
4130 |
26 Sep 16 |
nicklas |
238 |
String filePath = workFolder + "/" + OpenGrid.checkValidFilename(file.getName()); |
4130 |
26 Sep 16 |
nicklas |
239 |
uploadFile(file, filePath, upload.getFilePermission()); |
4130 |
26 Sep 16 |
nicklas |
240 |
} |
4130 |
26 Sep 16 |
nicklas |
241 |
|
5982 |
07 Jul 20 |
nicklas |
242 |
|
6614 |
28 Feb 22 |
nicklas |
243 |
JobSubmission jobSubmission = engine.createJobSubmission(this, job, workFolder, tmpFolder); |
5982 |
07 Jul 20 |
nicklas |
244 |
for (UploadSource file : jobSubmission.getJobScripts()) |
5982 |
07 Jul 20 |
nicklas |
245 |
{ |
5982 |
07 Jul 20 |
nicklas |
246 |
String filePath = workFolder + "/" + OpenGrid.checkValidFilename(file.getName()); |
5982 |
07 Jul 20 |
nicklas |
247 |
uploadFile(file, filePath, FilePermission.USER_RWX); |
5982 |
07 Jul 20 |
nicklas |
248 |
} |
5982 |
07 Jul 20 |
nicklas |
249 |
|
5982 |
07 Jul 20 |
nicklas |
250 |
qsub.cmd(jobSubmission.getSubmitCmd()); |
4130 |
26 Sep 16 |
nicklas |
251 |
} |
4130 |
26 Sep 16 |
nicklas |
252 |
|
4130 |
26 Sep 16 |
nicklas |
// Execute 'qsub' for all jobs |
5984 |
10 Jul 20 |
nicklas |
254 |
cmd = execute(new CmdResult<>(qsub.toString()), 10 + jobs.size()); |
4212 |
08 Nov 16 |
nicklas |
255 |
if (cmd.getExitStatus() != 0) return cmd; |
4130 |
26 Sep 16 |
nicklas |
256 |
|
4197 |
31 Oct 16 |
nicklas |
// Parse result from qsub which should be one line with the job id per job submitted |
4197 |
31 Oct 16 |
nicklas |
258 |
String[] jobIds = cmd.getStdout().split("\n"); |
4197 |
31 Oct 16 |
nicklas |
259 |
if (jobIds.length != jobs.size()) |
4197 |
31 Oct 16 |
nicklas |
260 |
{ |
4212 |
08 Nov 16 |
nicklas |
261 |
cmd.setException(new RuntimeException("Expected cluster to return "+ jobs.size() + " job-id:s but only got " + jobIds.length + ": " + Arrays.asList(jobIds))); |
4212 |
08 Nov 16 |
nicklas |
262 |
return cmd; |
4197 |
31 Oct 16 |
nicklas |
263 |
} |
4130 |
26 Sep 16 |
nicklas |
264 |
|
4197 |
31 Oct 16 |
nicklas |
265 |
long submissionTime = System.currentTimeMillis(); |
4130 |
26 Sep 16 |
nicklas |
266 |
List<JobStatus> status = new ArrayList<>(jobs.size()); |
4197 |
31 Oct 16 |
nicklas |
267 |
for (int jobNo = 0; jobNo < jobs.size(); ++jobNo) |
4130 |
26 Sep 16 |
nicklas |
268 |
{ |
4197 |
31 Oct 16 |
nicklas |
269 |
JobDefinition job = jobs.get(jobNo); |
4203 |
02 Nov 16 |
nicklas |
270 |
Job baseJob = job.getBaseJob(); |
4203 |
02 Nov 16 |
nicklas |
271 |
String clusterJobId = jobIds[jobNo].trim(); |
4197 |
31 Oct 16 |
nicklas |
272 |
|
4203 |
02 Nov 16 |
nicklas |
273 |
JobIdentifier jobId = new JobIdentifier(cluster.getId(), clusterJobId, baseJob); |
4203 |
02 Nov 16 |
nicklas |
274 |
|
4197 |
31 Oct 16 |
nicklas |
275 |
if (logger.isDebugEnabled()) |
4197 |
31 Oct 16 |
nicklas |
276 |
{ |
4197 |
31 Oct 16 |
nicklas |
277 |
logger.debug("Job submitted " + job.getName() + ": " + jobId); |
4197 |
31 Oct 16 |
nicklas |
278 |
} |
4197 |
31 Oct 16 |
nicklas |
279 |
|
4130 |
26 Sep 16 |
nicklas |
280 |
JobStatus jobStatus = new JobStatus(jobId); |
4197 |
31 Oct 16 |
nicklas |
281 |
jobStatus.setSubmitted(submissionTime, job.getName()); |
4130 |
26 Sep 16 |
nicklas |
282 |
status.add(jobStatus); |
4203 |
02 Nov 16 |
nicklas |
283 |
|
4203 |
02 Nov 16 |
nicklas |
284 |
if (baseJob != null) |
4203 |
02 Nov 16 |
nicklas |
285 |
{ |
4234 |
11 Nov 16 |
nicklas |
286 |
JobConfig jobConfig = job.getConfig(); |
4234 |
11 Nov 16 |
nicklas |
287 |
baseJob.setPriority(jobConfig.getBASEPriority()); |
4203 |
02 Nov 16 |
nicklas |
288 |
baseJob.setScheduled(jobId.getClusterId(), null); |
4203 |
02 Nov 16 |
nicklas |
289 |
baseJob.setExternalId(jobId.getClusterJobId()); |
4203 |
02 Nov 16 |
nicklas |
290 |
String signalURI = OpenGridSignalHandlerFactory.getSignalUri(jobId); |
4203 |
02 Nov 16 |
nicklas |
291 |
baseJob.setSignalTransporter(ExtensionSignalTransporter.class, signalURI); |
4203 |
02 Nov 16 |
nicklas |
292 |
dc.addTransactionalAction(new AbortJobIfTransactionFails(jobId)); |
4203 |
02 Nov 16 |
nicklas |
293 |
} |
4203 |
02 Nov 16 |
nicklas |
294 |
|
4130 |
26 Sep 16 |
nicklas |
295 |
} |
4130 |
26 Sep 16 |
nicklas |
296 |
|
4212 |
08 Nov 16 |
nicklas |
297 |
cmd.setResult(status); |
4212 |
08 Nov 16 |
nicklas |
298 |
return cmd; |
4130 |
26 Sep 16 |
nicklas |
299 |
} |
4130 |
26 Sep 16 |
nicklas |
300 |
|
4197 |
31 Oct 16 |
nicklas |
301 |
/** |
4197 |
31 Oct 16 |
nicklas |
Abort the job with the specified job id. |
4197 |
31 Oct 16 |
nicklas |
303 |
*/ |
4222 |
09 Nov 16 |
nicklas |
304 |
public CmdResult<String> qdel(JobIdentifier jobId) |
4197 |
31 Oct 16 |
nicklas |
305 |
{ |
4222 |
09 Nov 16 |
nicklas |
306 |
if (jobId == null) throw new NullPointerException("jobId"); |
4197 |
31 Oct 16 |
nicklas |
307 |
if (logger.isDebugEnabled()) |
4197 |
31 Oct 16 |
nicklas |
308 |
{ |
4222 |
09 Nov 16 |
nicklas |
309 |
logger.debug("Aborting job: " + jobId); |
4197 |
31 Oct 16 |
nicklas |
310 |
} |
5984 |
10 Jul 20 |
nicklas |
311 |
return engine.cancelJob(this, jobId); |
4197 |
31 Oct 16 |
nicklas |
312 |
} |
4197 |
31 Oct 16 |
nicklas |
313 |
|
4197 |
31 Oct 16 |
nicklas |
314 |
|
4212 |
08 Nov 16 |
nicklas |
315 |
/** |
5984 |
10 Jul 20 |
nicklas |
Issue a command, parse the result and return |
4222 |
09 Nov 16 |
nicklas |
the information as a JobStatus object. If the command |
4222 |
09 Nov 16 |
nicklas |
returns exit code 1 it means that the job was not found, |
4222 |
09 Nov 16 |
nicklas |
other non-zero exit code probably means that there is some |
4222 |
09 Nov 16 |
nicklas |
kind of error. A JobStatus instance is only returned if |
4222 |
09 Nov 16 |
nicklas |
the exit status is 0. |
4222 |
09 Nov 16 |
nicklas |
322 |
|
4222 |
09 Nov 16 |
nicklas |
Note! The cluster is only queried the first time this method is called which |
4222 |
09 Nov 16 |
nicklas |
will retrieve and cache information about all queued/running jobs. Calling |
4222 |
09 Nov 16 |
nicklas |
this method more than once will use the cached information. |
4222 |
09 Nov 16 |
nicklas |
326 |
|
4222 |
09 Nov 16 |
nicklas |
@param jobId Identifier for the job |
4212 |
08 Nov 16 |
nicklas |
@param useLocalTimeAdjustment TRUE if the times from the cluster should be adjusted |
4212 |
08 Nov 16 |
nicklas |
to match local time on this server |
4212 |
08 Nov 16 |
nicklas |
330 |
*/ |
4222 |
09 Nov 16 |
nicklas |
331 |
public CmdResult<JobStatus> qstat(JobIdentifier jobId, boolean useLocalTimeAdjustment) |
4212 |
08 Nov 16 |
nicklas |
332 |
{ |
4222 |
09 Nov 16 |
nicklas |
333 |
if (jobId == null) throw new NullPointerException("jobId"); |
4212 |
08 Nov 16 |
nicklas |
334 |
|
5984 |
10 Jul 20 |
nicklas |
335 |
int timeAdjustment = useLocalTimeAdjustment ? -getTimeAdjustment() : 0; |
5984 |
10 Jul 20 |
nicklas |
336 |
return engine.getStatusInQueue(this, jobId, timeAdjustment); |
4222 |
09 Nov 16 |
nicklas |
337 |
} |
4222 |
09 Nov 16 |
nicklas |
338 |
|
4222 |
09 Nov 16 |
nicklas |
339 |
|
4222 |
09 Nov 16 |
nicklas |
340 |
/** |
5984 |
10 Jul 20 |
nicklas |
Issue a command to find out information about |
4222 |
09 Nov 16 |
nicklas |
a job that has ended. The result is parsed and returned |
4222 |
09 Nov 16 |
nicklas |
as a JobStatus object. If the command |
4222 |
09 Nov 16 |
nicklas |
returns exit code 1 it means that the job was not found, |
4222 |
09 Nov 16 |
nicklas |
other non-zero exit code probably means that there is some |
4222 |
09 Nov 16 |
nicklas |
kind of error. A JobStatus instance is only returned if |
4222 |
09 Nov 16 |
nicklas |
the exit status is 0. |
4222 |
09 Nov 16 |
nicklas |
348 |
|
4222 |
09 Nov 16 |
nicklas |
@param jobId Identifier for the job |
4222 |
09 Nov 16 |
nicklas |
@param useLocalTimeAdjustment TRUE if the times from the cluster should be adjusted |
4222 |
09 Nov 16 |
nicklas |
to match local time on this server |
4222 |
09 Nov 16 |
nicklas |
352 |
*/ |
4222 |
09 Nov 16 |
nicklas |
353 |
public CmdResult<JobStatus> qacct(JobIdentifier jobId, boolean useLocalTimeAdjustment) |
4222 |
09 Nov 16 |
nicklas |
354 |
{ |
4222 |
09 Nov 16 |
nicklas |
355 |
if (jobId == null) throw new NullPointerException("jobId"); |
4222 |
09 Nov 16 |
nicklas |
356 |
|
5984 |
10 Jul 20 |
nicklas |
357 |
int timeAdjustment = useLocalTimeAdjustment ? -getTimeAdjustment() : 0; |
5984 |
10 Jul 20 |
nicklas |
358 |
return engine.getStatusIfFinished(this, jobId, timeAdjustment); |
4222 |
09 Nov 16 |
nicklas |
359 |
} |
4222 |
09 Nov 16 |
nicklas |
360 |
|
4222 |
09 Nov 16 |
nicklas |
361 |
/** |
4270 |
16 Dec 16 |
nicklas |
Read the 'progress' information for the job and update job status instance. |
4222 |
09 Nov 16 |
nicklas |
@param jobStatus The job status instance to update with the progress |
4222 |
09 Nov 16 |
nicklas |
information |
4301 |
13 Jan 17 |
nicklas |
@return A CmdResult with the same JobStatus instance in the result |
4222 |
09 Nov 16 |
nicklas |
366 |
*/ |
4229 |
10 Nov 16 |
nicklas |
367 |
public CmdResult<JobStatus> readProgress(JobStatus jobStatus) |
4222 |
09 Nov 16 |
nicklas |
368 |
{ |
4278 |
20 Dec 16 |
nicklas |
369 |
String progressFile = getHost().getWorkFolder(jobStatus.getName())+"/progress"; |
5984 |
10 Jul 20 |
nicklas |
370 |
CmdResult<JobStatus> progress = execute(new CmdResult<>("cat " + progressFile), 5); |
4229 |
10 Nov 16 |
nicklas |
371 |
progress.setResult(jobStatus); |
4222 |
09 Nov 16 |
nicklas |
372 |
if (progress.getExitStatus() == 0) |
4222 |
09 Nov 16 |
nicklas |
373 |
{ |
4222 |
09 Nov 16 |
nicklas |
374 |
jobStatus.readFromProgress(progress.getStdout()); |
4222 |
09 Nov 16 |
nicklas |
375 |
} |
4222 |
09 Nov 16 |
nicklas |
376 |
return progress; |
4212 |
08 Nov 16 |
nicklas |
377 |
} |
4222 |
09 Nov 16 |
nicklas |
378 |
|
4229 |
10 Nov 16 |
nicklas |
379 |
/** |
4270 |
16 Dec 16 |
nicklas |
Read the 'stderr' file and update the message in the job status instance. |
4229 |
10 Nov 16 |
nicklas |
@param jobStatus The job status instance to update with the progress |
4229 |
10 Nov 16 |
nicklas |
information |
4301 |
13 Jan 17 |
nicklas |
@return A CmdResult with the same JobStatus instance in the result |
4229 |
10 Nov 16 |
nicklas |
384 |
*/ |
4229 |
10 Nov 16 |
nicklas |
385 |
public CmdResult<JobStatus> readStderr(JobStatus jobStatus) |
4229 |
10 Nov 16 |
nicklas |
386 |
{ |
4278 |
20 Dec 16 |
nicklas |
387 |
String stderrFile = getHost().getWorkFolder(jobStatus.getName())+"/stderr"; |
5984 |
10 Jul 20 |
nicklas |
388 |
CmdResult<JobStatus> stderr = execute(new CmdResult<>("cat " + stderrFile), 5); |
4229 |
10 Nov 16 |
nicklas |
389 |
stderr.setResult(jobStatus); |
4229 |
10 Nov 16 |
nicklas |
390 |
jobStatus.setMessage(stderr.getExitStatus() == 0 ? stderr.getStdout() : stderr.getStderr()); |
4229 |
10 Nov 16 |
nicklas |
391 |
return stderr; |
4229 |
10 Nov 16 |
nicklas |
392 |
} |
4229 |
10 Nov 16 |
nicklas |
393 |
|
4270 |
16 Dec 16 |
nicklas |
394 |
/** |
4270 |
16 Dec 16 |
nicklas |
Utility method for downloading a result file from a job as a text file. |
4270 |
16 Dec 16 |
nicklas |
396 |
|
4270 |
16 Dec 16 |
nicklas |
@param jobName The name of the job |
4270 |
16 Dec 16 |
nicklas |
@param filename The name of the file |
4270 |
16 Dec 16 |
nicklas |
@param encoding Encoding to use (UTF-8 is recommended, but it depends on the program the created it) |
4270 |
16 Dec 16 |
nicklas |
@return The file data as a string |
4270 |
16 Dec 16 |
nicklas |
401 |
*/ |
4270 |
16 Dec 16 |
nicklas |
402 |
public String getJobFileAsString(String jobName, String filename, String encoding) |
4270 |
16 Dec 16 |
nicklas |
403 |
{ |
4278 |
20 Dec 16 |
nicklas |
404 |
String path = getHost().getWorkFolder(jobName) + "/" + filename; |
4270 |
16 Dec 16 |
nicklas |
405 |
ByteArrayDownloadTarget target = new ByteArrayDownloadTarget(filename); |
4270 |
16 Dec 16 |
nicklas |
406 |
downloadFile(path, target); |
4270 |
16 Dec 16 |
nicklas |
407 |
return target.getString(encoding); |
4270 |
16 Dec 16 |
nicklas |
408 |
} |
5984 |
10 Jul 20 |
nicklas |
409 |
|
4229 |
10 Nov 16 |
nicklas |
410 |
|
5984 |
10 Jul 20 |
nicklas |
411 |
|
4067 |
02 Sep 16 |
nicklas |
412 |
} |