source: trunk/gridctl.php@ 40

Last change on this file since 40 was 40, checked in by gegorbet, 6 years ago

more time information in error,completion emails

File size: 32.0 KB
RevLine 
[1]1<?php
2
[25]3$us3bin = exec( "ls -d ~us3/lims/bin" );
4include_once "$us3bin/listen-config.php";
5//include "$us3bin/cleanup_aira.php";
6//include "$us3bin/cleanup_gfac.php";
7
[1]8// Global variables
9$gfac_message = "";
10$updateTime = 0;
11$submittime = 0;
12$cluster = '';
13
[6]14//global $self;
[18]15global $status_ex, $status_gw;
[6]16
[1]17// Produce some output temporarily, so cron will send me message
18$now = time();
[6]19echo "Time started: " . date( 'Y-m-d H:i:s', $now ) . "\n";
[1]20
21// Get data from global GFAC DB
[35]22$gLink = mysqli_connect( $dbhost, $guser, $gpasswd, $gDB );
[1]23
[35]24if ( ! $gLink )
[1]25{
[39]26 write_log( "$self: Could not select DB $gDB - " . mysqli_error($gLink) );
27 //mail_to_admin( "fail", "Internal Error: Could not select DB $gDB" );
28 mail_to_admin( "fail",
29 "Internal Error: Could not select DB $gDB $dbhost $guser " );
30 //sleep(300);
31 sleep(3);
[1]32 exit();
33}
34
35$query = "SELECT gfacID, us3_db, cluster, status, queue_msg, " .
36 "UNIX_TIMESTAMP(time), time from analysis";
[35]37$result = mysqli_query( $gLink, $query );
[1]38
39if ( ! $result )
40{
[35]41 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
42 mail_to_admin( "fail", "Query failed $query\n" . mysqli_error( $gLink ) );
[1]43 exit();
44}
45
[35]46if ( mysqli_num_rows( $result ) == 0 )
[6]47{
48//write_log( "$self: analysis read got numrows==0" );
[1]49 exit(); // Nothing to do
[6]50}
[35]51//write_log( "$loghdr gfac-analysis rows $nrows" );
[1]52
[14]53$me_devel = preg_match( "/class_devel/", $class_dir );
[35]54//echo "me_devel=$me_devel class_dir=$class_dir\n";
[14]55
[1]56while ( list( $gfacID, $us3_db, $cluster, $status, $queue_msg, $time, $updateTime )
[35]57 = mysqli_fetch_array( $result ) )
[1]58{
[14]59 // If this entry does not match class/class_devel, skip processing
[35]60//echo " gfacID=$gfacID gf_status=$status\n";
[14]61
62 if ( preg_match( "/US3-A/i", $gfacID ) )
63 { // For thrift, job and gridctl must match
64 $job_devel = preg_match( "/US3-ADEV/i", $gfacID );
[35]65//echo " THR: job_devel=$job_devel\n";
[14]66 if ( ( $me_devel && !$job_devel ) ||
67 ( !$me_devel && $job_devel ) )
[25]68 { // Job type and Airavata server mismatch: skip processing
[14]69 continue;
70 }
71 }
72
[28]73 else if ( $me_devel )
74 { // Local (us3iab/-local) and class_devel: skip processing
[35]75//echo " LOC: me_devel=$me_devel\n";
[28]76 continue;
77 }
78
[1]79 // Checking we need to do for each entry
[6]80echo "us3db=$us3_db gfid=$gfacID\n";
[31]81//write_log( " us3db=$us3_db gfid=$gfacID" );
[6]82 switch ( $us3_db )
83 {
84 case 'Xuslims3_cauma3' :
85 case 'Xuslims3_cauma3d' :
86 case 'Xuslims3_HHU' :
87 case 'Xuslims3_Uni_KN' :
88 $serviceURL = "http://gridfarm005.ucs.indiana.edu:9090/ogce-rest/job";
89 break;
[1]90
[6]91 default :
92// $serviceURL = "http://gridfarm005.ucs.indiana.edu:8080/ogce-rest/job";
93 break;
94 }
95
[25]96// $awork = array();
97// $awork = explode( "-", $gfacID );
98// $gfacLabl = $awork[0] . "-" . $awork[1] . "-" . $awork[2];
99 $gfacLabl = $gfacID;
[6]100 $loghdr = $self . ":" . $gfacLabl . "...:";
[17]101 $status_ex = $status;
[6]102
103 // If entry is for Airvata/Thrift, get the true current status
104
105 if ( is_aira_job( $gfacID ) )
106 {
107 $status_in = $status;
[25]108//write_log( "$loghdr status_in=$status_in" );
[6]109 $status = aira_status( $gfacID, $status_in );
[35]110//echo "$loghdr status_in=$status_in status_ex=$status\n";
[6]111if($status != $status_in )
[31]112 write_log( "$loghdr Set to $status from $status_in" );
113//write_log( "$loghdr aira status=$status" );
[6]114 }
[25]115 else if ( is_gfac_job( $gfacID ) )
[6]116 {
117 $status_gw = $status;
118 $status = get_gfac_status( $gfacID );
119 //if ( $status == 'FINISHED' )
120 if ( $status_gw == 'COMPLETE' )
121 $status = $status_gw;
[35]122//echo "$loghdr status_gw=$status_gw status=$status\n";
[25]123//write_log( "$loghdr non-AThrift status=$status status_gw=$status_gw" );
[6]124 }
[25]125 else
126 {
[31]127//write_log( "$loghdr Local gfacID=$gfacID" );
[25]128 $status_gw = $status;
129 $status = get_local_status( $gfacID );
130 if ( $status_gw == 'COMPLETE' || $status == 'UNKNOWN' )
131 $status = $status_gw;
[35]132//echo "$loghdr status_lo=$status\n";
[25]133//write_log( "$loghdr Local status=$status status_gw=$status_gw" );
134 }
[6]135
[1]136 // Sometimes during testing, the us3_db entry is not set
137 // If $status == 'ERROR' then the condition has been processed before
138 if ( strlen( $us3_db ) == 0 && $status != 'ERROR' )
139 {
[6]140 write_log( "$loghdr GFAC DB is NULL - $gfacID" );
[1]141 mail_to_admin( "fail", "GFAC DB is NULL\n$gfacID" );
142
143 $query2 = "UPDATE analysis SET status='ERROR' WHERE gfacID='$gfacID'";
[35]144 $result2 = mysqli_query( $gLink, $query2 );
[1]145 $status = 'ERROR';
146
147 if ( ! $result2 )
[35]148 write_log( "$loghdr Query failed $query2 - " . mysqli_error( $gLink ) );
[1]149
150 }
151
[6]152//echo " st=$status\n";
[31]153//write_log( "$loghdr switch status=$status" );
[1]154 switch ( $status )
155 {
156 // Already been handled
157 // Later update this condition to search for gfacID?
158 case "ERROR":
159 cleanup();
160 break;
161
162 case "SUBMITTED":
163 submitted( $time );
164 break;
165
166 case "SUBMIT_TIMEOUT":
167 submit_timeout( $time );
168 break;
169
170 case "RUNNING":
[6]171 case "STARTED":
172 case "STAGING":
173 case "ACTIVE":
[1]174 running( $time );
175 break;
176
177 case "RUN_TIMEOUT":
178 run_timeout($time );
179 break;
180
181 case "DATA":
[6]182 case "RESULTS_GEN":
[1]183 wait_data( $time );
184 break;
185
186 case "DATA_TIMEOUT":
187 data_timeout( $time );
188 break;
189
[6]190 case "COMPLETED":
[1]191 case "COMPLETE":
[31]192//write_log( "$loghdr COMPLETE gfacID=$gfacID" );
[1]193 complete();
194 break;
195
196 case "CANCELLED":
197 case "CANCELED":
198 case "FAILED":
199 failed();
200 break;
201
[6]202 case "FINISHED":
203 case "DONE":
[26]204 if ( ! is_aira_job( $gfacID ) )
[17]205 {
[25]206 complete();
[17]207 }
[25]208write_log( "$loghdr FINISHED gfacID=$gfacID" );
[6]209 case "PROCESSING":
[1]210 default:
211 break;
212 }
213}
[35]214mysqli_close( $gLink );
[1]215
216exit();
217
218function submitted( $updatetime )
219{
220 global $self;
221 global $gLink;
222 global $gfacID;
[6]223 global $loghdr;
[1]224
225 $now = time();
226
227 if ( $updatetime + 600 > $now ) return; // < 10 minutes ago
228
229 if ( $updatetime + 86400 > $now ) // Within the first 24 hours
230 {
231 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
232 $job_status = get_local_status( $gfacID );
233
234 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
235 return;
236
237 if ( ! in_array( $job_status, array( 'SUBMITTED', 'INITIALIZED', 'PENDING' ) ) )
[6]238 {
[25]239write_log( "$loghdr submitted:job_status=$job_status" );
[1]240 update_job_status( $job_status, $gfacID );
[6]241 }
[1]242
243 return;
244 }
245
246 $message = "Job listed submitted longer than 24 hours";
247 write_log( "$self: $message - id: $gfacID" );
248 mail_to_admin( "hang", "$message - id: $gfacID" );
249 $query = "UPDATE analysis SET status='SUBMIT_TIMEOUT' WHERE gfacID='$gfacID'";
[35]250 $result = mysqli_query( $gLink, $query );
[1]251
252 if ( ! $result )
[35]253 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]254
255 update_queue_messages( $message );
256 update_db( $message );
257}
258
259function submit_timeout( $updatetime )
260{
261 global $self;
262 global $gLink;
263 global $gfacID;
[6]264 global $loghdr;
[1]265
266 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
267 $job_status = get_local_status( $gfacID );
268
269 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
270 return;
271
272 if ( ! in_array( $job_status, array( 'SUBMITTED', 'INITIALIZED', 'PENDING' ) ) )
273 {
274 update_job_status( $job_status, $gfacID );
275 return;
276 }
277
278 $now = time();
279
280 if ( $updatetime + 86400 > $now ) return; // < 24 hours ago ( 48 total submitted )
281
282 $message = "Job listed submitted longer than 48 hours";
283 write_log( "$self: $message - id: $gfacID" );
284 mail_to_admin( "hang", "$message - id: $gfacID" );
285 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
[35]286 $result = mysqli_query( $gLink, $query );
[1]287
288 if ( ! $result )
[35]289 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]290
291 update_queue_messages( $message );
292 update_db( $message );
293}
294
295function running( $updatetime )
296{
297 global $self;
298 global $gLink;
299 global $gfacID;
[6]300 global $loghdr;
[1]301
302 $now = time();
303
304 get_us3_data();
305
306 if ( $updatetime + 600 > $now ) return; // message received < 10 minutes ago
307
308 if ( $updatetime + 86400 > $now ) // Within the first 24 hours
309 {
310 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
311 $job_status = get_local_status( $gfacID );
312
313 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
314 return;
315
[6]316 if ( ! in_array( $job_status, array( 'ACTIVE', 'RUNNING', 'STARTED' ) ) )
[1]317 update_job_status( $job_status, $gfacID );
318
319 return;
320 }
321
322 $message = "Job listed running longer than 24 hours";
323 write_log( "$self: $message - id: $gfacID" );
324 mail_to_admin( "hang", "$message - id: $gfacID" );
325 $query = "UPDATE analysis SET status='RUN_TIMEOUT' WHERE gfacID='$gfacID'";
[35]326 $result = mysqli_query( $gLink, $query );
[1]327
328 if ( ! $result )
[35]329 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]330
331 update_queue_messages( $message );
332 update_db( $message );
333}
334
335function run_timeout( $updatetime )
336{
337 global $self;
338 global $gLink;
339 global $gfacID;
[6]340 global $loghdr;
[1]341
342 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
343 $job_status = get_local_status( $gfacID );
344
345 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
346 return;
347
[6]348 if ( ! in_array( $job_status, array( 'ACTIVE', 'RUNNING', 'STARTED' ) ) )
[1]349 {
350 update_job_status( $job_status, $gfacID );
351 return;
352 }
353
354 $now = time();
355
356 get_us3_data();
357
358 if ( $updatetime + 172800 > $now ) return; // < 48 hours ago
359
360 $message = "Job listed running longer than 48 hours";
361 write_log( "$self: $message - id: $gfacID" );
362 mail_to_admin( "hang", "$message - id: $gfacID" );
363 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
[35]364 $result = mysqli_query( $gLink, $query );
[1]365
366 if ( ! $result )
[35]367 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]368
369 update_queue_messages( $message );
370 update_db( $message );
371}
372
373function wait_data( $updatetime )
374{
375 global $self;
376 global $gLink;
377 global $gfacID;
[6]378 global $loghdr;
[1]379
380 $now = time();
381
382 if ( $updatetime + 3600 > $now ) // < Within the first hour
383 {
384 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
385 $job_status = get_local_status( $gfacID );
386
387 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
388 return;
389
390 if ( $job_status != 'DATA' )
391 {
392 update_job_status( $job_status, $gfacID );
393 return;
394 }
395
396 // Request to resend data, but only request every 5 minutes
397 $minute = date( 'i' ) * 1; // Makes it an int
398 if ( $minute % 5 ) return;
399
400 $output_status = get_gfac_outputs( $gfacID );
401
402 if ( $output_status !== false )
403 mail_to_admin( "debug", "wait_data/$gfacID/$output_status" );
404
405 return;
406 }
407
408 $message = "Waiting for data longer than 1 hour";
409 write_log( "$self: $message - id: $gfacID" );
410 mail_to_admin( "hang", "$message - id: $gfacID" );
411 $query = "UPDATE analysis SET status='DATA_TIMEOUT' WHERE gfacID='$gfacID'";
[35]412 $result = mysqli_query( $gLink, $query );
[1]413
414 if ( ! $result )
[35]415 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]416
417 update_queue_messages( $message );
418 update_db( $message );
419}
420
421function data_timeout( $updatetime )
422{
423 global $self;
424 global $gLink;
425 global $gfacID;
[6]426 global $loghdr;
[1]427
428 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
429 $job_status = get_local_status( $gfacID );
430
431 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
432 return;
433
434 if ( $job_status != 'DATA' )
435 {
436 update_job_status( $job_status, $gfacID );
437 return;
438 }
439
440 $now = time();
441
442 if ( $updatetime + 86400 > $now ) // < 24 hours ago
443 {
444 // Request to resend data, but only request every 15 minutes
445 $minute = date( 'i' ) * 1; // Makes it an int
446 if ( $minute % 15 ) return;
447
448 $output_status = get_gfac_outputs( $gfacID );
449
450 if ( $output_status !== false )
451 mail_to_admin( "debug", "data_timeout/$gfacID/$output_status" );
452
453 return;
454 }
455
456 $message = "Waiting for data longer than 24 hours";
457 write_log( "$self: $message - id: $gfacID" );
458 mail_to_admin( "hang", "$message - id: $gfacID" );
459 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
[35]460 $result = mysqli_query( $gLink, $query );
[1]461
462 if ( ! $result )
[35]463 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]464
465 update_queue_messages( $message );
466 update_db( $message );
467}
468
469function complete()
470{
471 // Just cleanup
472 cleanup();
473}
474
475function failed()
476{
477 // Just cleanup
478 cleanup();
479}
480
481function cleanup()
482{
483 global $self;
484 global $gLink;
485 global $gfacID;
486 global $us3_db;
[6]487 global $loghdr;
[26]488 global $class_dir;
[1]489
490 // Double check that the gfacID exists
491 $query = "SELECT count(*) FROM analysis WHERE gfacID='$gfacID'";
[35]492 $result = mysqli_query( $gLink, $query );
[1]493
494 if ( ! $result )
495 {
[35]496 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
497 mail_to_admin( "fail", "Query failed $query\n" . mysqli_error( $gLink ) );
[1]498 return;
499 }
500
[35]501 list( $count ) = mysqli_fetch_array( $result );
[1]502
[35]503//if ($count==0)
504//write_log( "$loghdr count = $count gfacID = $gfacID" );
[1]505 if ( $count == 0 ) return;
506
507 // Now check the us3 instance
508 $requestID = get_us3_data();
[6]509//write_log( "$loghdr requestID = $requestID gfacID = $gfacID" );
[1]510 if ( $requestID == 0 ) return;
511
[14]512 $me_devel = preg_match( "/class_devel/", $class_dir );
[25]513 $me_local = preg_match( "/class_local/", $class_dir );
[14]514
[10]515 if ( preg_match( "/US3-A/i", $gfacID ) )
[29]516 { // Airavata job: clean up if prod/devel match
[10]517 $job_devel = preg_match( "/US3-ADEV/i", $gfacID );
518 if ( ( !$me_devel && !$job_devel ) ||
519 ( $me_devel && $job_devel ) )
[25]520 { // Job is of same type (prod/devel) as Server: process it
[6]521//write_log( "$loghdr CALLING aira_cleanup()" );
[10]522 aira_cleanup( $us3_db, $requestID, $gLink );
523 }
[6]524//write_log( "$loghdr RTN FR aira_cleanup()" );
525 }
[29]526 else
527 { // Non-airavata job: clean up in a non-aira way
528write_log( "$loghdr calling gfac_cleanup() reqID=$requestID" );
[6]529 gfac_cleanup( $us3_db, $requestID, $gLink );
530 }
[1]531}
532
533// Function to update status of job
534function update_job_status( $job_status, $gfacID )
535{
536 global $gLink;
[6]537 global $query;
538 global $self;
539 global $loghdr;
[1]540
541 switch ( $job_status )
542 {
543 case 'SUBMITTED' :
544 case 'SUBMITED' :
545 case 'INITIALIZED' :
[25]546 case 'UPDATING' :
547 case 'PENDING' :
[1]548 $query = "UPDATE analysis SET status='SUBMITTED' WHERE gfacID='$gfacID'";
[25]549 $message = "Job status request reports job is SUBMITTED";
[1]550 break;
551
[6]552 case 'STARTED' :
553 case 'RUNNING' :
[1]554 case 'ACTIVE' :
555 $query = "UPDATE analysis SET status='RUNNING' WHERE gfacID='$gfacID'";
556 $message = "Job status request reports job is RUNNING";
557 break;
558
[28]559 case 'EXECUTING' :
560 $message = "Job status request reports job is EXECUTING";
561 break;
562
[6]563 case 'FINISHED' :
564 $query = "UPDATE analysis SET status='FINISHED' WHERE gfacID='$gfacID'";
565 $message = "NONE";
566 break;
567
568 case 'DONE' :
569 $query = "UPDATE analysis SET status='DONE' WHERE gfacID='$gfacID'";
570 $message = "NONE";
571 break;
572
[1]573 case 'COMPLETED' :
[6]574 case 'COMPLETE' :
[1]575 $query = "UPDATE analysis SET status='COMPLETE' WHERE gfacID='$gfacID'";
[6]576 $message = "Job status request reports job is COMPLETED";
[1]577 break;
578
[6]579 case 'DATA' :
[1]580 $query = "UPDATE analysis SET status='DATA' WHERE gfacID='$gfacID'";
581 $message = "Job status request reports job is COMPLETE, waiting for data";
582 break;
583
584 case 'CANCELED' :
[6]585 case 'CANCELLED' :
[1]586 $query = "UPDATE analysis SET status='CANCELED' WHERE gfacID='$gfacID'";
587 $message = "Job status request reports job is CANCELED";
588 break;
589
590 case 'FAILED' :
591 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
592 $message = "Job status request reports job is FAILED";
593 break;
594
595 case 'UNKNOWN' :
[6]596write_log( "$loghdr job_status='UNKNOWN', reset to 'ERROR' " );
597 $query = "UPDATE analysis SET status='ERROR' WHERE gfacID='$gfacID'";
[1]598 $message = "Job status request reports job is not in the queue";
599 break;
600
601 default :
[3]602 // We shouldn't ever get here
[6]603 $query = "UPDATE analysis SET status='ERROR' WHERE gfacID='$gfacID'";
[1]604 $message = "Job status was not recognized - $job_status";
[6]605 write_log( "$loghdr update_job_status: " .
[3]606 "Job status was not recognized - $job_status\n" .
607 "gfacID = $gfacID\n" );
[1]608 break;
609
610 }
611
[35]612 $result = mysqli_query( $gLink, $query );
[1]613 if ( ! $result )
[35]614 write_log( "$loghdr Query failed $query - " . mysqli_error( $gLink ) );
[1]615
[6]616 if ( $message != 'NONE' )
617 {
618 update_queue_messages( $message );
619 update_db( $message );
620 }
[1]621}
622
623function get_us3_data()
624{
625 global $self;
626 global $gfacID;
627 global $dbhost;
628 global $user;
629 global $passwd;
630 global $us3_db;
631 global $updateTime;
[6]632 global $loghdr;
[1]633
[35]634 $us3_link = mysqli_connect( $dbhost, $user, $passwd, $us3_db );
[1]635
636 if ( ! $us3_link )
637 {
[35]638 write_log( "$loghdr could not connect: $dbhost, $user, $passwd, $us3_db" );
639 mail_to_admin( "fail", "Could not connect to $dbhost : $us3_db" );
[1]640 return 0;
641 }
642
643 $query = "SELECT HPCAnalysisRequestID, UNIX_TIMESTAMP(updateTime) " .
644 "FROM HPCAnalysisResult WHERE gfacID='$gfacID'";
[35]645 $result = mysqli_query( $us3_link, $query );
[1]646
647 if ( ! $result )
648 {
[35]649 write_log( "$self: Query failed $query - " . mysqli_error( $us3_link ) );
650 mail_to_admin( "fail", "Query failed $query\n" . mysqli_error( $us3_link ) );
[1]651 return 0;
652 }
653
[35]654 list( $requestID, $updateTime ) = mysqli_fetch_array( $result );
655 mysqli_close( $us3_link );
[1]656
657 return $requestID;
658}
659
[6]660// Function to determine if this is a gfac job or not
[1]661function is_gfac_job( $gfacID )
662{
663 $hex = "[0-9a-fA-F]";
664 if ( ! preg_match( "/^US3-Experiment/i", $gfacID ) &&
665 ! preg_match( "/^US3-$hex{8}-$hex{4}-$hex{4}-$hex{4}-$hex{12}$/", $gfacID ) )
666 {
667 // Then it's not a GFAC job
668 return false;
669 }
670
671 return true;
672}
673
[6]674// Function to determine if this is an airavata/thrift job or not
675function is_aira_job( $gfacID )
676{
677 global $cluster;
678
[15]679 if ( preg_match( "/US3-A/i", $gfacID ) )
[6]680 {
681 // Then it's an Airavata/Thrift job
682 return true;
683 }
684
685 return false;
686}
687
[1]688// Function to get the current job status from GFAC
689function get_gfac_status( $gfacID )
690{
691 global $serviceURL;
[6]692 global $self;
693 global $loghdr;
694 global $cluster;
[18]695 global $status_ex, $status_gw;
[1]696
[6]697 if ( is_aira_job( $gfacID ) )
698 {
699 $status_ex = getExperimentStatus( $gfacID );
[17]700
701 if ( $status_ex == 'EXECUTING' )
702 {
[18]703 if ( $status_gw == 'RUNNING' )
[17]704 $status_ex = 'ACTIVE';
[22]705 else
706 $status_ex = 'QUEUED';
[17]707 }
708
[6]709 $gfac_status = standard_status( $status_ex );
710 return $gfac_status;
711 }
712
713 else if ( ! is_gfac_job( $gfacID ) )
[25]714 {
[1]715 return false;
[25]716 }
[1]717
718 $url = "$serviceURL/jobstatus/$gfacID";
719 try
720 {
721 $post = new HttpRequest( $url, HttpRequest::METH_GET );
722 $http = $post->send();
723 $xml = $post->getResponseBody();
724 }
725 catch ( HttpException $e )
726 {
[6]727 write_log( "$loghdr Status not available - marking failed - $gfacID" );
[1]728 return 'GFAC_STATUS_UNAVAILABLE';
729 }
730
731 // Parse the result
732 $gfac_status = parse_response( $xml );
733
[3]734 // This may not seem like the best place to do this, but here we have
735 // the xml straight from GFAC
736 $status_types = array('SUBMITTED',
737 'SUBMITED',
738 'INITIALIZED',
739 'PENDING',
[6]740 'RUNNING',
[3]741 'ACTIVE',
[6]742 'STARTED',
[3]743 'COMPLETED',
[6]744 'FINISHED',
[3]745 'DONE',
746 'DATA',
[6]747 'RESULTS_GEN',
[3]748 'CANCELED',
749 'CANCELLED',
750 'FAILED',
[6]751 'STAGING',
[3]752 'UNKNOWN');
753 if ( ! in_array( $gfac_status, $status_types ) )
754 mail_to_admin( 'debug', "gfacID: /$gfacID/\n" .
755 "XML: /$xml/\n" .
756 "Status: /$gfac_status/\n" );
757
[6]758 if ( in_array( $gfac_status, array( 'DONE', 'DATA', 'RESULTS_GEN' ) ) )
759 $gfac_status = 'DATA';
760
[1]761 return $gfac_status;
762}
763
764// Function to request data outputs from GFAC
765function get_gfac_outputs( $gfacID )
766{
767 global $serviceURL;
[6]768 global $self;
[1]769
770 // Make sure it's a GFAC job and status is appropriate for this call
771 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
772 {
773 // Then it's not a GFAC job
[25]774 $job_status = get_local_status( $gfacID );
775 return $job_status;
[1]776 }
777
[6]778 if ( ! in_array( $job_status, array( 'DONE', 'FAILED', 'COMPLETE', 'FINISHED' ) ) )
[1]779 {
780 // Then it's not appropriate to request data
781 return false;
782 }
783
784 $url = "$serviceURL/registeroutput/$gfacID";
785 try
786 {
787 $post = new HttpRequest( $url, HttpRequest::METH_GET );
788 $http = $post->send();
789 $xml = $post->getResponseBody();
790 }
791 catch ( HttpException $e )
792 {
793 write_log( "$self: Data not available - request failed - $gfacID" );
794 return false;
795 }
796
797 mail_to_admin( "debug", "get_gfac_outputs/\n$xml/" ); // Temporary, to see what the xml looks like,
798 // if we ever get one
799
800 // Parse the result
801 $gfac_status = parse_response( $xml );
802
803 return $gfac_status;
804}
805
806function parse_response( $xml )
807{
808 global $gfac_message;
809
810 $status = "";
811 $gfac_message = "";
812
813 $parser = new XMLReader();
814 $parser->xml( $xml );
815
816 while( $parser->read() )
817 {
818 $type = $parser->nodeType;
819
820 if ( $type == XMLReader::ELEMENT )
821 $name = $parser->name;
822
823 else if ( $type == XMLReader::TEXT )
824 {
825 if ( $name == "status" )
826 $status = $parser->value;
827 else
828 $gfac_message = $parser->value;
829 }
830 }
831
832 $parser->close();
833 return $status;
834}
835
836// Function to get status from local cluster
837function get_local_status( $gfacID )
838{
839 global $cluster;
[6]840 global $self;
[1]841
[31]842 $is_jetstr = preg_match( "/jetstream/", $cluster );
843 if ( $is_jetstr )
[32]844 $cmd = "squeue -j $gfacID 2>&1|tail -n 1";
[31]845 else
846 $cmd = "/usr/bin/qstat -a $gfacID 2>&1|tail -n 1";
[28]847//write_log( "$self cmd: $cmd" );
848//write_log( "$self cluster: $cluster" );
849//write_log( "$self gfacID: $gfacID" );
[31]850
[25]851 if ( ! preg_match( "/us3iab/", $cluster ) )
852 {
853 $system = "$cluster.uthscsa.edu";
[31]854 if ( $is_jetstr )
855 $system = "$cluster";
[25]856 $system = preg_replace( "/\-local/", "", $system );
857 $cmd = "/usr/bin/ssh -x us3@$system " . $cmd;
[28]858//write_log( "$self cmd: $cmd" );
[25]859 }
[1]860
861 $result = exec( $cmd );
[28]862//write_log( "$self result: $result" );
[1]863
[31]864 $secwait = 2;
865 $num_try = 0;
866 // Sleep and retry up to 3 times if ssh has "ssh_exchange_identification" error
867 while ( preg_match( "/ssh_exchange_id/", $result ) && $num_try < 3 )
[1]868 {
[31]869 sleep( $secwait );
870 $num_try++;
871 $secwait *= 2;
872write_log( "$me: num_try=$num_try secwait=$secwait" );
873 }
[35]874
[32]875 if ( preg_match( "/^qstat: Unknown/", $result ) ||
[31]876 preg_match( "/ssh_exchange_id/", $result ) )
877 {
[35]878 write_log( "$self get_local_status: Local job $gfacID unknown result=$result" );
[1]879 return 'UNKNOWN';
880 }
881
882 $values = preg_split( "/\s+/", $result );
[32]883 $jstat = ( $is_jetstr == 0 ) ? $values[ 9 ] : $values[ 5 ];
[31]884//write_log( "$self: get_local_status: job status = /$jstat/");
885 switch ( $jstat )
[1]886 {
887 case "W" : // Waiting for execution time to be reached
888 case "E" : // Job is exiting after having run
889 case "R" : // Still running
[32]890 case "CG" : // Job is completing
[1]891 $status = 'ACTIVE';
892 break;
893
894 case "C" : // Job has completed
[32]895 case "ST" : // Job has disappeared
896 case "CD" : // Job has completed
[1]897 $status = 'COMPLETED';
898 break;
899
900 case "T" : // Job is being moved
901 case "H" : // Held
902 case "Q" : // Queued
[31]903 case "PD" : // Queued
[32]904 case "CF" : // Queued
[1]905 $status = 'SUBMITTED';
906 break;
907
[32]908 case "CA" : // Job has been canceled
909 $status = 'CANCELED';
910 break;
911
912 case "F" : // Job has failed
913 case "BF" : // Job has failed
914 case "NF" : // Job has failed
915 case "TO" : // Job has timed out
916 case "" : // Job has disappeared
917 $status = 'FAILED';
918 break;
919
[1]920 default :
921 $status = 'UNKNOWN'; // This should not occur
922 break;
923 }
924
925 return $status;
926}
927
928function update_queue_messages( $message )
929{
930 global $self;
931 global $gLink;
932 global $gfacID;
933
934 // Get analysis table ID
935 $query = "SELECT id FROM analysis " .
936 "WHERE gfacID = '$gfacID' ";
[35]937 $result = mysqli_query( $gLink, $query );
[1]938 if ( ! $result )
939 {
[35]940 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]941 return;
942 }
[35]943 list( $analysisID ) = mysqli_fetch_array( $result );
[1]944
945 // Insert message into queue_message table
946 $query = "INSERT INTO queue_messages SET " .
[35]947 "message = '" . mysqli_real_escape_string( $gLink, $message ) . "', " .
[6]948 "analysisID = '$analysisID' ";
[35]949 $result = mysqli_query( $gLink, $query );
[1]950 if ( ! $result )
951 {
[35]952 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]953 return;
954 }
955}
956
957function update_db( $message )
958{
959 global $self;
960 global $gfacID;
961 global $dbhost;
962 global $user;
963 global $passwd;
964 global $us3_db;
965
[35]966 $us3_link = mysqli_connect( $dbhost, $user, $passwd, $us3_db );
[1]967
968 if ( ! $us3_link )
969 {
970 write_log( "$self: could not connect: $dbhost, $user, $passwd" );
[35]971 mail_to_admin( "fail", "Could not connect to $dbhost : $us3_db" );
[1]972 return 0;
973 }
974
975 $query = "UPDATE HPCAnalysisResult SET " .
[35]976 "lastMessage='" . mysqli_real_escape_string( $us3_link, $message ) . "'" .
[1]977 "WHERE gfacID = '$gfacID' ";
978
[35]979 mysqli_query( $us3_link, $query );
980 mysqli_close( $us3_link );
[1]981}
982
983function mail_to_admin( $type, $msg )
984{
985 global $updateTime;
986 global $status;
987 global $cluster;
988 global $org_name;
989 global $admin_email;
990 global $dbhost;
991 global $requestID;
992
993 $headers = "From: $org_name Admin<$admin_email>" . "\n";
994 $headers .= "Cc: $org_name Admin<$admin_email>" . "\n";
[6]995 $headers .= "Bcc: Gary Gorbet<gegorbet@gmail.com>" . "\n"; // make sure
[1]996
997 // Set the reply address
998 $headers .= "Reply-To: $org_name<$admin_email>" . "\n";
999 $headers .= "Return-Path: $org_name<$admin_email>" . "\n";
1000
1001 // Try to avoid spam filters
1002 $now = time();
[40]1003 $tnow = date( 'Y-m-d H:i:s' );
[1]1004 $headers .= "Message-ID: <" . $now . "gridctl@$dbhost>$requestID\n";
1005 $headers .= "X-Mailer: PHP v" . phpversion() . "\n";
1006 $headers .= "MIME-Version: 1.0" . "\n";
1007 $headers .= "Content-Transfer-Encoding: 8bit" . "\n";
1008
1009 $subject = "US3 Error Notification";
1010 $message = "
1011 UltraScan job error notification from gridctl.php:
1012
[40]1013 Update Time : $updateTime [ now=$tnow ]
[1]1014 GFAC Status : $status
1015 Cluster : $cluster
1016 ";
1017
1018 $message .= "Error Message : $msg\n";
1019
1020 mail( $admin_email, $subject, $message, $headers );
1021}
[6]1022
1023// Convert a status string to one of the standard DB status strings
1024function standard_status( $status_in )
1025{
1026 switch ( $status_in )
1027 { // Map variations to standard gateway status values
1028 case 'QUEUED' :
1029 case 'LAUNCHED' :
1030 case 'CREATED' :
1031 case 'VALIDATED' :
1032 case 'SCHEDULED' :
1033 case 'submitted' :
[28]1034 case 'SUBMITTED' :
[6]1035 case '' :
1036 $status = 'SUBMITTED';
1037 break;
1038
1039 case 'EXECUTING' :
1040 case 'ACTIVE' :
1041 case 'running' :
1042 case 'executing' :
1043 $status = 'RUNNING';
1044 break;
1045
1046 case 'PENDING' :
1047 case 'CANCELING' :
1048 $status = 'UPDATING';
1049 break;
1050
1051 case 'CANCELLED' :
1052 case 'canceled' :
1053 $status = 'CANCELED';
1054 break;
1055
[39]1056// $status = 'DATA';
1057// break;
[25]1058
[6]1059 case 'COMPLETED' :
1060 case 'completed' :
1061 $status = 'COMPLETE';
1062 break;
1063
1064 case 'FAILED_DATA' :
1065 case 'SUBMIT_TIMEOUT' :
1066 case 'RUN_TIMEOUT' :
1067 case 'DATA_TIMEOUT' :
1068 $status = 'FAILED';
1069 break;
1070
1071 case 'COMPLETE' :
1072 $status = 'DONE';
1073 break;
1074
1075 case 'UNKNOWN' :
1076 $status = 'ERROR';
1077 break;
1078
1079 // Where already standard value, retain value
1080 case 'ERROR' :
1081 case 'RUNNING' :
1082 case 'SUBMITTED' :
1083 case 'UPDATING' :
1084 case 'CANCELED' :
1085 case 'DATA' :
1086 case 'FAILED' :
1087 case 'DONE' :
1088 case 'FINISHED' :
1089 default :
1090 $status = $status_in;
1091 break;
1092 }
1093
1094 return $status;
1095}
1096
1097function aira_status( $gfacID, $status_in )
1098{
1099 global $self;
1100 global $loghdr;
[26]1101 global $class_dir;
[6]1102//echo "a_st: st_in$status_in : $gfacID\n";
1103 //$status_gw = standard_status( $status_in );
1104 $status_gw = $status_in;
1105//echo "a_st: st_db=$status_gw\n";
1106 $status = $status_gw;
[10]1107 $me_devel = preg_match( "/class_devel/", $class_dir );
1108 $job_devel = preg_match( "/US3-ADEV/i", $gfacID );
1109 $devmatch = ( ( !$me_devel && !$job_devel ) ||
1110 ( $me_devel && $job_devel ) );
[6]1111
[10]1112 if ( preg_match( "/US3-A/i", $gfacID ) && $devmatch )
[25]1113 {
[31]1114//write_log( "$loghdr status_in=$status_in status=$status gfacID=$gfacID" );
[6]1115 $status_ex = getExperimentStatus( $gfacID );
[31]1116//write_log( "$loghdr status_ex=$status_ex" );
[6]1117
1118 if ( $status_ex == 'COMPLETED' )
1119 { // Experiment is COMPLETED: check for 'FINISHED' or 'DONE'
1120 if ( $status_gw == 'FINISHED' || $status_gw == 'DONE' )
1121 { // COMPLETED + FINISHED/DONE : gateway status is now COMPLETE
1122 $status = 'COMPLETE';
1123 }
1124
1125 else
1126 { // COMPLETED + NOT-FINISHED/DONE: gw status now DONE
1127 $status = 'DONE';
1128 }
1129 }
1130
1131 else if ( $status_gw == 'FINISHED' || $status_gw == 'DONE' )
1132 { // Gfac status == FINISHED/DONE: leave as is (unless FAILED)
1133 $status = $status_gw;
1134 if ( $status_ex == 'FAILED' )
1135 {
1136 sleep( 10 );
1137 $status_ex = getExperimentStatus( $gfacID );
1138 if ( $status_ex == 'FAILED' )
1139 {
1140 write_log( "$loghdr status still 'FAILED' after 10-second delay" );
1141 sleep( 10 );
1142 $status_ex = getExperimentStatus( $gfacID );
1143 if ( $status_ex == 'FAILED' )
1144 write_log( "$loghdr status still 'FAILED' after 20-second delay" );
1145 else
1146 write_log( "$loghdr status is $status_ex after 20-second delayed retry" );
1147 }
1148 write_log( "$loghdr status reset to 'COMPLETE'" );
1149 $status = 'COMPLETE';
1150 }
1151 }
1152
[28]1153 else if ( $status_ex == 'EXECUTING' )
1154 {
1155 $status = standard_status( $status_gw );
1156write_log( "$loghdr status/_in/_gw/_ex=$status/$status_in/$status_gw/$status_ex" );
1157 }
1158
[6]1159 else
1160 { // Experiment not COMPLETED/FINISHED/DONE: use experiment status
1161 $status = standard_status( $status_ex );
1162 }
1163
[35]1164//if ( $status != 'SUBMITTED' )
[10]1165//write_log( "$loghdr status/_in/_gw/_ex=$status/$status_in/$status_gw/$status_ex" );
[6]1166 if ( $status != $status_gw )
1167 {
1168 update_job_status( $status, $gfacID );
1169 }
1170 }
1171
1172 return $status;
1173}
1174
[1]1175?>
Note: See TracBrowser for help on using the repository browser.