source: trunk/gridctl.php

Last change on this file was 41, checked in by gegorbet, 6 years ago

gridctl mods for uslims.uleth.ca demeler3 cluster

File size: 32.2 KB
RevLine 
[1]1<?php
2
[25]3$us3bin = exec( "ls -d ~us3/lims/bin" );
4include_once "$us3bin/listen-config.php";
5//include "$us3bin/cleanup_aira.php";
6//include "$us3bin/cleanup_gfac.php";
[41]7//include "$us3bin/cleanup.php";
[25]8
[41]9
[1]10// Global variables
11$gfac_message = "";
12$updateTime = 0;
13$submittime = 0;
14$cluster = '';
15
[6]16//global $self;
[18]17global $status_ex, $status_gw;
[6]18
[1]19// Produce some output temporarily, so cron will send me message
20$now = time();
[6]21echo "Time started: " . date( 'Y-m-d H:i:s', $now ) . "\n";
[1]22
23// Get data from global GFAC DB
[35]24$gLink = mysqli_connect( $dbhost, $guser, $gpasswd, $gDB );
[1]25
[35]26if ( ! $gLink )
[1]27{
[39]28 write_log( "$self: Could not select DB $gDB - " . mysqli_error($gLink) );
29 //mail_to_admin( "fail", "Internal Error: Could not select DB $gDB" );
30 mail_to_admin( "fail",
31 "Internal Error: Could not select DB $gDB $dbhost $guser " );
32 //sleep(300);
33 sleep(3);
[1]34 exit();
35}
36
37$query = "SELECT gfacID, us3_db, cluster, status, queue_msg, " .
38 "UNIX_TIMESTAMP(time), time from analysis";
[35]39$result = mysqli_query( $gLink, $query );
[1]40
41if ( ! $result )
42{
[35]43 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
44 mail_to_admin( "fail", "Query failed $query\n" . mysqli_error( $gLink ) );
[1]45 exit();
46}
47
[35]48if ( mysqli_num_rows( $result ) == 0 )
[6]49{
50//write_log( "$self: analysis read got numrows==0" );
[1]51 exit(); // Nothing to do
[6]52}
[35]53//write_log( "$loghdr gfac-analysis rows $nrows" );
[1]54
[14]55$me_devel = preg_match( "/class_devel/", $class_dir );
[35]56//echo "me_devel=$me_devel class_dir=$class_dir\n";
[14]57
[1]58while ( list( $gfacID, $us3_db, $cluster, $status, $queue_msg, $time, $updateTime )
[35]59 = mysqli_fetch_array( $result ) )
[1]60{
[14]61 // If this entry does not match class/class_devel, skip processing
[35]62//echo " gfacID=$gfacID gf_status=$status\n";
[14]63
64 if ( preg_match( "/US3-A/i", $gfacID ) )
65 { // For thrift, job and gridctl must match
66 $job_devel = preg_match( "/US3-ADEV/i", $gfacID );
[35]67//echo " THR: job_devel=$job_devel\n";
[14]68 if ( ( $me_devel && !$job_devel ) ||
69 ( !$me_devel && $job_devel ) )
[25]70 { // Job type and Airavata server mismatch: skip processing
[14]71 continue;
72 }
73 }
74
[28]75 else if ( $me_devel )
76 { // Local (us3iab/-local) and class_devel: skip processing
[35]77//echo " LOC: me_devel=$me_devel\n";
[28]78 continue;
79 }
80
[1]81 // Checking we need to do for each entry
[6]82echo "us3db=$us3_db gfid=$gfacID\n";
[31]83//write_log( " us3db=$us3_db gfid=$gfacID" );
[6]84 switch ( $us3_db )
85 {
86 case 'Xuslims3_cauma3' :
87 case 'Xuslims3_cauma3d' :
88 case 'Xuslims3_HHU' :
89 case 'Xuslims3_Uni_KN' :
90 $serviceURL = "http://gridfarm005.ucs.indiana.edu:9090/ogce-rest/job";
91 break;
[1]92
[6]93 default :
94// $serviceURL = "http://gridfarm005.ucs.indiana.edu:8080/ogce-rest/job";
95 break;
96 }
97
[25]98// $awork = array();
99// $awork = explode( "-", $gfacID );
100// $gfacLabl = $awork[0] . "-" . $awork[1] . "-" . $awork[2];
101 $gfacLabl = $gfacID;
[6]102 $loghdr = $self . ":" . $gfacLabl . "...:";
[17]103 $status_ex = $status;
[6]104
105 // If entry is for Airvata/Thrift, get the true current status
106
107 if ( is_aira_job( $gfacID ) )
108 {
109 $status_in = $status;
[25]110//write_log( "$loghdr status_in=$status_in" );
[6]111 $status = aira_status( $gfacID, $status_in );
[35]112//echo "$loghdr status_in=$status_in status_ex=$status\n";
[6]113if($status != $status_in )
[31]114 write_log( "$loghdr Set to $status from $status_in" );
115//write_log( "$loghdr aira status=$status" );
[6]116 }
[25]117 else if ( is_gfac_job( $gfacID ) )
[6]118 {
119 $status_gw = $status;
120 $status = get_gfac_status( $gfacID );
121 //if ( $status == 'FINISHED' )
122 if ( $status_gw == 'COMPLETE' )
123 $status = $status_gw;
[35]124//echo "$loghdr status_gw=$status_gw status=$status\n";
[25]125//write_log( "$loghdr non-AThrift status=$status status_gw=$status_gw" );
[6]126 }
[25]127 else
128 {
[31]129//write_log( "$loghdr Local gfacID=$gfacID" );
[25]130 $status_gw = $status;
131 $status = get_local_status( $gfacID );
132 if ( $status_gw == 'COMPLETE' || $status == 'UNKNOWN' )
133 $status = $status_gw;
[35]134//echo "$loghdr status_lo=$status\n";
[25]135//write_log( "$loghdr Local status=$status status_gw=$status_gw" );
136 }
[6]137
[1]138 // Sometimes during testing, the us3_db entry is not set
139 // If $status == 'ERROR' then the condition has been processed before
140 if ( strlen( $us3_db ) == 0 && $status != 'ERROR' )
141 {
[6]142 write_log( "$loghdr GFAC DB is NULL - $gfacID" );
[1]143 mail_to_admin( "fail", "GFAC DB is NULL\n$gfacID" );
144
145 $query2 = "UPDATE analysis SET status='ERROR' WHERE gfacID='$gfacID'";
[35]146 $result2 = mysqli_query( $gLink, $query2 );
[1]147 $status = 'ERROR';
148
149 if ( ! $result2 )
[35]150 write_log( "$loghdr Query failed $query2 - " . mysqli_error( $gLink ) );
[1]151
152 }
153
[6]154//echo " st=$status\n";
[31]155//write_log( "$loghdr switch status=$status" );
[1]156 switch ( $status )
157 {
158 // Already been handled
159 // Later update this condition to search for gfacID?
160 case "ERROR":
161 cleanup();
162 break;
163
164 case "SUBMITTED":
165 submitted( $time );
166 break;
167
168 case "SUBMIT_TIMEOUT":
169 submit_timeout( $time );
170 break;
171
172 case "RUNNING":
[6]173 case "STARTED":
174 case "STAGING":
175 case "ACTIVE":
[1]176 running( $time );
177 break;
178
179 case "RUN_TIMEOUT":
180 run_timeout($time );
181 break;
182
183 case "DATA":
[6]184 case "RESULTS_GEN":
[1]185 wait_data( $time );
186 break;
187
188 case "DATA_TIMEOUT":
189 data_timeout( $time );
190 break;
191
[6]192 case "COMPLETED":
[1]193 case "COMPLETE":
[31]194//write_log( "$loghdr COMPLETE gfacID=$gfacID" );
[1]195 complete();
196 break;
197
198 case "CANCELLED":
199 case "CANCELED":
200 case "FAILED":
201 failed();
202 break;
203
[6]204 case "FINISHED":
205 case "DONE":
[26]206 if ( ! is_aira_job( $gfacID ) )
[17]207 {
[25]208 complete();
[17]209 }
[25]210write_log( "$loghdr FINISHED gfacID=$gfacID" );
[6]211 case "PROCESSING":
[1]212 default:
213 break;
214 }
215}
[35]216mysqli_close( $gLink );
[1]217
218exit();
219
220function submitted( $updatetime )
221{
222 global $self;
223 global $gLink;
224 global $gfacID;
[6]225 global $loghdr;
[1]226
227 $now = time();
228
229 if ( $updatetime + 600 > $now ) return; // < 10 minutes ago
230
231 if ( $updatetime + 86400 > $now ) // Within the first 24 hours
232 {
233 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
234 $job_status = get_local_status( $gfacID );
235
236 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
237 return;
238
239 if ( ! in_array( $job_status, array( 'SUBMITTED', 'INITIALIZED', 'PENDING' ) ) )
[6]240 {
[25]241write_log( "$loghdr submitted:job_status=$job_status" );
[1]242 update_job_status( $job_status, $gfacID );
[6]243 }
[1]244
245 return;
246 }
247
248 $message = "Job listed submitted longer than 24 hours";
249 write_log( "$self: $message - id: $gfacID" );
250 mail_to_admin( "hang", "$message - id: $gfacID" );
251 $query = "UPDATE analysis SET status='SUBMIT_TIMEOUT' WHERE gfacID='$gfacID'";
[35]252 $result = mysqli_query( $gLink, $query );
[1]253
254 if ( ! $result )
[35]255 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]256
257 update_queue_messages( $message );
258 update_db( $message );
259}
260
261function submit_timeout( $updatetime )
262{
263 global $self;
264 global $gLink;
265 global $gfacID;
[6]266 global $loghdr;
[1]267
268 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
269 $job_status = get_local_status( $gfacID );
270
271 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
272 return;
273
274 if ( ! in_array( $job_status, array( 'SUBMITTED', 'INITIALIZED', 'PENDING' ) ) )
275 {
276 update_job_status( $job_status, $gfacID );
277 return;
278 }
279
280 $now = time();
281
282 if ( $updatetime + 86400 > $now ) return; // < 24 hours ago ( 48 total submitted )
283
284 $message = "Job listed submitted longer than 48 hours";
285 write_log( "$self: $message - id: $gfacID" );
286 mail_to_admin( "hang", "$message - id: $gfacID" );
287 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
[35]288 $result = mysqli_query( $gLink, $query );
[1]289
290 if ( ! $result )
[35]291 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]292
293 update_queue_messages( $message );
294 update_db( $message );
295}
296
297function running( $updatetime )
298{
299 global $self;
300 global $gLink;
301 global $gfacID;
[6]302 global $loghdr;
[1]303
304 $now = time();
305
306 get_us3_data();
307
308 if ( $updatetime + 600 > $now ) return; // message received < 10 minutes ago
309
310 if ( $updatetime + 86400 > $now ) // Within the first 24 hours
311 {
312 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
313 $job_status = get_local_status( $gfacID );
314
315 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
316 return;
317
[6]318 if ( ! in_array( $job_status, array( 'ACTIVE', 'RUNNING', 'STARTED' ) ) )
[1]319 update_job_status( $job_status, $gfacID );
320
321 return;
322 }
323
324 $message = "Job listed running longer than 24 hours";
325 write_log( "$self: $message - id: $gfacID" );
326 mail_to_admin( "hang", "$message - id: $gfacID" );
327 $query = "UPDATE analysis SET status='RUN_TIMEOUT' WHERE gfacID='$gfacID'";
[35]328 $result = mysqli_query( $gLink, $query );
[1]329
330 if ( ! $result )
[35]331 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]332
333 update_queue_messages( $message );
334 update_db( $message );
335}
336
337function run_timeout( $updatetime )
338{
339 global $self;
340 global $gLink;
341 global $gfacID;
[6]342 global $loghdr;
[1]343
344 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
345 $job_status = get_local_status( $gfacID );
346
347 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
348 return;
349
[6]350 if ( ! in_array( $job_status, array( 'ACTIVE', 'RUNNING', 'STARTED' ) ) )
[1]351 {
352 update_job_status( $job_status, $gfacID );
353 return;
354 }
355
356 $now = time();
357
358 get_us3_data();
359
360 if ( $updatetime + 172800 > $now ) return; // < 48 hours ago
361
362 $message = "Job listed running longer than 48 hours";
363 write_log( "$self: $message - id: $gfacID" );
364 mail_to_admin( "hang", "$message - id: $gfacID" );
365 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
[35]366 $result = mysqli_query( $gLink, $query );
[1]367
368 if ( ! $result )
[35]369 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]370
371 update_queue_messages( $message );
372 update_db( $message );
373}
374
375function wait_data( $updatetime )
376{
377 global $self;
378 global $gLink;
379 global $gfacID;
[6]380 global $loghdr;
[1]381
382 $now = time();
383
384 if ( $updatetime + 3600 > $now ) // < Within the first hour
385 {
386 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
387 $job_status = get_local_status( $gfacID );
388
389 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
390 return;
391
392 if ( $job_status != 'DATA' )
393 {
394 update_job_status( $job_status, $gfacID );
395 return;
396 }
397
398 // Request to resend data, but only request every 5 minutes
399 $minute = date( 'i' ) * 1; // Makes it an int
400 if ( $minute % 5 ) return;
401
402 $output_status = get_gfac_outputs( $gfacID );
403
404 if ( $output_status !== false )
405 mail_to_admin( "debug", "wait_data/$gfacID/$output_status" );
406
407 return;
408 }
409
410 $message = "Waiting for data longer than 1 hour";
411 write_log( "$self: $message - id: $gfacID" );
412 mail_to_admin( "hang", "$message - id: $gfacID" );
413 $query = "UPDATE analysis SET status='DATA_TIMEOUT' WHERE gfacID='$gfacID'";
[35]414 $result = mysqli_query( $gLink, $query );
[1]415
416 if ( ! $result )
[35]417 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]418
419 update_queue_messages( $message );
420 update_db( $message );
421}
422
423function data_timeout( $updatetime )
424{
425 global $self;
426 global $gLink;
427 global $gfacID;
[6]428 global $loghdr;
[1]429
430 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
431 $job_status = get_local_status( $gfacID );
432
433 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
434 return;
435
436 if ( $job_status != 'DATA' )
437 {
438 update_job_status( $job_status, $gfacID );
439 return;
440 }
441
442 $now = time();
443
444 if ( $updatetime + 86400 > $now ) // < 24 hours ago
445 {
446 // Request to resend data, but only request every 15 minutes
447 $minute = date( 'i' ) * 1; // Makes it an int
448 if ( $minute % 15 ) return;
449
450 $output_status = get_gfac_outputs( $gfacID );
451
452 if ( $output_status !== false )
453 mail_to_admin( "debug", "data_timeout/$gfacID/$output_status" );
454
455 return;
456 }
457
458 $message = "Waiting for data longer than 24 hours";
459 write_log( "$self: $message - id: $gfacID" );
460 mail_to_admin( "hang", "$message - id: $gfacID" );
461 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
[35]462 $result = mysqli_query( $gLink, $query );
[1]463
464 if ( ! $result )
[35]465 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]466
467 update_queue_messages( $message );
468 update_db( $message );
469}
470
471function complete()
472{
473 // Just cleanup
474 cleanup();
475}
476
477function failed()
478{
479 // Just cleanup
480 cleanup();
481}
482
483function cleanup()
484{
485 global $self;
486 global $gLink;
487 global $gfacID;
488 global $us3_db;
[6]489 global $loghdr;
[26]490 global $class_dir;
[1]491
492 // Double check that the gfacID exists
493 $query = "SELECT count(*) FROM analysis WHERE gfacID='$gfacID'";
[35]494 $result = mysqli_query( $gLink, $query );
[1]495
496 if ( ! $result )
497 {
[35]498 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
499 mail_to_admin( "fail", "Query failed $query\n" . mysqli_error( $gLink ) );
[1]500 return;
501 }
502
[35]503 list( $count ) = mysqli_fetch_array( $result );
[1]504
[35]505//if ($count==0)
506//write_log( "$loghdr count = $count gfacID = $gfacID" );
[1]507 if ( $count == 0 ) return;
508
509 // Now check the us3 instance
510 $requestID = get_us3_data();
[6]511//write_log( "$loghdr requestID = $requestID gfacID = $gfacID" );
[1]512 if ( $requestID == 0 ) return;
513
[14]514 $me_devel = preg_match( "/class_devel/", $class_dir );
[25]515 $me_local = preg_match( "/class_local/", $class_dir );
[14]516
[10]517 if ( preg_match( "/US3-A/i", $gfacID ) )
[29]518 { // Airavata job: clean up if prod/devel match
[10]519 $job_devel = preg_match( "/US3-ADEV/i", $gfacID );
520 if ( ( !$me_devel && !$job_devel ) ||
521 ( $me_devel && $job_devel ) )
[25]522 { // Job is of same type (prod/devel) as Server: process it
[6]523//write_log( "$loghdr CALLING aira_cleanup()" );
[10]524 aira_cleanup( $us3_db, $requestID, $gLink );
525 }
[6]526//write_log( "$loghdr RTN FR aira_cleanup()" );
527 }
[29]528 else
529 { // Non-airavata job: clean up in a non-aira way
530write_log( "$loghdr calling gfac_cleanup() reqID=$requestID" );
[6]531 gfac_cleanup( $us3_db, $requestID, $gLink );
532 }
[1]533}
534
535// Function to update status of job
536function update_job_status( $job_status, $gfacID )
537{
538 global $gLink;
[6]539 global $query;
540 global $self;
541 global $loghdr;
[1]542
543 switch ( $job_status )
544 {
545 case 'SUBMITTED' :
546 case 'SUBMITED' :
547 case 'INITIALIZED' :
[25]548 case 'UPDATING' :
549 case 'PENDING' :
[1]550 $query = "UPDATE analysis SET status='SUBMITTED' WHERE gfacID='$gfacID'";
[25]551 $message = "Job status request reports job is SUBMITTED";
[1]552 break;
553
[6]554 case 'STARTED' :
555 case 'RUNNING' :
[1]556 case 'ACTIVE' :
557 $query = "UPDATE analysis SET status='RUNNING' WHERE gfacID='$gfacID'";
558 $message = "Job status request reports job is RUNNING";
559 break;
560
[28]561 case 'EXECUTING' :
562 $message = "Job status request reports job is EXECUTING";
563 break;
564
[6]565 case 'FINISHED' :
566 $query = "UPDATE analysis SET status='FINISHED' WHERE gfacID='$gfacID'";
567 $message = "NONE";
568 break;
569
570 case 'DONE' :
571 $query = "UPDATE analysis SET status='DONE' WHERE gfacID='$gfacID'";
572 $message = "NONE";
573 break;
574
[1]575 case 'COMPLETED' :
[6]576 case 'COMPLETE' :
[1]577 $query = "UPDATE analysis SET status='COMPLETE' WHERE gfacID='$gfacID'";
[6]578 $message = "Job status request reports job is COMPLETED";
[1]579 break;
580
[6]581 case 'DATA' :
[1]582 $query = "UPDATE analysis SET status='DATA' WHERE gfacID='$gfacID'";
583 $message = "Job status request reports job is COMPLETE, waiting for data";
584 break;
585
586 case 'CANCELED' :
[6]587 case 'CANCELLED' :
[1]588 $query = "UPDATE analysis SET status='CANCELED' WHERE gfacID='$gfacID'";
589 $message = "Job status request reports job is CANCELED";
590 break;
591
592 case 'FAILED' :
593 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
594 $message = "Job status request reports job is FAILED";
595 break;
596
597 case 'UNKNOWN' :
[6]598write_log( "$loghdr job_status='UNKNOWN', reset to 'ERROR' " );
599 $query = "UPDATE analysis SET status='ERROR' WHERE gfacID='$gfacID'";
[1]600 $message = "Job status request reports job is not in the queue";
601 break;
602
603 default :
[3]604 // We shouldn't ever get here
[6]605 $query = "UPDATE analysis SET status='ERROR' WHERE gfacID='$gfacID'";
[1]606 $message = "Job status was not recognized - $job_status";
[6]607 write_log( "$loghdr update_job_status: " .
[3]608 "Job status was not recognized - $job_status\n" .
609 "gfacID = $gfacID\n" );
[1]610 break;
611
612 }
613
[35]614 $result = mysqli_query( $gLink, $query );
[1]615 if ( ! $result )
[35]616 write_log( "$loghdr Query failed $query - " . mysqli_error( $gLink ) );
[1]617
[6]618 if ( $message != 'NONE' )
619 {
620 update_queue_messages( $message );
621 update_db( $message );
622 }
[1]623}
624
625function get_us3_data()
626{
627 global $self;
628 global $gfacID;
629 global $dbhost;
630 global $user;
631 global $passwd;
632 global $us3_db;
633 global $updateTime;
[6]634 global $loghdr;
[1]635
[35]636 $us3_link = mysqli_connect( $dbhost, $user, $passwd, $us3_db );
[1]637
638 if ( ! $us3_link )
639 {
[35]640 write_log( "$loghdr could not connect: $dbhost, $user, $passwd, $us3_db" );
641 mail_to_admin( "fail", "Could not connect to $dbhost : $us3_db" );
[1]642 return 0;
643 }
644
645 $query = "SELECT HPCAnalysisRequestID, UNIX_TIMESTAMP(updateTime) " .
646 "FROM HPCAnalysisResult WHERE gfacID='$gfacID'";
[35]647 $result = mysqli_query( $us3_link, $query );
[1]648
649 if ( ! $result )
650 {
[35]651 write_log( "$self: Query failed $query - " . mysqli_error( $us3_link ) );
652 mail_to_admin( "fail", "Query failed $query\n" . mysqli_error( $us3_link ) );
[1]653 return 0;
654 }
655
[35]656 list( $requestID, $updateTime ) = mysqli_fetch_array( $result );
657 mysqli_close( $us3_link );
[1]658
659 return $requestID;
660}
661
[6]662// Function to determine if this is a gfac job or not
[1]663function is_gfac_job( $gfacID )
664{
665 $hex = "[0-9a-fA-F]";
666 if ( ! preg_match( "/^US3-Experiment/i", $gfacID ) &&
667 ! preg_match( "/^US3-$hex{8}-$hex{4}-$hex{4}-$hex{4}-$hex{12}$/", $gfacID ) )
668 {
669 // Then it's not a GFAC job
670 return false;
671 }
672
673 return true;
674}
675
[6]676// Function to determine if this is an airavata/thrift job or not
677function is_aira_job( $gfacID )
678{
679 global $cluster;
680
[15]681 if ( preg_match( "/US3-A/i", $gfacID ) )
[6]682 {
683 // Then it's an Airavata/Thrift job
684 return true;
685 }
686
687 return false;
688}
689
[1]690// Function to get the current job status from GFAC
691function get_gfac_status( $gfacID )
692{
693 global $serviceURL;
[6]694 global $self;
695 global $loghdr;
696 global $cluster;
[18]697 global $status_ex, $status_gw;
[1]698
[6]699 if ( is_aira_job( $gfacID ) )
700 {
701 $status_ex = getExperimentStatus( $gfacID );
[17]702
703 if ( $status_ex == 'EXECUTING' )
704 {
[18]705 if ( $status_gw == 'RUNNING' )
[17]706 $status_ex = 'ACTIVE';
[22]707 else
708 $status_ex = 'QUEUED';
[17]709 }
710
[6]711 $gfac_status = standard_status( $status_ex );
712 return $gfac_status;
713 }
714
715 else if ( ! is_gfac_job( $gfacID ) )
[25]716 {
[1]717 return false;
[25]718 }
[1]719
720 $url = "$serviceURL/jobstatus/$gfacID";
721 try
722 {
723 $post = new HttpRequest( $url, HttpRequest::METH_GET );
724 $http = $post->send();
725 $xml = $post->getResponseBody();
726 }
727 catch ( HttpException $e )
728 {
[6]729 write_log( "$loghdr Status not available - marking failed - $gfacID" );
[1]730 return 'GFAC_STATUS_UNAVAILABLE';
731 }
732
733 // Parse the result
734 $gfac_status = parse_response( $xml );
735
[3]736 // This may not seem like the best place to do this, but here we have
737 // the xml straight from GFAC
738 $status_types = array('SUBMITTED',
739 'SUBMITED',
740 'INITIALIZED',
741 'PENDING',
[6]742 'RUNNING',
[3]743 'ACTIVE',
[6]744 'STARTED',
[3]745 'COMPLETED',
[6]746 'FINISHED',
[3]747 'DONE',
748 'DATA',
[6]749 'RESULTS_GEN',
[3]750 'CANCELED',
751 'CANCELLED',
752 'FAILED',
[6]753 'STAGING',
[3]754 'UNKNOWN');
755 if ( ! in_array( $gfac_status, $status_types ) )
756 mail_to_admin( 'debug', "gfacID: /$gfacID/\n" .
757 "XML: /$xml/\n" .
758 "Status: /$gfac_status/\n" );
759
[6]760 if ( in_array( $gfac_status, array( 'DONE', 'DATA', 'RESULTS_GEN' ) ) )
761 $gfac_status = 'DATA';
762
[1]763 return $gfac_status;
764}
765
766// Function to request data outputs from GFAC
767function get_gfac_outputs( $gfacID )
768{
769 global $serviceURL;
[6]770 global $self;
[1]771
772 // Make sure it's a GFAC job and status is appropriate for this call
773 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
774 {
775 // Then it's not a GFAC job
[25]776 $job_status = get_local_status( $gfacID );
777 return $job_status;
[1]778 }
779
[6]780 if ( ! in_array( $job_status, array( 'DONE', 'FAILED', 'COMPLETE', 'FINISHED' ) ) )
[1]781 {
782 // Then it's not appropriate to request data
783 return false;
784 }
785
786 $url = "$serviceURL/registeroutput/$gfacID";
787 try
788 {
789 $post = new HttpRequest( $url, HttpRequest::METH_GET );
790 $http = $post->send();
791 $xml = $post->getResponseBody();
792 }
793 catch ( HttpException $e )
794 {
795 write_log( "$self: Data not available - request failed - $gfacID" );
796 return false;
797 }
798
799 mail_to_admin( "debug", "get_gfac_outputs/\n$xml/" ); // Temporary, to see what the xml looks like,
800 // if we ever get one
801
802 // Parse the result
803 $gfac_status = parse_response( $xml );
804
805 return $gfac_status;
806}
807
808function parse_response( $xml )
809{
810 global $gfac_message;
811
812 $status = "";
813 $gfac_message = "";
814
815 $parser = new XMLReader();
816 $parser->xml( $xml );
817
818 while( $parser->read() )
819 {
820 $type = $parser->nodeType;
821
822 if ( $type == XMLReader::ELEMENT )
823 $name = $parser->name;
824
825 else if ( $type == XMLReader::TEXT )
826 {
827 if ( $name == "status" )
828 $status = $parser->value;
829 else
830 $gfac_message = $parser->value;
831 }
832 }
833
834 $parser->close();
835 return $status;
836}
837
838// Function to get status from local cluster
839function get_local_status( $gfacID )
840{
841 global $cluster;
[6]842 global $self;
[1]843
[31]844 $is_jetstr = preg_match( "/jetstream/", $cluster );
[41]845 $is_demeler3 = preg_match( "/demeler3/", $cluster );
846
[31]847 if ( $is_jetstr )
[32]848 $cmd = "squeue -j $gfacID 2>&1|tail -n 1";
[31]849 else
850 $cmd = "/usr/bin/qstat -a $gfacID 2>&1|tail -n 1";
[28]851//write_log( "$self cmd: $cmd" );
852//write_log( "$self cluster: $cluster" );
853//write_log( "$self gfacID: $gfacID" );
[31]854
[25]855 if ( ! preg_match( "/us3iab/", $cluster ) )
856 {
857 $system = "$cluster.uthscsa.edu";
[31]858 if ( $is_jetstr )
859 $system = "$cluster";
[25]860 $system = preg_replace( "/\-local/", "", $system );
[41]861
862 if ( $is_demeler3 )
863 {
864 $system = "demeler3.uleth.ca";
865 }
866
[25]867 $cmd = "/usr/bin/ssh -x us3@$system " . $cmd;
[28]868//write_log( "$self cmd: $cmd" );
[25]869 }
[1]870
871 $result = exec( $cmd );
[28]872//write_log( "$self result: $result" );
[1]873
[31]874 $secwait = 2;
875 $num_try = 0;
876 // Sleep and retry up to 3 times if ssh has "ssh_exchange_identification" error
877 while ( preg_match( "/ssh_exchange_id/", $result ) && $num_try < 3 )
[1]878 {
[31]879 sleep( $secwait );
880 $num_try++;
881 $secwait *= 2;
882write_log( "$me: num_try=$num_try secwait=$secwait" );
883 }
[35]884
[32]885 if ( preg_match( "/^qstat: Unknown/", $result ) ||
[31]886 preg_match( "/ssh_exchange_id/", $result ) )
887 {
[35]888 write_log( "$self get_local_status: Local job $gfacID unknown result=$result" );
[1]889 return 'UNKNOWN';
890 }
891
892 $values = preg_split( "/\s+/", $result );
[32]893 $jstat = ( $is_jetstr == 0 ) ? $values[ 9 ] : $values[ 5 ];
[31]894//write_log( "$self: get_local_status: job status = /$jstat/");
895 switch ( $jstat )
[1]896 {
897 case "W" : // Waiting for execution time to be reached
898 case "E" : // Job is exiting after having run
899 case "R" : // Still running
[32]900 case "CG" : // Job is completing
[1]901 $status = 'ACTIVE';
902 break;
903
904 case "C" : // Job has completed
[32]905 case "ST" : // Job has disappeared
906 case "CD" : // Job has completed
[1]907 $status = 'COMPLETED';
908 break;
909
910 case "T" : // Job is being moved
911 case "H" : // Held
912 case "Q" : // Queued
[31]913 case "PD" : // Queued
[32]914 case "CF" : // Queued
[1]915 $status = 'SUBMITTED';
916 break;
917
[32]918 case "CA" : // Job has been canceled
919 $status = 'CANCELED';
920 break;
921
922 case "F" : // Job has failed
923 case "BF" : // Job has failed
924 case "NF" : // Job has failed
925 case "TO" : // Job has timed out
926 case "" : // Job has disappeared
927 $status = 'FAILED';
928 break;
929
[1]930 default :
931 $status = 'UNKNOWN'; // This should not occur
932 break;
933 }
934
935 return $status;
936}
937
938function update_queue_messages( $message )
939{
940 global $self;
941 global $gLink;
942 global $gfacID;
943
944 // Get analysis table ID
945 $query = "SELECT id FROM analysis " .
946 "WHERE gfacID = '$gfacID' ";
[35]947 $result = mysqli_query( $gLink, $query );
[1]948 if ( ! $result )
949 {
[35]950 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]951 return;
952 }
[35]953 list( $analysisID ) = mysqli_fetch_array( $result );
[1]954
955 // Insert message into queue_message table
956 $query = "INSERT INTO queue_messages SET " .
[35]957 "message = '" . mysqli_real_escape_string( $gLink, $message ) . "', " .
[6]958 "analysisID = '$analysisID' ";
[35]959 $result = mysqli_query( $gLink, $query );
[1]960 if ( ! $result )
961 {
[35]962 write_log( "$self: Query failed $query - " . mysqli_error( $gLink ) );
[1]963 return;
964 }
965}
966
967function update_db( $message )
968{
969 global $self;
970 global $gfacID;
971 global $dbhost;
972 global $user;
973 global $passwd;
974 global $us3_db;
975
[35]976 $us3_link = mysqli_connect( $dbhost, $user, $passwd, $us3_db );
[1]977
978 if ( ! $us3_link )
979 {
980 write_log( "$self: could not connect: $dbhost, $user, $passwd" );
[35]981 mail_to_admin( "fail", "Could not connect to $dbhost : $us3_db" );
[1]982 return 0;
983 }
984
985 $query = "UPDATE HPCAnalysisResult SET " .
[35]986 "lastMessage='" . mysqli_real_escape_string( $us3_link, $message ) . "'" .
[1]987 "WHERE gfacID = '$gfacID' ";
988
[35]989 mysqli_query( $us3_link, $query );
990 mysqli_close( $us3_link );
[1]991}
992
993function mail_to_admin( $type, $msg )
994{
995 global $updateTime;
996 global $status;
997 global $cluster;
998 global $org_name;
999 global $admin_email;
1000 global $dbhost;
1001 global $requestID;
1002
1003 $headers = "From: $org_name Admin<$admin_email>" . "\n";
1004 $headers .= "Cc: $org_name Admin<$admin_email>" . "\n";
[41]1005 $headers .= "Cc: $org_name Admin<alexsav.science@gmail.com>" . "\n";
[6]1006 $headers .= "Bcc: Gary Gorbet<gegorbet@gmail.com>" . "\n"; // make sure
[1]1007
[41]1008
[1]1009 // Set the reply address
1010 $headers .= "Reply-To: $org_name<$admin_email>" . "\n";
1011 $headers .= "Return-Path: $org_name<$admin_email>" . "\n";
1012
1013 // Try to avoid spam filters
1014 $now = time();
[40]1015 $tnow = date( 'Y-m-d H:i:s' );
[1]1016 $headers .= "Message-ID: <" . $now . "gridctl@$dbhost>$requestID\n";
1017 $headers .= "X-Mailer: PHP v" . phpversion() . "\n";
1018 $headers .= "MIME-Version: 1.0" . "\n";
1019 $headers .= "Content-Transfer-Encoding: 8bit" . "\n";
1020
1021 $subject = "US3 Error Notification";
1022 $message = "
1023 UltraScan job error notification from gridctl.php:
1024
[40]1025 Update Time : $updateTime [ now=$tnow ]
[1]1026 GFAC Status : $status
1027 Cluster : $cluster
1028 ";
1029
1030 $message .= "Error Message : $msg\n";
1031
1032 mail( $admin_email, $subject, $message, $headers );
1033}
[6]1034
1035// Convert a status string to one of the standard DB status strings
1036function standard_status( $status_in )
1037{
1038 switch ( $status_in )
1039 { // Map variations to standard gateway status values
1040 case 'QUEUED' :
1041 case 'LAUNCHED' :
1042 case 'CREATED' :
1043 case 'VALIDATED' :
1044 case 'SCHEDULED' :
1045 case 'submitted' :
[28]1046 case 'SUBMITTED' :
[6]1047 case '' :
1048 $status = 'SUBMITTED';
1049 break;
1050
1051 case 'EXECUTING' :
1052 case 'ACTIVE' :
1053 case 'running' :
1054 case 'executing' :
1055 $status = 'RUNNING';
1056 break;
1057
1058 case 'PENDING' :
1059 case 'CANCELING' :
1060 $status = 'UPDATING';
1061 break;
1062
1063 case 'CANCELLED' :
1064 case 'canceled' :
1065 $status = 'CANCELED';
1066 break;
1067
[39]1068// $status = 'DATA';
1069// break;
[25]1070
[6]1071 case 'COMPLETED' :
1072 case 'completed' :
1073 $status = 'COMPLETE';
1074 break;
1075
1076 case 'FAILED_DATA' :
1077 case 'SUBMIT_TIMEOUT' :
1078 case 'RUN_TIMEOUT' :
1079 case 'DATA_TIMEOUT' :
1080 $status = 'FAILED';
1081 break;
1082
1083 case 'COMPLETE' :
1084 $status = 'DONE';
1085 break;
1086
1087 case 'UNKNOWN' :
1088 $status = 'ERROR';
1089 break;
1090
1091 // Where already standard value, retain value
1092 case 'ERROR' :
1093 case 'RUNNING' :
1094 case 'SUBMITTED' :
1095 case 'UPDATING' :
1096 case 'CANCELED' :
1097 case 'DATA' :
1098 case 'FAILED' :
1099 case 'DONE' :
1100 case 'FINISHED' :
1101 default :
1102 $status = $status_in;
1103 break;
1104 }
1105
1106 return $status;
1107}
1108
1109function aira_status( $gfacID, $status_in )
1110{
1111 global $self;
1112 global $loghdr;
[26]1113 global $class_dir;
[6]1114//echo "a_st: st_in$status_in : $gfacID\n";
1115 //$status_gw = standard_status( $status_in );
1116 $status_gw = $status_in;
1117//echo "a_st: st_db=$status_gw\n";
1118 $status = $status_gw;
[10]1119 $me_devel = preg_match( "/class_devel/", $class_dir );
1120 $job_devel = preg_match( "/US3-ADEV/i", $gfacID );
1121 $devmatch = ( ( !$me_devel && !$job_devel ) ||
1122 ( $me_devel && $job_devel ) );
[6]1123
[10]1124 if ( preg_match( "/US3-A/i", $gfacID ) && $devmatch )
[25]1125 {
[31]1126//write_log( "$loghdr status_in=$status_in status=$status gfacID=$gfacID" );
[6]1127 $status_ex = getExperimentStatus( $gfacID );
[31]1128//write_log( "$loghdr status_ex=$status_ex" );
[6]1129
1130 if ( $status_ex == 'COMPLETED' )
1131 { // Experiment is COMPLETED: check for 'FINISHED' or 'DONE'
1132 if ( $status_gw == 'FINISHED' || $status_gw == 'DONE' )
1133 { // COMPLETED + FINISHED/DONE : gateway status is now COMPLETE
1134 $status = 'COMPLETE';
1135 }
1136
1137 else
1138 { // COMPLETED + NOT-FINISHED/DONE: gw status now DONE
1139 $status = 'DONE';
1140 }
1141 }
1142
1143 else if ( $status_gw == 'FINISHED' || $status_gw == 'DONE' )
1144 { // Gfac status == FINISHED/DONE: leave as is (unless FAILED)
1145 $status = $status_gw;
1146 if ( $status_ex == 'FAILED' )
1147 {
1148 sleep( 10 );
1149 $status_ex = getExperimentStatus( $gfacID );
1150 if ( $status_ex == 'FAILED' )
1151 {
1152 write_log( "$loghdr status still 'FAILED' after 10-second delay" );
1153 sleep( 10 );
1154 $status_ex = getExperimentStatus( $gfacID );
1155 if ( $status_ex == 'FAILED' )
1156 write_log( "$loghdr status still 'FAILED' after 20-second delay" );
1157 else
1158 write_log( "$loghdr status is $status_ex after 20-second delayed retry" );
1159 }
1160 write_log( "$loghdr status reset to 'COMPLETE'" );
1161 $status = 'COMPLETE';
1162 }
1163 }
1164
[28]1165 else if ( $status_ex == 'EXECUTING' )
1166 {
1167 $status = standard_status( $status_gw );
1168write_log( "$loghdr status/_in/_gw/_ex=$status/$status_in/$status_gw/$status_ex" );
1169 }
1170
[6]1171 else
1172 { // Experiment not COMPLETED/FINISHED/DONE: use experiment status
1173 $status = standard_status( $status_ex );
1174 }
1175
[35]1176//if ( $status != 'SUBMITTED' )
[10]1177//write_log( "$loghdr status/_in/_gw/_ex=$status/$status_in/$status_gw/$status_ex" );
[6]1178 if ( $status != $status_gw )
1179 {
1180 update_job_status( $status, $gfacID );
1181 }
1182 }
1183
1184 return $status;
1185}
1186
[1]1187?>
Note: See TracBrowser for help on using the repository browser.