source: trunk/gridctl.php@ 33

Last change on this file since 33 was 32, checked in by gegorbet, 7 years ago

gridctl updates, mostly for jetstream and 4.0

File size: 32.1 KB
Line 
1<?php
2
3$us3bin = exec( "ls -d ~us3/lims/bin" );
4include_once "$us3bin/listen-config.php";
5//include "$us3bin/cleanup_aira.php";
6//include "$us3bin/cleanup_gfac.php";
7
8// Global variables
9$gfac_message = "";
10$updateTime = 0;
11$submittime = 0;
12$cluster = '';
13
14//global $self;
15global $status_ex, $status_gw;
16
17// Produce some output temporarily, so cron will send me message
18$now = time();
19echo "Time started: " . date( 'Y-m-d H:i:s', $now ) . "\n";
20
21// Get data from global GFAC DB
22$gLink = mysql_connect( $dbhost, $guser, $gpasswd );
23
24if ( ! mysql_select_db( $gDB, $gLink ) )
25{
26 write_log( "$self: Could not select DB $gDB - " . mysql_error() );
27 mail_to_admin( "fail", "Internal Error: Could not select DB $gDB" );
28 exit();
29}
30
31$query = "SELECT gfacID, us3_db, cluster, status, queue_msg, " .
32 "UNIX_TIMESTAMP(time), time from analysis";
33$result = mysql_query( $query, $gLink );
34
35if ( ! $result )
36{
37 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
38 mail_to_admin( "fail", "Query failed $query\n" . mysql_error( $gLink ) );
39 exit();
40}
41
42if ( mysql_num_rows( $result ) == 0 )
43{
44//write_log( "$self: analysis read got numrows==0" );
45 exit(); // Nothing to do
46}
47
48$me_devel = preg_match( "/class_devel/", $class_dir );
49
50while ( list( $gfacID, $us3_db, $cluster, $status, $queue_msg, $time, $updateTime )
51 = mysql_fetch_array( $result ) )
52{
53 // If this entry does not match class/class_devel, skip processing
54
55 if ( preg_match( "/US3-A/i", $gfacID ) )
56 { // For thrift, job and gridctl must match
57 $job_devel = preg_match( "/US3-ADEV/i", $gfacID );
58 if ( ( $me_devel && !$job_devel ) ||
59 ( !$me_devel && $job_devel ) )
60 { // Job type and Airavata server mismatch: skip processing
61 continue;
62 }
63 }
64
65 else if ( $me_devel )
66 { // Local (us3iab/-local) and class_devel: skip processing
67 continue;
68 }
69
70 // Checking we need to do for each entry
71echo "us3db=$us3_db gfid=$gfacID\n";
72//write_log( " us3db=$us3_db gfid=$gfacID" );
73 switch ( $us3_db )
74 {
75 case 'Xuslims3_cauma3' :
76 case 'Xuslims3_cauma3d' :
77 case 'Xuslims3_HHU' :
78 case 'Xuslims3_Uni_KN' :
79 $serviceURL = "http://gridfarm005.ucs.indiana.edu:9090/ogce-rest/job";
80 break;
81
82 default :
83// $serviceURL = "http://gridfarm005.ucs.indiana.edu:8080/ogce-rest/job";
84 break;
85 }
86
87// $awork = array();
88// $awork = explode( "-", $gfacID );
89// $gfacLabl = $awork[0] . "-" . $awork[1] . "-" . $awork[2];
90 $gfacLabl = $gfacID;
91 $loghdr = $self . ":" . $gfacLabl . "...:";
92 $status_ex = $status;
93
94 // If entry is for Airvata/Thrift, get the true current status
95
96 if ( is_aira_job( $gfacID ) )
97 {
98 $status_in = $status;
99//write_log( "$loghdr status_in=$status_in" );
100 $status = aira_status( $gfacID, $status_in );
101if($status != $status_in )
102 write_log( "$loghdr Set to $status from $status_in" );
103//write_log( "$loghdr aira status=$status" );
104 }
105 else if ( is_gfac_job( $gfacID ) )
106 {
107 $status_gw = $status;
108 $status = get_gfac_status( $gfacID );
109 //if ( $status == 'FINISHED' )
110 if ( $status_gw == 'COMPLETE' )
111 $status = $status_gw;
112//write_log( "$loghdr non-AThrift status=$status status_gw=$status_gw" );
113 }
114 else
115 {
116//write_log( "$loghdr Local gfacID=$gfacID" );
117 $status_gw = $status;
118 $status = get_local_status( $gfacID );
119 if ( $status_gw == 'COMPLETE' || $status == 'UNKNOWN' )
120 $status = $status_gw;
121//write_log( "$loghdr Local status=$status status_gw=$status_gw" );
122 }
123
124 // Sometimes during testing, the us3_db entry is not set
125 // If $status == 'ERROR' then the condition has been processed before
126 if ( strlen( $us3_db ) == 0 && $status != 'ERROR' )
127 {
128 write_log( "$loghdr GFAC DB is NULL - $gfacID" );
129 mail_to_admin( "fail", "GFAC DB is NULL\n$gfacID" );
130
131 $query2 = "UPDATE analysis SET status='ERROR' WHERE gfacID='$gfacID'";
132 $result2 = mysql_query( $query2, $gLink );
133 $status = 'ERROR';
134
135 if ( ! $result2 )
136 write_log( "$loghdr Query failed $query2 - " . mysql_error( $gLink ) );
137
138 }
139
140//echo " st=$status\n";
141//write_log( "$loghdr switch status=$status" );
142 switch ( $status )
143 {
144 // Already been handled
145 // Later update this condition to search for gfacID?
146 case "ERROR":
147 cleanup();
148 break;
149
150 case "SUBMITTED":
151 submitted( $time );
152 break;
153
154 case "SUBMIT_TIMEOUT":
155 submit_timeout( $time );
156 break;
157
158 case "RUNNING":
159 case "STARTED":
160 case "STAGING":
161 case "ACTIVE":
162 running( $time );
163 break;
164
165 case "RUN_TIMEOUT":
166 run_timeout($time );
167 break;
168
169 case "DATA":
170 case "RESULTS_GEN":
171 wait_data( $time );
172 break;
173
174 case "DATA_TIMEOUT":
175 data_timeout( $time );
176 break;
177
178 case "COMPLETED":
179 case "COMPLETE":
180//write_log( "$loghdr COMPLETE gfacID=$gfacID" );
181 complete();
182 break;
183
184 case "CANCELLED":
185 case "CANCELED":
186 case "FAILED":
187 failed();
188 break;
189
190 case "FINISHED":
191 case "DONE":
192 if ( ! is_aira_job( $gfacID ) )
193 {
194 complete();
195 }
196write_log( "$loghdr FINISHED gfacID=$gfacID" );
197 case "PROCESSING":
198 default:
199 break;
200 }
201}
202
203exit();
204
205function submitted( $updatetime )
206{
207 global $self;
208 global $gLink;
209 global $gfacID;
210 global $loghdr;
211
212 $now = time();
213
214 if ( $updatetime + 600 > $now ) return; // < 10 minutes ago
215
216 if ( $updatetime + 86400 > $now ) // Within the first 24 hours
217 {
218 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
219 $job_status = get_local_status( $gfacID );
220
221 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
222 return;
223
224 if ( ! in_array( $job_status, array( 'SUBMITTED', 'INITIALIZED', 'PENDING' ) ) )
225 {
226write_log( "$loghdr submitted:job_status=$job_status" );
227 update_job_status( $job_status, $gfacID );
228 }
229
230 return;
231 }
232
233 $message = "Job listed submitted longer than 24 hours";
234 write_log( "$self: $message - id: $gfacID" );
235 mail_to_admin( "hang", "$message - id: $gfacID" );
236 $query = "UPDATE analysis SET status='SUBMIT_TIMEOUT' WHERE gfacID='$gfacID'";
237 $result = mysql_query( $query, $gLink );
238
239 if ( ! $result )
240 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
241
242 update_queue_messages( $message );
243 update_db( $message );
244}
245
246function submit_timeout( $updatetime )
247{
248 global $self;
249 global $gLink;
250 global $gfacID;
251 global $loghdr;
252
253 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
254 $job_status = get_local_status( $gfacID );
255
256 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
257 return;
258
259 if ( ! in_array( $job_status, array( 'SUBMITTED', 'INITIALIZED', 'PENDING' ) ) )
260 {
261 update_job_status( $job_status, $gfacID );
262 return;
263 }
264
265 $now = time();
266
267 if ( $updatetime + 86400 > $now ) return; // < 24 hours ago ( 48 total submitted )
268
269 $message = "Job listed submitted longer than 48 hours";
270 write_log( "$self: $message - id: $gfacID" );
271 mail_to_admin( "hang", "$message - id: $gfacID" );
272 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
273 $result = mysql_query( $query, $gLink );
274
275 if ( ! $result )
276 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
277
278 update_queue_messages( $message );
279 update_db( $message );
280}
281
282function running( $updatetime )
283{
284 global $self;
285 global $gLink;
286 global $gfacID;
287 global $loghdr;
288
289 $now = time();
290
291 get_us3_data();
292
293 if ( $updatetime + 600 > $now ) return; // message received < 10 minutes ago
294
295 if ( $updatetime + 86400 > $now ) // Within the first 24 hours
296 {
297 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
298 $job_status = get_local_status( $gfacID );
299
300 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
301 return;
302
303 if ( ! in_array( $job_status, array( 'ACTIVE', 'RUNNING', 'STARTED' ) ) )
304 update_job_status( $job_status, $gfacID );
305
306 return;
307 }
308
309 $message = "Job listed running longer than 24 hours";
310 write_log( "$self: $message - id: $gfacID" );
311 mail_to_admin( "hang", "$message - id: $gfacID" );
312 $query = "UPDATE analysis SET status='RUN_TIMEOUT' WHERE gfacID='$gfacID'";
313 $result = mysql_query( $query, $gLink );
314
315 if ( ! $result )
316 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
317
318 update_queue_messages( $message );
319 update_db( $message );
320}
321
322function run_timeout( $updatetime )
323{
324 global $self;
325 global $gLink;
326 global $gfacID;
327 global $loghdr;
328
329 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
330 $job_status = get_local_status( $gfacID );
331
332 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
333 return;
334
335 if ( ! in_array( $job_status, array( 'ACTIVE', 'RUNNING', 'STARTED' ) ) )
336 {
337 update_job_status( $job_status, $gfacID );
338 return;
339 }
340
341 $now = time();
342
343 get_us3_data();
344
345 if ( $updatetime + 172800 > $now ) return; // < 48 hours ago
346
347 $message = "Job listed running longer than 48 hours";
348 write_log( "$self: $message - id: $gfacID" );
349 mail_to_admin( "hang", "$message - id: $gfacID" );
350 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
351 $result = mysql_query( $query, $gLink );
352
353 if ( ! $result )
354 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
355
356 update_queue_messages( $message );
357 update_db( $message );
358}
359
360function wait_data( $updatetime )
361{
362 global $self;
363 global $gLink;
364 global $gfacID;
365 global $loghdr;
366
367 $now = time();
368
369 if ( $updatetime + 3600 > $now ) // < Within the first hour
370 {
371 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
372 $job_status = get_local_status( $gfacID );
373
374 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
375 return;
376
377 if ( $job_status != 'DATA' )
378 {
379 update_job_status( $job_status, $gfacID );
380 return;
381 }
382
383 // Request to resend data, but only request every 5 minutes
384 $minute = date( 'i' ) * 1; // Makes it an int
385 if ( $minute % 5 ) return;
386
387 $output_status = get_gfac_outputs( $gfacID );
388
389 if ( $output_status !== false )
390 mail_to_admin( "debug", "wait_data/$gfacID/$output_status" );
391
392 return;
393 }
394
395 $message = "Waiting for data longer than 1 hour";
396 write_log( "$self: $message - id: $gfacID" );
397 mail_to_admin( "hang", "$message - id: $gfacID" );
398 $query = "UPDATE analysis SET status='DATA_TIMEOUT' WHERE gfacID='$gfacID'";
399 $result = mysql_query( $query, $gLink );
400
401 if ( ! $result )
402 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
403
404 update_queue_messages( $message );
405 update_db( $message );
406}
407
408function data_timeout( $updatetime )
409{
410 global $self;
411 global $gLink;
412 global $gfacID;
413 global $loghdr;
414
415 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
416 $job_status = get_local_status( $gfacID );
417
418 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
419 return;
420
421 if ( $job_status != 'DATA' )
422 {
423 update_job_status( $job_status, $gfacID );
424 return;
425 }
426
427 $now = time();
428
429 if ( $updatetime + 86400 > $now ) // < 24 hours ago
430 {
431 // Request to resend data, but only request every 15 minutes
432 $minute = date( 'i' ) * 1; // Makes it an int
433 if ( $minute % 15 ) return;
434
435 $output_status = get_gfac_outputs( $gfacID );
436
437 if ( $output_status !== false )
438 mail_to_admin( "debug", "data_timeout/$gfacID/$output_status" );
439
440 return;
441 }
442
443 $message = "Waiting for data longer than 24 hours";
444 write_log( "$self: $message - id: $gfacID" );
445 mail_to_admin( "hang", "$message - id: $gfacID" );
446 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
447 $result = mysql_query( $query, $gLink );
448
449 if ( ! $result )
450 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
451
452 update_queue_messages( $message );
453 update_db( $message );
454}
455
456function complete()
457{
458 // Just cleanup
459 cleanup();
460}
461
462function failed()
463{
464 // Just cleanup
465 cleanup();
466}
467
468function cleanup()
469{
470 global $self;
471 global $gLink;
472 global $gfacID;
473 global $us3_db;
474 global $loghdr;
475 global $class_dir;
476
477 // Double check that the gfacID exists
478 $query = "SELECT count(*) FROM analysis WHERE gfacID='$gfacID'";
479 $result = mysql_query( $query, $gLink );
480
481 if ( ! $result )
482 {
483 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
484 mail_to_admin( "fail", "Query failed $query\n" . mysql_error( $gLink ) );
485 return;
486 }
487
488 list( $count ) = mysql_fetch_array( $result );
489
490if ($count==0)
491write_log( "$loghdr count = $count gfacID = $gfacID" );
492 if ( $count == 0 ) return;
493
494 // Now check the us3 instance
495 $requestID = get_us3_data();
496//write_log( "$loghdr requestID = $requestID gfacID = $gfacID" );
497 if ( $requestID == 0 ) return;
498
499 $me_devel = preg_match( "/class_devel/", $class_dir );
500 $me_local = preg_match( "/class_local/", $class_dir );
501
502 if ( preg_match( "/US3-A/i", $gfacID ) )
503 { // Airavata job: clean up if prod/devel match
504 $job_devel = preg_match( "/US3-ADEV/i", $gfacID );
505 if ( ( !$me_devel && !$job_devel ) ||
506 ( $me_devel && $job_devel ) )
507 { // Job is of same type (prod/devel) as Server: process it
508//write_log( "$loghdr CALLING aira_cleanup()" );
509 aira_cleanup( $us3_db, $requestID, $gLink );
510 }
511//write_log( "$loghdr RTN FR aira_cleanup()" );
512 }
513 else
514 { // Non-airavata job: clean up in a non-aira way
515write_log( "$loghdr calling gfac_cleanup() reqID=$requestID" );
516 gfac_cleanup( $us3_db, $requestID, $gLink );
517 }
518}
519
520// Function to update status of job
521function update_job_status( $job_status, $gfacID )
522{
523 global $gLink;
524 global $query;
525 global $self;
526 global $loghdr;
527
528 switch ( $job_status )
529 {
530 case 'SUBMITTED' :
531 case 'SUBMITED' :
532 case 'INITIALIZED' :
533 case 'UPDATING' :
534 case 'PENDING' :
535 $query = "UPDATE analysis SET status='SUBMITTED' WHERE gfacID='$gfacID'";
536 $message = "Job status request reports job is SUBMITTED";
537 break;
538
539 case 'STARTED' :
540 case 'RUNNING' :
541 case 'ACTIVE' :
542 $query = "UPDATE analysis SET status='RUNNING' WHERE gfacID='$gfacID'";
543 $message = "Job status request reports job is RUNNING";
544 break;
545
546 case 'EXECUTING' :
547 $message = "Job status request reports job is EXECUTING";
548 break;
549
550 case 'FINISHED' :
551 $query = "UPDATE analysis SET status='FINISHED' WHERE gfacID='$gfacID'";
552 $message = "NONE";
553 break;
554
555 case 'DONE' :
556 $query = "UPDATE analysis SET status='DONE' WHERE gfacID='$gfacID'";
557 $message = "NONE";
558 break;
559
560 case 'COMPLETED' :
561 case 'COMPLETE' :
562 $query = "UPDATE analysis SET status='COMPLETE' WHERE gfacID='$gfacID'";
563 $message = "Job status request reports job is COMPLETED";
564 break;
565
566 case 'DATA' :
567 $query = "UPDATE analysis SET status='DATA' WHERE gfacID='$gfacID'";
568 $message = "Job status request reports job is COMPLETE, waiting for data";
569 break;
570
571 case 'CANCELED' :
572 case 'CANCELLED' :
573 $query = "UPDATE analysis SET status='CANCELED' WHERE gfacID='$gfacID'";
574 $message = "Job status request reports job is CANCELED";
575 break;
576
577 case 'FAILED' :
578 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
579 $message = "Job status request reports job is FAILED";
580 break;
581
582 case 'UNKNOWN' :
583write_log( "$loghdr job_status='UNKNOWN', reset to 'ERROR' " );
584 $query = "UPDATE analysis SET status='ERROR' WHERE gfacID='$gfacID'";
585 $message = "Job status request reports job is not in the queue";
586 break;
587
588 default :
589 // We shouldn't ever get here
590 $query = "UPDATE analysis SET status='ERROR' WHERE gfacID='$gfacID'";
591 $message = "Job status was not recognized - $job_status";
592 write_log( "$loghdr update_job_status: " .
593 "Job status was not recognized - $job_status\n" .
594 "gfacID = $gfacID\n" );
595 break;
596
597 }
598
599 $result = mysql_query( $query, $gLink );
600 if ( ! $result )
601 write_log( "$loghdr Query failed $query - " . mysql_error( $gLink ) );
602
603 if ( $message != 'NONE' )
604 {
605 update_queue_messages( $message );
606 update_db( $message );
607 }
608}
609
610function get_us3_data()
611{
612 global $self;
613 global $gfacID;
614 global $dbhost;
615 global $user;
616 global $passwd;
617 global $us3_db;
618 global $updateTime;
619 global $loghdr;
620
621 $us3_link = mysql_connect( $dbhost, $user, $passwd );
622
623 if ( ! $us3_link )
624 {
625 write_log( "$loghdr could not connect: $dbhost, $user, $passwd" );
626 mail_to_admin( "fail", "Could not connect to $dbhost" );
627 return 0;
628 }
629
630
631 $result = mysql_select_db( $us3_db, $us3_link );
632
633 if ( ! $result )
634 {
635 write_log( "$loghdr could not select DB $us3_db" );
636 mail_to_admin( "fail", "Could not select DB $us3_db, $dbhost, $user, $passwd" );
637 return 0;
638 }
639
640 $query = "SELECT HPCAnalysisRequestID, UNIX_TIMESTAMP(updateTime) " .
641 "FROM HPCAnalysisResult WHERE gfacID='$gfacID'";
642 $result = mysql_query( $query, $us3_link );
643
644 if ( ! $result )
645 {
646 write_log( "$self: Query failed $query - " . mysql_error( $us3_link ) );
647 mail_to_admin( "fail", "Query failed $query\n" . mysql_error( $us3_link ) );
648 return 0;
649 }
650
651 list( $requestID, $updateTime ) = mysql_fetch_array( $result );
652 mysql_close( $us3_link );
653
654 return $requestID;
655}
656
657// Function to determine if this is a gfac job or not
658function is_gfac_job( $gfacID )
659{
660 $hex = "[0-9a-fA-F]";
661 if ( ! preg_match( "/^US3-Experiment/i", $gfacID ) &&
662 ! preg_match( "/^US3-$hex{8}-$hex{4}-$hex{4}-$hex{4}-$hex{12}$/", $gfacID ) )
663 {
664 // Then it's not a GFAC job
665 return false;
666 }
667
668 return true;
669}
670
671// Function to determine if this is an airavata/thrift job or not
672function is_aira_job( $gfacID )
673{
674 global $cluster;
675
676 if ( preg_match( "/US3-A/i", $gfacID ) )
677 {
678 // Then it's an Airavata/Thrift job
679 return true;
680 }
681
682 return false;
683}
684
685// Function to get the current job status from GFAC
686function get_gfac_status( $gfacID )
687{
688 global $serviceURL;
689 global $self;
690 global $loghdr;
691 global $cluster;
692 global $status_ex, $status_gw;
693
694 if ( is_aira_job( $gfacID ) )
695 {
696 $status_ex = getExperimentStatus( $gfacID );
697
698 if ( $status_ex == 'EXECUTING' )
699 {
700 if ( $status_gw == 'RUNNING' )
701 $status_ex = 'ACTIVE';
702 else
703 $status_ex = 'QUEUED';
704 }
705
706 $gfac_status = standard_status( $status_ex );
707 return $gfac_status;
708 }
709
710 else if ( ! is_gfac_job( $gfacID ) )
711 {
712 return false;
713 }
714
715 $url = "$serviceURL/jobstatus/$gfacID";
716 try
717 {
718 $post = new HttpRequest( $url, HttpRequest::METH_GET );
719 $http = $post->send();
720 $xml = $post->getResponseBody();
721 }
722 catch ( HttpException $e )
723 {
724 write_log( "$loghdr Status not available - marking failed - $gfacID" );
725 return 'GFAC_STATUS_UNAVAILABLE';
726 }
727
728 // Parse the result
729 $gfac_status = parse_response( $xml );
730
731 // This may not seem like the best place to do this, but here we have
732 // the xml straight from GFAC
733 $status_types = array('SUBMITTED',
734 'SUBMITED',
735 'INITIALIZED',
736 'PENDING',
737 'RUNNING',
738 'ACTIVE',
739 'STARTED',
740 'COMPLETED',
741 'FINISHED',
742 'DONE',
743 'DATA',
744 'RESULTS_GEN',
745 'CANCELED',
746 'CANCELLED',
747 'FAILED',
748 'STAGING',
749 'UNKNOWN');
750 if ( ! in_array( $gfac_status, $status_types ) )
751 mail_to_admin( 'debug', "gfacID: /$gfacID/\n" .
752 "XML: /$xml/\n" .
753 "Status: /$gfac_status/\n" );
754
755 if ( in_array( $gfac_status, array( 'DONE', 'DATA', 'RESULTS_GEN' ) ) )
756 $gfac_status = 'DATA';
757
758 return $gfac_status;
759}
760
761// Function to request data outputs from GFAC
762function get_gfac_outputs( $gfacID )
763{
764 global $serviceURL;
765 global $self;
766
767 // Make sure it's a GFAC job and status is appropriate for this call
768 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
769 {
770 // Then it's not a GFAC job
771 $job_status = get_local_status( $gfacID );
772 return $job_status;
773 }
774
775 if ( ! in_array( $job_status, array( 'DONE', 'FAILED', 'COMPLETE', 'FINISHED' ) ) )
776 {
777 // Then it's not appropriate to request data
778 return false;
779 }
780
781 $url = "$serviceURL/registeroutput/$gfacID";
782 try
783 {
784 $post = new HttpRequest( $url, HttpRequest::METH_GET );
785 $http = $post->send();
786 $xml = $post->getResponseBody();
787 }
788 catch ( HttpException $e )
789 {
790 write_log( "$self: Data not available - request failed - $gfacID" );
791 return false;
792 }
793
794 mail_to_admin( "debug", "get_gfac_outputs/\n$xml/" ); // Temporary, to see what the xml looks like,
795 // if we ever get one
796
797 // Parse the result
798 $gfac_status = parse_response( $xml );
799
800 return $gfac_status;
801}
802
803function parse_response( $xml )
804{
805 global $gfac_message;
806
807 $status = "";
808 $gfac_message = "";
809
810 $parser = new XMLReader();
811 $parser->xml( $xml );
812
813 while( $parser->read() )
814 {
815 $type = $parser->nodeType;
816
817 if ( $type == XMLReader::ELEMENT )
818 $name = $parser->name;
819
820 else if ( $type == XMLReader::TEXT )
821 {
822 if ( $name == "status" )
823 $status = $parser->value;
824 else
825 $gfac_message = $parser->value;
826 }
827 }
828
829 $parser->close();
830 return $status;
831}
832
833// Function to get status from local cluster
834function get_local_status( $gfacID )
835{
836 global $cluster;
837 global $self;
838
839 $is_jetstr = preg_match( "/jetstream/", $cluster );
840 if ( $is_jetstr )
841 $cmd = "squeue -j $gfacID 2>&1|tail -n 1";
842 else
843 $cmd = "/usr/bin/qstat -a $gfacID 2>&1|tail -n 1";
844//write_log( "$self cmd: $cmd" );
845//write_log( "$self cluster: $cluster" );
846//write_log( "$self gfacID: $gfacID" );
847
848 if ( ! preg_match( "/us3iab/", $cluster ) )
849 {
850 $system = "$cluster.uthscsa.edu";
851 if ( $is_jetstr )
852 $system = "$cluster";
853 $system = preg_replace( "/\-local/", "", $system );
854 $cmd = "/usr/bin/ssh -x us3@$system " . $cmd;
855//write_log( "$self cmd: $cmd" );
856 }
857
858 $result = exec( $cmd );
859//write_log( "$self result: $result" );
860
861///////////////////////////////////////////////////////////////////
862 $secwait = 2;
863 $num_try = 0;
864 // Sleep and retry up to 3 times if ssh has "ssh_exchange_identification" error
865 while ( preg_match( "/ssh_exchange_id/", $result ) && $num_try < 3 )
866 {
867 sleep( $secwait );
868 $num_try++;
869 $secwait *= 2;
870write_log( "$me: num_try=$num_try secwait=$secwait" );
871 }
872///////////////////////////////////////////////////////////////////
873// if ( $result == "" ||
874// preg_match( "/^qstat: Unknown/", $result ) ||
875// preg_match( "/ssh_exchange_id/", $result ) )
876 if ( preg_match( "/^qstat: Unknown/", $result ) ||
877 preg_match( "/ssh_exchange_id/", $result ) )
878 {
879 write_log( "$self get_local_status: Local job $gfacID unknown" );
880//write_log( "$self get_local_status: result=$result" );
881 return 'UNKNOWN';
882 }
883
884 $values = preg_split( "/\s+/", $result );
885 $jstat = ( $is_jetstr == 0 ) ? $values[ 9 ] : $values[ 5 ];
886//write_log( "$self: get_local_status: job status = /$jstat/");
887 switch ( $jstat )
888 {
889 case "W" : // Waiting for execution time to be reached
890 case "E" : // Job is exiting after having run
891 case "R" : // Still running
892 case "CG" : // Job is completing
893 $status = 'ACTIVE';
894 break;
895
896 case "C" : // Job has completed
897 case "ST" : // Job has disappeared
898 case "CD" : // Job has completed
899 $status = 'COMPLETED';
900 break;
901
902 case "T" : // Job is being moved
903 case "H" : // Held
904 case "Q" : // Queued
905 case "PD" : // Queued
906 case "CF" : // Queued
907 $status = 'SUBMITTED';
908 break;
909
910 case "CA" : // Job has been canceled
911 $status = 'CANCELED';
912 break;
913
914 case "F" : // Job has failed
915 case "BF" : // Job has failed
916 case "NF" : // Job has failed
917 case "TO" : // Job has timed out
918 case "" : // Job has disappeared
919 $status = 'FAILED';
920 break;
921
922 default :
923 $status = 'UNKNOWN'; // This should not occur
924 break;
925 }
926
927 return $status;
928}
929
930function update_queue_messages( $message )
931{
932 global $self;
933 global $gLink;
934 global $gfacID;
935
936 // Get analysis table ID
937 $query = "SELECT id FROM analysis " .
938 "WHERE gfacID = '$gfacID' ";
939 $result = mysql_query( $query, $gLink );
940 if ( ! $result )
941 {
942 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
943 return;
944 }
945 list( $analysisID ) = mysql_fetch_array( $result );
946
947 // Insert message into queue_message table
948 $query = "INSERT INTO queue_messages SET " .
949 "message = '" . mysql_real_escape_string( $message, $gLink ) . "', " .
950 "analysisID = '$analysisID' ";
951 $result = mysql_query( $query, $gLink );
952 if ( ! $result )
953 {
954 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
955 return;
956 }
957}
958
959function update_db( $message )
960{
961 global $self;
962 global $gfacID;
963 global $dbhost;
964 global $user;
965 global $passwd;
966 global $us3_db;
967
968 $us3_link = mysql_connect( $dbhost, $user, $passwd );
969
970 if ( ! $us3_link )
971 {
972 write_log( "$self: could not connect: $dbhost, $user, $passwd" );
973 mail_to_admin( "fail", "Could not connect to $dbhost" );
974 return 0;
975 }
976
977
978 $result = mysql_select_db( $us3_db, $us3_link );
979
980 if ( ! $result )
981 {
982 write_log( "$self: could not select DB $us3_db" );
983 mail_to_admin( "fail", "Could not select DB $us3_db, $dbhost, $user, $passwd" );
984 return 0;
985 }
986
987 $query = "UPDATE HPCAnalysisResult SET " .
988 "lastMessage='" . mysql_real_escape_string( $message, $us3_link ) . "'" .
989 "WHERE gfacID = '$gfacID' ";
990
991 mysql_query( $query, $us3_link );
992 mysql_close( $us3_link );
993}
994
995function mail_to_admin( $type, $msg )
996{
997 global $updateTime;
998 global $status;
999 global $cluster;
1000 global $org_name;
1001 global $admin_email;
1002 global $dbhost;
1003 global $requestID;
1004
1005 $headers = "From: $org_name Admin<$admin_email>" . "\n";
1006 $headers .= "Cc: $org_name Admin<$admin_email>" . "\n";
1007 $headers .= "Bcc: Gary Gorbet<gegorbet@gmail.com>" . "\n"; // make sure
1008
1009 // Set the reply address
1010 $headers .= "Reply-To: $org_name<$admin_email>" . "\n";
1011 $headers .= "Return-Path: $org_name<$admin_email>" . "\n";
1012
1013 // Try to avoid spam filters
1014 $now = time();
1015 $headers .= "Message-ID: <" . $now . "gridctl@$dbhost>$requestID\n";
1016 $headers .= "X-Mailer: PHP v" . phpversion() . "\n";
1017 $headers .= "MIME-Version: 1.0" . "\n";
1018 $headers .= "Content-Transfer-Encoding: 8bit" . "\n";
1019
1020 $subject = "US3 Error Notification";
1021 $message = "
1022 UltraScan job error notification from gridctl.php:
1023
1024 Update Time : $updateTime
1025 GFAC Status : $status
1026 Cluster : $cluster
1027 ";
1028
1029 $message .= "Error Message : $msg\n";
1030
1031 mail( $admin_email, $subject, $message, $headers );
1032}
1033
1034// Convert a status string to one of the standard DB status strings
1035function standard_status( $status_in )
1036{
1037 switch ( $status_in )
1038 { // Map variations to standard gateway status values
1039 case 'QUEUED' :
1040 case 'LAUNCHED' :
1041 case 'CREATED' :
1042 case 'VALIDATED' :
1043 case 'SCHEDULED' :
1044 case 'submitted' :
1045 case 'SUBMITTED' :
1046 case '' :
1047 $status = 'SUBMITTED';
1048 break;
1049
1050 case 'EXECUTING' :
1051 case 'ACTIVE' :
1052 case 'running' :
1053 case 'executing' :
1054 $status = 'RUNNING';
1055 break;
1056
1057 case 'PENDING' :
1058 case 'CANCELING' :
1059 $status = 'UPDATING';
1060 break;
1061
1062 case 'CANCELLED' :
1063 case 'canceled' :
1064 $status = 'CANCELED';
1065 break;
1066
1067 $status = 'DATA';
1068 break;
1069
1070 case 'COMPLETED' :
1071 case 'completed' :
1072 $status = 'COMPLETE';
1073 break;
1074
1075 case 'FAILED_DATA' :
1076 case 'SUBMIT_TIMEOUT' :
1077 case 'RUN_TIMEOUT' :
1078 case 'DATA_TIMEOUT' :
1079 $status = 'FAILED';
1080 break;
1081
1082 case 'COMPLETE' :
1083 $status = 'DONE';
1084 break;
1085
1086 case 'UNKNOWN' :
1087 $status = 'ERROR';
1088 break;
1089
1090 // Where already standard value, retain value
1091 case 'ERROR' :
1092 case 'RUNNING' :
1093 case 'SUBMITTED' :
1094 case 'UPDATING' :
1095 case 'CANCELED' :
1096 case 'DATA' :
1097 case 'FAILED' :
1098 case 'DONE' :
1099 case 'FINISHED' :
1100 default :
1101 $status = $status_in;
1102 break;
1103 }
1104
1105 return $status;
1106}
1107
1108function aira_status( $gfacID, $status_in )
1109{
1110 global $self;
1111 global $loghdr;
1112 global $class_dir;
1113//echo "a_st: st_in$status_in : $gfacID\n";
1114 //$status_gw = standard_status( $status_in );
1115 $status_gw = $status_in;
1116//echo "a_st: st_db=$status_gw\n";
1117 $status = $status_gw;
1118 $me_devel = preg_match( "/class_devel/", $class_dir );
1119 $job_devel = preg_match( "/US3-ADEV/i", $gfacID );
1120 $devmatch = ( ( !$me_devel && !$job_devel ) ||
1121 ( $me_devel && $job_devel ) );
1122
1123 if ( preg_match( "/US3-A/i", $gfacID ) && $devmatch )
1124 {
1125//write_log( "$loghdr status_in=$status_in status=$status gfacID=$gfacID" );
1126 $status_ex = getExperimentStatus( $gfacID );
1127//write_log( "$loghdr status_ex=$status_ex" );
1128
1129 if ( $status_ex == 'COMPLETED' )
1130 { // Experiment is COMPLETED: check for 'FINISHED' or 'DONE'
1131 if ( $status_gw == 'FINISHED' || $status_gw == 'DONE' )
1132 { // COMPLETED + FINISHED/DONE : gateway status is now COMPLETE
1133 $status = 'COMPLETE';
1134 }
1135
1136 else
1137 { // COMPLETED + NOT-FINISHED/DONE: gw status now DONE
1138 $status = 'DONE';
1139 }
1140 }
1141
1142 else if ( $status_gw == 'FINISHED' || $status_gw == 'DONE' )
1143 { // Gfac status == FINISHED/DONE: leave as is (unless FAILED)
1144 $status = $status_gw;
1145 if ( $status_ex == 'FAILED' )
1146 {
1147 sleep( 10 );
1148 $status_ex = getExperimentStatus( $gfacID );
1149 if ( $status_ex == 'FAILED' )
1150 {
1151 write_log( "$loghdr status still 'FAILED' after 10-second delay" );
1152 sleep( 10 );
1153 $status_ex = getExperimentStatus( $gfacID );
1154 if ( $status_ex == 'FAILED' )
1155 write_log( "$loghdr status still 'FAILED' after 20-second delay" );
1156 else
1157 write_log( "$loghdr status is $status_ex after 20-second delayed retry" );
1158 }
1159 write_log( "$loghdr status reset to 'COMPLETE'" );
1160 $status = 'COMPLETE';
1161 }
1162 }
1163
1164 else if ( $status_ex == 'EXECUTING' )
1165 {
1166 $status = standard_status( $status_gw );
1167write_log( "$loghdr status/_in/_gw/_ex=$status/$status_in/$status_gw/$status_ex" );
1168 }
1169
1170 else
1171 { // Experiment not COMPLETED/FINISHED/DONE: use experiment status
1172 $status = standard_status( $status_ex );
1173 }
1174
1175//write_log( "$loghdr status/_in/_gw/_ex=$status/$status_in/$status_gw/$status_ex" );
1176 if ( $status != $status_gw )
1177 {
1178 update_job_status( $status, $gfacID );
1179 }
1180 }
1181
1182 return $status;
1183}
1184
1185?>
Note: See TracBrowser for help on using the repository browser.