source: trunk/gridctl.php@ 3

Last change on this file since 3 was 3, checked in by us3, 12 years ago

Added DONE status, check for other unknown statuses that are added

File size: 20.3 KB
RevLine 
[1]1<?php
2
3include_once "/home/us3/bin/listen-config.php";
4include "/home/us3/bin/cleanup.php";
5
6// Global variables
7$gfac_message = "";
8$updateTime = 0;
9$submittime = 0;
10$cluster = '';
11
12// Produce some output temporarily, so cron will send me message
13$now = time();
14//echo "Time started: " . date( 'Y-m-d H:i:s', $now ) . "\n";
15
16// Get data from global GFAC DB
17$gLink = mysql_connect( $dbhost, $guser, $gpasswd );
18
19if ( ! mysql_select_db( $gDB, $gLink ) )
20{
[3]21 write_log( "$self: Could not select DB $gDB - " . mysql_error() );
[1]22 mail_to_admin( "fail", "Internal Error: Could not select DB $gDB" );
23 exit();
24}
25
26$query = "SELECT gfacID, us3_db, cluster, status, queue_msg, " .
27 "UNIX_TIMESTAMP(time), time from analysis";
28$result = mysql_query( $query, $gLink );
29
30if ( ! $result )
31{
32 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
33 mail_to_admin( "fail", "Query failed $query\n" . mysql_error( $gLink ) );
34 exit();
35}
36
37if ( mysql_num_rows( $result ) == 0 )
38 exit(); // Nothing to do
39
40while ( list( $gfacID, $us3_db, $cluster, $status, $queue_msg, $time, $updateTime )
41 = mysql_fetch_array( $result ) )
42{
43 // Checking we need to do for each entry
44
45 // Sometimes during testing, the us3_db entry is not set
46 // If $status == 'ERROR' then the condition has been processed before
47 if ( strlen( $us3_db ) == 0 && $status != 'ERROR' )
48 {
49 write_log( "$self: GFAC DB is NULL - $gfacID" );
50 mail_to_admin( "fail", "GFAC DB is NULL\n$gfacID" );
51
52 $query2 = "UPDATE analysis SET status='ERROR' WHERE gfacID='$gfacID'";
53 $result2 = mysql_query( $query2, $gLink );
54 $status = 'ERROR';
55
56 if ( ! $result2 )
57 write_log( "$self: Query failed $query2 - " . mysql_error( $gLink ) );
58
59 }
60
61 switch ( $status )
62 {
63 // Already been handled
64 // Later update this condition to search for gfacID?
65 case "ERROR":
66 cleanup();
67 break;
68
69 case "SUBMITTED":
70 submitted( $time );
71 break;
72
73 case "SUBMIT_TIMEOUT":
74 submit_timeout( $time );
75 break;
76
77 case "RUNNING":
78 running( $time );
79 break;
80
81 case "RUN_TIMEOUT":
82 run_timeout($time );
83 break;
84
85 case "DATA":
86 wait_data( $time );
87 break;
88
89 case "DATA_TIMEOUT":
90 data_timeout( $time );
91 break;
92
93 case "COMPLETE":
94 complete();
95 break;
96
97 case "CANCELLED":
98 case "CANCELED":
99 case "FAILED":
100 failed();
101 break;
102
103 default:
104 break;
105 }
106}
107
108exit();
109
110function submitted( $updatetime )
111{
112 global $self;
113 global $gLink;
114 global $gfacID;
115
116 $now = time();
117
118 if ( $updatetime + 600 > $now ) return; // < 10 minutes ago
119
120 if ( $updatetime + 86400 > $now ) // Within the first 24 hours
121 {
122 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
123 $job_status = get_local_status( $gfacID );
124
125 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
126 return;
127
128 if ( ! in_array( $job_status, array( 'SUBMITTED', 'INITIALIZED', 'PENDING' ) ) )
129 update_job_status( $job_status, $gfacID );
130
131 return;
132 }
133
134 $message = "Job listed submitted longer than 24 hours";
135 write_log( "$self: $message - id: $gfacID" );
136 mail_to_admin( "hang", "$message - id: $gfacID" );
137 $query = "UPDATE analysis SET status='SUBMIT_TIMEOUT' WHERE gfacID='$gfacID'";
138 $result = mysql_query( $query, $gLink );
139
140 if ( ! $result )
141 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
142
143 update_queue_messages( $message );
144 update_db( $message );
145}
146
147function submit_timeout( $updatetime )
148{
149 global $self;
150 global $gLink;
151 global $gfacID;
152
153 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
154 $job_status = get_local_status( $gfacID );
155
156 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
157 return;
158
159 if ( ! in_array( $job_status, array( 'SUBMITTED', 'INITIALIZED', 'PENDING' ) ) )
160 {
161 update_job_status( $job_status, $gfacID );
162 return;
163 }
164
165 $now = time();
166
167 if ( $updatetime + 86400 > $now ) return; // < 24 hours ago ( 48 total submitted )
168
169 $message = "Job listed submitted longer than 48 hours";
170 write_log( "$self: $message - id: $gfacID" );
171 mail_to_admin( "hang", "$message - id: $gfacID" );
172 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
173 $result = mysql_query( $query, $gLink );
174
175 if ( ! $result )
176 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
177
178 update_queue_messages( $message );
179 update_db( $message );
180}
181
182function running( $updatetime )
183{
184 global $self;
185 global $gLink;
186 global $gfacID;
187
188 $now = time();
189
190 get_us3_data();
191
192 if ( $updatetime + 600 > $now ) return; // message received < 10 minutes ago
193
194 if ( $updatetime + 86400 > $now ) // Within the first 24 hours
195 {
196 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
197 $job_status = get_local_status( $gfacID );
198
199 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
200 return;
201
202 if ( $job_status != 'ACTIVE' )
203 update_job_status( $job_status, $gfacID );
204
205 return;
206 }
207
208 $message = "Job listed running longer than 24 hours";
209 write_log( "$self: $message - id: $gfacID" );
210 mail_to_admin( "hang", "$message - id: $gfacID" );
211 $query = "UPDATE analysis SET status='RUN_TIMEOUT' WHERE gfacID='$gfacID'";
212 $result = mysql_query( $query, $gLink );
213
214 if ( ! $result )
215 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
216
217 update_queue_messages( $message );
218 update_db( $message );
219}
220
221function run_timeout( $updatetime )
222{
223 global $self;
224 global $gLink;
225 global $gfacID;
226
227 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
228 $job_status = get_local_status( $gfacID );
229
230 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
231 return;
232
233 if ( $job_status != 'ACTIVE' )
234 {
235 update_job_status( $job_status, $gfacID );
236 return;
237 }
238
239 $now = time();
240
241 get_us3_data();
242
243 if ( $updatetime + 172800 > $now ) return; // < 48 hours ago
244
245 $message = "Job listed running longer than 48 hours";
246 write_log( "$self: $message - id: $gfacID" );
247 mail_to_admin( "hang", "$message - id: $gfacID" );
248 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
249 $result = mysql_query( $query, $gLink );
250
251 if ( ! $result )
252 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
253
254 update_queue_messages( $message );
255 update_db( $message );
256}
257
258function wait_data( $updatetime )
259{
260 global $self;
261 global $gLink;
262 global $gfacID;
263
264 $now = time();
265
266 if ( $updatetime + 3600 > $now ) // < Within the first hour
267 {
268 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
269 $job_status = get_local_status( $gfacID );
270
271 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
272 return;
273
274 if ( $job_status != 'DATA' )
275 {
276 update_job_status( $job_status, $gfacID );
277 return;
278 }
279
280 // Request to resend data, but only request every 5 minutes
281 $minute = date( 'i' ) * 1; // Makes it an int
282 if ( $minute % 5 ) return;
283
284 $output_status = get_gfac_outputs( $gfacID );
285
286 if ( $output_status !== false )
287 mail_to_admin( "debug", "wait_data/$gfacID/$output_status" );
288
289 return;
290 }
291
292 $message = "Waiting for data longer than 1 hour";
293 write_log( "$self: $message - id: $gfacID" );
294 mail_to_admin( "hang", "$message - id: $gfacID" );
295 $query = "UPDATE analysis SET status='DATA_TIMEOUT' WHERE gfacID='$gfacID'";
296 $result = mysql_query( $query, $gLink );
297
298 if ( ! $result )
299 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
300
301 update_queue_messages( $message );
302 update_db( $message );
303}
304
305function data_timeout( $updatetime )
306{
307 global $self;
308 global $gLink;
309 global $gfacID;
310
311 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
312 $job_status = get_local_status( $gfacID );
313
314 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
315 return;
316
317 if ( $job_status != 'DATA' )
318 {
319 update_job_status( $job_status, $gfacID );
320 return;
321 }
322
323 $now = time();
324
325 if ( $updatetime + 86400 > $now ) // < 24 hours ago
326 {
327 // Request to resend data, but only request every 15 minutes
328 $minute = date( 'i' ) * 1; // Makes it an int
329 if ( $minute % 15 ) return;
330
331 $output_status = get_gfac_outputs( $gfacID );
332
333 if ( $output_status !== false )
334 mail_to_admin( "debug", "data_timeout/$gfacID/$output_status" );
335
336 return;
337 }
338
339 $message = "Waiting for data longer than 24 hours";
340 write_log( "$self: $message - id: $gfacID" );
341 mail_to_admin( "hang", "$message - id: $gfacID" );
342 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
343 $result = mysql_query( $query, $gLink );
344
345 if ( ! $result )
346 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
347
348 update_queue_messages( $message );
349 update_db( $message );
350}
351
352function complete()
353{
354 // Just cleanup
355 cleanup();
356}
357
358function failed()
359{
360 // Just cleanup
361 cleanup();
362}
363
364function cleanup()
365{
366 global $self;
367 global $gLink;
368 global $gfacID;
369 global $us3_db;
370
371 // Double check that the gfacID exists
372 $query = "SELECT count(*) FROM analysis WHERE gfacID='$gfacID'";
373 $result = mysql_query( $query, $gLink );
374
375 if ( ! $result )
376 {
377 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
378 mail_to_admin( "fail", "Query failed $query\n" . mysql_error( $gLink ) );
379 return;
380 }
381
382 list( $count ) = mysql_fetch_array( $result );
383
384 if ( $count == 0 ) return;
385
386 // Now check the us3 instance
387 $requestID = get_us3_data();
388 if ( $requestID == 0 ) return;
389
390 gfac_cleanup( $us3_db, $requestID, $gLink );
391}
392
393// Function to update status of job
394function update_job_status( $job_status, $gfacID )
395{
396 global $gLink;
397
398 switch ( $job_status )
399 {
400 case 'SUBMITTED' :
401 case 'SUBMITED' :
402 case 'INITIALIZED' :
403 case 'PENDING' :
404 $query = "UPDATE analysis SET status='SUBMITTED' WHERE gfacID='$gfacID'";
405 $message = "Job status request reports job is SUBMITTED";
406 break;
407
408 case 'ACTIVE' :
409 $query = "UPDATE analysis SET status='RUNNING' WHERE gfacID='$gfacID'";
410 $message = "Job status request reports job is RUNNING";
411 break;
412
413 case 'COMPLETED' :
[3]414 case 'DONE' :
[1]415 $query = "UPDATE analysis SET status='COMPLETE' WHERE gfacID='$gfacID'";
416 $message = "Job status request reports job is COMPLETE";
417 break;
418
419 case 'DATA' :
420 $query = "UPDATE analysis SET status='DATA' WHERE gfacID='$gfacID'";
421 $message = "Job status request reports job is COMPLETE, waiting for data";
422 break;
423
424 case 'CANCELED' :
425 case 'CANCELLED' :
426 $query = "UPDATE analysis SET status='CANCELED' WHERE gfacID='$gfacID'";
427 $message = "Job status request reports job is CANCELED";
428 break;
429
430 case 'FAILED' :
431 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
432 $message = "Job status request reports job is FAILED";
433 break;
434
435 case 'UNKNOWN' :
436 // $query = "UPDATE analysis SET status='ERROR' WHERE gfacID='$gfacID'";
437 $message = "Job status request reports job is not in the queue";
438 break;
439
440 default :
[3]441 // We shouldn't ever get here
[1]442 $query = "";
443 $message = "Job status was not recognized - $job_status";
[3]444 write_log( "$self - update_job_status: " .
445 "Job status was not recognized - $job_status\n" .
446 "gfacID = $gfacID\n" );
[1]447 break;
448
449 }
450
451 $result = mysql_query( $query, $gLink );
452 if ( ! $result )
453 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
454
455 update_queue_messages( $message );
456 update_db( $message );
457}
458
459function get_us3_data()
460{
461 global $self;
462 global $gfacID;
463 global $dbhost;
464 global $user;
465 global $passwd;
466 global $us3_db;
467 global $updateTime;
468
469 $us3_link = mysql_connect( $dbhost, $user, $passwd );
470
471 if ( ! $us3_link )
472 {
473 write_log( "$self: could not connect: $dbhost, $user, $passwd" );
474 mail_to_admin( "fail", "Could not connect to $dbhost" );
475 return 0;
476 }
477
478
479 $result = mysql_select_db( $us3_db, $us3_link );
480
481 if ( ! $result )
482 {
483 write_log( "$self: could not select DB $us3_db" );
484 mail_to_admin( "fail", "Could not select DB $us3_db, $dbhost, $user, $passwd" );
485 return 0;
486 }
487
488 $query = "SELECT HPCAnalysisRequestID, UNIX_TIMESTAMP(updateTime) " .
489 "FROM HPCAnalysisResult WHERE gfacID='$gfacID'";
490 $result = mysql_query( $query, $us3_link );
491
492 if ( ! $result )
493 {
494 write_log( "$self: Query failed $query - " . mysql_error( $us3_link ) );
495 mail_to_admin( "fail", "Query failed $query\n" . mysql_error( $us3_link ) );
496 return 0;
497 }
498
499 list( $requestID, $updateTime ) = mysql_fetch_array( $result );
500 mysql_close( $us3_link );
501
502 return $requestID;
503}
504
505// Function to determine if this is a gfac job or a local job
506function is_gfac_job( $gfacID )
507{
508 $hex = "[0-9a-fA-F]";
509 if ( ! preg_match( "/^US3-Experiment/i", $gfacID ) &&
510 ! preg_match( "/^US3-$hex{8}-$hex{4}-$hex{4}-$hex{4}-$hex{12}$/", $gfacID ) )
511 {
512 // Then it's not a GFAC job
513 return false;
514 }
515
516 return true;
517}
518
519// Function to get the current job status from GFAC
520function get_gfac_status( $gfacID )
521{
522 global $serviceURL;
523
524 if ( ! is_gfac_job( $gfacID ) )
525 return false;
526
527 $url = "$serviceURL/jobstatus/$gfacID";
528 try
529 {
530 $post = new HttpRequest( $url, HttpRequest::METH_GET );
531 $http = $post->send();
532 $xml = $post->getResponseBody();
533 }
534 catch ( HttpException $e )
535 {
536 write_log( "$self: Status not available - marking failed - $gfacID" );
537 return 'GFAC_STATUS_UNAVAILABLE';
538 }
539
540 // Parse the result
541 $gfac_status = parse_response( $xml );
542
[3]543 // This may not seem like the best place to do this, but here we have
544 // the xml straight from GFAC
545 $status_types = array('SUBMITTED',
546 'SUBMITED',
547 'INITIALIZED',
548 'PENDING',
549 'ACTIVE',
550 'COMPLETED',
551 'DONE',
552 'DATA',
553 'CANCELED',
554 'CANCELLED',
555 'FAILED',
556 'UNKNOWN');
557 if ( ! in_array( $gfac_status, $status_types ) )
558 mail_to_admin( 'debug', "gfacID: /$gfacID/\n" .
559 "XML: /$xml/\n" .
560 "Status: /$gfac_status/\n" );
561
[1]562 return $gfac_status;
563}
564
565// Function to request data outputs from GFAC
566function get_gfac_outputs( $gfacID )
567{
568 global $serviceURL;
569
570 // Make sure it's a GFAC job and status is appropriate for this call
571 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
572 {
573 // Then it's not a GFAC job
574 return false;
575 }
576
577 if ( ! in_array( $job_status, array( 'DONE', 'FAILED', 'COMPLETE' ) ) )
578 {
579 // Then it's not appropriate to request data
580 return false;
581 }
582
583 $url = "$serviceURL/registeroutput/$gfacID";
584 try
585 {
586 $post = new HttpRequest( $url, HttpRequest::METH_GET );
587 $http = $post->send();
588 $xml = $post->getResponseBody();
589 }
590 catch ( HttpException $e )
591 {
592 write_log( "$self: Data not available - request failed - $gfacID" );
593 return false;
594 }
595
596 mail_to_admin( "debug", "get_gfac_outputs/\n$xml/" ); // Temporary, to see what the xml looks like,
597 // if we ever get one
598
599 // Parse the result
600 $gfac_status = parse_response( $xml );
601
602 return $gfac_status;
603}
604
605function parse_response( $xml )
606{
607 global $gfac_message;
608
609 $status = "";
610 $gfac_message = "";
611
612 $parser = new XMLReader();
613 $parser->xml( $xml );
614
615 while( $parser->read() )
616 {
617 $type = $parser->nodeType;
618
619 if ( $type == XMLReader::ELEMENT )
620 $name = $parser->name;
621
622 else if ( $type == XMLReader::TEXT )
623 {
624 if ( $name == "status" )
625 $status = $parser->value;
626 else
627 $gfac_message = $parser->value;
628 }
629 }
630
631 $parser->close();
632 return $status;
633}
634
635// Function to get status from local cluster
636function get_local_status( $gfacID )
637{
638 global $cluster;
639
640 $system = "$cluster.uthscsa.edu";
641 $system = preg_replace( "/\-local/", "", $system );
642 $cmd = "/usr/bin/ssh -x us3@$system qstat -a $gfacID 2>&1";
643
644 $result = exec( $cmd );
645
646 if ( $result == "" || preg_match( "/^qstat: Unknown/", $result ) )
647 {
648 write_log( "$self get_local_status: Local job $gfacID unknown" );
649 return 'UNKNOWN';
650 }
651
652 $values = preg_split( "/\s+/", $result );
653// write_log( "$self: get_local_status: job status = /{$values[9]}/");
654 switch ( $values[ 9 ] )
655 {
656 case "W" : // Waiting for execution time to be reached
657 case "E" : // Job is exiting after having run
658 case "R" : // Still running
659 $status = 'ACTIVE';
660 break;
661
662 case "C" : // Job has completed
663 $status = 'COMPLETED';
664 break;
665
666 case "T" : // Job is being moved
667 case "H" : // Held
668 case "Q" : // Queued
669 $status = 'SUBMITTED';
670 break;
671
672 default :
673 $status = 'UNKNOWN'; // This should not occur
674 break;
675 }
676
677 return $status;
678}
679
680function update_queue_messages( $message )
681{
682 global $self;
683 global $gLink;
684 global $gfacID;
685
686 // Get analysis table ID
687 $query = "SELECT id FROM analysis " .
688 "WHERE gfacID = '$gfacID' ";
689 $result = mysql_query( $query, $gLink );
690 if ( ! $result )
691 {
692 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
693 return;
694 }
695 list( $analysisID ) = mysql_fetch_array( $result );
696
697 // Insert message into queue_message table
698 $query = "INSERT INTO queue_messages SET " .
699 "message = '" . mysql_real_escape_string( $message, $gLink ) . "'," .
700 "analysisID = $analysisID ";
701 $result = mysql_query( $query, $gLink );
702 if ( ! $result )
703 {
704 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
705 return;
706 }
707}
708
709function update_db( $message )
710{
711 global $self;
712 global $gfacID;
713 global $dbhost;
714 global $user;
715 global $passwd;
716 global $us3_db;
717
718 $us3_link = mysql_connect( $dbhost, $user, $passwd );
719
720 if ( ! $us3_link )
721 {
722 write_log( "$self: could not connect: $dbhost, $user, $passwd" );
723 mail_to_admin( "fail", "Could not connect to $dbhost" );
724 return 0;
725 }
726
727
728 $result = mysql_select_db( $us3_db, $us3_link );
729
730 if ( ! $result )
731 {
732 write_log( "$self: could not select DB $us3_db" );
733 mail_to_admin( "fail", "Could not select DB $us3_db, $dbhost, $user, $passwd" );
734 return 0;
735 }
736
737 $query = "UPDATE HPCAnalysisResult SET " .
738 "lastMessage='" . mysql_real_escape_string( $message, $us3_link ) . "'" .
739 "WHERE gfacID = '$gfacID' ";
740
741 mysql_query( $query, $us3_link );
742 mysql_close( $us3_link );
743}
744
745function mail_to_admin( $type, $msg )
746{
747 global $updateTime;
748 global $status;
749 global $cluster;
750 global $org_name;
751 global $admin_email;
752 global $dbhost;
753 global $requestID;
754
755 $headers = "From: $org_name Admin<$admin_email>" . "\n";
756 $headers .= "Cc: $org_name Admin<$admin_email>" . "\n";
[3]757 $headers .= "Bcc: Dan Zollars<dzollars@gmail.com>" . "\n"; // make sure
[1]758
759 // Set the reply address
760 $headers .= "Reply-To: $org_name<$admin_email>" . "\n";
761 $headers .= "Return-Path: $org_name<$admin_email>" . "\n";
762
763 // Try to avoid spam filters
764 $now = time();
765 $headers .= "Message-ID: <" . $now . "gridctl@$dbhost>$requestID\n";
766 $headers .= "X-Mailer: PHP v" . phpversion() . "\n";
767 $headers .= "MIME-Version: 1.0" . "\n";
768 $headers .= "Content-Transfer-Encoding: 8bit" . "\n";
769
770 $subject = "US3 Error Notification";
771 $message = "
772 UltraScan job error notification from gridctl.php:
773
774 Update Time : $updateTime
775 GFAC Status : $status
776 Cluster : $cluster
777 ";
778
779 $message .= "Error Message : $msg\n";
780
781 mail( $admin_email, $subject, $message, $headers );
782}
783?>
Note: See TracBrowser for help on using the repository browser.