source: trunk/gridctl.php@ 1

Last change on this file since 1 was 1, checked in by zollarsd, 12 years ago

Initial import

File size: 19.2 KB
RevLine 
[1]1<?php
2
3include_once "/home/us3/bin/listen-config.php";
4include "/home/us3/bin/cleanup.php";
5
6// Global variables
7$gfac_message = "";
8$updateTime = 0;
9$submittime = 0;
10$cluster = '';
11
12// Produce some output temporarily, so cron will send me message
13$now = time();
14//echo "Time started: " . date( 'Y-m-d H:i:s', $now ) . "\n";
15
16// Get data from global GFAC DB
17$gLink = mysql_connect( $dbhost, $guser, $gpasswd );
18
19if ( ! mysql_select_db( $gDB, $gLink ) )
20{
21 write_log( "$self: Could not connect to DB $gDB" );
22 mail_to_admin( "fail", "Internal Error: Could not select DB $gDB" );
23 exit();
24}
25
26$query = "SELECT gfacID, us3_db, cluster, status, queue_msg, " .
27 "UNIX_TIMESTAMP(time), time from analysis";
28$result = mysql_query( $query, $gLink );
29
30if ( ! $result )
31{
32 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
33 mail_to_admin( "fail", "Query failed $query\n" . mysql_error( $gLink ) );
34 exit();
35}
36
37if ( mysql_num_rows( $result ) == 0 )
38 exit(); // Nothing to do
39
40while ( list( $gfacID, $us3_db, $cluster, $status, $queue_msg, $time, $updateTime )
41 = mysql_fetch_array( $result ) )
42{
43 // Checking we need to do for each entry
44
45 // Sometimes during testing, the us3_db entry is not set
46 // If $status == 'ERROR' then the condition has been processed before
47 if ( strlen( $us3_db ) == 0 && $status != 'ERROR' )
48 {
49 write_log( "$self: GFAC DB is NULL - $gfacID" );
50 mail_to_admin( "fail", "GFAC DB is NULL\n$gfacID" );
51
52 $query2 = "UPDATE analysis SET status='ERROR' WHERE gfacID='$gfacID'";
53 $result2 = mysql_query( $query2, $gLink );
54 $status = 'ERROR';
55
56 if ( ! $result2 )
57 write_log( "$self: Query failed $query2 - " . mysql_error( $gLink ) );
58
59 }
60
61 switch ( $status )
62 {
63 // Already been handled
64 // Later update this condition to search for gfacID?
65 case "ERROR":
66 cleanup();
67 break;
68
69 case "SUBMITTED":
70 submitted( $time );
71 break;
72
73 case "SUBMIT_TIMEOUT":
74 submit_timeout( $time );
75 break;
76
77 case "RUNNING":
78 running( $time );
79 break;
80
81 case "RUN_TIMEOUT":
82 run_timeout($time );
83 break;
84
85 case "DATA":
86 wait_data( $time );
87 break;
88
89 case "DATA_TIMEOUT":
90 data_timeout( $time );
91 break;
92
93 case "COMPLETE":
94 complete();
95 break;
96
97 case "CANCELLED":
98 case "CANCELED":
99 case "FAILED":
100 failed();
101 break;
102
103 default:
104 break;
105 }
106}
107
108exit();
109
110function submitted( $updatetime )
111{
112 global $self;
113 global $gLink;
114 global $gfacID;
115
116 $now = time();
117
118 if ( $updatetime + 600 > $now ) return; // < 10 minutes ago
119
120 if ( $updatetime + 86400 > $now ) // Within the first 24 hours
121 {
122 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
123 $job_status = get_local_status( $gfacID );
124
125 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
126 return;
127
128 if ( ! in_array( $job_status, array( 'SUBMITTED', 'INITIALIZED', 'PENDING' ) ) )
129 update_job_status( $job_status, $gfacID );
130
131 return;
132 }
133
134 $message = "Job listed submitted longer than 24 hours";
135 write_log( "$self: $message - id: $gfacID" );
136 mail_to_admin( "hang", "$message - id: $gfacID" );
137 $query = "UPDATE analysis SET status='SUBMIT_TIMEOUT' WHERE gfacID='$gfacID'";
138 $result = mysql_query( $query, $gLink );
139
140 if ( ! $result )
141 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
142
143 update_queue_messages( $message );
144 update_db( $message );
145}
146
147function submit_timeout( $updatetime )
148{
149 global $self;
150 global $gLink;
151 global $gfacID;
152
153 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
154 $job_status = get_local_status( $gfacID );
155
156 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
157 return;
158
159 if ( ! in_array( $job_status, array( 'SUBMITTED', 'INITIALIZED', 'PENDING' ) ) )
160 {
161 update_job_status( $job_status, $gfacID );
162 return;
163 }
164
165 $now = time();
166
167 if ( $updatetime + 86400 > $now ) return; // < 24 hours ago ( 48 total submitted )
168
169 $message = "Job listed submitted longer than 48 hours";
170 write_log( "$self: $message - id: $gfacID" );
171 mail_to_admin( "hang", "$message - id: $gfacID" );
172 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
173 $result = mysql_query( $query, $gLink );
174
175 if ( ! $result )
176 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
177
178 update_queue_messages( $message );
179 update_db( $message );
180}
181
182function running( $updatetime )
183{
184 global $self;
185 global $gLink;
186 global $gfacID;
187
188 $now = time();
189
190 get_us3_data();
191
192 if ( $updatetime + 600 > $now ) return; // message received < 10 minutes ago
193
194 if ( $updatetime + 86400 > $now ) // Within the first 24 hours
195 {
196 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
197 $job_status = get_local_status( $gfacID );
198
199 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
200 return;
201
202 if ( $job_status != 'ACTIVE' )
203 update_job_status( $job_status, $gfacID );
204
205 return;
206 }
207
208 $message = "Job listed running longer than 24 hours";
209 write_log( "$self: $message - id: $gfacID" );
210 mail_to_admin( "hang", "$message - id: $gfacID" );
211 $query = "UPDATE analysis SET status='RUN_TIMEOUT' WHERE gfacID='$gfacID'";
212 $result = mysql_query( $query, $gLink );
213
214 if ( ! $result )
215 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
216
217 update_queue_messages( $message );
218 update_db( $message );
219}
220
221function run_timeout( $updatetime )
222{
223 global $self;
224 global $gLink;
225 global $gfacID;
226
227 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
228 $job_status = get_local_status( $gfacID );
229
230 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
231 return;
232
233 if ( $job_status != 'ACTIVE' )
234 {
235 update_job_status( $job_status, $gfacID );
236 return;
237 }
238
239 $now = time();
240
241 get_us3_data();
242
243 if ( $updatetime + 172800 > $now ) return; // < 48 hours ago
244
245 $message = "Job listed running longer than 48 hours";
246 write_log( "$self: $message - id: $gfacID" );
247 mail_to_admin( "hang", "$message - id: $gfacID" );
248 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
249 $result = mysql_query( $query, $gLink );
250
251 if ( ! $result )
252 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
253
254 update_queue_messages( $message );
255 update_db( $message );
256}
257
258function wait_data( $updatetime )
259{
260 global $self;
261 global $gLink;
262 global $gfacID;
263
264 $now = time();
265
266 if ( $updatetime + 3600 > $now ) // < Within the first hour
267 {
268 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
269 $job_status = get_local_status( $gfacID );
270
271 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
272 return;
273
274 if ( $job_status != 'DATA' )
275 {
276 update_job_status( $job_status, $gfacID );
277 return;
278 }
279
280 // Request to resend data, but only request every 5 minutes
281 $minute = date( 'i' ) * 1; // Makes it an int
282 if ( $minute % 5 ) return;
283
284 $output_status = get_gfac_outputs( $gfacID );
285
286 if ( $output_status !== false )
287 mail_to_admin( "debug", "wait_data/$gfacID/$output_status" );
288
289 return;
290 }
291
292 $message = "Waiting for data longer than 1 hour";
293 write_log( "$self: $message - id: $gfacID" );
294 mail_to_admin( "hang", "$message - id: $gfacID" );
295 $query = "UPDATE analysis SET status='DATA_TIMEOUT' WHERE gfacID='$gfacID'";
296 $result = mysql_query( $query, $gLink );
297
298 if ( ! $result )
299 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
300
301 update_queue_messages( $message );
302 update_db( $message );
303}
304
305function data_timeout( $updatetime )
306{
307 global $self;
308 global $gLink;
309 global $gfacID;
310
311 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
312 $job_status = get_local_status( $gfacID );
313
314 if ( $job_status == 'GFAC_STATUS_UNAVAILABLE' )
315 return;
316
317 if ( $job_status != 'DATA' )
318 {
319 update_job_status( $job_status, $gfacID );
320 return;
321 }
322
323 $now = time();
324
325 if ( $updatetime + 86400 > $now ) // < 24 hours ago
326 {
327 // Request to resend data, but only request every 15 minutes
328 $minute = date( 'i' ) * 1; // Makes it an int
329 if ( $minute % 15 ) return;
330
331 $output_status = get_gfac_outputs( $gfacID );
332
333 if ( $output_status !== false )
334 mail_to_admin( "debug", "data_timeout/$gfacID/$output_status" );
335
336 return;
337 }
338
339 $message = "Waiting for data longer than 24 hours";
340 write_log( "$self: $message - id: $gfacID" );
341 mail_to_admin( "hang", "$message - id: $gfacID" );
342 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
343 $result = mysql_query( $query, $gLink );
344
345 if ( ! $result )
346 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
347
348 update_queue_messages( $message );
349 update_db( $message );
350}
351
352function complete()
353{
354 // Just cleanup
355 cleanup();
356}
357
358function failed()
359{
360 // Just cleanup
361 cleanup();
362}
363
364function cleanup()
365{
366 global $self;
367 global $gLink;
368 global $gfacID;
369 global $us3_db;
370
371 // Double check that the gfacID exists
372 $query = "SELECT count(*) FROM analysis WHERE gfacID='$gfacID'";
373 $result = mysql_query( $query, $gLink );
374
375 if ( ! $result )
376 {
377 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
378 mail_to_admin( "fail", "Query failed $query\n" . mysql_error( $gLink ) );
379 return;
380 }
381
382 list( $count ) = mysql_fetch_array( $result );
383
384 if ( $count == 0 ) return;
385
386 // Now check the us3 instance
387 $requestID = get_us3_data();
388 if ( $requestID == 0 ) return;
389
390 gfac_cleanup( $us3_db, $requestID, $gLink );
391}
392
393// Function to update status of job
394function update_job_status( $job_status, $gfacID )
395{
396 global $gLink;
397
398 switch ( $job_status )
399 {
400 case 'SUBMITTED' :
401 case 'SUBMITED' :
402 case 'INITIALIZED' :
403 case 'PENDING' :
404 $query = "UPDATE analysis SET status='SUBMITTED' WHERE gfacID='$gfacID'";
405 $message = "Job status request reports job is SUBMITTED";
406 break;
407
408 case 'ACTIVE' :
409 $query = "UPDATE analysis SET status='RUNNING' WHERE gfacID='$gfacID'";
410 $message = "Job status request reports job is RUNNING";
411 break;
412
413 case 'COMPLETED' :
414 $query = "UPDATE analysis SET status='COMPLETE' WHERE gfacID='$gfacID'";
415 $message = "Job status request reports job is COMPLETE";
416 break;
417
418 case 'DATA' :
419 $query = "UPDATE analysis SET status='DATA' WHERE gfacID='$gfacID'";
420 $message = "Job status request reports job is COMPLETE, waiting for data";
421 break;
422
423 case 'CANCELED' :
424 case 'CANCELLED' :
425 $query = "UPDATE analysis SET status='CANCELED' WHERE gfacID='$gfacID'";
426 $message = "Job status request reports job is CANCELED";
427 break;
428
429 case 'FAILED' :
430 $query = "UPDATE analysis SET status='FAILED' WHERE gfacID='$gfacID'";
431 $message = "Job status request reports job is FAILED";
432 break;
433
434 case 'UNKNOWN' :
435 // $query = "UPDATE analysis SET status='ERROR' WHERE gfacID='$gfacID'";
436 $message = "Job status request reports job is not in the queue";
437 break;
438
439 default :
440 $query = "";
441 $message = "Job status was not recognized - $job_status";
442 break;
443
444 }
445
446 $result = mysql_query( $query, $gLink );
447 if ( ! $result )
448 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
449
450 update_queue_messages( $message );
451 update_db( $message );
452}
453
454function get_us3_data()
455{
456 global $self;
457 global $gfacID;
458 global $dbhost;
459 global $user;
460 global $passwd;
461 global $us3_db;
462 global $updateTime;
463
464 $us3_link = mysql_connect( $dbhost, $user, $passwd );
465
466 if ( ! $us3_link )
467 {
468 write_log( "$self: could not connect: $dbhost, $user, $passwd" );
469 mail_to_admin( "fail", "Could not connect to $dbhost" );
470 return 0;
471 }
472
473
474 $result = mysql_select_db( $us3_db, $us3_link );
475
476 if ( ! $result )
477 {
478 write_log( "$self: could not select DB $us3_db" );
479 mail_to_admin( "fail", "Could not select DB $us3_db, $dbhost, $user, $passwd" );
480 return 0;
481 }
482
483 $query = "SELECT HPCAnalysisRequestID, UNIX_TIMESTAMP(updateTime) " .
484 "FROM HPCAnalysisResult WHERE gfacID='$gfacID'";
485 $result = mysql_query( $query, $us3_link );
486
487 if ( ! $result )
488 {
489 write_log( "$self: Query failed $query - " . mysql_error( $us3_link ) );
490 mail_to_admin( "fail", "Query failed $query\n" . mysql_error( $us3_link ) );
491 return 0;
492 }
493
494 list( $requestID, $updateTime ) = mysql_fetch_array( $result );
495 mysql_close( $us3_link );
496
497 return $requestID;
498}
499
500// Function to determine if this is a gfac job or a local job
501function is_gfac_job( $gfacID )
502{
503 $hex = "[0-9a-fA-F]";
504 if ( ! preg_match( "/^US3-Experiment/i", $gfacID ) &&
505 ! preg_match( "/^US3-$hex{8}-$hex{4}-$hex{4}-$hex{4}-$hex{12}$/", $gfacID ) )
506 {
507 // Then it's not a GFAC job
508 return false;
509 }
510
511 return true;
512}
513
514// Function to get the current job status from GFAC
515function get_gfac_status( $gfacID )
516{
517 global $serviceURL;
518
519 if ( ! is_gfac_job( $gfacID ) )
520 return false;
521
522 $url = "$serviceURL/jobstatus/$gfacID";
523 try
524 {
525 $post = new HttpRequest( $url, HttpRequest::METH_GET );
526 $http = $post->send();
527 $xml = $post->getResponseBody();
528 }
529 catch ( HttpException $e )
530 {
531 write_log( "$self: Status not available - marking failed - $gfacID" );
532 return 'GFAC_STATUS_UNAVAILABLE';
533 }
534
535 // Parse the result
536 $gfac_status = parse_response( $xml );
537
538 return $gfac_status;
539}
540
541// Function to request data outputs from GFAC
542function get_gfac_outputs( $gfacID )
543{
544 global $serviceURL;
545
546 // Make sure it's a GFAC job and status is appropriate for this call
547 if ( ( $job_status = get_gfac_status( $gfacID ) ) === false )
548 {
549 // Then it's not a GFAC job
550 return false;
551 }
552
553 if ( ! in_array( $job_status, array( 'DONE', 'FAILED', 'COMPLETE' ) ) )
554 {
555 // Then it's not appropriate to request data
556 return false;
557 }
558
559 $url = "$serviceURL/registeroutput/$gfacID";
560 try
561 {
562 $post = new HttpRequest( $url, HttpRequest::METH_GET );
563 $http = $post->send();
564 $xml = $post->getResponseBody();
565 }
566 catch ( HttpException $e )
567 {
568 write_log( "$self: Data not available - request failed - $gfacID" );
569 return false;
570 }
571
572 mail_to_admin( "debug", "get_gfac_outputs/\n$xml/" ); // Temporary, to see what the xml looks like,
573 // if we ever get one
574
575 // Parse the result
576 $gfac_status = parse_response( $xml );
577
578 return $gfac_status;
579}
580
581function parse_response( $xml )
582{
583 global $gfac_message;
584
585 $status = "";
586 $gfac_message = "";
587
588 $parser = new XMLReader();
589 $parser->xml( $xml );
590
591 while( $parser->read() )
592 {
593 $type = $parser->nodeType;
594
595 if ( $type == XMLReader::ELEMENT )
596 $name = $parser->name;
597
598 else if ( $type == XMLReader::TEXT )
599 {
600 if ( $name == "status" )
601 $status = $parser->value;
602 else
603 $gfac_message = $parser->value;
604 }
605 }
606
607 $parser->close();
608 return $status;
609}
610
611// Function to get status from local cluster
612function get_local_status( $gfacID )
613{
614 global $cluster;
615
616 $system = "$cluster.uthscsa.edu";
617 $system = preg_replace( "/\-local/", "", $system );
618 $cmd = "/usr/bin/ssh -x us3@$system qstat -a $gfacID 2>&1";
619
620 $result = exec( $cmd );
621
622 if ( $result == "" || preg_match( "/^qstat: Unknown/", $result ) )
623 {
624 write_log( "$self get_local_status: Local job $gfacID unknown" );
625 return 'UNKNOWN';
626 }
627
628 $values = preg_split( "/\s+/", $result );
629// write_log( "$self: get_local_status: job status = /{$values[9]}/");
630 switch ( $values[ 9 ] )
631 {
632 case "W" : // Waiting for execution time to be reached
633 case "E" : // Job is exiting after having run
634 case "R" : // Still running
635 $status = 'ACTIVE';
636 break;
637
638 case "C" : // Job has completed
639 $status = 'COMPLETED';
640 break;
641
642 case "T" : // Job is being moved
643 case "H" : // Held
644 case "Q" : // Queued
645 $status = 'SUBMITTED';
646 break;
647
648 default :
649 $status = 'UNKNOWN'; // This should not occur
650 break;
651 }
652
653 return $status;
654}
655
656function update_queue_messages( $message )
657{
658 global $self;
659 global $gLink;
660 global $gfacID;
661
662 // Get analysis table ID
663 $query = "SELECT id FROM analysis " .
664 "WHERE gfacID = '$gfacID' ";
665 $result = mysql_query( $query, $gLink );
666 if ( ! $result )
667 {
668 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
669 return;
670 }
671 list( $analysisID ) = mysql_fetch_array( $result );
672
673 // Insert message into queue_message table
674 $query = "INSERT INTO queue_messages SET " .
675 "message = '" . mysql_real_escape_string( $message, $gLink ) . "'," .
676 "analysisID = $analysisID ";
677 $result = mysql_query( $query, $gLink );
678 if ( ! $result )
679 {
680 write_log( "$self: Query failed $query - " . mysql_error( $gLink ) );
681 return;
682 }
683}
684
685function update_db( $message )
686{
687 global $self;
688 global $gfacID;
689 global $dbhost;
690 global $user;
691 global $passwd;
692 global $us3_db;
693
694 $us3_link = mysql_connect( $dbhost, $user, $passwd );
695
696 if ( ! $us3_link )
697 {
698 write_log( "$self: could not connect: $dbhost, $user, $passwd" );
699 mail_to_admin( "fail", "Could not connect to $dbhost" );
700 return 0;
701 }
702
703
704 $result = mysql_select_db( $us3_db, $us3_link );
705
706 if ( ! $result )
707 {
708 write_log( "$self: could not select DB $us3_db" );
709 mail_to_admin( "fail", "Could not select DB $us3_db, $dbhost, $user, $passwd" );
710 return 0;
711 }
712
713 $query = "UPDATE HPCAnalysisResult SET " .
714 "lastMessage='" . mysql_real_escape_string( $message, $us3_link ) . "'" .
715 "WHERE gfacID = '$gfacID' ";
716
717 mysql_query( $query, $us3_link );
718 mysql_close( $us3_link );
719}
720
721function mail_to_admin( $type, $msg )
722{
723 global $updateTime;
724 global $status;
725 global $cluster;
726 global $org_name;
727 global $admin_email;
728 global $dbhost;
729 global $requestID;
730
731 $headers = "From: $org_name Admin<$admin_email>" . "\n";
732 $headers .= "Cc: $org_name Admin<$admin_email>" . "\n";
733
734 // Set the reply address
735 $headers .= "Reply-To: $org_name<$admin_email>" . "\n";
736 $headers .= "Return-Path: $org_name<$admin_email>" . "\n";
737
738 // Try to avoid spam filters
739 $now = time();
740 $headers .= "Message-ID: <" . $now . "gridctl@$dbhost>$requestID\n";
741 $headers .= "X-Mailer: PHP v" . phpversion() . "\n";
742 $headers .= "MIME-Version: 1.0" . "\n";
743 $headers .= "Content-Transfer-Encoding: 8bit" . "\n";
744
745 $subject = "US3 Error Notification";
746 $message = "
747 UltraScan job error notification from gridctl.php:
748
749 Update Time : $updateTime
750 GFAC Status : $status
751 Cluster : $cluster
752 ";
753
754 $message .= "Error Message : $msg\n";
755
756 mail( $admin_email, $subject, $message, $headers );
757}
758?>
Note: See TracBrowser for help on using the repository browser.