Changeset 31


Ignore:
Timestamp:
Oct 13, 2017, 9:37:16 PM (7 years ago)
Author:
gegorbet
Message:

global-fit and jetstream,stampede2 mods

Location:
trunk
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/cleanup.php

    r28 r31  
    225225   if ( file_exists( $fn_stdout  ) ) $stdout   = file_get_contents( $fn_stdout  );
    226226   if ( file_exists( $fn_tarfile ) ) $tarfile  = file_get_contents( $fn_tarfile );
     227write_log( "$me(0):  length contents stderr,stdout,tarfile -- "
     228 . strlen($stderr) . "," . strlen($stdout) . "," . strlen($tarfile) );
    227229   // If stdout,stderr have no content, retry after delay
    228230   if ( strlen( $stdout ) == 0  ||  strlen( $stderr ) == 0 )
     
    870872   global $status;
    871873   $is_us3iab  = preg_match( "/us3iab/", $cluster );
     874   $is_jetstr  = preg_match( "/jetstream/", $cluster );
    872875
    873876   // Figure out remote directory
     
    881884   {
    882885      // For "-local", recompute remote work directory
    883       $cmd = "ssh us3@$cluster.uthscsa.edu 'ls -d ~us3/lims/work/local' 2/dev/null";
     886      $clushost = "$cluster.uthscsa.edu";
     887      $lworkdir = "~us3/lims/work/local";
     888      if ( $is_jetstr )
     889      {
     890         $clushost = "js-157-184.jetstream-cloud.org";
     891         $lworkdir = "/N/us3_cluster/work/local";
     892      }
     893      $cmd         = "ssh us3@$clushost 'ls -d $lworkdir' 2/dev/null";
    884894      exec( $cmd, $output, $stat );
    885895      $work_remote = $output[ 0 ];
     
    891901      $pwd = chdir( "$work/$gfacID" );
    892902
    893       $cmd = "scp us3@$cluster.uthscsa.edu:$remoteDir/output/analysis-results.tar . 2>&1";
     903      $cmd = "scp us3@$clushost:$remoteDir/output/analysis-results.tar . 2>&1";
    894904
    895905      exec( $cmd, $output, $stat );
     
    897907         write_log( "$me: Bad exec:\n$cmd\n" . implode( "\n", $output ) );
    898908
    899       $cmd = "scp us3@$cluster.uthscsa.edu:$remoteDir/stdout . 2>&1";
     909      $cmd = "scp us3@$clushost:$remoteDir/stdout . 2>&1";
    900910
    901911      exec( $cmd, $output, $stat );
     
    910920      }
    911921
    912       $cmd = "scp us3@$cluster.uthscsa.edu:$remoteDir/stderr . 2>&1";
     922      $cmd = "scp us3@$clushost:$remoteDir/stderr . 2>&1";
    913923
    914924      exec( $cmd, $output, $stat );
  • trunk/cluster_status.php

    r28 r31  
    183183   {
    184184      $clusters = array( "alamo", "lonestar5", "stampede", "comet",
    185                          "gordon", "jureca", "jacinto" );
     185                         "stampede2-b", "jetstream", "jureca", "jacinto-b" );
    186186   }
    187187
     
    251251            break;
    252252         }
     253         case 'stampede2':
     254         {
     255            $host   = "us3@stampede2.tacc.utexas.edu";
     256            $qstat  = `ssh $host '/usr/local/bin/showq 2>&1|grep "Total Jobs"'`;
     257            $sparts = preg_split( '/\s+/', $qstat );
     258            $tot    = $sparts[ 2 ];
     259            $run    = $sparts[ 5 ];
     260            $que    = $sparts[ 8 ];
     261            $sta    = "up";
     262            if ( $tot == ''  ||  $tot == '0' )
     263               $sta    = "down";
     264            break;
     265         }
    253266         case 'lonestar5':
    254267         {
     
    267280            {
    268281               $run    = $sparts[ 5 ];
    269                $que    = $sparts[ 8 ];
     282//               $que    = $sparts[ 8 ];
     283               $que    = $sparts[ 11 ];
    270284            }
    271285            break;
     
    309323            break;
    310324         }
     325         case 'jetstream-local':
     326         case 'jetstream':
     327         {
     328            $host   = "us3@js-157-184.jetstream-cloud.org";
     329            $qstat  = `ssh $host '/usr/bin/sinfo -s -p batch -o "%a %F" |tail -1'`;
     330            $sparts = preg_split( '/\s+/', $qstat );
     331            $sta    = $sparts[ 0 ];
     332            $knts   = $sparts[ 1 ];
     333            $sparts = preg_split( '/\//', $knts );
     334            $run    = $sparts[ 0 ];
     335            $que    = $sparts[ 1 ];
     336            if ( $sta == "" )
     337               $sta    = "down";
     338            break;
     339         }
    311340      }
    312341
     
    328357      $data[] = $a;
    329358
    330       if ( $clname == 'alamo'  ||  $clname == 'jacinto' )
     359      if ( $clname == 'alamo'  ||
     360           $clname == 'jacinto'  ||
     361           $clname == 'jetstream' )
    331362      {
    332363         $a[ 'cluster' ] = $clname . "-local";
  • trunk/gridctl.php

    r29 r31  
    7070   // Checking we need to do for each entry
    7171echo "us3db=$us3_db  gfid=$gfacID\n";
     72//write_log( " us3db=$us3_db  gfid=$gfacID" );
    7273   switch ( $us3_db )
    7374   {
     
    99100      $status     = aira_status( $gfacID, $status_in );
    100101if($status != $status_in )
    101 write_log( "$loghdr Set to $status from $status_in" );
     102 write_log( "$loghdr Set to $status from $status_in" );
     103//write_log( "$loghdr    aira status=$status" );
    102104   }
    103105   else if ( is_gfac_job( $gfacID ) )
     
    112114   else
    113115   {
     116//write_log( "$loghdr Local gfacID=$gfacID" );
    114117      $status_gw  = $status;
    115118      $status     = get_local_status( $gfacID );
     
    136139
    137140//echo "  st=$status\n";
     141//write_log( "$loghdr switch status=$status" );
    138142   switch ( $status )
    139143   {
     
    174178      case "COMPLETED":
    175179      case "COMPLETE":
    176 write_log( "$loghdr   COMPLETE gfacID=$gfacID" );
     180//write_log( "$loghdr   COMPLETE gfacID=$gfacID" );
    177181         complete();
    178182         break;
     
    833837   global $self;
    834838
    835    $cmd    = "/usr/bin/qstat -a $gfacID 2>&1|tail -n 1";
     839   $is_jetstr = preg_match( "/jetstream/", $cluster );
     840   if ( $is_jetstr )
     841      $cmd    = "squeue -a $gfacID 2>&1|tail -n 1";
     842   else
     843      $cmd    = "/usr/bin/qstat -a $gfacID 2>&1|tail -n 1";
    836844//write_log( "$self cmd: $cmd" );
    837845//write_log( "$self cluster: $cluster" );
    838846//write_log( "$self gfacID: $gfacID" );
     847
    839848   if ( ! preg_match( "/us3iab/", $cluster ) )
    840849   {
    841850      $system = "$cluster.uthscsa.edu";
     851      if ( $is_jetstr )
     852         $system = "$cluster";
    842853      $system = preg_replace( "/\-local/", "", $system );
    843 //write_log( "$self system: $system" );
    844854      $cmd    = "/usr/bin/ssh -x us3@$system " . $cmd;
    845855//write_log( "$self  cmd: $cmd" );
     
    849859//write_log( "$self  result: $result" );
    850860
    851    if ( $result == ""  ||  preg_match( "/^qstat: Unknown/", $result ) )
     861///////////////////////////////////////////////////////////////////
     862   $secwait    = 2;
     863   $num_try    = 0;
     864   // Sleep and retry up to 3 times if ssh has "ssh_exchange_identification" error
     865   while ( preg_match( "/ssh_exchange_id/", $result )  &&  $num_try < 3 )
     866   {
     867      sleep( $secwait );
     868      $num_try++;
     869      $secwait   *= 2;
     870write_log( "$me:   num_try=$num_try  secwait=$secwait" );
     871   }
     872///////////////////////////////////////////////////////////////////
     873   if ( $result == ""  ||
     874        preg_match( "/^qstat: Unknown/", $result )  ||
     875        preg_match( "/ssh_exchange_id/", $result ) )
    852876   {
    853877      write_log( "$self get_local_status: Local job $gfacID unknown" );
     
    857881
    858882   $values = preg_split( "/\s+/", $result );
    859 //write_log( "$self: get_local_status: job status = /{$values[9]}/");
    860    switch ( $values[ 9 ] )
     883   $jstat   = ( $is_jetstr == 0 ) ? $values[ 9 ] : $values[ 4 ];
     884//write_log( "$self: get_local_status: job status = /$jstat/");
     885   switch ( $jstat )
    861886   {
    862887      case "W" :                      // Waiting for execution time to be reached
     
    873898      case "H" :                      // Held
    874899      case "Q" :                      // Queued
     900      case "PD" :                     // Queued
    875901        $status = 'SUBMITTED';
    876902        break;
     
    10791105   if ( preg_match( "/US3-A/i", $gfacID )  &&  $devmatch )
    10801106   {
     1107//write_log( "$loghdr status_in=$status_in status=$status gfacID=$gfacID" );
    10811108      $status_ex = getExperimentStatus( $gfacID );
     1109//write_log( "$loghdr   status_ex=$status_ex" );
    10821110
    10831111      if ( $status_ex == 'COMPLETED' )
Note: See TracChangeset for help on using the changeset viewer.