#!/bin/bash

# Helper script to flag done LoadLeveler jobs.
# The script is called periodically by the arex.
#
# This function retrieve the jobs status and id in one shot
# look for jobs which have a known status but are not completed (!=C)
# and save the localid of these jobs in the string variable
# $outLocalIdsString.
# 
# The input variable is a string list of localid to check.
# 
# Example of usage:
# get_bunch_jobs_status "$inLocalIdsString"

outLocalIdsString=""

# ARC1 passes the config file first.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi
# Does the control directory exist?
control_dir="$1"
test -d "$control_dir" || exit 1

joboption_lrms="ll"
lrms_options="ll_bin_path ll_consumable_resources ll_parallel_single_jobs"

# define paths and config parser
basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?
. "${basedir}/lrms_common.sh"

# include common scan functions
. "${pkgdatadir}/scan_common.sh" || exit $?

# run common init 
#  * parse config
#  * load LRMS-specific env
#  * set common variables
common_init

# Assume that gm-kick is installed in the same directory
GMKICK=${pkglibexecdir}/gm-kick

# Log system performance
if [ ! -z "$perflogdir" ]; then
   perflog_common "$perflogdir" "$CONFIG_controldir"
fi

if [ ! -z "$perflogdir" ]; then
   start_ts=`date +%s.%N`
fi

my_id=`id -u`

get_bunch_jobs_status() {
  #get the string list of jobs
  loop=`$LL_BIN_PATH/llq -r %st %id $1`
  if [ $? -eq 0 ]; then
    for elm in $loop
    do
      if [ `echo $elm | grep '^[A-Z]\{1,2\}!.\+$'` ]; then
        if [ ! `echo $elm | grep '^C!'` ]; then
          outLocalIdsString=$outLocalIdsString" "`echo $elm | awk -F! '{ print $NF}'`
        fi
      fi
    done
  fi
}

# mergedlist: array where each element is made of
# key:value, where key is the arc jobid and value is the 
# localid
mergedlist=()
# inLocalIdsString: in this string we save the localid retrived by from 
# the arc .local file divided by space 
inLocalIdsString=""

findoutput=$(find "$control_dir/processing" -maxdepth 1 -type f -name 'job.*.status' | sed 's/processing\/job\.\([^\.]*\)\.status$/job.\1.local/') 

while read i
do

  # Continue if no glob expansion or other problems
  test -f "$i" || continue
  
  jobid=`basename $i .local|sed 's/^job.//'`
  donefile="${control_dir}/job.${jobid}.lrms_done"
  statusfile="${control_dir}/processing/job.${jobid}.status"

  # Continue if the job is already flagged as done?
  test -f "$donefile" && continue

  if [ ! -f "$statusfile" ] ; then continue ; fi
  gmstatus=`cat "$statusfile"`
  if [ "$gmstatus" != "INLRMS" ] && [ "$gmstatus" != "CANCELING" ] ; then continue ; fi

  # Get local LRMS id of job by evaluating the line with localid
  localid=`grep ^localid= $i|head -1`
  eval $localid

  # Did we get a local id?
  test "$localid" = "" && continue

  # HACK: save the localid to be queried into inLocalIdsString
  # associate the localid to its jobid and save them in a list
  inLocalIdsString=$inLocalIdsString" "$localid
  mergedlist+=("$jobid:$localid")

done <<< "$findoutput"

if [ ! -z "$perflogdir" ]; then
   stop_ts=`date +%s.%N`
   t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
   echo "[`date +%Y-%m-%d\ %T`] scan-ll-job, ControldirTraversal: $t" >> $perflogfile
fi


if [ ! -z "$perflogdir" ]; then
   start_ts=`date +%s.%N`
fi

# Query the LoadLeveler for jobs 
# and save the not completed into $outLocalIdsString
# Call the funcion only if there is some into the string
if [[ $inLocalIdsString =~ /^[0-9]|[a-z]|[A-Z]*$/ ]]; then
  get_bunch_jobs_status "$inLocalIdsString"
fi 

if [ ! -z "$perflogdir" ]; then
   stop_ts=`date +%s.%N`
   t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
   echo "[`date +%Y-%m-%d\ %T`] scan-ll-job, llq -r %st %id: $t" >> $perflogfile
fi

if [ ! -z "$perflogdir" ]; then
   start_ts=`date +%s.%N`
fi
numelem=0
# Start the loop based on element of the mergelist
for element in ${mergedlist[@]}
do

  # Divide the jobid from the localid
  jobid=`echo $element | awk '{split($0,a,":"); print a[1]}'`
  localid=`echo $element | awk '{split($0,a,":"); print a[2]}'`

  # Exclude the not completed jobs stored into the $outLocalIdsString
  if [[ $outLocalIdsString == *$localid*  ]]
  then 
      continue
  fi
  numelem=$((numelem+1))
  donefile="${control_dir}/job.${jobid}.lrms_done"
  statusfile="${control_dir}/processing/job.${jobid}.status"
  jobfile="${control_dir}/job.${jobid}.local"
  errorsfile="${control_dir}/job.${jobid}.errors"

  # Continue if the job is already flagged as done?
  test -f "$donefile" && continue

  if [ ! -f "$statusfile" ] ; then continue ; fi
  gmstatus=`cat "$statusfile"`

  exitcode=''

  # get session directory of this job
  sessiondir=`grep -h '^sessiondir=' "$control_dir/job.${jobid}.local" | sed 's/^sessiondir=\(.*\)/\1/'`
  diagfile="${sessiondir}.diag"
  commentfile="${sessiondir}.comment"

  if [ "$my_id" != '0' ] ; then
    if [ ! -O "$jobfile" ] ; then continue ; fi
  fi
  uid=$(get_owner_uid "$jobfile")
  [ -z "$uid" ] && { log "Failed to stat $jobfile"; continue; }

  if [ ! -z "$sessiondir" ] ; then
    # have chance to obtain exit code
    exitcode=$(do_as_uid "$uid" "grep '^exitcode=' '$diagfile'" | sed 's/^exitcode=//')
  else
    continue
  fi

  if [ ! -z "$exitcode" ] ; then
    if [ "$exitcode" = "152" -o $exitcode = "24" ] ; then
      exitcode="24"
      save_commentfile "$uid" "${sessiondir}.comment" "$errorsfile"
      echo "$exitcode Job exceeded time limit." > "$donefile"
      # If job exceeded time, then it will have been killed and no cputime/walltime has been written
      walltime=`$LL_BIN_PATH/llq -l $localid|sed -n 's/^ *Wall Clk Hard Limit:.*(\([0-9]*\) seconds.*/\1/p'`
      usertime=`$LL_BIN_PATH/llq -l $localid|sed -n 's/^ *Step Cpu Hard Limit:.*(\([0-9]*\) seconds.*/\1/p'`
      starttime=`$LL_BIN_PATH/llq -l $localid|sed -n 's/^ *Dispatch Time: \(.*\)/\1/p'`
      endtime=`$LL_BIN_PATH/llq -l $localid|sed -n 's/^ *Completion Date: \(.*\)/\1/p'`

      if [ -n "$starttime" ]; then
        date_to_utc_seconds "$starttime"
        seconds_to_mds_date "$return_date_seconds"
        starttime=$return_mds_date
      fi
      if [ -n "$endtime" ]; then
        date_to_utc_seconds "$endtime"
        seconds_to_mds_date "$return_date_seconds"
        endtime=$return_mds_date
      fi

      job_read_diag

      [ -n "$walltime" ] && WallTime=${walltime}
      [ -n "$usertime" ] && UserTime=${usertime}
      [ -n "$usertime" ] && KernelTime=0
      [ -n "$starttime" ] && LRMSStartTime=${starttime}
      [ -n "$endtime" ] && LRMSEndTime=${endtime}
      #This needs investigating, might be user program exit code
      [ -n "$exitcode" ] && LRMSExitcode=$exitcode

      job_write_diag

      ${GMKICK} -j "${jobid}" "$jobfile"
      continue
    fi
    # job finished and exit code is known
    save_commentfile "$uid" "${sessiondir}.comment" "$errorsfile"
    echo "$exitcode Executable finished with exit code $exitcode" >> "$donefile"
    ${GMKICK} -j "${jobid}" "$jobfile"
    continue
  fi
  exitcode=-1
  save_commentfile "$uid" "${sessiondir}.comment" "$errorsfile"
  echo "$exitcode Job finished with unknown exit code" >> "$donefile"
  ${GMKICK} -j "${jobid}" "$jobfile"
done

if [ ! -z "$perflogdir" ]; then
   stop_ts=`date +%s.%N`
   t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
   echo "[`date +%Y-%m-%d\ %T`] scan-ll-job, JobHandling, Handled= $numelem: $t" >> $perflogfile
fi

sleep 60
exit 0
