#!/bin/sh
# set -xv
#
#  Submits job to Sun Grid Engine (SGE).
#  Input: path to grami file (same as Globus).
#
# A temporary job script is created for the submission and then removed 
# at the end of this script. 
#

echo "----- starting submit_sge_job -----" 1>&2

joboption_lrms="sge"
lrms_options="sge_root sge_cell sge_qmaster_port sge_execd_port sge_bin_path sge_jobopts"
queue_options="sge_jobopts"

# ARC1 passes first the config file.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi
GRAMI_FILE=$1

# define paths and config parser
basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?
. "${basedir}/lrms_common.sh"

# include common submit functions
. "${pkgdatadir}/submit_common.sh" || exit $?

# run common init
#  * parse grami
#  * parse config
#  * load LRMS-specific env
#  * set common variables
common_init

#Log performance
perflogfilesub="${perflogdir}/submission.perflog"

if [ ! -z "$perflogdir" ]; then
   #start time stamp
   start_ts=`date +%s.%N`
fi

##############################################################
# Zero stage of runtime environments
##############################################################

RTE_stage0

# Force shell /bin/sh, other qsub options have been moved to the job script
SGE_QSUB='qsub -S /bin/sh'
SGE_QCONF=qconf
if [ "$SGE_BIN_PATH" ] ; then
  SGE_QSUB=${SGE_BIN_PATH}/${SGE_QSUB}
  SGE_QCONF=${SGE_BIN_PATH}/${SGE_QCONF}
fi

mktempscript

##############################################################
# Start job script
##############################################################
echo '#!/bin/sh' > $LRMS_JOB_SCRIPT
echo "# SGE batch job script built by arex" >> $LRMS_JOB_SCRIPT
# Job not rerunable:
echo "#$ -r n" >> $LRMS_JOB_SCRIPT
# Don't send mail when job finishes:
echo "#$ -m n" >> $LRMS_JOB_SCRIPT
# Mix standard output and standard error:
echo "#$ -j y" >> $LRMS_JOB_SCRIPT
# Write output to comment file:
echo "#$ -o ${joboption_directory}/.comment" >> $LRMS_JOB_SCRIPT

##############################################################
# priority
##############################################################
if [ ! -z "$joboption_priority" ]; then
  #first we must scale priority.  SGE: -1023 -> 1024 ARC: 0-100
  #user can only decrease priority: i.e. -1023 -> 0 (info from gsciacca@lhep.unibe.ch)
  #Same problem as SLURM. We can only prioritize grid jobs. Locally submitted jobs will get highest priority.
  priority=$((joboption_priority * 1023 / 100))
  priority=$((priority-1023))
  echo "#$ -p ${priority}" >> $LRMS_JOB_SCRIPT
fi

# Choose queue.
echo "#$ -q $joboption_queue" >> $LRMS_JOB_SCRIPT

# job name for convenience
if [ ! -z "${joboption_jobname}" ] ; then
  jobname=`echo "$joboption_jobname" | \
           sed 's/^\([^[:alpha:]]\)/N\1/' | \
           sed 's/[^[:alnum:]]/_/g' | \
           sed 's/\(...............\).*/\1/'`
  echo "#$ -N \"$jobname\"" >> $LRMS_JOB_SCRIPT
fi
echo "SGE jobname: $jobname" 1>&2

##############################################################
# (non-)parallel jobs
##############################################################

set_count

##############################################################
# parallel jobs
##############################################################
# In addition to the number of parallel tasks, also a valid
# parallel environment (PE) must be set for SGE.
#
# The selection of PE is done by Runtime Environment setup script in the zero
# stage. The user has to request a proper RE in addition to setting the
# "count" -property in the xrsl. The RE script must set the name of the desired 
# PE to joboption_nodeproperty_# -variable (# is a number starting from zero,
# RE should use the lowest previously undefined number). This script then
# searches through the joboption_nodeproperty_# variables and compares them to
# the PE list obtained from SGE. The first matching PE name is used.
# 
if [ -n "$joboption_nodeproperty_0" ]; then 
    i=0
    sge_parallel_environment_list=`$SGE_QCONF -spl`
    while eval jope=\${joboption_nodeproperty_$i} && test "$jope" ; do
        for ipe in $sge_parallel_environment_list ; do
            if [ "$jope" = "$ipe" ] ; then
                break 2 # now $jope contains the requested parallel env
            fi
        done
        i=$(($i + 1))
    done
    if [ -n "$jope" ] ; then
        echo "#\$ -pe $jope $joboption_count" >> $LRMS_JOB_SCRIPT
    else
        echo 'ERROR: Setting parallel environment failed.' 1>&2
    fi
fi

if [ "$joboption_exclusivenode" = "true" ]; then
  sge_excl_complex=`$SGE_QCONF -sc | grep EXCL | head -n 1`
  if [ -n "$sge_excl_complex" ]; then
     sge_excl_complex_name=`echo $sge_excl_complex | awk '{print $1}'`
     echo "#\$ -l ${sge_excl_complex_name}=true"  >> $LRMS_JOB_SCRIPT
  else
     echo "WARNING: Exclusive execution support is not configured by this Grid Engine" 1>&2
     echo "WARNING: Example configuration: https://wiki.nordugrid.org/index.php/LRMS_Backends/Testbeds" 1>&2
  fi
fi


##############################################################
# Execution times (obtained in seconds)
##############################################################
# SGE has soft and hard limits (soft = SIGUSR1, hard = SIGKILL sent to the job), 
# let's allow time_hardlimit_ratio extra before the hard limit.
# cputime/walltime is obtained in seconds via $joboption_cputime and $joboption_walltime
# parallel jobs, add initialization time, soft/hard limit configurable...
if ( [ -n "$joboption_cputime" ] && [ $joboption_cputime -gt 0 ] ) ; then
  # SGE enforces job-total cpu time limit, but it expects in h_cpu and s_cpu
  # per-slot limits. It then scales these with the number of requested slots
  # before enforcing them.
  cputime_perslot=$(( $joboption_cputime / $joboption_count ))
  cputime_hard_perslot=$(( $cputime_perslot * $time_hardlimit_ratio ))
  s_cpu_requestable=$($SGE_QCONF -sc|awk '($1=="s_cpu" && ( $5=="YES" || $5=="FORCED" )){print $5}')
  h_cpu_requestable=$($SGE_QCONF -sc|awk '($1=="h_cpu" && ( $5=="YES" || $5=="FORCED" )){print $5}')
  opt="#$"
  if [ $s_cpu_requestable ]; then opt="$opt -l s_cpu=::${cputime_perslot}"; fi
  if [ $h_cpu_requestable ]; then opt="$opt -l h_cpu=::${cputime_hard_perslot}"; fi
  echo $opt >> $LRMS_JOB_SCRIPT
fi

if [ -n "$joboption_walltime" ] ; then  
  if [ $joboption_walltime -lt 0 ] ; then
    echo 'WARNING: Less than 0 wall time requested: $joboption_walltime' 1>&2
    joboption_walltime=0
    echo 'WARNING: wall time set to 0' 1>&2
  fi
  joboption_walltime_hard=$(( $joboption_walltime * $time_hardlimit_ratio ))
  s_rt_requestable=$($SGE_QCONF -sc|awk '($1=="s_rt" && ( $5=="YES" || $5=="FORCED" )){print $5}')
  h_rt_requestable=$($SGE_QCONF -sc|awk '($1=="h_rt" && ( $5=="YES" || $5=="FORCED" )){print $5}')
  opt="#$"
  if [ $s_rt_requestable ]; then opt="$opt -l s_rt=::${joboption_walltime}"; fi
  if [ $h_rt_requestable ]; then opt="$opt -l h_rt=::${joboption_walltime_hard}"; fi
  echo $opt >> $LRMS_JOB_SCRIPT
fi



##############################################################
# Requested memory (mb)
##############################################################

set_req_mem

# There are soft and hard limits for virtual memory consumption in SGE

if [ -n "$joboption_memory" ] ; then
  joboption_memory_hard=$(( $joboption_memory * $memory_hardlimit_ratio ))
  h_vmem_requestable=$($SGE_QCONF -sc|awk '($1=="h_vmem" && ( $5=="YES" || $5=="FORCED" )){print $5}')
  s_vmem_requestable=$($SGE_QCONF -sc|awk '($1=="s_vmem" && ( $5=="YES" || $5=="FORCED" )){print $5}')
  opt="#$"
  if [ $s_vmem_requestable ]; then opt="$opt -l s_vmem=${joboption_memory}M"; fi
  if [ $h_vmem_requestable ]; then opt="$opt -l h_vmem=${joboption_memory_hard}M"; fi
  echo $opt >> $LRMS_JOB_SCRIPT
fi

##############################################################
# Extra job options. This is the last, so that
# it can overwrite previously set options.
##############################################################
if [ ! -z "$CONFIG_sge_jobopts" ]; then
  echo "#$ $CONFIG_sge_jobopts" >> $LRMS_JOB_SCRIPT
fi

##############################################################
# Override umask
##############################################################
echo "" >> $LRMS_JOB_SCRIPT
echo "# Overide umask of execution node (sometime values are really strange)" >> $LRMS_JOB_SCRIPT
echo "umask 077" >> $LRMS_JOB_SCRIPT

##############################################################
# By default, use $TMPDIR from SGE to alleviate its cleanup facilities.
# It can be overridden with scratchdir though.
# Don't do this if "shared_scratch" is defined in arc.conf.
##############################################################

if [ "$RUNTIME_LOCAL_SCRATCH_DIR" ] && [ ! "$RUNTIME_FRONTEND_SEES_NODE" ]; then
    echo "if [ -d \"${CONFIG_scratchdir:-\$TMPDIR}\" ]; then RUNTIME_LOCAL_SCRATCH_DIR=${CONFIG_scratchdir:-\$TMPDIR}; fi" >> $LRMS_JOB_SCRIPT
fi

sourcewithargs_jobscript

##############################################################
# Init accounting
##############################################################
accounting_init

##############################################################
# Add environment variables
##############################################################
add_user_env

##############################################################
# Check for existance of executable,
##############################################################
if [ -z "${joboption_arg_0}" ] ; then
  echo 'Executable is not specified' 1>&2
  exit 1
fi

program_start=`echo ${joboption_arg_0} | cut -c 1 2>&1`
if [ "$program_start" != '$' ] && [ "$program_start" != '/' ] ; then
  if [ ! -f $joboption_directory/${joboption_arg_0} ] ; then 
    echo 'Executable does not exist, or permission denied.' 1>&2
    echo "   Executable $joboption_directory/${joboption_arg_0}" 1>&2
    echo "   whoami: "`whoami` 1>&2
    echo "   ls -l $joboption_directory/${joboption_arg_0}: "`ls -l $joboption_directory/${joboption_arg_0}`
    exit 1
  fi
  if [ ! -x $joboption_directory/${joboption_arg_0} ] ; then 
    echo 'Executable is not executable' 1>&2
    exit 1
  fi
fi


setup_runtime_env

# Override location of .diag file: put it under the working directory
echo 'RUNTIME_JOB_DIAG=$RUNTIME_JOB_DIR/.diag' >> $LRMS_JOB_SCRIPT

##############################################################
# Add std... to job arguments
##############################################################
include_std_streams

##############################################################
#  Move files to local working directory (job is done on node only)
#  RUNTIME_JOB_DIR -> RUNTIME_LOCAL_SCRATCH_DIR/job_id
##############################################################
move_files_to_node

echo "" >> $LRMS_JOB_SCRIPT
echo "RESULT=0" >> $LRMS_JOB_SCRIPT
echo "" >> $LRMS_JOB_SCRIPT



##############################################################
#  Skip execution if something already failed
##############################################################
echo "" >> $LRMS_JOB_SCRIPT
echo "if [ \"\$RESULT\" = '0' ] ; then" >> $LRMS_JOB_SCRIPT

##############################################################
#  Runtime configuration
##############################################################

RTE_stage1

echo "echo \"runtimeenvironments=\$runtimeenvironments\" >> \"\$RUNTIME_JOB_DIAG\"" >> $LRMS_JOB_SCRIPT

if [ -z "$RUNTIME_NODE_SEES_FRONTEND" ] ; then
  echo "Nodes detached from gridarea are not supported when SGE is used. Aborting job submit" 1>&2
  rm -f "$LRMS_JOB_SCRIPT" "$LRMS_JOB_OUT" "$LRMS_JOB_ERR"
  exit 1
fi

##############################################################
# Accounting (WN OS Detection)
##############################################################
detect_wn_systemsoftware


##############################################################
#  Execution
##############################################################
cd_and_run
echo "fi" >> $LRMS_JOB_SCRIPT

##############################################################
#  Runtime (post)configuration at computing node
##############################################################
RTE_stage2

##############################################################
#  Move files back to session directory (job is done on node only)
#  RUNTIME_JOB_DIR -> RUNTIME_LOCAL_SCRATCH_DIR/job_id
##############################################################
move_files_to_frontend

##############################################################
# Finish accounting and exit job
##############################################################
accounting_end

if [ ! -z "$perflogdir" ]; then
   stop_ts=`date +%s.%N`
   t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
   echo "[`date +%Y-%m-%d\ %T`] submit-sge-job, JobScriptCreation: $t" >> $perflogfilesub
fi


if [ ! -z "$perflogdir" ]; then
   #start time stamp
   start_ts=`date +%s.%N`
fi

#######################################
#  Submit the job
#######################################
(
  echo "SGE job script built"
  cd "$joboption_directory"
  echo "SGE script follows:"
  cat "$LRMS_JOB_SCRIPT"
  echo

  # Execute qsub command
  ${SGE_QSUB} < $LRMS_JOB_SCRIPT 1>$LRMS_JOB_OUT 2>$LRMS_JOB_ERR

  # expected SGE output is like: 'Your job 77 ("perftest") has been
  # submitted', the line below uses only the job number as job id.
  job_id=$(cat $LRMS_JOB_OUT $LRMS_JOB_ERR \
           | awk '/^.our job .* has been submitted/ {split($0,field," ");print field[3]}')
  # anything else is a sign of problems, which should be logged
  warnings=$(cat $LRMS_JOB_OUT $LRMS_JOB_ERR \
             | grep -v '^.our job .* has been submitted' | grep -v '^Exit')
  if [ ! -z "$warnings" ]; then echo "WARNING: $warnings"; echo; fi

  exitcode=0
  if [ -z $job_id ] ; then
    echo "job *NOT* submitted successfully!"
    exitcode=1
  else
    echo "joboption_jobid=$job_id" >> $GRAMI_FILE
    echo "local job id: $job_id"
    echo "job submitted successfully!"
    exitcode=0
  fi

  # Remove temporary job script file
  rm -f $LRMS_JOB_SCRIPT $LRMS_JOB_OUT $LRMS_JOB_ERR
  echo "----- exiting submit_sge_job -----";
) 1>&2

if [ ! -z "$perflogdir" ]; then
   stop_ts=`date +%s.%N`
   t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
   echo "[`date +%Y-%m-%d\ %T`] submit-sge-job, JobSubmission: $t" >> $perflogfilesub
fi

exit $exitcode
