#!/bin/sh

show_help() 
{

	echo "Use: sge_submit_workers [options] <servername> <port> <num-workers>"
	echo "         when auto mode is not enabled, or"  
	echo "     sge_submit_workers [options] <num-workers>"
	echo "         when auto mode is enabled."  
	echo "Options:"
	echo "  -M,--master-name <name>  Name of the preferred master for worker. (auto mode enabled)"
	echo "  -N,--name <name>         Same as -M (backwards compatibility). (auto mode enabled)"
	echo "  -C,--catalog <catalog>   Set catalog server to <catalog>. <catalog> format: HOSTNAME:PORT."
	echo "  -t,--timeout <time>      Abort after this amount of idle time. (default=900s)."
	echo "  -d,--debug <subsystem>   Enable debugging on worker for this subsystem (try -d all to start)."
	echo "  -w,--tcp-window-size <size>  Set TCP window size."
	echo "  -i,--min-backoff <time>  Set initial value for backoff interval when worker fails to connect to a master. (default=1s)"
	echo "  -b,--max-backoff <time>  Set maxmimum value for backoff interval when worker fails to connect to a master. (default=60s)"
	echo "  -z,--disk-threshold <size>   Set available disk space threshold (in MB). When exceeded worker will clean up and reconnect. (default=100MB)"
	echo "  -A,--arch <arch>         Set architecture string for the worker to report to master instead of the value in uname."
	echo "  -O,--os <os>             Set operating system string for the worker to report to master instead of the value in uname."
	echo "  -s,--workdir <path>      Set the location for creating the working directory of the worker."
	echo "  -P,--password <pwfile>   Password file to authenticate workers to master."
	echo "  --cores <num>            Set the number of cores each worker should use (0=auto). (default=1)"
	echo "  -W,--workers <num>       Number of worker processes which will divide the cores evenly (Lee-Ping's addition)"	
	echo "  --memory <size>          Manually set the amonut of memory (in MB) reported by this worker."
	echo "  --disk <size>            Manually set the amount of disk (in MB) reported by this worker."
	echo "  -f                       SSH port forwarding enabled (Lee-Ping's addition)"
	echo "  -j                       Use job array to submit workers."	
	echo "  -p <parameters>          SGE qsub parameters."
	echo "  -h,--help                Show this help message."
	exit 1
}

# An example of submitting three workers, with four slots and two workers per job, with SSH port forwarding,
# and the host is leeping.stanford.edu at port 5813.  On leeping.stanford.edu, I have already run the command:
# ssh -o ServerAliveInterval=180 -N -R5813:leeping.stanford.edu:5813 -f leeping@fire-4-0-ext.slac.stanford.edu
# 
# sge_submit_workers -f -w 2 -t 86400s -p "-l h_rt=168:00:00 -q cpuq -pe smp 4" fire-4-0-ext.slac.stanford.edu 5813 3
#

arguments=""
use_auto=0
use_jobarray=0
parameters=""
cores=""
numworkers=1
port_forward=0

# Used options (as in the getopts format):  aM:N:C:t:d:w:i:b:z:A:O:s:P:jp:h  

while [ $# -gt 0 ]
do
	case $1 in
		-a | --advertise)
            #left here for backwards compatibility
			arguments="$arguments -a"
			use_auto=1
			;;
		-M | --master-name)
		    shift
			arguments="$arguments -M $1"
			use_auto=1
			;;
		-N | --name)
		    shift
			arguments="$arguments -M $1"
		   	use_auto=1
			;;
		-C | --catalog)  
		    shift
			arguments="$arguments -C $1"
			;;
		-t | --timeout)  
		    shift
			arguments="$arguments -t $1"
			;;
		-d | --debug)  
			shift
			arguments="$arguments -d $1"
			;;
		-w | --tcp-window-size)
			shift
	  		arguments="$arguments -w $1"
			;;
		-i | --min-backoff)
			shift
			arguments="$arguments -i $1"
			;;
		-b | --max-backoff)
			shift
		    arguments="$arguments -b $1"
			;;
		-z | --disk-threshold)
			shift
			arguments="$arguments -z $1"
			;;
		-A | --arch)
			shift
			arguments="$arguments -A $1"
			;;
		-O | --os)  
			shift
			arguments="$arguments -O $1"
			;;
		-s | --workdir)  
			shift
			arguments="$arguments -s $1"
			;;
		-P | --password)  
			shift
			pwfile=$1 
			arguments="$arguments -P $pwfile"
			;;
		--cores)  
			shift
			cores=$1
			;; 
		-W | --workers)  
			shift
			numworkers=$1
			;; 
		--memory)  
			shift
			arguments="$arguments --memory $1"
			;;
		--disk)  
			shift
			arguments="$arguments --disk $1"
			;;
		-f)  
			port_forward=1
			;;
		-j)  
			use_jobarray=1
			;;
		-p)  
			shift
			parameters="$parameters $1"
			;;
		-h | --help)
		    show_help
			;;
		*)
			break
			;;
	esac
	shift
done

if [ $use_auto = 0 ]; then
    if [ $# -ne 3 ] ; then
		echo "3 arguments (<servername> <port> <num-workers>) are expected while $# is present: \"$@\"."
		echo "To view the help message, type: sge_submit_workers -h"
		exit 1
    fi
    host=$1
    port=$2
    count=$3
else
    if [ $# -ne 1 ]
    then
		echo "1 argument (<num-workers>) is expected while $# is present: \"$@\"."
		echo "To view the help message, type: sge_submit_workers -h"
		exit 1
    fi
    host=
    port=
    count=$1
fi

worker=`which work_queue_worker 2>/dev/null`
if [ $? != 0 ]
then
	echo "$0: please add 'work_queue_worker' to your PATH."
	exit 1
fi

qsub=`which qsub 2>/dev/null`
if [ $? != 0 ]
then
	echo "$0: please add 'qsub' to your PATH."
	exit 1
fi

# For SSH port forwarding, establish a connection from the head node (i.e. where this script is run) to the master
if [ $port_forward == 1 ]; then
    if [ `ps aux | grep $USER | grep ServerAlive | grep $port | grep -v grep | awk '{print \$2}' | wc -l` -eq 0 ] ; then
        ssh -o ServerAliveInterval=180 -N -f -L$port:$host:$port $host
    fi
fi

#================================#
#| Lee-Ping's Modifications for |#
#| Best Performance on Clusters |#
#|  using Torque: vsp-compute,  |#
#|      Certainty, biox3        |#
#================================#
# Host-specific parameters to qsub.
# Note that these variables
# are evaluated on the head node!

if [[ $HOSTNAME =~ "hs-ln01" ]] ; then
    if [ "x$cores" == "x" ] ; then cores=$(( 16 / $numworkers )) ; fi
    QSUB_EXTRAS="#$ -l h_vmem=3.75G
#$ -l h_rt=672:00:00
#$ -pe shm $(( cores * numworkers ))"
    SELF_SSH="y"
    SCRATCH_DIR=/tmp/$USER/\$HOSTNAME
    # This variable will prevent leeping-workers 
    # folders from being strewn everywhere.
    WORKER_DIR=/hsgs/nobackup/$USER
    HEADNODE="hs-ln01" # Using $HOSTNAME for ssh does not work.
fi
arguments="$arguments --cores $cores"; 

mkdir -p $WORKER_DIR/${USER}-workers/$$
cd $WORKER_DIR/${USER}-workers/$$
cp $worker .

#====================================================#
#| Worker script has two layers.  This gives us     |#
#| the option of whether we should SSH back into    |#
#| the local node before running the worker, which  |#
#| allows us to surpass some buggy resource limits. |#
#====================================================#

cat >worker.sh <<EOF
#!/bin/sh
#$ -V
#$ -N worker.sh
#$ -o \$JOB_NAME.o\$JOB_ID
#$ -e \$JOB_NAME.e\$JOB_ID
$QSUB_EXTRAS


# Execute the second layer with an optional self-SSH.
if [ "x$SELF_SSH" == "x" ] ; then
    $PWD/worker1.sh
else
    ssh \$HOSTNAME "$PWD/worker1.sh \$\$"
fi
EOF

#=======================================#
#|   Create the second layer script.   |#
#| This actually launches the workers. |#
#=======================================#
cat >worker1.sh <<EOF
#!/bin/sh

# Limit core dump size.
ulimit -c 0

# This function makes the script kill itself if:
# 1) the second layer stops running (i.e. job deleted by scheduler)
# 2) there are no more workers (i.e. idle timeout)
waitkill(){
    while sleep 1 ; do 
        kill -0 \$1 2> /dev/null || break
        [ \$( ps xjf | grep work_queue_worker | grep -v grep | wc -l ) -gt 0 ] || break
    done
    kill -TERM -\$\$
};

# Go into the directory where the worker program is.
cd $PWD

# Set environment variables.
export OMP_NUM_THREADS=$cores
export MKL_NUM_THREADS=$cores
export CORES_PER_WORKER=$cores

if [[ \$HOSTNAME =~ "cn43" ]] ; then 
    # Compute node specific fix
    export _CONDOR_SCRATCH_DIR=/state/partition1/tmp/$USER/\$HOSTNAME
else
    export _CONDOR_SCRATCH_DIR=/tmp/$USER/\$HOSTNAME
fi
mkdir -p \$_CONDOR_SCRATCH_DIR

# Optional SSH port forwarding
if [ $port_forward == 1 ]; then
    if [ \`ps aux | grep $USER | grep $HEADNODE | grep ServerAlive | grep $port | grep -v grep | awk '{print \$2}' | wc -l\` -eq 0 ] ; then
        ssh -o ServerAliveInterval=180 -N -f -L$port:localhost:$port $HEADNODE
    fi
fi

waitkill \$1 &

# Actually execute the workers.
for i in \`seq $numworkers\`; do
    export CUDA_DEVICE=\$(( i - 1 ))
    if [ $port_forward == 1 ]; then
        ./work_queue_worker -d all $arguments localhost $port &
    else
        ./work_queue_worker -d all $arguments $host $port &
    fi
done

wait
EOF

chmod 755 worker.sh worker1.sh

if [ $use_jobarray = 1 ]
then
	qsub -t 1-$count:1 -cwd $parameters worker.sh	
else 
	for n in `seq 1 $count`
	do
		qsub -cwd $parameters worker.sh
	done
fi
return_status=$?

exit $return_status
