#!/bin/bash -l
##################################################################################
#Andy Rampersaud, 02.23.16
#This script is called by setup_UCSC_BigWig.sh
##################################################################################
# Specify which shell to use
#$ -S /bin/bash
# Run on the current working directory
#$ -cwd

# Join standard output and error to a single file  
#$ -j n
# change to y if you want a single qlog file 
##################################################################################
#Initialize variables from UCSC_BigWig.sh
##################################################################################
#checking the command line arg
#-ne : "is not equal to"
if [ $# -ne 4 ] ; then
      echo "Need 4 arguments for the qsub command:"
      echo "qsub -N ${Job_Name}'_'${Sample_ID} -P waxmanlab -l h_rt=${TIME_LIMIT} Read_Strandness.qsub ${Sample_ID} ${Dataset_DIR} ${Sample_Labels_DIR} ${SCRIPT_DIR}"
      exit 0
fi

#http://www.ibm.com/developerworks/library/l-bash-parameters/
#Note: If you have more than 9 parameters, you cannot use $10 to refer to the tenth one. You must first either process or save the first parameter ($1), then use the shift command to drop parameter 1 and move all remaining parameters down 1, so that $10 becomes $9 and so on.

#http://unix.stackexchange.com/questions/104420/how-to-use-command-line-arguments-in-a-shell-script
#If you need access more than 9 command line arguments, you can use the shift command. Example: shift 2 renames $3 to $1, $4 to $2 etc.

#process the command line arguments
Sample_ID=$1
Dataset_DIR=$2
Sample_Labels_DIR=$3
SCRIPT_DIR=$4


#Print variables (make sure they appear correctly):
echo "-----------------------"
echo "Start of variable list:"
echo "-----------------------"
echo "Sample_ID:"
echo ${Sample_ID}
echo "Dataset_DIR:"
echo ${Dataset_DIR}
echo "Sample_Labels_DIR:"
echo ${Sample_Labels_DIR}
echo "SCRIPT_DIR:"
echo ${SCRIPT_DIR}
echo "-----------------------"
echo "End of variable list"
echo "-----------------------"

# Now let's keep track of some information just in case anything goes wrong
echo "=========================================================="
#Use to calculate job time:
#Start_Time in seconds
Start_Time=$(date +"%s")
echo "Starting on : $(date)"
echo "Running on node : $(hostname)"
echo "Current directory : $(pwd)"
echo "Current job ID : $JOB_ID"
echo "Current job name : $JOB_NAME"
echo "Task index number : $SGE_TASK_ID"
echo "Parameter for multiple cores : $NSLOTS"
echo "=========================================================="

# Go to local scratch directory
echo
echo 'Change dir to scratch directory'
echo
cd ${TMPDIR}
echo
echo 'Print scratch directory location:'
echo
echo $TMPDIR
#--------------------------------------
echo
echo 'Loading required modules...'
echo
echo
echo 'Loading required modules...'
echo
#Make sure the shebang line = #!/bin/bash -l
#Need the -l option to load modules
#Search for latest program installed:
#module avail -t 2>&1 | grep -i samtools
module load samtools/samtools-0.1.19_gnu446
#module avail -t 2>&1 | grep -i python
module load python2.7
#module avail -t 2>&1 | grep -i R-2.15.1
module load R/R-2.15.1_gnu-4.4.6   
#module avail -t 2>&1 | grep -i RSeQC                
module load rseqc/2.4
#--------------------------------------
#module help samtools/samtools-0.1.19_gnu446
#---------------------------------------------------------------------------------
#----------- Module Specific Help for 'samtools/samtools-0.1.19_gnu446' ---------------------------
#SAMtools 0.1.19_gnu446: Suite of programs for interacting with high-throughput sequencing data.

#SAM (Sequence Alignment/Map) format is a generic format for storing
#large nucleotide sequence alignments. SAM Tools provide various utilities
#for manipulating alignments in the SAM format, including sorting, merging,
#indexing and generating alignments in a per-position format.
#---------------------------------------------------------------------------------
#module help python2.7
#----------- Module Specific Help for 'python2.7/Python-2.7.3_gnu446' ---------------------------
#python2.7 Python-2.7.3_gnu446: A general purpose, interpretive programming language.

#For more information on python2.7, please see http://www.python.org
#---------------------------------------------------------------------------------
#module help R/R-2.15.1_gnu-4.4.6   
#----------- Module Specific Help for 'R/R-2.15.1_gnu-4.4.6' -------
#R 2.15.1  is a system (language) for statistical computation and graphics

#<<R consists of a language plus run-time enviroment with graphics, a debugger, access to certain system functions and the ability to run programs stored in script files>>

#For more information on R, visit http://cran.r-project.org/
#---------------------------------------------------------------------------------
#module help rseqc/2.4
#----------- Module Specific Help for 'rseqc/2.4' ------------------
#rseqc 2.4 An RNA-seq Quality Control Package
#RSeQC package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. Some basic modules quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while RNA-seq specific modules evaluate sequencing saturation, mapped reads distribution, coverage uniformity, strand specificity, etc.
#For more information on rseqc, please see http://rseqc.sourceforge.net/
#---------------------------------------------------------------------------------
# copy user input data files to scratch
cp ${Dataset_DIR}/${Sample_ID}/fastq/tophat2/${Sample_ID}'_primary_unique.bam' .
#Initialize INPUT_BAM:
INPUT_BAM=${Sample_ID}'_primary_unique.bam'
#Copy Read_Strandness required files:
cp ${SCRIPT_DIR}/Input_Regions/mm9.refseq.bed.gz .

STORAGE_DIR=${Dataset_DIR}/${Sample_ID}/fastq/tophat2
#Create Read_Strandness output folder to store files:
OUTPUT_DIR='Read_Strandness'
###############################
if [ ! -d $OUTPUT_DIR ]; then
mkdir $OUTPUT_DIR
fi
###############################
#Create output file to store the result:
OUTPUT_FILE=$OUTPUT_DIR/${Sample_ID}'_'Read_Strandness.txt
######################
if [ -f $OUTPUT_FILE ]
then 
rm $OUTPUT_FILE
else
touch $OUTPUT_FILE
fi
######################

echo
echo 'Unzip files:'
echo

time gzip -d *.gz

echo
echo 'Finished unzipping'
echo

echo
echo 'List files in scratch directory:'
echo
ls -alh

echo
echo 'Starting to run my commands'
echo


echo
echo 'Starting infer_experiment.py command'
echo
#--------------------------------------------------------------------------------
#infer_experiment.py       
#Usage: infer_experiment.py [options]


#Options:
#  --version             show program's version number and exit
#  -h, --help            show this help message and exit
#  -i INPUT_FILE, --input-file=INPUT_FILE
#                        Input alignment file in SAM or BAM format
#  -r REFGENE_BED, --refgene=REFGENE_BED
#                        Reference gene model in bed fomat.
#  -s SAMPLE_SIZE, --sample-size=SAMPLE_SIZE
#                        Number of reads sampled from SAM/BAM file.
#                        default=200000
#  -q MAP_QUAL, --mapq=MAP_QUAL
#                        Minimum mapping quality (phred scaled) for an
#                        alignment to be called "uniquely mapped". default=30


#=================================================================================================
#Infer RNA-seq experiment design from SAM/BAM file. This module will determine if the RNA-seq
#experiment is:
#1) pair-end or single-end
#2) if experiment is strand-specific, how reads were stranded.
# * For pair-end RNA-seq, there are two different ways to strand reads:
#  i) 1++,1--,2+-,2-+
#     read1 mapped to '+' strand indicates parental gene on '+' strand
#     read1 mapped to '-' strand indicates parental gene on '-' strand
#     read2 mapped to '+' strand indicates parental gene on '-' strand
#     read2 mapped to '-' strand indicates parental gene on '+' strand
#  ii) 1+-,1-+,2++,2--
#     read1 mapped to '+' strand indicates parental gene on '-' strand
#     read1 mapped to '-' strand indicates parental gene on '+' strand
#     read2 mapped to '+' strand indicates parental gene on '+' strand
#     read2 mapped to '-' strand indicates parental gene on '-' strand
# * For single-end RNA-seq, there are two different ways to strand reads:
#  i) ++,--
#     read mapped to '+' strand indicates parental gene on '+' strand
#     read mapped to '-' strand indicates parental gene on '-' strand
#  ii) +-,-+
#     read mapped to '+' strand indicates parental gene on '-' strand
#     read mapped to '-' strand indicates parental gene on '+' strand		
#=================================================================================================
#--------------------------------------------------------------------------------
echo 'Printing command:'
echo "infer_experiment.py --input-file=${INPUT_BAM} --refgene=mm9.refseq.bed > $OUTPUT_FILE"
echo
#Run command:
infer_experiment.py --input-file=${INPUT_BAM} --refgene=mm9.refseq.bed > $OUTPUT_FILE
#--------------------------------------------------------------------------------

echo
echo 'Finished infer_experiment.py command'
echo

#Need to remove first 2 blank lines:
awk 'NR > 2 { print $0 }' $OUTPUT_FILE > temp1.txt
mv temp1.txt $OUTPUT_FILE

echo
echo 'Output file from infer_experiment.py:'
echo

head $OUTPUT_FILE

echo
echo 'Copy output to storage dir'
echo
cp -r $OUTPUT_DIR $STORAGE_DIR

echo
echo "List files in scratch"
echo
ls -alh

echo "=========================================================="
echo "Finished on : $(date)"
#Use to calculate job time:
#End_Time in seconds
End_Time=$(date +"%s")
diff=$(($End_Time-$Start_Time))
echo "$(($diff / 3600)) hours, $((($diff / 60) % 60)) minutes and $(($diff % 60)) seconds elapsed."
echo "=========================================================="