#!/bin/bash
##################################################################################
#Andy Rampersaud, 02.22.16
#This script would be used to run Extract_Counts.pbs in parallel for different FASTQ files
#Way to run script:
#Usage: ./Run_Jobs.sh
#Example: 
#./Run_Jobs.sh 
##################################################################################
#Remove *.o files from previous jobs
#Need the 2>/dev/null to supress the "No such file or directory"
count=`ls -1 *.o* 2>/dev/null  | wc -l`
if [ $count != 0 ]
then 
rm *.o*
rm *.e*
fi 
##################################################################################
#---------------------------------------------------------------------------------
#Source the setup file to initialize variables
source ../00_Setup_Pipeline/01_Pipeline_Setup.sh
#---------------------------------------------------------------------------------
#Check that each variable prints a value to the terminal:
echo "-----------------------"
echo "Start of variable list:"
echo "-----------------------"
echo "Dataset_DIR:"
echo ${Dataset_DIR}
echo "Sample_Labels_DIR:"
echo ${Sample_Labels_DIR}
echo "GTF_Files_DIR:"
echo ${GTF_Files_DIR}
echo "MODE:"
echo ${MODE}
echo "STRANDEDNESS_HTSeq:"
echo ${STRANDEDNESS_HTSeq}
echo "FEATURE_ID:"
echo ${FEATURE_ID}
echo "SCRIPT_DIR:"
echo ${SCRIPT_DIR}
echo "TIME_LIMIT:"
echo ${TIME_LIMIT}
echo "-----------------------"
echo "End of variable list"
echo "-----------------------"
#---------------------------------------------------------------------------------
#A text file (Sample_Labels.txt) is needed to run this script
cp $Sample_Labels_DIR/Sample_Labels.txt $SCRIPT_DIR
#---------------------------------------------------------------------------------
#This will guarantee that only tabs exists between columns:
awk '{print $1"\t"$2"\t"$3}' Sample_Labels.txt > Sample_Labels.temp1
mv Sample_Labels.temp1 Sample_Labels.txt
#---------------------------------------------------------------------------------
################################################
#The text file is formatted like the following:
#----------------------------------------------
#Sample_DIR	Sample_ID	Description
#Sample_Waxman-TP17	G83_M1	Male 8wk-pool 1
#Sample_Waxman-TP18	G83_M2	Male 8wk-pool 2
#Sample_Waxman-TP19	G83_M3	Female 8wk-pool 1
#Sample_Waxman-TP20	G83_M4	Female 8wk-pool 2	
#----------------------------------------------
#The 1st column: The Sample_DIR name
#The 2nd column: Waxman Lab Sample_ID 
#The 3rd column: Sample's description 
################################################
echo "-----------------------"
echo "Start of qsub commands:"
echo "-----------------------"
#Text file has a header line to ignore:
tail -n +2 Sample_Labels.txt > Sample_Labels.temp
#Use a while loop to run jobs
while IFS=$'\t' read -r -a myArray
do
#---------------------------
##Check that text file is read in properly:
#echo 'Sample_DIR:'
Sample_DIR=${myArray[0]}
#echo 'Sample_ID:'
Sample_ID=${myArray[1]}
#echo $Sample_ID
#echo 'Description:'
Description=${myArray[2]}
#echo $Description
#---------------------------
#Note:
#If you need more than 9 command line arguments, you can use the shift command. 
#See details in the Extract_Counts.qsub
#---------------------------
#Gene Annotation file to use
#---------------------------------------------------------------------------------
#For read mapping purposes we want the full gene body information (use the RefSeq_GeneBody.gtf)
#Presence of splice junctions should be based on the full gene structure for any isoform of a gene symbol
#For read counting purposes we can use either 
#1. RefSeq_GeneBody.gtf
#2. Intron_Only_Regions.gtf
#3. Exon_Only_Regions.gtf
#---------------------------------------------------------------------------------
#Start loop over GTF files:
GTF_List=$GTF_Files_DIR/*.gtf
for GTF_file in $GTF_List
do
ANNOTATION_FILE=`basename $GTF_file`
#echo ${ANNOTATION_FILE}
##################################################################################
##################################################################################
#Please: DO NOT EDIT CODE BELOW
#Additional variables will be populated from if statements
##################################################################################
##################################################################################
#Extracting counts:
#How reads are counted
#--------------------------
#if EXONS_ONLY is set to be true, then, only reads that overlap exons will be counted (recommended)
#if EXONS_ONLY is set to be false,then, all reads that overlap a gene will be counted
#Need if statements to determine FEATURE_TYPE depending on the ANNOTATION_FILE used:
#------------------------------------------
#Use the -o (logical "or")
if [ "${ANNOTATION_FILE}" == "genes.gtf" -o "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ];
then
EXONS_ONLY="TRUE"
#If statement to determine FEATURE_TYPE:
if [ $EXONS_ONLY == "TRUE" ] 
then
	FEATURE_TYPE="exon"
else
	FEATURE_TYPE="CDS"
fi
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Intron_Only_Regions.gtf" ];
then
#The 3rd column in the Intron_Only_Regions.gtf is "Intronic_Only"
FEATURE_TYPE="Intronic_Only"
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ];
then
#The 3rd column in the Exon_Only_Regions.gtf is "Exonic_Only"
FEATURE_TYPE="Exonic_Only"
fi


#Start of lncRNA gtf, ignore all of these gtf
#
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "ncRNA_exon_for_counting.gtf" -o "${ANNOTATION_FILE}" == "ncRNA_genebodies_for_counting.gtf" -o "${ANNOTATION_FILE}" == "intronic_only_gene_models_ncRNA_for_counting.gtf" -o "${ANNOTATION_FILE}" == "exonic_only_gene_models_ncRNA_for_counting.gtf" ];
then
echo "lncRNA GTF file is ignored by HTSeq, see FeatureCount result output"
else
#non lncRNA GTFs
#---------------------------------------------------------------------------------
#--------------------------------------------------------------------------------
#Create a job name that's a function of the folder name:
#Extract the folder name:
DIR_name=`basename ${SCRIPT_DIR}`
#Split by "_" grab the 1st part (use cut command)
Step_Num=$(echo ${DIR_name} | cut -d'_' -f 1)
#Create the Job_Name:
Job_Name=$(echo 'Step_'${Step_Num})
#Use ${Job_Name} for qsub -N parameter
#--------------------------------------------------------------------------------
#Now use arguments in the PBS script call:
	qsub -N ${Job_Name}'_'${Sample_ID} -P waxmanlab -l h_rt=${TIME_LIMIT} Extract_Counts.qsub ${Sample_ID} ${Dataset_DIR} ${Sample_Labels_DIR} ${GTF_Files_DIR} ${MODE} ${STRANDEDNESS_HTSeq} ${FEATURE_ID} ${SCRIPT_DIR} ${ANNOTATION_FILE} ${FEATURE_TYPE} 
#Printing the qsub command which submits the job
	echo "qsub -N ${Job_Name}'_'${Sample_ID} -P waxmanlab -l h_rt=${TIME_LIMIT} Extract_Counts.qsub ${Sample_ID} ${Dataset_DIR} ${Sample_Labels_DIR} ${GTF_Files_DIR} ${MODE} ${STRANDEDNESS_HTSeq} ${FEATURE_ID} ${SCRIPT_DIR} ${ANNOTATION_FILE} ${FEATURE_TYPE}"
#---------------------------------------------------------------------------------
fi #End of if statement to ignore lncRNA gtfs
#End loop over GTF files:
done
#End loop over Sample_Labels:
done < Sample_Labels.temp
#Remove the temp file:
rm Sample_Labels.temp
#---------------------------------------------------------------------------------
echo "-----------------------"
echo "End of qsub commands"
echo "-----------------------"

##################################################################################