#!/bin/bash -l
##################################################################################
#Andy Rampersaud, 02.22.16
#This script is called by setup_Extract_Counts.sh
##################################################################################
# Specify which shell to use
#$ -S /bin/bash
# Run on the current working directory
#$ -cwd

# Join standard output and error to a single file  
#$ -j n
# change to y if you want a single qlog file 
##################################################################################
#Initialize variables from Extract_Counts.sh
##################################################################################
#checking the command line arg
#-ne : "is not equal to"
if [ $# -ne 10 ] ; then
      echo "Need 10 arguments for the qsub command:"
      echo "qsub -N ${Job_Name}'_'${Sample_ID} -P waxmanlab -l h_rt=${TIME_LIMIT} Extract_Counts.qsub ${Sample_ID} ${Dataset_DIR} ${Sample_Labels_DIR} ${GTF_Files_DIR} ${MODE} ${STRANDEDNESS_HTSeq} ${FEATURE_ID} ${SCRIPT_DIR} ${ANNOTATION_FILE} ${FEATURE_TYPE}"
      exit 0
fi

#http://www.ibm.com/developerworks/library/l-bash-parameters/
#Note: If you have more than 9 parameters, you cannot use $10 to refer to the tenth one. You must first either process or save the first parameter ($1), then use the shift command to drop parameter 1 and move all remaining parameters down 1, so that $10 becomes $9 and so on.

#http://unix.stackexchange.com/questions/104420/how-to-use-command-line-arguments-in-a-shell-script
#If you need access more than 9 command line arguments, you can use the shift command. Example: shift 2 renames $3 to $1, $4 to $2 etc.

#process the command line arguments
Sample_ID=$1
Dataset_DIR=$2
Sample_Labels_DIR=$3
GTF_Files_DIR=$4
MODE=$5
STRANDEDNESS_HTSeq=$6
FEATURE_ID=$7
SCRIPT_DIR=$8
ANNOTATION_FILE=$9
#FEATURE_TYPE=$10

echo "-----------------------"
echo "Need shift 1 command (have more than 9 arguments):"
#The "shift" command allows users to move through an argument list (if there are more than 9 arguments)
#It removes the arguments that occur at the beginning of the list and shifts arguments at the end of the list so that they are now within the list of 9 arguments
#Example: shift 1
#This will remove arg #1 then shift args #2 - #10 so that they are numbered $1 - $9
#Now the user can save the arg #10 as $9
#Shift_Number = (Total number of args) - (9)
#For this script:
#Shift_Number = (Total number of args) - (9) = (10) - (9) = 1
#Hence: Need shift 4 command
#Now I can save arg:
#10 as $9
echo "-----------------------"

shift 1

#process the command line arguments
FEATURE_TYPE=$9

#Print variables (make sure they appear correctly):
echo "-----------------------"
echo "Start of variable list:"
echo "-----------------------"
echo "Sample_ID:"
echo ${Sample_ID}
echo "Dataset_DIR:"
echo ${Dataset_DIR}
echo "Sample_Labels_DIR:"
echo ${Sample_Labels_DIR}
echo "GTF_Files_DIR:"
echo ${GTF_Files_DIR}
echo "MODE:"
echo ${MODE}
echo "STRANDEDNESS_HTSeq:"
echo ${STRANDEDNESS_HTSeq}
echo "FEATURE_ID:"
echo ${FEATURE_ID}
echo "SCRIPT_DIR:"
echo ${SCRIPT_DIR}
echo "ANNOTATION_FILE:"
echo ${ANNOTATION_FILE}
echo "FEATURE_TYPE:"
echo ${FEATURE_TYPE}
echo "-----------------------"
echo "End of variable list"
echo "-----------------------"

# Now let's keep track of some information just in case anything goes wrong
echo "=========================================================="
#Use to calculate job time:
#Start_Time in seconds
Start_Time=$(date +"%s")
echo "Starting on : $(date)"
echo "Running on node : $(hostname)"
echo "Current directory : $(pwd)"
echo "Current job ID : $JOB_ID"
echo "Current job name : $JOB_NAME"
echo "Task index number : $SGE_TASK_ID"
echo "Parameter for multiple cores : $NSLOTS"
echo "=========================================================="

# Go to local scratch directory
echo
echo 'Change dir to scratch directory'
echo
cd ${TMPDIR}
echo
echo 'Print scratch directory location:'
echo
echo $TMPDIR
#--------------------------------------
echo
echo 'Loading required modules...'
echo
#Make sure the shebang line = #!/bin/bash -l
#Need the -l option to load modules
#Search for latest program installed:
#module avail -t 2>&1 | grep -i python2.7
module load python2.7/Python-2.7.3_gnu446
#module avail -t 2>&1 | grep -i samtools
module load samtools/samtools-0.1.19_gnu446
#module avail -t 2>&1 | grep -i HTSeq
module load htseq/0.6.1p1
#module avail -t 2>&1 | grep -i bedtools
module load bedtools/2.24.0
#----------------------------------------------------------------------------------
#module help python2.7/Python-2.7.3_gnu446
#----------- Module Specific Help for 'python2.7/Python-2.7.3_gnu446' ---------------------------
#python2.7 Python-2.7.3_gnu446: A general purpose, interpretive programming language.
#For more information on python2.7, please see http://www.python.org
#----------------------------------------------------------------------------------
#module help samtools/samtools-0.1.19_gnu446
#----------- Module Specific Help for 'samtools/samtools-0.1.19_gnu446' ---------------------------
#sets the environment for samtools (0.1.19) built using GNU Compilers
#SAM (Sequence Alignment/Map) format is a generic format for storing
#large nucleotide sequence alignments. SAM Tools provide various utilities
#for manipulating alignments in the SAM format, including sorting, merging,
#indexing and generating alignments in a per-position format.
#http://samtools.sourceforge.net/
#----------------------------------------------------------------------------------
#module help htseq/0.6.1p1
#----------- Module Specific Help for 'htseq/0.6.1p1' --------------
#HTSeq 0.6.1p1: Analysing high-throughput sequencing data with Python.
#Python package that provides infrastructure to process data from high-throughput sequencing assays.
#For more information on htseq, please see http://www-huber.embl.de/users/anders/HTSeq
#----------------------------------------------------------------------------------
# module help bedtools/2.24.0
#----------- Module Specific Help for 'bedtools/2.24.0' ------------
#bedtools 2.24.0 A fast, flexible toolset for genome arithmetic.
#Collectively, the bedtools utilities are a swiss-army knife of tools for a wide-range of genomics analysis tasks. The most widely-used tools enable genome arithmetic: that is, set theory on the genome. For example, bedtools allows one to intersect, merge, count, complement, and shuffle genomic intervals from multiple files in widely-used genomic file formats such as BAM, BED, GFF/GTF, VCF. While each individual tool is designed to do a relatively simple task (e.g., intersect two interval files), quite sophisticated analyses can be conducted by combining multiple bedtools operations on the UNIX command line.
#For more information on bedtools, please see http://bedtools.readthedocs.org/en/latest/index.html
#----------------------------------------------------------------------------------
# copy user input data files to scratch
cp ${Dataset_DIR}/${Sample_ID}/fastq/tophat2/${Sample_ID}'_primary_unique.bam' .
#Copy gtf file (annotation file):
cp ${GTF_Files_DIR}/${ANNOTATION_FILE} .
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ];
then
#Copy Intron_Only gtf file (annotation file):
cp ${GTF_Files_DIR}/Intron_Only_Regions.gtf .
fi
#------------------------------------------
#Initialize INPUT_BAM:
INPUT_BAM=${Sample_ID}'_primary_unique.bam'
#----------------------------------------------------------------------------------
#Make output dir:
STORAGE_DIR=${Dataset_DIR}/${Sample_ID}/fastq/tophat2/HTSeq
#Need if statements to name the OUTPUT_DIR depending on the ANNOTATION_FILE used:
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "genes.gtf" ];
then
OUTPUT_DIR=$STORAGE_DIR/Illumina_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ];
then
OUTPUT_DIR=$STORAGE_DIR/RefSeq_Exon_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Intron_Only_Regions.gtf" ];
then
OUTPUT_DIR=$STORAGE_DIR/RefSeq_Intron_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ];
then
OUTPUT_DIR=$STORAGE_DIR/RefSeq_Exon_Only_GTF
fi
#------------------------------------------
##############################
if [[ ! -d $OUTPUT_DIR ]]; 
then
mkdir -p $OUTPUT_DIR
else
rm $OUTPUT_DIR/*
fi
##############################
OUTPUT_FILE=${Sample_ID}'_HTSeq.out'
######################
if [ -f $OUTPUT_FILE ]
then 
rm $OUTPUT_FILE
else
touch $OUTPUT_FILE
fi
######################
echo
echo 'List files in scratch directory:'
echo
ls -alh

echo
echo 'Starting to run my commands'
echo

# run my commands.
echo
echo 'Starting samtools and htseq-count'
echo
##################################################################################
#---------------------------------------------------------------------------------
#For the "Exonic_Only" counts I need to process the BAM file to filter out reads that overlap Intron_Only_Regions 
#Need an if statement to check ANNOTATION_FILE used:
if [ ${ANNOTATION_FILE} == "Exon_Only_Regions.gtf" ];
then
echo
echo 'Starting Intron_Only_Regions read overlap filter'
echo
#-v 	Only report those entries in A that have no overlap in B. Restricted by -f and -r.
#-abam 	BAM file A. Each BAM alignment in A is compared to B in search of overlaps. Use “stdin” if passing A with a UNIX pipe: For example: samtools view -b <BAM> | bedtools intersect -abam stdin -b genes.bed. Note: no longer necessary after version 2.19.0
#-b 	One or more BAM/BED/GFF/VCF file(s) “B”. Use “stdin” if passing B with a UNIX pipe.
intersectBed -v -abam ${INPUT_BAM} -b Intron_Only_Regions.gtf > ${Sample_ID}'_primary_no_Intron_Only.bam'
#Redefine the ${INPUT_BAM}:
INPUT_BAM=${Sample_ID}'_primary_no_Intron_Only.bam'
echo
echo 'Ending Intron_Only_Regions read overlap filter'
echo
fi
#End of if statement
#---------------------------------------------------------------------------------
##################################################################################
##################################################################################
#---------------------------------------------------------------------------------
#For the "Intronic_Only" counts I need to process the BAM file to filter out splice junction reads
#Need an if statement to check ANNOTATION_FILE used:
if [ ${ANNOTATION_FILE} == "Intron_Only_Regions.gtf" ];
then
echo
echo 'Starting splice read filter'
echo
#Need a regular expression to specifically find these spliced reads:
#Bunch of numbers then "M", bunch of numbers then "N", bunch of numbers then "M":
# print only lines which do NOT match regex (emulates "grep -v")
# NOT match regex:  awk '$7 !~ /^[a-f]/'
#---------------------------------------------------------------------------------
#Need the header (-h)
#Need to use a regular expression match to extract header lines:
#https://www.biostars.org/p/9247/
#Regular expression:
#/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/
#"At" symbol then two letters, a tab, then letters, then letter and/or number, then a colon, some characters after then the end of the string
samtools view -h ${INPUT_BAM} | awk '($0 ~ /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)' > Header.txt
#Need everything after the header information (do NOT match regex):
samtools view -h ${INPUT_BAM} | awk '($0 !~ /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)' > temp1.sam
#Run awk command to filter out spliced reads:
awk '($6 !~ /[0-9]+[M][0-9]+[N][0-9]+[M]/)' temp1.sam > temp2.sam
#Add header:
cat Header.txt temp2.sam > temp3.sam
#Convert SAM to BAM:
samtools view -bS temp3.sam > ${Sample_ID}'_primary_no_splice.bam'
#Redefine the ${INPUT_BAM}:
INPUT_BAM=${Sample_ID}'_primary_no_splice.bam'
echo
echo 'Ending splice read filter'
echo
fi
#End of if statement
#---------------------------------------------------------------------------------
##################################################################################
#Sort the input BAM file:
#Usage:   samtools sort [options] <in.bam> <out.prefix>
#Options: -n        sort by read name
samtools sort -n ${INPUT_BAM} ${Sample_ID}'_sorted'
#Since we sorted by read name we need the following htseq-count option (required for Paired-end sequencing data):
#--order=name
#---------------------------------------------------------------------------------
#Usage: htseq-count [options] alignment_file gff_file

#This script takes an alignment file in SAM/BAM format and a feature file in
#GFF format and calculates for each feature the number of reads mapping to it.
#See http://www-huber.embl.de/users/anders/HTSeq/doc/count.html for details.

#Options:
#  -h, --help            show this help message and exit
#  -f SAMTYPE, --format=SAMTYPE
#                        type of <alignment_file> data, either 'sam' or 'bam'
#                        (default: sam)
#  -r ORDER, --order=ORDER
#                        'pos' or 'name'. Sorting order of <alignment_file>
#                        (default: name). Paired-end sequencing data must be
#                        sorted either by position or by read name, and the
#                        sorting order must be specified. Ignored for single-
#                        end data.
#  -s STRANDED, --stranded=STRANDED
#                        whether the data is from a strand-specific assay.
#                        Specify 'yes', 'no', or 'reverse' (default: yes).
#                        'reverse' means 'yes' with reversed strand
#                        interpretation
#  -a MINAQUAL, --minaqual=MINAQUAL
#                        skip all reads with alignment quality lower than the
#                        given minimum value (default: 10)
#  -t FEATURETYPE, --type=FEATURETYPE
#                        feature type (3rd column in GFF file) to be used, all
#                        features of other type are ignored (default, suitable
#                        for Ensembl GTF files: exon)
#  -i IDATTR, --idattr=IDATTR
#                        GFF attribute to be used as feature ID (default,
#                        suitable for Ensembl GTF files: gene_id)
#  -m MODE, --mode=MODE  mode to handle reads overlapping more than one feature
#                        (choices: union, intersection-strict, intersection-
#                        nonempty; default: union)
#  -o SAMOUT, --samout=SAMOUT
#                        write out all SAM alignment records into an output SAM
#                        file called SAMOUT, annotating each line with its
#                        feature assignment (as an optional field with tag
#                        'XF')
#  -q, --quiet           suppress progress report

#Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology
#Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General
#Public License v3. Part of the 'HTSeq' framework, version 0.6.1p1.
#---------------------------------------------------------------------------------
#Print the command
echo "samtools view ${Sample_ID}'_sorted'.bam | htseq-count --stranded=${STRANDEDNESS_HTSeq} --type=${FEATURE_TYPE} --idattr=${FEATURE_ID} --order=name --mode=${MODE} - ${ANNOTATION_FILE}  > ${OUTPUT_FILE}"
#Run the command
samtools view ${Sample_ID}'_sorted'.bam | htseq-count --stranded=${STRANDEDNESS_HTSeq} --type=${FEATURE_TYPE} --idattr=${FEATURE_ID} --order=name --mode=${MODE} - ${ANNOTATION_FILE}  > ${OUTPUT_FILE}
##################################################################################
echo
echo 'Ending samtools and htseq-count'
echo

#---------------------------------------------------------------------------------
#For the "Intronic_Only" counts I need to process the output count file so that upload to SEGEX will work with the RefSeq platform
#Need an if statement to check ANNOTATION_FILE used:
if [ ${ANNOTATION_FILE} == "Intron_Only_Regions.gtf" ];
then
#Copy RefSeq_Genes_Zero_Count.txt:
cp ${SCRIPT_DIR}/RefSeq_Files/RefSeq_Genes_Zero_Count.txt . 
#Omit header from RefSeq_Genes_Zero_Count.txt
tail -n +2 RefSeq_Genes_Zero_Count.txt > RefSeq_Genes_Zero_Count.temp
#The HTSeq output file has 5 extra lines at the end (extract then add back)
tail -n 5 ${OUTPUT_FILE} > HTSeq_Extra_Lines.txt
#Print everything except last n lines (use negative number):
head -n -5 ${OUTPUT_FILE} > ${OUTPUT_FILE}.temp1
#Concatenate files:
cat ${OUTPUT_FILE}.temp1 RefSeq_Genes_Zero_Count.temp > ${OUTPUT_FILE}.temp2
#Sort file by gene symbol
sort -k1,1 ${OUTPUT_FILE}.temp2 > ${OUTPUT_FILE}.temp3
#Concatenate HTSeq_Extra_Lines.txt back to count file:
cat ${OUTPUT_FILE}.temp3 HTSeq_Extra_Lines.txt > ${OUTPUT_FILE}.temp4
#Rename the count file:
mv ${OUTPUT_FILE}.temp4 ${OUTPUT_FILE}
#Remove temp files:
rm *.temp*
fi
#---------------------------------------------------------------------------------

# copy the output files to users storage dir
cp ${OUTPUT_FILE} ${OUTPUT_DIR}

#-------------------------------------------
#Note that the output file will be summarized by gene symbol and the last 5 lines will have extra counts
#Example output file:
#wc -l G110_M1_HTSeq.out 
#24202 G110_M1_HTSeq.out
#I was expecting there would be 24197 unique gene symbols
#head G110_M1_HTSeq.out 
#0610005C13Rik	2655
#0610007P14Rik	310
#0610009B22Rik	151
#0610009L18Rik	7
#0610009O20Rik	602
#0610010B08Rik	9
#0610010F05Rik	348
#0610010K14Rik	194
#0610011F06Rik	664
#0610012G03Rik	163
#tail G110_M1_HTSeq.out 
#Zyx	793
#Zzef1	1019
#Zzz3	719
#a	0
#l7Rn6	106
#no_feature	4226540
#ambiguous	59841
#too_low_aQual	0
#not_aligned	0
#alignment_not_unique	1677774
#Excluding these 5 lines, I get the expected 24197 
#-------------------------------------------

echo
echo "List files in scratch"
echo
ls -alh

echo "=========================================================="
echo "Finished on : $(date)"
#Use to calculate job time:
#End_Time in seconds
End_Time=$(date +"%s")
diff=$(($End_Time-$Start_Time))
echo "$(($diff / 3600)) hours, $((($diff / 60) % 60)) minutes and $(($diff % 60)) seconds elapsed."
echo "=========================================================="