#!/bin/bash -l
##################################################################################
#Andy Rampersaud, 02.22.16
#This script is called by setup_Extract_Counts.sh
##################################################################################
# Specify which shell to use
#$ -S /bin/bash
# Run on the current working directory
#$ -cwd

# Join standard output and error to a single file  
#$ -j n
# change to y if you want a single qlog file 
##################################################################################
#Initialize variables from Extract_Counts.sh
##################################################################################
#checking the command line arg
#-ne : "is not equal to"
if [ $# -ne 9 ] ; then
      echo "Need 9 arguments for the qsub command:"
      echo "qsub -N ${Job_Name}'_'${Sample_ID} -P waxmanlab -l h_rt=${TIME_LIMIT} Extract_Counts.qsub ${Sample_ID} ${Dataset_DIR} ${Sample_Labels_DIR} ${GTF_Files_DIR} ${STRANDEDNESS_featureCount} ${FEATURE_ID} ${SCRIPT_DIR} ${ANNOTATION_FILE} ${FEATURE_TYPE}"
      exit 0
fi

#http://www.ibm.com/developerworks/library/l-bash-parameters/
#Note: If you have more than 9 parameters, you cannot use $10 to refer to the tenth one. You must first either process or save the first parameter ($1), then use the shift command to drop parameter 1 and move all remaining parameters down 1, so that $10 becomes $9 and so on.

#http://unix.stackexchange.com/questions/104420/how-to-use-command-line-arguments-in-a-shell-script
#If you need access more than 9 command line arguments, you can use the shift command. Example: shift 2 renames $3 to $1, $4 to $2 etc.

#process the command line arguments
Sample_ID=$1
Dataset_DIR=$2
Sample_Labels_DIR=$3
GTF_Files_DIR=$4
STRANDEDNESS_featureCount=$5
FEATURE_ID=$6
SCRIPT_DIR=$7
ANNOTATION_FILE=$8
#FEATURE_TYPE=$10

#echo "-----------------------"
#echo "Need shift 1 command (have more than 9 arguments):"
##The "shift" command allows users to move through an argument list (if there are more than 9 arguments)
##It removes the arguments that occur at the beginning of the list and shifts arguments at the end of the list so that they are now within the list of 9 arguments
##Example: shift 1
##This will remove arg #1 then shift args #2 - #10 so that they are numbered $1 - $9
##Now the user can save the arg #10 as $9
##Shift_Number = (Total number of args) - (9)
##For this script:
##Shift_Number = (Total number of args) - (9) = (10) - (9) = 1
##Hence: Need shift 4 command
##Now I can save arg:
##10 as $9
#echo "-----------------------"

#shift 1

#process the command line arguments
FEATURE_TYPE=$9

#Print variables (make sure they appear correctly):
echo "-----------------------"
echo "Start of variable list:"
echo "-----------------------"
echo "Sample_ID:"
echo ${Sample_ID}
echo "Dataset_DIR:"
echo ${Dataset_DIR}
echo "Sample_Labels_DIR:"
echo ${Sample_Labels_DIR}
echo "GTF_Files_DIR:"
echo ${GTF_Files_DIR}
echo "STRANDEDNESS_featureCount:"
echo ${STRANDEDNESS_featureCount}
echo "FEATURE_ID:"
echo ${FEATURE_ID}
echo "SCRIPT_DIR:"
echo ${SCRIPT_DIR}
echo "ANNOTATION_FILE:"
echo ${ANNOTATION_FILE}
echo "FEATURE_TYPE:"
echo ${FEATURE_TYPE}
echo "-----------------------"
echo "End of variable list"
echo "-----------------------"

#You can use option "-pe omp N", where N is the number of cores ( number of course on our system can be any integer between 1 and 16):
#$ -pe omp 16

# Now let's keep track of some information just in case anything goes wrong
echo "=========================================================="
#Use to calculate job time:
#Start_Time in seconds
Start_Time=$(date +"%s")
echo "Starting on : $(date)"
echo "Running on node : $(hostname)"
echo "Current directory : $(pwd)"
echo "Current job ID : $JOB_ID"
echo "Current job name : $JOB_NAME"
echo "Task index number : $SGE_TASK_ID"
echo "Parameter for multiple cores : $NSLOTS"
echo "=========================================================="

# Go to local scratch directory
echo
echo 'Change dir to scratch directory'
echo
cd ${TMPDIR}
echo
echo 'Print scratch directory location:'
echo
echo $TMPDIR
#--------------------------------------
echo
echo 'Loading required modules...'
echo
#Make sure the shebang line = #!/bin/bash -l
#Need the -l option to load modules
#Search for latest program installed:
#module avail -t 2>&1 | grep -i samtools
module load samtools/samtools-0.1.19_gnu446
#module avail -t 2>&1 | grep -i subread
module load subread/1.4.6_p5
#module avail -t 2>&1 | grep -i bedtools
module load bedtools/2.24.0
#----------------------------------------------------------------------------------
#module help samtools/samtools-0.1.19_gnu446
#----------- Module Specific Help for 'samtools/samtools-0.1.19_gnu446' ---------------------------
#sets the environment for samtools (0.1.19) built using GNU Compilers
#SAM (Sequence Alignment/Map) format is a generic format for storing
#large nucleotide sequence alignments. SAM Tools provide various utilities
#for manipulating alignments in the SAM format, including sorting, merging,
#indexing and generating alignments in a per-position format.
#http://samtools.sourceforge.net/
#----------------------------------------------------------------------------------
#module help subread/1.4.6_p5
#----------- Module Specific Help for 'subread/1.4.6_p5' -----------
#subread 1.4.6_p5 High-performance read alignment, quantification, and mutation discovery
#The Subread package comprises a suite of software programs for processing next-gen sequencing read data including:
# - Subread: an accurate and efficient aligner for mapping both genomic DNA-seq reads and RNA-seq reads (for the purpose of expression analysis).
# - Subjunc: an RNA-seq aligner suitable for all purposes of RNA-seq analyses.
# - featureCounts: a highly efficient and accurate read summarization program.
# - exactSNP: a SNP caller that discovers SNPs by testing signals against local background noises.
#These programs were also implemented in Bioconductor R package Rsubread.
#For more information on subread, please see http://subread.sourceforge.net/
#----------------------------------------------------------------------------------
# module help bedtools/2.24.0
#----------- Module Specific Help for 'bedtools/2.24.0' ------------
#bedtools 2.24.0 A fast, flexible toolset for genome arithmetic.
#Collectively, the bedtools utilities are a swiss-army knife of tools for a wide-range of genomics analysis tasks. The most widely-used tools enable genome arithmetic: that is, set theory on the genome. For example, bedtools allows one to intersect, merge, count, complement, and shuffle genomic intervals from multiple files in widely-used genomic file formats such as BAM, BED, GFF/GTF, VCF. While each individual tool is designed to do a relatively simple task (e.g., intersect two interval files), quite sophisticated analyses can be conducted by combining multiple bedtools operations on the UNIX command line.
#For more information on bedtools, please see http://bedtools.readthedocs.org/en/latest/index.html
#----------------------------------------------------------------------------------
# copy user input data files to scratch
cp ${Dataset_DIR}/${Sample_ID}/fastq/tophat2/${Sample_ID}'_primary_unique.bam' .

#Copy gtf file (annotation file):
cp ${GTF_Files_DIR}/${ANNOTATION_FILE} .
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ];
then
#Copy Intron_Only gtf file (annotation file):
cp ${GTF_Files_DIR}/Intron_Only_Regions.gtf .
elif [ "${ANNOTATION_FILE}" == "exonic_only_gene_models_ncRNA_for_counting.gtf" ];
then
#Copy intronic_only_gene_models_ncRNA_for_counting gtf file (annotation file):
#rename the intronic_only gtf file as Intron_Only_Regions.gtf, such that I can reuse Andy's if statements
cp ${GTF_Files_DIR}/intronic_only_gene_models_ncRNA_for_counting.gtf Intron_Only_Regions.gtf
fi
#------------------------------------------
#Initialize INPUT_BAM:
INPUT_BAM=${Sample_ID}'_primary_unique.bam'
#----------------------------------------------------------------------------------
#Make output dir:
STORAGE_DIR=${Dataset_DIR}/${Sample_ID}/fastq/tophat2/featureCounts
#Need if statements to name the OUTPUT_DIR depending on the ANNOTATION_FILE used:
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "genes.gtf" ];
then
OUTPUT_DIR=$STORAGE_DIR/Illumina_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ];
then
OUTPUT_DIR=$STORAGE_DIR/RefSeq_Exon_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Intron_Only_Regions.gtf" ];
then
OUTPUT_DIR=$STORAGE_DIR/RefSeq_Intron_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ];
then
OUTPUT_DIR=$STORAGE_DIR/RefSeq_Exon_Only_GTF
fi
#Start of lncRNA gtf
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "ncRNA_exon_for_counting.gtf" ];
then
OUTPUT_DIR=$STORAGE_DIR/LncRNA_Exon_Collapsed_GTF
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "ncRNA_genebodies_for_counting.gtf" ];
then
OUTPUT_DIR=$STORAGE_DIR/LncRNA_GeneBody_GTF
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "intronic_only_gene_models_ncRNA_for_counting.gtf" ];
then
OUTPUT_DIR=$STORAGE_DIR/LncRNA_Intronic_Only_GTF
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "exonic_only_gene_models_ncRNA_for_counting.gtf" ];
then
OUTPUT_DIR=$STORAGE_DIR/LncRNA_Exonic_Only_GTF
fi

#------------------------------------------
##############################
if [[ ! -d $OUTPUT_DIR ]]; 
then
mkdir -p $OUTPUT_DIR
else
rm $OUTPUT_DIR/*
fi
##############################
OUTPUT_FILE=${Sample_ID}'_featureCounts.out'
######################
if [ -f $OUTPUT_FILE ]
then 
rm $OUTPUT_FILE
else
touch $OUTPUT_FILE
fi
######################
echo
echo 'List files in scratch directory:'
echo
ls -alh

echo
echo 'Starting to run my commands'
echo

# run my commands.
echo
echo 'Starting featureCounts'
echo
##################################################################################
#---------------------------------------------------------------------------------
#For the "Exonic_Only" counts I need to process the BAM file to filter out reads that overlap Intron_Only_Regions 
#Need an if statement to check ANNOTATION_FILE used:
if [ ${ANNOTATION_FILE} == "Exon_Only_Regions.gtf" -o "${ANNOTATION_FILE}" == "exonic_only_gene_models_ncRNA_for_counting.gtf" ];
then
echo
echo 'Starting Intron_Only_Regions read overlap filter'
echo
#-v 	Only report those entries in A that have no overlap in B. Restricted by -f and -r.
#-abam 	BAM file A. Each BAM alignment in A is compared to B in search of overlaps. Use “stdin” if passing A with a UNIX pipe: For example: samtools view -b <BAM> | bedtools intersect -abam stdin -b genes.bed. Note: no longer necessary after version 2.19.0
#-b 	One or more BAM/BED/GFF/VCF file(s) “B”. Use “stdin” if passing B with a UNIX pipe.
intersectBed -v -abam ${INPUT_BAM} -b Intron_Only_Regions.gtf > ${Sample_ID}'_primary_no_Intron_Only.bam'
#Redefine the ${INPUT_BAM}:
INPUT_BAM=${Sample_ID}'_primary_no_Intron_Only.bam'
echo
echo 'Ending Intron_Only_Regions read overlap filter'
echo
fi
#End of if statement
#---------------------------------------------------------------------------------
##################################################################################
#---------------------------------------------------------------------------------
#For the "Intronic_Only" counts I need to process the BAM file to filter out splice junction reads
#Need an if statement to check ANNOTATION_FILE used:
if [ ${ANNOTATION_FILE} == "Intron_Only_Regions.gtf" -o "${ANNOTATION_FILE}" == "intronic_only_gene_models_ncRNA_for_counting.gtf"  ];
then
echo
echo 'Starting splice read filter'
echo
#Need a regular expression to specifically find these spliced reads:
#Bunch of numbers then "M", bunch of numbers then "N", bunch of numbers then "M":
# print only lines which do NOT match regex (emulates "grep -v")
# NOT match regex:  awk '$7 !~ /^[a-f]/'
#---------------------------------------------------------------------------------
#Need the header (-h)
#Need to use a regular expression match to extract header lines:
#https://www.biostars.org/p/9247/
#Regular expression:
#/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/
#"At" symbol then two letters, a tab, then letters, then letter and/or number, then a colon, some characters after then the end of the string
samtools view -h ${INPUT_BAM} | awk '($0 ~ /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)' > Header.txt
#Need everything after the header information (do NOT match regex):
samtools view -h ${INPUT_BAM} | awk '($0 !~ /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)' > temp1.sam
#Run awk command to filter out spliced reads:
awk '($6 !~ /[0-9]+[M][0-9]+[N][0-9]+[M]/)' temp1.sam > temp2.sam
#Add header:
cat Header.txt temp2.sam > temp3.sam
#Convert SAM to BAM:
samtools view -bS temp3.sam > ${Sample_ID}'_primary_no_splice.bam'
#Redefine the ${INPUT_BAM}:
INPUT_BAM=${Sample_ID}'_primary_no_splice.bam'
#Remove temp files:
rm temp*.sam
rm Header.txt 
echo
echo 'Ending splice read filter'
echo
fi
#End of if statement
#---------------------------------------------------------------------------------
##################################################################################
#Sort the input BAM file:
#Usage:   samtools sort [options] <in.bam> <out.prefix>
#Options: -n        sort by read name
samtools sort -n ${INPUT_BAM} ${Sample_ID}'_sorted'
#---------------------------------------------------------------------------------
#featureCounts
#Version 1.4.6-p5
#Usage: featureCounts [options] -a <annotation_file> -o <output_file> input_file1 [input_file2] ... 
#    Required parameters:
#    -a <input>	Give the name of the annotation file. The program assumes
#              	that the provided annotation file is in GTF format. Use -F
#              	option to specify other annotation formats.
#    
#    -o <input>	Give the name of the output file. The output file contains
#              	the number of reads assigned to each meta-feature (or each
#              	feature if -f is specified). A meta-feature is the aggregation
#              	of features, grouped by using gene identifiers. Please refer
#              	to the users guide for more details.
#    
#   input_files	Give the names of input read files that include the read
#              	mapping results. Format of input files is automatically
#              	determined (SAM or BAM). Paired-end reads will be
#              	automatically re-ordered if it is found that reads from the
#              	same pair are not adjacent to each other. Multiple files can
#              	be provided at the same time.
#    
#    Optional parameters:
#    
#    -A <input>	Specify the name of a file including aliases of chromosome
#              	names. The file should be a comma delimited text file that
#              	includes two columns. The first column gives the chromosome
#              	names used in the annotation and the second column gives the
#              	chromosome names used by reads. This file should not contain
#              	header lines. Names included in this file are case sensitive.
#    
#    -F <input>	Specify the format of the annotation file. Acceptable formats
#              	include `GTF' and `SAF'. `GTF' by default. Please refer to the
#              	users guide for SAF annotation format.
#    
#    -t <input>	Specify the feature type. Only rows which have the matched
#              	matched feature type in the provided GTF annotation file
#              	will be included for read counting. `exon' by default.
#    
#    -g <input>	Specify the attribute type used to group features (eg. exons)
#              	into meta-features (eg. genes), when GTF annotation is provided.
#              	`gene_id' by default. This attribute type is usually the gene
#              	identifier. This argument is useful for the meta-feature level
#              	summarization.
#    
#    -f        	If specified, read summarization will be performed at the 
#              	feature level (eg. exon level). Otherwise, it is performed at
#              	meta-feature level (eg. gene level).
#    
#    -O        	If specified, reads (or fragments if -p is specified) will
#              	be allowed to be assigned to more than one matched meta-
#              	feature (or feature if -f is specified). 
#    
#    -s <int>  	Indicate if strand-specific read counting should be performed.
#              	It has three possible values:  0 (unstranded), 1 (stranded) and
#              	2 (reversely stranded). 0 by default.
#    
#    -M        	If specified, multi-mapping reads/fragments will be counted (ie.
#              	a multi-mapping read will be counted up to N times if it has N
#              	reported mapping locations). The program uses the `NH' tag to
#              	find multi-mapping reads.
#    
#    -Q <int>  	The minimum mapping quality score a read must satisfy in order
#              	to be counted. For paired-end reads, at least one end should
#              	satisfy this criteria. 0 by default.
#    
#    -T <int>  	Number of the threads. 1 by default.
#    
#    -v        	Output version of the program.
#    
#    -R        	Output read counting result for each read/fragment. For each
#              	input read file, read counting results for reads/fragments will
#              	be saved to a tab-delimited file that contains four columns
#              	including read name, status(assigned or the reason if not
#              	assigned), name of target feature/meta-feature and number of
#              	hits if the read/fragment is counted multiple times. Name of
#              	the file is the same as name of the input read file except a
#              	suffix `.featureCounts' is added.
#    
#    --largestOverlap            If specified, reads (or fragments) will be
#              	assigned to the target that has the largest number of overlapping
#              	bases.
#    
#    --minOverlap <int>          Specify the minimum required number of
#              	overlapping bases between a read (or a fragment) and a feature.
#              	1 by default. If a negative value is provided, the read will be
#              	extended from both ends.
#    
#    --readExtension5 <int>      Reads are extended upstream by <int> bases from
#              	their 5' end.
#    
#    --readExtension3 <int>      Reads are extended upstream by <int> bases from
#              	their 3' end.
#    
#    --read2pos <5:3>            The read is reduced to its 5' most base or 3'
#              	most base. Read summarization is then performed based on the
#              	single base which the read is reduced to.
#    
#    --fraction	If specified, a fractional count 1/n will be generated for each
#              	multi-mapping read, where n is the number of alignments (indica-
#              	ted by 'NH' tag) reported for the read. This option must be used
#              	together with the '-M' option.
#    
#    --primary 	If specified, only primary alignments will be counted. Primary
#              	and secondary alignments are identified using bit 0x100 in the
#              	Flag field of SAM/BAM files. All primary alignments in a dataset
#              	will be counted no matter they are from multi-mapping reads or
#              	not ('-M' is ignored). 
#    
#    --ignoreDup                 If specified, reads that were marked as
#              	duplicates will be ignored. Bit Ox400 in FLAG field of SAM/BAM
#              	file is used for identifying duplicate reads. In paired end
#              	data, the entire read pair will be ignored if at least one end
#              	is found to be a duplicate read.
#    
#    --countSplitAlignmentsOnly  If specified, only split alignments (CIGAR
#              	strings containing letter `N') will be counted. All the other
#              	alignments will be ignored. An example of split alignments is
#              	the exon-spanning reads in RNA-seq data.
#    
#    Optional paired-end parameters:
#    
#    -p        	If specified, fragments (or templates) will be counted instead
#              	of reads. This option is only applicable for paired-end reads.
#              	The two reads from the same fragment must be adjacent to each
#              	other in the provided SAM/BAM file.
#    
#    -P        	If specified, paired-end distance will be checked when assigning
#              	fragments to meta-features or features. This option is only
#              	applicable when -p is specified. The distance thresholds should
#              	be specified using -d and -D options.
#    
#    -d <int>  	Minimum fragment/template length, 50 by default.
#    
#    -D <int>  	Maximum fragment/template length, 600 by default.
#    
#    -B        	If specified, only fragments that have both ends 
#              	successfully aligned will be considered for summarization.
#              	This option is only applicable for paired-end reads.
#    
#    -S <ff:fr:rf> Orientation of the two read from the same pair, 'fr' by
#              	by default.
#    
#    -C        	If specified, the chimeric fragments (those fragments that 
#              	have their two ends aligned to different chromosomes) will
#              	NOT be included for summarization. This option is only 
#              	applicable for paired-end read data.
#    
#    --donotsort   If specified, paired end reads will not be reordered even if
#              	reads from the same pair were found not to be next to each other
#              	in the input. 
#---------------------------------------------------------------------------------
#For paired-end data we want the (-p) option
#The (-B) option seems logical but the only effect is a slightly lowered read count
#Leave out the (-B) option (get overall counts closer to HTSeq counts)
#Need an if statement to check ANNOTATION_FILE used:
if [ ${ANNOTATION_FILE} == "genes.gtf" -o \
     ${ANNOTATION_FILE} == "ncRNA_exon_for_counting.gtf" -o \
     ${ANNOTATION_FILE} == "ncRNA_genebodies_for_counting.gtf" -o \
     ${ANNOTATION_FILE} == "intronic_only_gene_models_ncRNA_for_counting.gtf" -o \
     ${ANNOTATION_FILE} == "exonic_only_gene_models_ncRNA_for_counting.gtf" ];
then
#Print the command:
echo "featureCounts -p  -T $NSLOTS -s ${STRANDEDNESS_featureCount} -g ${FEATURE_ID} -t ${FEATURE_TYPE} -a ${ANNOTATION_FILE} -o ${OUTPUT_FILE} ${Sample_ID}'_sorted'.bam"
#Run the command:
featureCounts -p -T $NSLOTS -s ${STRANDEDNESS_featureCount} -g ${FEATURE_ID} -t ${FEATURE_TYPE} -a ${ANNOTATION_FILE} -o ${OUTPUT_FILE} ${Sample_ID}'_sorted'.bam
fi
#---------------------------------------------------------------------------------
#Dealing with overlapping gene symbols, need to run the Parse_GTF.R
#Parse_GTF.R will create 2 GTF files from the ${ANNOTATION_FILE}
#1st GTF file for modified counting 
#2nd GTF file for normal counting
#---------------------------------------------------------------------------------
#Need an if statement to check ANNOTATION_FILE used:
if [ ${ANNOTATION_FILE} != "genes.gtf" -a \
     ${ANNOTATION_FILE} != "ncRNA_exon_for_counting.gtf" -a \
     ${ANNOTATION_FILE} != "ncRNA_genebodies_for_counting.gtf" -a \
     ${ANNOTATION_FILE} != "intronic_only_gene_models_ncRNA_for_counting.gtf" -a \
     ${ANNOTATION_FILE} != "exonic_only_gene_models_ncRNA_for_counting.gtf" ];
then
#GTF file name determines the GeneSym_List.txt
#Extract extension
GTF_name=${ANNOTATION_FILE%\.gtf}
echo "GTF file name: "${GTF_name}
#Copy GeneSym_List.txt:
cp ${GTF_Files_DIR}/featureCount_Files/${GTF_name}/GeneSym_DoubleCounts/GeneSym_List.txt . 
#Copy Parse_GTF.R:
cp ${SCRIPT_DIR}/Scripts/Parse_GTF.R . 
#Print the command:
echo "Rscript Parse_GTF.R ${ANNOTATION_FILE} GeneSym_List.txt"
#Run the command:
Rscript Parse_GTF.R ${ANNOTATION_FILE} GeneSym_List.txt
echo "Running assign_all_features counting"
#    -O        	If specified, reads (or fragments if -p is specified) will
#              	be allowed to be assigned to more than one matched meta-
#              	feature (or feature if -f is specified). 
#Print the command:
echo "featureCounts -O -p -T $NSLOTS -s ${STRANDEDNESS_featureCount} -g ${FEATURE_ID} -t ${FEATURE_TYPE} -a GeneSym_assign_all_features.gtf -o ${Sample_ID}"_assign_all_features.out" ${Sample_ID}'_sorted'.bam"
#Run the command:
featureCounts -O -p -T $NSLOTS -s ${STRANDEDNESS_featureCount} -g ${FEATURE_ID} -t ${FEATURE_TYPE} -a GeneSym_assign_all_features.gtf -o ${Sample_ID}"_assign_all_features.out" ${Sample_ID}'_sorted'.bam
#---------------------------------------------------------------------------------
echo "Running assign_only1_feature counting"
#Print the command:
echo "featureCounts -p -T $NSLOTS -s ${STRANDEDNESS_featureCount} -g ${FEATURE_ID} -t ${FEATURE_TYPE} -a GeneSym_assign_only1_feature.gtf -o ${Sample_ID}"_assign_only1_feature.out" ${Sample_ID}'_sorted'.bam"
#Run the command:
featureCounts -p -T $NSLOTS -s ${STRANDEDNESS_featureCount} -g ${FEATURE_ID} -t ${FEATURE_TYPE} -a GeneSym_assign_only1_feature.gtf -o ${Sample_ID}"_assign_only1_feature.out" ${Sample_ID}'_sorted'.bam
#---------------------------------------------------------------------------------
#The featureCounts output file needs to be reformatted for use by DiffExp scripts
file_list=*_assign_*.out
for file in ${file_list}
do
echo "Processing "${file}
#Get rid of first 2 header lines:
tail -n +3 ${file} > ${file}.temp1
#Just want the first and last column 
#NF is always the last column
awk -F '\t' '{print $1"\t"$NF}' ${file}.temp1 > ${file}.temp2
#Rename the count file:
mv ${file}.temp2 ${file}
#Remove temp files:
rm *.temp*
echo "Done processing file."
done
#---------------------------------------------------------------------------------
#Need to concatenate summary files
#Need to add headers to indicate counting method
echo "${Sample_ID}"_assign_all_features.out >> ${OUTPUT_FILE}.summary
cat ${Sample_ID}"_assign_all_features.out.summary" >> ${OUTPUT_FILE}.summary
echo "${Sample_ID}"_assign_only1_feature.out >> ${OUTPUT_FILE}.summary
cat ${Sample_ID}"_assign_only1_feature.out.summary" >> ${OUTPUT_FILE}.summary
#---------------------------------------------------------------------------------
#Concatenate files then sort by 1st column
cat *_assign_*.out > ${OUTPUT_FILE}.temp1
sort -k1,1 ${OUTPUT_FILE}.temp1 > ${OUTPUT_FILE}.temp2
mv ${OUTPUT_FILE}.temp2 ${OUTPUT_FILE}
#Remove temp files:
rm ${OUTPUT_FILE}.temp*
#---------------------------------------------------------------------------------
fi
#End if statement to check ANNOTATION_FILE
#---------------------------------------------------------------------------------
##################################################################################
echo
echo 'Ending featureCounts'
echo

#---------------------------------------------------------------------------------
#Example featureCounts output file:
## Program:featureCounts v1.4.6-p5; Command:"featureCounts" "-T" "16" "-s" "0" "-g" "gene_id" "-t" "exon" "-a" "RefSeq_GeneBody.gtf" "-o" "G83_M1_featureCounts.out" "G83_M1_sorted.bam" 
#Geneid	Chr	Start	End	Strand	Length	G83_M1_sorted.bam
#Xkr4	chr1;chr1;chr1	3204563;3411783;3660633	3207049;3411982;3661579	-;-;-	3634	0
#Rp1	chr1;chr1;chr1;chr1;chr1;chr1	4280927;4333588;4341991;4342283;4350281;4399251	4283093;4340172;4342162;4342918;4350395;4399322	-;-;-;-;-;-	9747	0
#Sox17	chr1;chr1;chr1;chr1;chr1	4481009;4483181;4483853;4485217;4486372	4482749;4483571;4483944;4486023;4487435	-;-;-;-;-	4095	4
#---------------------------------------------------------------------------------

#---------------------------------------------------------------------------------
#Need an if statement to check ANNOTATION_FILE used:
if [ ${ANNOTATION_FILE} == "genes.gtf" -o \
     ${ANNOTATION_FILE} == "ncRNA_exon_for_counting.gtf" -o \
     ${ANNOTATION_FILE} == "ncRNA_genebodies_for_counting.gtf" -o \
     ${ANNOTATION_FILE} == "intronic_only_gene_models_ncRNA_for_counting.gtf" -o \
     ${ANNOTATION_FILE} == "exonic_only_gene_models_ncRNA_for_counting.gtf" ];
then
#The featureCounts output file needs to be reformatted for use by DiffExp scripts
#Get rid of first 2 header lines:
tail -n +3 ${OUTPUT_FILE} > ${OUTPUT_FILE}.temp1
#Just want the first and last column 
#NF is always the last column
awk -F '\t' '{print $1"\t"$NF}' ${OUTPUT_FILE}.temp1 > ${OUTPUT_FILE}.temp2
#Rename the count file:
mv ${OUTPUT_FILE}.temp2 ${OUTPUT_FILE}
#Remove temp files:
rm *.temp*
fi
#---------------------------------------------------------------------------------

#---------------------------------------------------------------------------------
#For the "Intronic_Only" counts I need to process the output count file so that upload to SEGEX will work with the RefSeq platform
#Need an if statement to check ANNOTATION_FILE used:
if [ ${ANNOTATION_FILE} == "Intron_Only_Regions.gtf" ];
then
#Copy RefSeq_Genes_Zero_Count.txt:
cp ${SCRIPT_DIR}/RefSeq_Files/RefSeq_Genes_Zero_Count.txt . 
#Omit header from RefSeq_Genes_Zero_Count.txt
tail -n +2 RefSeq_Genes_Zero_Count.txt > RefSeq_Genes_Zero_Count.temp
#The featureCounts output file has already been reformatted above
#Concatenate files:
cat ${OUTPUT_FILE} RefSeq_Genes_Zero_Count.temp > ${OUTPUT_FILE}.temp1
#Sort file by gene symbol
sort -k1,1 ${OUTPUT_FILE}.temp1 > ${OUTPUT_FILE}.temp2
#Rename the count file:
mv ${OUTPUT_FILE}.temp2 ${OUTPUT_FILE}
#Remove temp files:
rm *.temp*
fi
#---------------------------------------------------------------------------------

# copy the output files to users storage dir
cp ${OUTPUT_FILE} ${OUTPUT_DIR}
cp ${OUTPUT_FILE}.summary ${OUTPUT_DIR}
#---------------------------------------------------------------------------------
#Example job output files:
#---------------------------------------------------------------------------------
#*.out file:
#Xkr4	0
#Rp1	0
#Sox17	2
#Mrpl15	7
#Lypla1	33
#Tcea1	12
#Rgs20	0
#Atp6v1h	9
#Oprk1	0
#Npbwr1	0
#Rb1cc1	11
#Fam150a	0
#---------------------------------------------------------------------------------
#*.out.summary file:
#Status	G83_M1_sorted.bam
#Assigned	181926
#Unassigned_Ambiguity	4618
#Unassigned_MultiMapping	0
#Unassigned_NoFeatures	24533
#Unassigned_Unmapped	0
#Unassigned_MappingQuality	0
#Unassigned_FragmentLength	0
#Unassigned_Chimera	0
#Unassigned_Secondary	0
#Unassigned_Nonjunction	0
#Unassigned_Duplicate	0
#---------------------------------------------------------------------------------

echo
echo "List files in scratch"
echo
ls -alh

echo "=========================================================="
echo "Finished on : $(date)"
#Use to calculate job time:
#End_Time in seconds
End_Time=$(date +"%s")
diff=$(($End_Time-$Start_Time))
echo "$(($diff / 3600)) hours, $((($diff / 60) % 60)) minutes and $(($diff % 60)) seconds elapsed."
echo "=========================================================="