#!/bin/bash
##################################################################################
#Andy Rampersaud, 03.11.16
#This script would be used to summarize Extract_Counts statistics 
#Way to run script:
#Usage: 
#./Extract_Counts_Summary.sh
##################################################################################
#---------------------------------------------------------------------------------
#Remove *.o files from previous jobs
#Need the 2>/dev/null to supress the "No such file or directory"
count=`ls -1 *.pe* 2>/dev/null  | wc -l`
if [ $count != 0 ]
then 
echo 'Removing *.po* and *.pe* files...'
rm *.po*
rm *.pe*
echo 'Done'
fi 
#---------------------------------------------------------------------------------
#Source the setup file to initialize variables
source ../00_Setup_Pipeline/01_Pipeline_Setup.sh
#---------------------------------------------------------------------------------
#Check that each variable prints a value to the terminal:
echo "-----------------------"
echo "Start of variable list:"
echo "-----------------------"
echo "Dataset_DIR:"
echo ${Dataset_DIR}
echo "Sample_Labels_DIR:"
echo ${Sample_Labels_DIR}
echo "GTF_Files_DIR:"
echo ${GTF_Files_DIR}
echo "STRANDEDNESS_featureCount:"
echo ${STRANDEDNESS_featureCount}
echo "FEATURE_ID:"
echo ${FEATURE_ID}
echo "SCRIPT_DIR:"
echo ${SCRIPT_DIR}
echo "TIME_LIMIT:"
echo ${TIME_LIMIT}
echo "-----------------------"
echo "End of variable list"
echo "-----------------------"
#---------------------------------------------------------------------------------
##################################################################################
#---------------------------------------------------------------------------------
#Retrieve the job name for this step:
Current_DIR=$(pwd)
#Extract the folder name:
DIR_name=`basename ${Current_DIR}`
#---------------------------------------------------------------------------------
OUTPUT_DIR=${Dataset_DIR}/Scripts/${DIR_name}/Job_Summary
###############################
if [ ! -d ${OUTPUT_DIR} ]; then
mkdir -p ${OUTPUT_DIR}
else
#Remove dir:
rm -r ${OUTPUT_DIR}
#Make new dir:
mkdir -p ${OUTPUT_DIR}
fi
###############################
#---------------------------------------------------------------------------------
#Start loop over GTF files:
GTF_List=$GTF_Files_DIR/*.gtf
for GTF_file in $GTF_List
do
ANNOTATION_FILE=`basename $GTF_file`
echo '#------------------------------------------'
echo 'ANNOTATION_FILE:'
echo ${ANNOTATION_FILE}
echo '#------------------------------------------'
#Need if statements to determine OUTPUT_FILE depending on the ANNOTATION_FILE used:
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "genes.gtf" ];
then
OUTPUT_FILE=$OUTPUT_DIR/featureCounts_Stats_Illumina_GTF.txt
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ];
then
OUTPUT_FILE=$OUTPUT_DIR/featureCounts_Stats_RefSeq_Exon_GTF.txt
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Intron_Only_Regions.gtf" ];
then
OUTPUT_FILE=$OUTPUT_DIR/featureCounts_Stats_RefSeq_Intron_GTF.txt
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ];
then
OUTPUT_FILE=$OUTPUT_DIR/featureCounts_Stats_RefSeq_Exon_Only_GTF.txt
fi
#Start of lncRNA gtf
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "ncRNA_exon_for_counting.gtf" ];
then
OUTPUT_FILE=$OUTPUT_DIR/featureCounts_Stats_LncRNA_Exon_Collapsed_GTF.txt
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "ncRNA_genebodies_for_counting.gtf" ];
then
OUTPUT_FILE=$OUTPUT_DIR/featureCounts_Stats_LncRNA_GeneBody_GTF.txt
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "intronic_only_gene_models_ncRNA_for_counting.gtf" ];
then
OUTPUT_FILE=$OUTPUT_DIR/featureCounts_Stats_LncRNA_Exonic_Only_GTF.txt
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "exonic_only_gene_models_ncRNA_for_counting.gtf" ];
then
OUTPUT_FILE=$OUTPUT_DIR/featureCounts_Stats_RefSeq_Intronic_Only_GTF.txt
fi
#------------------------------------------
######################
if [ -f $OUTPUT_FILE ]
then 
rm $OUTPUT_FILE
else
touch $OUTPUT_FILE
fi
######################
##################################################################################
cd ${Current_DIR}
################################################
#A text file (Sample_Labels.txt) is needed to run this script
SCRIPT_DIR=$(pwd)
cp $Sample_Labels_DIR/Sample_Labels.txt $SCRIPT_DIR
################################################
#The text file is formatted like the following:
#----------------------------------------------
#Sample_DIR	Sample_ID	Description
#Sample_Waxman-TP17	G83_M1	Male 8wk-pool 1
#Sample_Waxman-TP18	G83_M2	Male 8wk-pool 2
#Sample_Waxman-TP19	G83_M3	Female 8wk-pool 1
#Sample_Waxman-TP20	G83_M4	Female 8wk-pool 2	
#----------------------------------------------
#The 1st column: The Sample_DIR name
#The 2nd column: Waxman Lab Sample_ID 
#The 3rd column: Sample's description 
################################################
#Print header to output file:
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Intron_Only_Regions.gtf" -o "${ANNOTATION_FILE}" == "intronic_only_gene_models_ncRNA_for_counting.gtf" ];
then
echo 'SAMPLE_ID' $'\t''DESCRIPTION' $'\t''Aligned_Pairs' $'\t''Read_Pair_IN_INTRONS' $'\t''Read_Pair_IN_INTRONS_RATIO' >> $OUTPUT_FILE
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "genes.gtf" -o "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" -o "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" -o "${ANNOTATION_FILE}" == "ncRNA_exon_for_counting.gtf" -o "${ANNOTATION_FILE}" == "ncRNA_genebodies_for_counting.gtf" -o  "${ANNOTATION_FILE}" == "exonic_only_gene_models_ncRNA_for_counting.gtf" ];
then
echo 'SAMPLE_ID' $'\t''DESCRIPTION' $'\t''Aligned_Pairs' $'\t''Read_Pair_IN_EXONS' $'\t''Read_Pair_IN_EXONS_RATIO' >> $OUTPUT_FILE
fi
#------------------------------------------
################################################
#Text file has a header line to ignore:
tail -n +2 Sample_Labels.txt > Sample_Labels.temp
#Use a while loop to run jobs
while IFS=$'\t' read -r -a myArray
do
#---------------------------
##Check that text file is read in properly:
#echo 'Sample_DIR:'
Sample_DIR=${myArray[0]}
#echo 'Sample_ID:'
Sample_ID=${myArray[1]}
#echo $Sample_ID
#echo 'Description:'
Description=${myArray[2]}
#echo $Description
#---------------------------
echo
echo $Sample_ID
#Need to cd to sample specific tophat2 folder
cd ${Dataset_DIR}/$Sample_ID/fastq/tophat2
#------------------------------------------
#Get counts from *_align_summary.txt file
# echo 'Getting the total number of mapped reads...'
# #(statistics_for_all_accepted_reads.txt)
# TOTAL_MAPPED_READS=$(grep 'total' statistics_for_all_accepted_reads.txt  | awk '{print $1}')
#------------------------------------------
#For paired-end data, we used the (-p) option:
#-p        	If specified, fragments (or templates) will be counted instead
#              	of reads. This option is only applicable for paired-end reads.
#              	The two reads from the same fragment must be adjacent to each
#              	other in the provided SAM/BAM file.
#------------------------------------------
#----------------------------------------------
echo 'Getting the Aligned_Pairs...'
#Need to get the number of Aligned pairs:
Aligned_Pairs=$(grep 'Aligned pairs:'  $Sample_ID'_align_summary.txt'  | awk '{print $3}')
#------------------------------------------
echo 'Getting READS_IN_EXONS...'
#Need to cd to sample specific featureCounts folder
#Need if statements to process the *_featureCounts.out file depending on the ANNOTATION_FILE used:
featureCounts_DIR=${Dataset_DIR}/$Sample_ID/fastq/tophat2/featureCounts
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "genes.gtf" ];
then
cd ${featureCounts_DIR}/Illumina_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ];
then
cd ${featureCounts_DIR}/RefSeq_Exon_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Intron_Only_Regions.gtf" ];
then
cd ${featureCounts_DIR}/RefSeq_Intron_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ];
then
cd ${featureCounts_DIR}/RefSeq_Exon_Only_GTF
fi
#Start of lncRNA gtf
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "ncRNA_exon_for_counting.gtf" ];
then
cd ${featureCounts_DIR}/LncRNA_Exon_Collapsed_GTF
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "ncRNA_genebodies_for_counting.gtf" ];
then
cd ${featureCounts_DIR}/LncRNA_GeneBody_GTF
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "intronic_only_gene_models_ncRNA_for_counting.gtf" ];
then
cd ${featureCounts_DIR}/LncRNA_Intronic_Only_GTF
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "exonic_only_gene_models_ncRNA_for_counting.gtf" ];
then
cd ${featureCounts_DIR}/LncRNA_Exonic_Only_GTF
fi
#------------------------------------------
#Get the sum of the 2nd coulumn
READS_IN_EXONS=$(awk '{n+=$2;} ; END {print n;}' $Sample_ID'_featureCounts.out')
echo 'Getting READS_IN_EXONS_RATIO...'
#Calculate percentages
READS_IN_EXONS_RATIO=$(echo "scale=4;$READS_IN_EXONS/$Aligned_Pairs" | bc)
echo 'Printing to output file...'
echo $Sample_ID $'\t'$Description $'\t'$Aligned_Pairs $'\t'$READS_IN_EXONS $'\t'$READS_IN_EXONS_RATIO >> $OUTPUT_FILE
#End loop over Sample_Labels:
done < Sample_Labels.temp
cd ${Current_DIR}
#Remove the temp file:
rm Sample_Labels.temp
##################################################################################
#Need to summarize featureCounts special counters
##################################################################################
cd ${Current_DIR}
##################################################################################
#Need if statements to determine OUTPUT_FILE_SPECIAL depending on the ANNOTATION_FILE used:
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "genes.gtf" ];
then
OUTPUT_FILE_SPECIAL=$OUTPUT_DIR/featureCounts_summary_Illumina_GTF.txt
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ];
then
OUTPUT_FILE_SPECIAL=$OUTPUT_DIR/featureCounts_summary_RefSeq_Exon_GTF.txt
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Intron_Only_Regions.gtf" ];
then
OUTPUT_FILE_SPECIAL=$OUTPUT_DIR/featureCounts_summary_RefSeq_Intron_GTF.txt
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ];
then
OUTPUT_FILE_SPECIAL=$OUTPUT_DIR/featureCounts_summary_RefSeq_Exon_Only_GTF.txt
fi
#Start of lncRNA gtf
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "ncRNA_exon_for_counting.gtf" ];
then
OUTPUT_FILE_SPECIAL=$OUTPUT_DIR/featureCounts_summary_LncRNA_Exon_Collapsed_GTF.txt
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "ncRNA_genebodies_for_counting.gtf" ];
then
OUTPUT_FILE_SPECIAL=$OUTPUT_DIR/featureCounts_summary_LncRNA_GeneBody_GTF.txt
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "intronic_only_gene_models_ncRNA_for_counting.gtf" ];
then
OUTPUT_FILE_SPECIAL=$OUTPUT_DIR/featureCounts_summary_LncRNA_Intronic_Only_GTF.txt
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "exonic_only_gene_models_ncRNA_for_counting.gtf" ];
then
OUTPUT_FILE_SPECIAL=$OUTPUT_DIR/featureCounts_summary_LncRNA_Exonic_Only_GTF.txt
fi
#------------------------------------------
######################
if [ -f $OUTPUT_FILE_SPECIAL ]
then 
rm $OUTPUT_FILE_SPECIAL
else
touch $OUTPUT_FILE_SPECIAL
fi
######################
##################################################################################
################################################
#A text file (Sample_Labels.txt) is needed to run this script
SCRIPT_DIR=$(pwd)
cp $Sample_Labels_DIR/Sample_Labels.txt $SCRIPT_DIR
################################################
#The text file is formatted like the following:
#----------------------------------------------
#Sample_DIR	Sample_ID	Description
#Sample_Waxman-TP17	G83_M1	Male 8wk-pool 1
#Sample_Waxman-TP18	G83_M2	Male 8wk-pool 2
#Sample_Waxman-TP19	G83_M3	Female 8wk-pool 1
#Sample_Waxman-TP20	G83_M4	Female 8wk-pool 2	
#----------------------------------------------
#The 1st column: The Sample_DIR name
#The 2nd column: Waxman Lab Sample_ID 
#The 3rd column: Sample's description 
################################################
#Text file has a header line to ignore:
tail -n +2 Sample_Labels.txt > Sample_Labels.temp
#Use a while loop to run jobs
while IFS=$'\t' read -r -a myArray
do
#---------------------------
##Check that text file is read in properly:
#echo 'Sample_DIR:'
Sample_DIR=${myArray[0]}
#echo 'Sample_ID:'
Sample_ID=${myArray[1]}
#echo $Sample_ID
#echo 'Description:'
Description=${myArray[2]}
#echo $Description
#---------------------------
echo
echo $Sample_ID
#Need to cd to sample specific tophat2 folder
cd ${Dataset_DIR}/$Sample_ID/fastq/tophat2
#------------------------------------------
#Get counts from *_align_summary.txt file
# echo 'Getting the total number of mapped reads...'
# #(statistics_for_all_accepted_reads.txt)
# TOTAL_MAPPED_READS=$(grep 'total' statistics_for_all_accepted_reads.txt  | awk '{print $1}')
#------------------------------------------
#For paired-end data, we used the (-p) option:
#-p        	If specified, fragments (or templates) will be counted instead
#              	of reads. This option is only applicable for paired-end reads.
#              	The two reads from the same fragment must be adjacent to each
#              	other in the provided SAM/BAM file.
#------------------------------------------
#----------------------------------------------
echo 'Getting the Aligned_Pairs...'
#Need to get the number of Aligned pairs:
Aligned_Pairs=$(grep 'Aligned pairs:'  $Sample_ID'_align_summary.txt'  | awk '{print $3}')
#------------------------------------------
#Need to cd to sample specific featureCounts folder
#Need if statements to process the *_featureCounts.out file depending on the ANNOTATION_FILE used:
featureCounts_DIR=${Dataset_DIR}/$Sample_ID/fastq/tophat2/featureCounts
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "genes.gtf" ];
then
cd ${featureCounts_DIR}/Illumina_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ];
then
cd ${featureCounts_DIR}/RefSeq_Exon_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Intron_Only_Regions.gtf" ];
then
cd ${featureCounts_DIR}/RefSeq_Intron_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ];
then
cd ${featureCounts_DIR}/RefSeq_Exon_Only_GTF
fi
#Start of lncRNA gtf
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "ncRNA_exon_for_counting.gtf" ];
then
cd ${featureCounts_DIR}/LncRNA_Exon_Collapsed_GTF
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "ncRNA_genebodies_for_counting.gtf" ];
then
cd ${featureCounts_DIR}/LncRNA_GeneBody_GTF
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "intronic_only_gene_models_ncRNA_for_counting.gtf" ];
then
cd ${featureCounts_DIR}/LncRNA_Intronic_Only_GTF
fi
#---------------------------------------------------------------------------------
if [ "${ANNOTATION_FILE}" == "exonic_only_gene_models_ncRNA_for_counting.gtf" ];
then
cd ${featureCounts_DIR}/LncRNA_Exonic_Only_GTF
fi
#------------------------------------------
#Omit header line:
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ];
then
cp $Sample_ID'_featureCounts.out.summary' temp1.txt
else
awk 'NR>1' $Sample_ID'_featureCounts.out.summary' > temp1.txt
fi
#------------------------------------------
#Divide 2nd column by total number of reads:
awk -v Aligned_Pairs=${Aligned_Pairs} '{ Ratio=($2)/(Aligned_Pairs) ; print $1"\t"$2"\t"Aligned_Pairs"\t"Ratio }' temp1.txt > temp2.txt
#Need a header:
echo 'Status' $'\t'$Sample_ID'_Read_Count' $'\t''Aligned_Pairs' $'\t''Ratio' >> Header.txt
#Combine files:
cat Header.txt temp2.txt > temp3.txt
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ];
then
sed 's/.out.*/:/' temp3.txt > temp4.txt
mv temp4.txt temp3.txt
fi
#------------------------------------------
echo 'Concatenating summary file...'
cat temp3.txt >> $OUTPUT_FILE_SPECIAL
#Remove temp files:
rm temp*.txt
rm Header.txt
##------------------------------------------
#End loop over Sample_Labels:
done < Sample_Labels.temp
cd ${Current_DIR}
#Remove the temp file:
rm Sample_Labels.temp
###################################################################################
echo '#--------------------------------------------------------------------------'
echo 'Check out '$OUTPUT_FILE
echo 'Check out '$OUTPUT_FILE_SPECIAL
echo '#--------------------------------------------------------------------------'
#End loop over GTF files:
done
echo '#--------------------------------------------------------------------------'
echo 'Check out: '${OUTPUT_DIR}
echo '#--------------------------------------------------------------------------'
##################################################################################