#!/bin/bash
##################################################################################
#Andy Rampersaud, 03.11.16
#This script would be used to summarize Extract_Counts statistics 
#Way to run script:
#Usage: 
#./Extract_Counts_Summary.sh
##################################################################################
#---------------------------------------------------------------------------------
#Remove *.o files from previous jobs
#Need the 2>/dev/null to supress the "No such file or directory"
count=`ls -1 *.pe* 2>/dev/null  | wc -l`
if [ $count != 0 ]
then 
echo 'Removing *.po* and *.pe* files...'
rm *.po*
rm *.pe*
echo 'Done'
fi 
#---------------------------------------------------------------------------------
#Source the setup file to initialize variables
source ../00_Setup_Pipeline/01_Pipeline_Setup.sh
#---------------------------------------------------------------------------------
#Check that each variable prints a value to the terminal:
echo "-----------------------"
echo "Start of variable list:"
echo "-----------------------"
echo "Dataset_DIR:"
echo ${Dataset_DIR}
echo "Sample_Labels_DIR:"
echo ${Sample_Labels_DIR}
echo "GTF_Files_DIR:"
echo ${GTF_Files_DIR}
echo "MODE:"
echo ${MODE}
echo "STRANDEDNESS_HTSeq:"
echo ${STRANDEDNESS_HTSeq}
echo "FEATURE_ID:"
echo ${FEATURE_ID}
echo "SCRIPT_DIR:"
echo ${SCRIPT_DIR}
echo "TIME_LIMIT:"
echo ${TIME_LIMIT}
echo "-----------------------"
echo "End of variable list"
echo "-----------------------"
#---------------------------------------------------------------------------------
##################################################################################
#---------------------------------------------------------------------------------
#Retrieve the job name for this step:
Current_DIR=$(pwd)
#Extract the folder name:
DIR_name=`basename ${Current_DIR}`
#---------------------------------------------------------------------------------
OUTPUT_DIR=${Dataset_DIR}/Scripts/${DIR_name}/Job_Summary
###############################
if [ ! -d ${OUTPUT_DIR} ]; then
mkdir -p ${OUTPUT_DIR}
else
#Remove dir:
rm -r ${OUTPUT_DIR}
#Make new dir:
mkdir -p ${OUTPUT_DIR}
fi
###############################
#---------------------------------------------------------------------------------
#Start loop over GTF files:
GTF_List=$GTF_Files_DIR/*.gtf
for GTF_file in $GTF_List
do
ANNOTATION_FILE=`basename $GTF_file`
echo '#------------------------------------------'
echo 'ANNOTATION_FILE:'
echo ${ANNOTATION_FILE}
echo '#------------------------------------------'


#Skip all of these steps if we are dealing with lncRNA gtf files
if [ "${ANNOTATION_FILE}" == "ncRNA_exon_for_counting.gtf" -o "${ANNOTATION_FILE}" == "ncRNA_genebodies_for_counting.gtf" -o "${ANNOTATION_FILE}" == "intronic_only_gene_models_ncRNA_for_counting.gtf" -o "${ANNOTATION_FILE}" == "exonic_only_gene_models_ncRNA_for_counting.gtf" ];
then
echo "lncRNA GTF file is ignored by HTSeq, see FeatureCount result output"
else
#Need if statements to determine OUTPUT_FILE depending on the ANNOTATION_FILE used:
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "genes.gtf" ];
then
OUTPUT_FILE=$OUTPUT_DIR/HTSeq_Stats_Illumina_GTF.txt
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ];
then
OUTPUT_FILE=$OUTPUT_DIR/HTSeq_Stats_RefSeq_Exon_GTF.txt
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Intron_Only_Regions.gtf" ];
then
OUTPUT_FILE=$OUTPUT_DIR/HTSeq_Stats_RefSeq_Intron_GTF.txt
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ];
then
OUTPUT_FILE=$OUTPUT_DIR/HTSeq_Stats_RefSeq_Exon_Only_GTF.txt
fi
#------------------------------------------
######################
if [ -f $OUTPUT_FILE ]
then 
rm $OUTPUT_FILE
else
touch $OUTPUT_FILE
fi
######################
##################################################################################
cd ${Current_DIR}
################################################
#A text file (Sample_Labels.txt) is needed to run this script
SCRIPT_DIR=$(pwd)
cp $Sample_Labels_DIR/Sample_Labels.txt $SCRIPT_DIR
################################################
#The text file is formatted like the following:
#----------------------------------------------
#Sample_DIR	Sample_ID	Description
#Sample_Waxman-TP17	G83_M1	Male 8wk-pool 1
#Sample_Waxman-TP18	G83_M2	Male 8wk-pool 2
#Sample_Waxman-TP19	G83_M3	Female 8wk-pool 1
#Sample_Waxman-TP20	G83_M4	Female 8wk-pool 2	
#----------------------------------------------
#The 1st column: The Sample_DIR name
#The 2nd column: Waxman Lab Sample_ID 
#The 3rd column: Sample's description 
################################################
#Print header to output file:
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Intron_Only_Regions.gtf" ];
then
echo 'SAMPLE_ID' $'\t''DESCRIPTION' $'\t''Aligned_Pairs' $'\t''Read_Pair_IN_INTRONS' $'\t''Read_Pair_IN_INTRONS_RATIO' >> $OUTPUT_FILE
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "genes.gtf" ] || [ "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ] || [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ];
then
echo 'SAMPLE_ID' $'\t''DESCRIPTION' $'\t''Aligned_Pairs' $'\t''Read_Pair_IN_EXONS' $'\t''Read_Pair_IN_EXONS_RATIO' >> $OUTPUT_FILE
fi
#------------------------------------------
################################################
#Text file has a header line to ignore:
tail -n +2 Sample_Labels.txt > Sample_Labels.temp
#Use a while loop to run jobs
while IFS=$'\t' read -r -a myArray
do
#---------------------------
##Check that text file is read in properly:
#echo 'Sample_DIR:'
Sample_DIR=${myArray[0]}
#echo 'Sample_ID:'
Sample_ID=${myArray[1]}
#echo $Sample_ID
#echo 'Description:'
Description=${myArray[2]}
#echo $Description
#---------------------------
echo
echo $Sample_ID
#Need to cd to sample specific tophat2 folder
cd ${Dataset_DIR}/$Sample_ID/fastq/tophat2
#------------------------------------------
#Get counts from *_align_summary.txt file
# echo 'Getting the total number of mapped reads...'
# #(statistics_for_all_accepted_reads.txt)
# TOTAL_MAPPED_READS=$(grep 'total' statistics_for_all_accepted_reads.txt  | awk '{print $1}')
#------------------------------------------
#http://www-huber.embl.de/users/anders/HTSeq/doc/count.html
# For paired-end data, does htseq-count count reads or read pairs?
    # Read pairs. The script is designed to count “units of evidence” for gene expression. If both mates map to the same gene, this still only shows that one cDNA fragment originated from that gene. Hence, it should be counted only once.
#------------------------------------------
#----------------------------------------------
echo 'Getting the Aligned_Pairs...'
#Need to get the number of Aligned pairs:
Aligned_Pairs=$(grep 'Aligned pairs:'  $Sample_ID'_align_summary.txt'  | awk '{print $3}')
#------------------------------------------
echo 'Getting READS_IN_EXONS...'
#Need to cd to sample specific HTSeq folder
#Need if statements to process the *_HTSeq.out file depending on the ANNOTATION_FILE used:
HTSeq_DIR=${Dataset_DIR}/$Sample_ID/fastq/tophat2/HTSeq
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "genes.gtf" ];
then
cd ${HTSeq_DIR}/Illumina_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ];
then
cd ${HTSeq_DIR}/RefSeq_Exon_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Intron_Only_Regions.gtf" ];
then
cd ${HTSeq_DIR}/RefSeq_Intron_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ];
then
cd ${HTSeq_DIR}/RefSeq_Exon_Only_GTF
fi
#------------------------------------------
#Need to remove the last 5 lines
head -n -5 $Sample_ID'_HTSeq.out' > $Sample_ID'_HTSeq.temp1'
#Get the sum of the 2nd coulumn
READS_IN_EXONS=$(awk '{n+=$2;} ; END {print n;}' $Sample_ID'_HTSeq.temp1')
#Remove temp* files
rm $Sample_ID'_HTSeq.temp1'
echo 'Getting READS_IN_EXONS_RATIO...'
#Calculate percentages
READS_IN_EXONS_RATIO=$(echo "scale=4;$READS_IN_EXONS/$Aligned_Pairs" | bc)
echo 'Printing to output file...'
echo $Sample_ID $'\t'$Description $'\t'$Aligned_Pairs $'\t'$READS_IN_EXONS $'\t'$READS_IN_EXONS_RATIO >> $OUTPUT_FILE
#End loop over Sample_Labels:
done < Sample_Labels.temp
cd ${Current_DIR}
#Remove the temp file:
rm Sample_Labels.temp
##################################################################################
#Need to summarize HTSeq special counters
##################################################################################
cd ${Current_DIR}
################################################
#A text file (Sample_Labels.txt) is needed to run this script
SCRIPT_DIR=$(pwd)
cp $Sample_Labels_DIR/Sample_Labels.txt $SCRIPT_DIR
################################################
#The text file is formatted like the following:
#----------------------------------------------
#Sample_DIR	Sample_ID	Description
#Sample_Waxman-TP17	G83_M1	Male 8wk-pool 1
#Sample_Waxman-TP18	G83_M2	Male 8wk-pool 2
#Sample_Waxman-TP19	G83_M3	Female 8wk-pool 1
#Sample_Waxman-TP20	G83_M4	Female 8wk-pool 2	
#----------------------------------------------
#The 1st column: The Sample_DIR name
#The 2nd column: Waxman Lab Sample_ID 
#The 3rd column: Sample's description 
################################################
#Print header to output file:
#------------------------------------------
echo 'SAMPLE_ID' $'\t''DESCRIPTION' $'\t''Aligned_Pairs' $'\t''no_feature_count' $'\t''no_feature_RATIO' $'\t''ambiguous_count'$'\t''ambiguous_RATIO' $'\t''too_low_aQual_count'$'\t''too_low_aQual_RATIO' $'\t''not_aligned_count'$'\t''not_aligned_RATIO' $'\t''alignment_not_unique'$'\t''alignment_not_unique_RATIO' >> $OUTPUT_FILE
#------------------------------------------
################################################
#Text file has a header line to ignore:
tail -n +2 Sample_Labels.txt > Sample_Labels.temp
#Use a while loop to run jobs
while IFS=$'\t' read -r -a myArray
do
#---------------------------
##Check that text file is read in properly:
#echo 'Sample_DIR:'
Sample_DIR=${myArray[0]}
#echo 'Sample_ID:'
Sample_ID=${myArray[1]}
#echo $Sample_ID
#echo 'Description:'
Description=${myArray[2]}
#echo $Description
#---------------------------
echo
echo $Sample_ID
#Need to cd to sample specific tophat2 folder
cd ${Dataset_DIR}/$Sample_ID/fastq/tophat2
#------------------------------------------
#Get counts from *_align_summary.txt file
# echo 'Getting the total number of mapped reads...'
# #(statistics_for_all_accepted_reads.txt)
# TOTAL_MAPPED_READS=$(grep 'total' statistics_for_all_accepted_reads.txt  | awk '{print $1}')
#------------------------------------------
#http://www-huber.embl.de/users/anders/HTSeq/doc/count.html
# For paired-end data, does htseq-count count reads or read pairs?
    # Read pairs. The script is designed to count “units of evidence” for gene expression. If both mates map to the same gene, this still only shows that one cDNA fragment originated from that gene. Hence, it should be counted only once.
#------------------------------------------
#----------------------------------------------
echo 'Getting the Aligned_Pairs...'
#Need to get the number of Aligned pairs:
Aligned_Pairs=$(grep 'Aligned pairs:'  $Sample_ID'_align_summary.txt'  | awk '{print $3}')
#------------------------------------------
#Need to cd to sample specific HTSeq folder
#Need if statements to process the *_HTSeq.out file depending on the ANNOTATION_FILE used:
HTSeq_DIR=${Dataset_DIR}/$Sample_ID/fastq/tophat2/HTSeq
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "genes.gtf" ];
then
cd ${HTSeq_DIR}/Illumina_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ];
then
cd ${HTSeq_DIR}/RefSeq_Exon_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Intron_Only_Regions.gtf" ];
then
cd ${HTSeq_DIR}/RefSeq_Intron_GTF
fi
#------------------------------------------
if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ];
then
cd ${HTSeq_DIR}/RefSeq_Exon_Only_GTF
fi
#------------------------------------------
echo 'Getting no_feature count...'
no_feature=$(grep '__no_feature' $Sample_ID'_HTSeq.out'  | awk '{print $2}')
echo 'Calculate percentage...'
no_feature_RATIO=$(echo "scale=4;$no_feature/$Aligned_Pairs" | bc)
#------------------------------------------
echo 'Getting ambiguous count...'
ambiguous=$(grep '__ambiguous' $Sample_ID'_HTSeq.out'  | awk '{print $2}')
echo 'Calculate percentage...'
ambiguous_RATIO=$(echo "scale=4;$ambiguous/$Aligned_Pairs" | bc)
#------------------------------------------
echo 'Getting too_low_aQual count...'
too_low_aQual=$(grep '__too_low_aQual' $Sample_ID'_HTSeq.out'  | awk '{print $2}')
echo 'Calculate percentage...'
too_low_aQual_RATIO=$(echo "scale=4;$too_low_aQual/$Aligned_Pairs" | bc)
#------------------------------------------
echo 'Getting not_aligned count...'
not_aligned=$(grep '__not_aligned' $Sample_ID'_HTSeq.out'  | awk '{print $2}')
echo 'Calculate percentage...'
not_aligned_RATIO=$(echo "scale=4;$not_aligned/$Aligned_Pairs" | bc)
#------------------------------------------
echo 'Getting alignment_not_unique count...'
alignment_not_unique=$(grep '__alignment_not_unique' $Sample_ID'_HTSeq.out'  | awk '{print $2}')
echo 'Calculate percentage...'
alignment_not_unique_RATIO=$(echo "scale=4;$alignment_not_unique/$Aligned_Pairs" | bc)
#------------------------------------------
echo 'Printing to output file...'
#------------------------------------------
echo $Sample_ID $'\t'$Description $'\t'$Aligned_Pairs $'\t'$no_feature $'\t'$no_feature_RATIO $'\t'$ambiguous $'\t'$ambiguous_RATIO $'\t'$too_low_aQual $'\t'$too_low_aQual_RATIO $'\t'$not_aligned $'\t'$not_aligned_RATIO $'\t'$alignment_not_unique $'\t'$alignment_not_unique_RATIO >> $OUTPUT_FILE
#End loop over Sample_Labels:
done < Sample_Labels.temp
cd ${Current_DIR}
#Remove the temp file:
rm Sample_Labels.temp
##################################################################################
echo '#--------------------------------------------------------------------------'
echo 'Check out '$OUTPUT_FILE
echo '#--------------------------------------------------------------------------'
fi #End of if statement to ignore lncRNA gtf files
#End loop over GTF files:
done
echo '#--------------------------------------------------------------------------'
echo 'Check out: '${OUTPUT_DIR}
echo '#--------------------------------------------------------------------------'
##################################################################################