#!/bin/bash -l
##################################################################################
#Andy Rampersaud, 03.11.16
#This script is called by DiffExp.sh
#06.29.2015: Tisha edited to accomodate edgeR results
##################################################################################
# Specify which shell to use
#$ -S /bin/bash
# Run on the current working directory
#$ -cwd
# Join standard output and error to a single file
#$ -j n
# change to y if you want a single qlog file
##################################################################################
#Initialize variables from DiffExp.sh
##################################################################################
#checking the command line arg
#-ne : "is not equal to"
if [ $# -ne 14 ] ; then
echo "Need 14 arguments for the qsub command:"
echo "qsub -N ${Job_Name}'_'${DiffExp_Index} -P waxmanlab -l h_rt=${TIME_LIMIT} DiffExp.qsub ${SCRIPT_DIR} ${Dataset_DIR} ${Dataset_Label} ${GTF_Files_DIR} ${ANNOTATION_FILE} ${CONDITION_1_NAME} ${CONDITION_2_NAME} ${Lengths_DIR} ${GENE_LENGTHS_FILE} ${COUNT_DIR} ${OUTPUT_PREFIX} ${DiffExp_Index} ${COL_SUFFIX} ${COUNT_PROGRAM}"
exit 0
fi
#http://www.ibm.com/developerworks/library/l-bash-parameters/
#Note: If you have more than 9 parameters, you cannot use $10 to refer to the tenth one. You must first either process or save the first parameter ($1), then use the shift command to drop parameter 1 and move all remaining parameters down 1, so that $10 becomes $9 and so on.
#http://unix.stackexchange.com/questions/104420/how-to-use-command-line-arguments-in-a-shell-script
#If you need access more than 9 command line arguments, you can use the shift command. Example: shift 2 renames $3 to $1, $4 to $2 etc.
#process the command line arguments
SCRIPT_DIR=$1
Dataset_DIR=$2
Dataset_Label=$3
GTF_Files_DIR=$4
ANNOTATION_FILE=$5
CONDITION_1_NAME=$6
CONDITION_2_NAME=$7
Lengths_DIR=$8
GENE_LENGTHS_FILE=$9
#COUNT_DIR=$10
#OUTPUT_PREFIX=$11
#DiffExp_Index=$12
#COL_SUFFIX=$13
#COUNT_PROGRAM=$14
echo "-----------------------"
echo "Need shift 5 command (have more than 9 arguments):"
#The "shift" command allows users to move through an argument list (if there are more than 9 arguments)
#It removes the arguments that occur at the beginning of the list and shifts arguments at the end of the list so that they are now within the list of 9 arguments
#Example: shift 1
#This will remove arg #1 then shift args #2 - #10 so that they are numbered $1 - $9
#Now the user can save the arg #10 as $9
#Shift_Number = (Total number of args) - (9)
#For this script:
#Shift_Number = (Total number of args) - (9) = (14) - (9) = 5
#Hence: Need shift 5 command
#Now I can save arg:
#10 as $5
#11 as $6
#12 as $7
#13 as $8
#14 as $9
echo "-----------------------"
shift 5
#process the command line arguments
COUNT_DIR=$5
OUTPUT_PREFIX=$6
DiffExp_Index=$7
COL_SUFFIX=$8
COUNT_PROGRAM=$9
#Print variables (make sure they appear correctly):
echo "-----------------------"
echo "Start of variable list:"
echo "-----------------------"
echo "SCRIPT_DIR:"
echo ${SCRIPT_DIR}
echo "Dataset_DIR:"
echo ${Dataset_DIR}
echo "Dataset_Label:"
echo ${Dataset_Label}
echo "GTF_Files_DIR:"
echo ${GTF_Files_DIR}
echo "ANNOTATION_FILE:"
echo ${ANNOTATION_FILE}
echo "CONDITION_1_NAME:"
echo ${CONDITION_1_NAME}
echo "CONDITION_2_NAME:"
echo ${CONDITION_2_NAME}
echo "Lengths_DIR:"
echo ${Lengths_DIR}
echo "GENE_LENGTHS_FILE:"
echo ${GENE_LENGTHS_FILE}
echo "COUNT_DIR:"
echo ${COUNT_DIR}
echo "OUTPUT_PREFIX:"
echo ${OUTPUT_PREFIX}
echo "DiffExp_Index:"
echo ${DiffExp_Index}
echo "COL_SUFFIX:"
echo ${COL_SUFFIX}
echo "COUNT_PROGRAM:"
echo ${COUNT_PROGRAM}
echo "-----------------------"
echo "End of variable list"
echo "-----------------------"
# Now let's keep track of some information just in case anything goes wrong
echo "=========================================================="
#Use to calculate job time:
#Start_Time in seconds
Start_Time=$(date +"%s")
echo "Starting on : $(date)"
echo "Running on node : $(hostname)"
echo "Current directory : $(pwd)"
echo "Current job ID : $JOB_ID"
echo "Current job name : $JOB_NAME"
echo "Task index number : $SGE_TASK_ID"
echo "Parameter for multiple cores : $NSLOTS"
echo "=========================================================="
# Go to local scratch directory
echo
echo 'Change dir to scratch directory'
echo
cd ${TMPDIR}
echo
echo 'Print scratch directory location:'
echo
echo $TMPDIR
#--------------------------------------
echo
echo 'Loading required modules...'
echo
#Make sure the shebang line = #!/bin/bash -l
#Need the -l option to load modules
#Search for latest program installed:
#For captial "R" remove (-i Ignore case sensitivity.) option
#module avail -t 2>&1 | grep R
module load R/R-3.1.1
#--------------------------------------
#module help R/R-3.1.1
#----------- Module Specific Help for 'R/R-3.1.1' ------------------
#R R-3.1.1: R is a system/language for statistical computation and graphics.
#R 3.1.1 is a system (language) for statistical computation and graphics
#R consists of a language plus run-time enviroment with graphics, a debugger,
#access to certain system functions and the ability to run programs stored in
#script files
module load samtools
#----------------------------------------------------------------------------------
#Output dir:
OUTPUT_DIR=${SCRIPT_DIR}/'Output_'${DiffExp_Index}'_'${COUNT_PROGRAM}'_'${COL_SUFFIX}
#Using an if statement to make the output folder if it does not exists
if [ ! -d $OUTPUT_DIR ]; then
mkdir $OUTPUT_DIR;
fi
#-----------------------------------------------------
#Create folders to store input files:
mkdir -p Input/Condition_1
mkdir -p Input/Condition_2
#-----------------------------------------------------
#Copy gtf file (annotation file):
cp ${GTF_Files_DIR}/${ANNOTATION_FILE} .
#Copy over text files indicating Condition_1 and Condition_2 samples:
cp ${SCRIPT_DIR}/Condition_1.txt .
cp ${SCRIPT_DIR}/Condition_2.txt .
#Copy Rscripts:
cp ${SCRIPT_DIR}/Scripts/differentialAnalysis.R .
cp ${SCRIPT_DIR}/Scripts/formatForSegex_ver3.R .
cp ${SCRIPT_DIR}/Scripts/Diff_Genes.R .
cp ${SCRIPT_DIR}/Scripts/Venn_Diff_Genes.R .
#----------------------------------------------------------------------------------
#Use text files to indicate Condition_1 and Condition_2 samples
#Condition_1.txt
#Condition_2.txt
#----------------------------------------------------------------------------------
#Need to preprocess the Condition_*.txt files to make sure they will work:
#This will guarantee that only tabs exists between columns:
awk '{print $1"\t"$2"\t"$3}' Condition_1.txt > Condition_1.temp1
mv Condition_1.temp1 Condition_1.txt
awk '{print $1"\t"$2"\t"$3}' Condition_2.txt > Condition_2.temp1
mv Condition_2.temp1 Condition_2.txt
#----------------------------------------------------------------------------------
#-----------------------------------------------
#Copy Condition_1 sample count files to Condition_1 folder
################################################
#The text file is formatted like the following:
#----------------------------------------------
#Sample_DIR Sample_ID Description
#Sample_Waxman-TP17 G83_M1 Male 8wk-pool 1
#Sample_Waxman-TP18 G83_M2 Male 8wk-pool 2
#Sample_Waxman-TP19 G83_M3 Female 8wk-pool 1
#Sample_Waxman-TP20 G83_M4 Female 8wk-pool 2
#----------------------------------------------
#The 1st column: The Sample_DIR name
#The 2nd column: Waxman Lab Sample_ID
#The 3rd column: Sample's description
################################################
#Text file has a header line to ignore:
tail -n +2 Condition_1.txt > Condition_1.temp
echo "------------------------------------------"
#Use a while loop to run jobs
while IFS=$'\t' read -r -a myArray
do
#---------------------------
##Check that text file is read in properly:
echo 'Sample_DIR:'
Sample_DIR=${myArray[0]}
echo $Sample_DIR
echo 'Sample_ID:'
Sample_ID=${myArray[1]}
echo $Sample_ID
echo 'Description:'
Description=${myArray[2]}
echo $Description
#---------------------------
#Split by underscore and get the 2nd part (M number)
M_Num=$(echo $Sample_ID | cut -d'_' -f 2)
echo "M_Num:"
echo ${M_Num}
#Append variable string to itself:
M_Num_Cond1_List+=${M_Num}
#---------------------------
echo 'Copy Condition_1 sample count files to Condition_1 folder'
cp ${Dataset_DIR}/${Sample_ID}/fastq/tophat2/HTSeq/${COUNT_DIR}/${Sample_ID}'_HTSeq.out' ./Input/Condition_1
#---------------------------
echo 'calculate mapped reads: '${Sample_ID}
#---------------------------
#The following samtools command is too time-consuming
#samtools view -c ${Dataset_DIR}/${Sample_ID}/fastq/tophat2/${Sample_ID}'_primary_unique.bam' > ./Input/Condition_1/${Sample_ID}'_num_mapped_reads.txt'
#---------------------------
#The TopHat_Paired_End job already created a *_statistics_for_primary_unique_reads.txt
#Extract the number of uniquely_mapped_reads, then save it to a text file:
#Copy *_statistics_for_primary_unique_reads.txt to compute node:
cp ${Dataset_DIR}/${Sample_ID}/fastq/tophat2/$Sample_ID'_statistics_for_primary_unique_reads.txt' .
#Extract the number of uniquely_mapped_reads:
uniquely_mapped_reads=$(grep 'total' $Sample_ID'_statistics_for_primary_unique_reads.txt' | awk '{print $1}')
#save it to a text file:
echo $uniquely_mapped_reads >> ./Input/Condition_1/${Sample_ID}'_num_mapped_reads.txt'
#Remove the *_statistics_for_primary_unique_reads.txt
rm $Sample_ID'_statistics_for_primary_unique_reads.txt'
#---------------------------
done < Condition_1.temp
#---------------------------
echo "M_Num_Cond1_List:"
echo ${M_Num_Cond1_List}
#---------------------------
#Remove the temp file:
rm Condition_1.temp
echo "------------------------------------------"
###############################################
#----------------------------------------------------------------------------------
#Copy Condition_2 sample count files to Condition_2 folder
################################################
#The text file is formatted like the following:
#----------------------------------------------
#Sample_DIR Sample_ID Description
#Sample_Waxman-TP17 G83_M1 Male 8wk-pool 1
#Sample_Waxman-TP18 G83_M2 Male 8wk-pool 2
#Sample_Waxman-TP19 G83_M3 Female 8wk-pool 1
#Sample_Waxman-TP20 G83_M4 Female 8wk-pool 2
#----------------------------------------------
#The 1st column: The Sample_DIR name
#The 2nd column: Waxman Lab Sample_ID
#The 3rd column: Sample's description
################################################
#Text file has a header line to ignore:
tail -n +2 Condition_2.txt > Condition_2.temp
echo "------------------------------------------"
#Use a while loop to run jobs
while IFS=$'\t' read -r -a myArray
do
#---------------------------
##Check that text file is read in properly:
echo 'Sample_DIR:'
Sample_DIR=${myArray[0]}
echo $Sample_DIR
echo 'Sample_ID:'
Sample_ID=${myArray[1]}
echo $Sample_ID
echo 'Description:'
Description=${myArray[2]}
echo $Description
#---------------------------
#Split by underscore and get the 2nd part (M number)
M_Num=$(echo $Sample_ID | cut -d'_' -f 2)
echo "M_Num:"
echo ${M_Num}
#Append variable string to itself:
M_Num_Cond2_List+=${M_Num}
#---------------------------
echo 'Copy Condition_2 sample count files to Condition_2 folder'
cp ${Dataset_DIR}/${Sample_ID}/fastq/tophat2/HTSeq/${COUNT_DIR}/${Sample_ID}'_HTSeq.out' ./Input/Condition_2
#---------------------------
echo 'calculate mapped reads: '${Sample_ID}
#---------------------------
#The following samtools command is too time-consuming
#samtools view -c ${Dataset_DIR}/${Sample_ID}/fastq/tophat2/${Sample_ID}'_primary_unique.bam' > ./Input/Condition_2/${Sample_ID}'_num_mapped_reads.txt'
#---------------------------
#The TopHat_Paired_End job already created a *_statistics_for_primary_unique_reads.txt
#Extract the number of uniquely_mapped_reads, then save it to a text file:
#Copy *_statistics_for_primary_unique_reads.txt to compute node:
cp ${Dataset_DIR}/${Sample_ID}/fastq/tophat2/$Sample_ID'_statistics_for_primary_unique_reads.txt' .
#Extract the number of uniquely_mapped_reads:
uniquely_mapped_reads=$(grep 'total' $Sample_ID'_statistics_for_primary_unique_reads.txt' | awk '{print $1}')
#save it to a text file:
echo $uniquely_mapped_reads >> ./Input/Condition_2/${Sample_ID}'_num_mapped_reads.txt'
#Remove the *_statistics_for_primary_unique_reads.txt
rm $Sample_ID'_statistics_for_primary_unique_reads.txt'
#---------------------------
done < Condition_2.temp
#---------------------------
echo "M_Num_Cond2_List:"
echo ${M_Num_Cond2_List}
#---------------------------
#Remove the temp file:
rm Condition_2.temp
echo "------------------------------------------"
###############################################
echo "=========================================================="
echo
echo 'Number of replicates in each condition:'
echo
cd Input/Condition_1
NUM_REP_CONDITION1=$(ls -l | grep '.out' | wc -l)
echo 'NUM_REP_CONDITION1: '$NUM_REP_CONDITION1
cd ../../
cd Input/Condition_2
NUM_REP_CONDITION2=$(ls -l | grep '.out' | wc -l)
echo 'NUM_REP_CONDITION1: '$NUM_REP_CONDITION1
#----------------------------------------------------------------------------------
#Need to check if NUM_REP_CONDITION* equals 1
#If there's only 1 replicate (1 sample) per condition can't run current DE analysis
#------------------------------------------
if [ "${NUM_REP_CONDITION1}" == "1" ] || [ "${NUM_REP_CONDITION2}" == "1" ] ;
then
echo
echo "WARNING:"
echo
echo "Either your Condition_1 or Condition_2 has only 1 sample (no replicates)"
echo "The differential expression analysis is designed for cases where there are at least 2 replicates/biological condition."
echo "The resulting differential expression analysis will be converted to a stringent version due to absence of replicate samples."
echo "Samples from this comparison will be treated as \"replicates\" to estimate the gene level variance in edgeR and, separately in, DESeq2."
echo "Expect very few to zero DE genes from DESeq2, EdgeR should yield a DE gene count."
echo "Interpreting the results: use the edgeR fold change or DESeq ratio (not DESeq fold change)"
echo "Note regarding the SEGEX upload file(s):"
echo "Since DESeq2 fold changes are not the same as the ratio of the normalized counts between the conditions, so the output of the segex upload file will be different between \"DESeq ratio\" and \"DESeq foldChange\". The former is truly the ratio between the normalized counts between the two conditions (plus pseudocounts) whereas the latter is the fold change estimated by DESeq."
fi
echo "=========================================================="
#----------------------------------------------------------------------------------
cd ../../
#Rename folder names to match CONDITION_1_NAME and CONDITION_2_NAME
mv ./Input/Condition_1 ./Input/$CONDITION_1_NAME
mv ./Input/Condition_2 ./Input/$CONDITION_2_NAME
#----------------------------------------------------------------------------------
echo
echo 'Renaming input count files'
echo
#The differentialAnalysis.R is expecting file names within each folder to also be a function of the condition names
#Use an array to rename the files
cd ${TMPDIR}/Input/$CONDITION_1_NAME
arr=(*.out)
for ((i=0; i<${#arr[@]}; i++)); do
#do something to each element of array
#echo 'Original file name:'
filename=${arr[$i]}
#echo "${filename}"
#Extract extension
name=${filename%\.out}
#echo $name
#----------------------------------------------------------------------------------
#Need an if statement to check if HTSeq was used
#If HTSeq was used, then remove last 5 lines (remove special counters)
if [[ $name == *"HTSeq"* ]] ; then
echo "Counting program: HTSeq"
echo "Removing last 5 lines (remove special counters)"
head -n -5 ${name}.out > ${name}.temp1
mv ${name}.temp1 ${name}.out
else
echo "Counting program: Not HTSeq"
echo "No lines removed."
fi
#----------------------------------------------------------------------------------
#echo 'Modified file name:'
mod_name="${CONDITION_1_NAME}${i}.out"
#echo ${mod_name}
#Rename the file
mv ${filename} ${TMPDIR}/Input/${mod_name}
mv ${filename%\_HTSeq.out}'_num_mapped_reads.txt' ${TMPDIR}/Input/${CONDITION_1_NAME}${i}'_num_mapped_reads.txt'
done
#Use an array to rename the files
cd ${TMPDIR}/Input/$CONDITION_2_NAME
arr=(*.out)
for ((i=0; i<${#arr[@]}; i++)); do
#do something to each element of array
#echo 'Original file name:'
filename=${arr[$i]}
#echo "${filename}"
#Extract extension
name=${filename%\.out}
#echo $name
#----------------------------------------------------------------------------------
#Need an if statement to check if HTSeq was used
#If HTSeq was used, then remove last 5 lines (remove special counters)
if [[ $name == *"HTSeq"* ]] ; then
echo "Counting program: HTSeq"
echo "Removing last 5 lines (remove special counters)"
head -n -5 ${name}.out > ${name}.temp1
mv ${name}.temp1 ${name}.out
else
echo "Counting program: Not HTSeq"
echo "No lines removed."
fi
#----------------------------------------------------------------------------------
#echo 'Modified file name:'
mod_name="${CONDITION_2_NAME}${i}.out"
#echo ${mod_name}
#Rename the file
mv ${filename} ${TMPDIR}/Input/${mod_name}
mv ${filename%\_HTSeq.out}'_num_mapped_reads.txt' ${TMPDIR}/Input/${CONDITION_2_NAME}${i}'_num_mapped_reads.txt'
done
#----------------------------------------------------------------------------------
#copying lengths
cp ${Lengths_DIR}/${GENE_LENGTHS_FILE} ${TMPDIR}/Input/.
echo "number of mapped reads"
cat ${TMPDIR}/Input/*_num_mapped_reads.txt
echo "=========================================================="
echo
echo "List files in Input"
echo
ls -al ${TMPDIR}/Input/*
echo "=========================================================="
#Initialize variables:
OUTPUT_COUNT_FOLDER=${TMPDIR}/Input
#----------------------------------------------------------------------------------
cd ${TMPDIR}
echo
echo 'Starting to run my commands'
echo
# run my commands.
#----------------------------------------------------------------------------------
echo 'Printing Rscript command:'
echo "Rscript differentialAnalysis.R $CONDITION_1_NAME $CONDITION_2_NAME ${NUM_REP_CONDITION1} ${NUM_REP_CONDITION2} $ANNOTATION_FILE $OUTPUT_COUNT_FOLDER $OUTPUT_PREFIX $GENE_LENGTHS_FILE"
#Run the command:
Rscript differentialAnalysis.R $CONDITION_1_NAME $CONDITION_2_NAME ${NUM_REP_CONDITION1} ${NUM_REP_CONDITION2} $ANNOTATION_FILE $OUTPUT_COUNT_FOLDER $OUTPUT_PREFIX $GENE_LENGTHS_FILE
#----------------------------------------------------------------------------------
#Output file will be in "Input folder"
echo "=========================================================="
echo
echo "Create SEGEX formatted file:"
echo
#cd in to Input folder to run script:
cd ${TMPDIR}/Input
#Move formatForSegex_ver3.R to Input folder
mv ${TMPDIR}/formatForSegex_ver3.R ${TMPDIR}/Input/
#Call the Rscript
#Rscript formatForSegex_ver3.R