#!/bin/bash -l ################################################################################## #Andy Rampersaud, 02.22.16 #This script is called by Read_Length.sh ################################################################################## # Specify which shell to use #$ -S /bin/bash # Run on the current working directory #$ -cwd # Join standard output and error to a single file #$ -j n # change to y if you want a single qlog file ################################################################################## #Initialize variables from Read_Length.sh ################################################################################## #checking the command line arg #-ne : "is not equal to" if [ $# -ne 4 ] ; then echo "Need 4 arguments for the qsub command:" echo "qsub -N ${Job_Name}'_'${Sample_ID} -P waxmanlab -l h_rt=${TIME_LIMIT} Read_Length.qsub ${Sample_ID} ${Dataset_DIR} ${Sample_Labels_DIR} ${OUTPUT_DIR}" exit 0 fi #process the command line arguments Sample_ID=$1 Dataset_DIR=$2 Sample_Labels_DIR=$3 OUTPUT_DIR=$4 #Print variables (make sure they appear correctly): echo "-----------------------" echo "Start of variable list:" echo "-----------------------" echo "Sample_ID:" echo ${Sample_ID} echo "Dataset_DIR:" echo ${Dataset_DIR} echo "Sample_Labels_DIR:" echo ${Sample_Labels_DIR} echo "-----------------------" echo "End of variable list" echo "-----------------------" # Now let's keep track of some information just in case anything goes wrong echo "==========================================================" #Use to calculate job time: #Start_Time in seconds Start_Time=$(date +"%s") echo "Starting on : $(date)" echo "Running on node : $(hostname)" echo "Current directory : $(pwd)" echo "Current job ID : $JOB_ID" echo "Current job name : $JOB_NAME" echo "Task index number : $SGE_TASK_ID" echo "Parameter for multiple cores : $NSLOTS" echo "==========================================================" # Go to local scratch directory echo echo 'Change dir to scratch directory' echo cd ${TMPDIR} echo echo 'Print scratch directory location:' echo echo $TMPDIR #-------------------------------------- echo echo 'Loading required modules...' echo #Make sure the shebang line = #!/bin/bash -l #Need the -l option to load modules #No modules needed #-------------------------------------- #----------------------------------------------------- # copy user input data files to scratch #Copy over the *R1*.fastq.gz file(s): cp ${Dataset_DIR}/${Sample_ID}/fastq/*_1.fastq.gz . #Copy over the *R2*.fastq.gz file(s): cp ${Dataset_DIR}/${Sample_ID}/fastq/*_2.fastq.gz . #----------------------------------------------------- echo echo 'List files in scratch directory:' echo ls -alh echo echo 'Starting to run my commands' echo echo echo 'Unzip files:' echo time gzip -d *.gz echo echo 'Finished unzipping' echo # run my commands. echo echo 'Starting Read Length' echo OUTPUT_FILE=$OUTPUT_DIR/${Sample_ID}'_read_length.txt' ###################### if [ -f $OUTPUT_FILE ] then rm $OUTPUT_FILE else touch $OUTPUT_FILE fi ###################### #----------------------------------------------------------------------------------- #awk Command Explanation: #http://www.biostars.org/p/72433/ #For every 2nd line in group of 4 lines (NR%4 == 2): #Count the number of characters in the read sequence (length($0)) #Store that count in the array (lengths) #Record the frequency of distinct counts in array (lengths) ({lengths[length($0)]++}) #End the loop (END) #Print the output: #For each element (L) in array (lengths) (for (L in lengths)) #Print the element (L), tab, frequency of (L) ({print L "\t" lengths[L]}) #Input file: sample.fastq, Output file: Read_Length.txt #Alternate explanation: #It reads like this: every second line in every group of 4 lines (the sequence line), measure the length of the sequence and increment the array cell corresponding to that length. When all lines have been read, loop over the array to print its content. In awk, arrays and array cells are initialized when they are called for the first time, no need to initialize them before. #----------------------------------------------------------------------------------- #Run this awk command for each fastq file: FASTQ_LIST=*.fastq for FASTQ_FILE in ${FASTQ_LIST} do echo echo "Processing "${FASTQ_FILE} echo #Need a "-v" for every variable awk -v Sample_ID=${Sample_ID} -v FASTQ_FILE=${FASTQ_FILE} 'NR%4 == 2 {lengths[length($0)]++} END {for (L in lengths) {print Sample_ID "\t" FASTQ_FILE "\t" L "\t" lengths[L]}}' ${FASTQ_FILE} >> $OUTPUT_FILE done echo echo 'Ending Read Length' echo echo echo "List files in scratch" echo ls -alh echo "==========================================================" echo "Finished on : $(date)" #Use to calculate job time: #End_Time in seconds End_Time=$(date +"%s") diff=$(($End_Time-$Start_Time)) echo "$(($diff / 3600)) hours, $((($diff / 60) % 60)) minutes and $(($diff % 60)) seconds elapsed." echo "=========================================================="