#!/bin/bash -l ################################################################################## #Andy Rampersaud, 02.23.16 #This script is called by setup_UCSC_BigWig.sh ################################################################################## # Specify which shell to use #$ -S /bin/bash # Run on the current working directory #$ -cwd # Join standard output and error to a single file #$ -j n # change to y if you want a single qlog file ################################################################################## #Initialize variables from UCSC_BigWig.sh ################################################################################## #checking the command line arg #-ne : "is not equal to" if [ $# -ne 4 ] ; then echo "Need 4 arguments for the qsub command:" echo "qsub -N ${Job_Name}'_'${Sample_ID} -P waxmanlab -l h_rt=${TIME_LIMIT} Read_Strandness.qsub ${Sample_ID} ${Dataset_DIR} ${Sample_Labels_DIR} ${SCRIPT_DIR}" exit 0 fi #http://www.ibm.com/developerworks/library/l-bash-parameters/ #Note: If you have more than 9 parameters, you cannot use $10 to refer to the tenth one. You must first either process or save the first parameter ($1), then use the shift command to drop parameter 1 and move all remaining parameters down 1, so that $10 becomes $9 and so on. #http://unix.stackexchange.com/questions/104420/how-to-use-command-line-arguments-in-a-shell-script #If you need access more than 9 command line arguments, you can use the shift command. Example: shift 2 renames $3 to $1, $4 to $2 etc. #process the command line arguments Sample_ID=$1 Dataset_DIR=$2 Sample_Labels_DIR=$3 SCRIPT_DIR=$4 #Print variables (make sure they appear correctly): echo "-----------------------" echo "Start of variable list:" echo "-----------------------" echo "Sample_ID:" echo ${Sample_ID} echo "Dataset_DIR:" echo ${Dataset_DIR} echo "Sample_Labels_DIR:" echo ${Sample_Labels_DIR} echo "SCRIPT_DIR:" echo ${SCRIPT_DIR} echo "-----------------------" echo "End of variable list" echo "-----------------------" # Now let's keep track of some information just in case anything goes wrong echo "==========================================================" #Use to calculate job time: #Start_Time in seconds Start_Time=$(date +"%s") echo "Starting on : $(date)" echo "Running on node : $(hostname)" echo "Current directory : $(pwd)" echo "Current job ID : $JOB_ID" echo "Current job name : $JOB_NAME" echo "Task index number : $SGE_TASK_ID" echo "Parameter for multiple cores : $NSLOTS" echo "==========================================================" # Go to local scratch directory echo echo 'Change dir to scratch directory' echo cd ${TMPDIR} echo echo 'Print scratch directory location:' echo echo $TMPDIR #-------------------------------------- echo echo 'Loading required modules...' echo echo echo 'Loading required modules...' echo #Make sure the shebang line = #!/bin/bash -l #Need the -l option to load modules #Search for latest program installed: #module avail -t 2>&1 | grep -i samtools module load samtools/samtools-0.1.19_gnu446 #module avail -t 2>&1 | grep -i python module load python2.7 #module avail -t 2>&1 | grep -i R-2.15.1 module load R/R-2.15.1_gnu-4.4.6 #module avail -t 2>&1 | grep -i RSeQC module load rseqc/2.4 #-------------------------------------- #module help samtools/samtools-0.1.19_gnu446 #--------------------------------------------------------------------------------- #----------- Module Specific Help for 'samtools/samtools-0.1.19_gnu446' --------------------------- #SAMtools 0.1.19_gnu446: Suite of programs for interacting with high-throughput sequencing data. #SAM (Sequence Alignment/Map) format is a generic format for storing #large nucleotide sequence alignments. SAM Tools provide various utilities #for manipulating alignments in the SAM format, including sorting, merging, #indexing and generating alignments in a per-position format. #--------------------------------------------------------------------------------- #module help python2.7 #----------- Module Specific Help for 'python2.7/Python-2.7.3_gnu446' --------------------------- #python2.7 Python-2.7.3_gnu446: A general purpose, interpretive programming language. #For more information on python2.7, please see http://www.python.org #--------------------------------------------------------------------------------- #module help R/R-2.15.1_gnu-4.4.6 #----------- Module Specific Help for 'R/R-2.15.1_gnu-4.4.6' ------- #R 2.15.1 is a system (language) for statistical computation and graphics #<> #For more information on R, visit http://cran.r-project.org/ #--------------------------------------------------------------------------------- #module help rseqc/2.4 #----------- Module Specific Help for 'rseqc/2.4' ------------------ #rseqc 2.4 An RNA-seq Quality Control Package #RSeQC package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. Some basic modules quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while RNA-seq specific modules evaluate sequencing saturation, mapped reads distribution, coverage uniformity, strand specificity, etc. #For more information on rseqc, please see http://rseqc.sourceforge.net/ #--------------------------------------------------------------------------------- # copy user input data files to scratch cp ${Dataset_DIR}/${Sample_ID}/fastq/tophat2/${Sample_ID}'_primary_unique.bam' . #Initialize INPUT_BAM: INPUT_BAM=${Sample_ID}'_primary_unique.bam' #Copy Read_Strandness required files: cp ${SCRIPT_DIR}/Input_Regions/mm9.refseq.bed.gz . STORAGE_DIR=${Dataset_DIR}/${Sample_ID}/fastq/tophat2 #Create Read_Strandness output folder to store files: OUTPUT_DIR='Read_Strandness' ############################### if [ ! -d $OUTPUT_DIR ]; then mkdir $OUTPUT_DIR fi ############################### #Create output file to store the result: OUTPUT_FILE=$OUTPUT_DIR/${Sample_ID}'_'Read_Strandness.txt ###################### if [ -f $OUTPUT_FILE ] then rm $OUTPUT_FILE else touch $OUTPUT_FILE fi ###################### echo echo 'Unzip files:' echo time gzip -d *.gz echo echo 'Finished unzipping' echo echo echo 'List files in scratch directory:' echo ls -alh echo echo 'Starting to run my commands' echo echo echo 'Starting infer_experiment.py command' echo #-------------------------------------------------------------------------------- #infer_experiment.py #Usage: infer_experiment.py [options] #Options: # --version show program's version number and exit # -h, --help show this help message and exit # -i INPUT_FILE, --input-file=INPUT_FILE # Input alignment file in SAM or BAM format # -r REFGENE_BED, --refgene=REFGENE_BED # Reference gene model in bed fomat. # -s SAMPLE_SIZE, --sample-size=SAMPLE_SIZE # Number of reads sampled from SAM/BAM file. # default=200000 # -q MAP_QUAL, --mapq=MAP_QUAL # Minimum mapping quality (phred scaled) for an # alignment to be called "uniquely mapped". default=30 #================================================================================================= #Infer RNA-seq experiment design from SAM/BAM file. This module will determine if the RNA-seq #experiment is: #1) pair-end or single-end #2) if experiment is strand-specific, how reads were stranded. # * For pair-end RNA-seq, there are two different ways to strand reads: # i) 1++,1--,2+-,2-+ # read1 mapped to '+' strand indicates parental gene on '+' strand # read1 mapped to '-' strand indicates parental gene on '-' strand # read2 mapped to '+' strand indicates parental gene on '-' strand # read2 mapped to '-' strand indicates parental gene on '+' strand # ii) 1+-,1-+,2++,2-- # read1 mapped to '+' strand indicates parental gene on '-' strand # read1 mapped to '-' strand indicates parental gene on '+' strand # read2 mapped to '+' strand indicates parental gene on '+' strand # read2 mapped to '-' strand indicates parental gene on '-' strand # * For single-end RNA-seq, there are two different ways to strand reads: # i) ++,-- # read mapped to '+' strand indicates parental gene on '+' strand # read mapped to '-' strand indicates parental gene on '-' strand # ii) +-,-+ # read mapped to '+' strand indicates parental gene on '-' strand # read mapped to '-' strand indicates parental gene on '+' strand #================================================================================================= #-------------------------------------------------------------------------------- echo 'Printing command:' echo "infer_experiment.py --input-file=${INPUT_BAM} --refgene=mm9.refseq.bed > $OUTPUT_FILE" echo #Run command: infer_experiment.py --input-file=${INPUT_BAM} --refgene=mm9.refseq.bed > $OUTPUT_FILE #-------------------------------------------------------------------------------- echo echo 'Finished infer_experiment.py command' echo #Need to remove first 2 blank lines: awk 'NR > 2 { print $0 }' $OUTPUT_FILE > temp1.txt mv temp1.txt $OUTPUT_FILE echo echo 'Output file from infer_experiment.py:' echo head $OUTPUT_FILE echo echo 'Copy output to storage dir' echo cp -r $OUTPUT_DIR $STORAGE_DIR echo echo "List files in scratch" echo ls -alh echo "==========================================================" echo "Finished on : $(date)" #Use to calculate job time: #End_Time in seconds End_Time=$(date +"%s") diff=$(($End_Time-$Start_Time)) echo "$(($diff / 3600)) hours, $((($diff / 60) % 60)) minutes and $(($diff % 60)) seconds elapsed." echo "=========================================================="