#!/bin/bash -l ################################################################################## #Andy Rampersaud, 02.22.16 #This script is called by setup_Extract_Counts.sh ################################################################################## # Specify which shell to use #$ -S /bin/bash # Run on the current working directory #$ -cwd # Join standard output and error to a single file #$ -j n # change to y if you want a single qlog file ################################################################################## #Initialize variables from Extract_Counts.sh ################################################################################## #checking the command line arg #-ne : "is not equal to" if [ $# -ne 10 ] ; then echo "Need 10 arguments for the qsub command:" echo "qsub -N ${Job_Name}'_'${Sample_ID} -P waxmanlab -l h_rt=${TIME_LIMIT} Extract_Counts.qsub ${Sample_ID} ${Dataset_DIR} ${Sample_Labels_DIR} ${GTF_Files_DIR} ${MODE} ${STRANDEDNESS_HTSeq} ${FEATURE_ID} ${SCRIPT_DIR} ${ANNOTATION_FILE} ${FEATURE_TYPE}" exit 0 fi #http://www.ibm.com/developerworks/library/l-bash-parameters/ #Note: If you have more than 9 parameters, you cannot use $10 to refer to the tenth one. You must first either process or save the first parameter ($1), then use the shift command to drop parameter 1 and move all remaining parameters down 1, so that $10 becomes $9 and so on. #http://unix.stackexchange.com/questions/104420/how-to-use-command-line-arguments-in-a-shell-script #If you need access more than 9 command line arguments, you can use the shift command. Example: shift 2 renames $3 to $1, $4 to $2 etc. #process the command line arguments Sample_ID=$1 Dataset_DIR=$2 Sample_Labels_DIR=$3 GTF_Files_DIR=$4 MODE=$5 STRANDEDNESS_HTSeq=$6 FEATURE_ID=$7 SCRIPT_DIR=$8 ANNOTATION_FILE=$9 #FEATURE_TYPE=$10 echo "-----------------------" echo "Need shift 1 command (have more than 9 arguments):" #The "shift" command allows users to move through an argument list (if there are more than 9 arguments) #It removes the arguments that occur at the beginning of the list and shifts arguments at the end of the list so that they are now within the list of 9 arguments #Example: shift 1 #This will remove arg #1 then shift args #2 - #10 so that they are numbered $1 - $9 #Now the user can save the arg #10 as $9 #Shift_Number = (Total number of args) - (9) #For this script: #Shift_Number = (Total number of args) - (9) = (10) - (9) = 1 #Hence: Need shift 4 command #Now I can save arg: #10 as $9 echo "-----------------------" shift 1 #process the command line arguments FEATURE_TYPE=$9 #Print variables (make sure they appear correctly): echo "-----------------------" echo "Start of variable list:" echo "-----------------------" echo "Sample_ID:" echo ${Sample_ID} echo "Dataset_DIR:" echo ${Dataset_DIR} echo "Sample_Labels_DIR:" echo ${Sample_Labels_DIR} echo "GTF_Files_DIR:" echo ${GTF_Files_DIR} echo "MODE:" echo ${MODE} echo "STRANDEDNESS_HTSeq:" echo ${STRANDEDNESS_HTSeq} echo "FEATURE_ID:" echo ${FEATURE_ID} echo "SCRIPT_DIR:" echo ${SCRIPT_DIR} echo "ANNOTATION_FILE:" echo ${ANNOTATION_FILE} echo "FEATURE_TYPE:" echo ${FEATURE_TYPE} echo "-----------------------" echo "End of variable list" echo "-----------------------" # Now let's keep track of some information just in case anything goes wrong echo "==========================================================" #Use to calculate job time: #Start_Time in seconds Start_Time=$(date +"%s") echo "Starting on : $(date)" echo "Running on node : $(hostname)" echo "Current directory : $(pwd)" echo "Current job ID : $JOB_ID" echo "Current job name : $JOB_NAME" echo "Task index number : $SGE_TASK_ID" echo "Parameter for multiple cores : $NSLOTS" echo "==========================================================" # Go to local scratch directory echo echo 'Change dir to scratch directory' echo cd ${TMPDIR} echo echo 'Print scratch directory location:' echo echo $TMPDIR #-------------------------------------- echo echo 'Loading required modules...' echo #Make sure the shebang line = #!/bin/bash -l #Need the -l option to load modules #Search for latest program installed: #module avail -t 2>&1 | grep -i python2.7 module load python2.7/Python-2.7.3_gnu446 #module avail -t 2>&1 | grep -i samtools module load samtools/samtools-0.1.19_gnu446 #module avail -t 2>&1 | grep -i HTSeq module load htseq/0.6.1p1 #module avail -t 2>&1 | grep -i bedtools module load bedtools/2.24.0 #---------------------------------------------------------------------------------- #module help python2.7/Python-2.7.3_gnu446 #----------- Module Specific Help for 'python2.7/Python-2.7.3_gnu446' --------------------------- #python2.7 Python-2.7.3_gnu446: A general purpose, interpretive programming language. #For more information on python2.7, please see http://www.python.org #---------------------------------------------------------------------------------- #module help samtools/samtools-0.1.19_gnu446 #----------- Module Specific Help for 'samtools/samtools-0.1.19_gnu446' --------------------------- #sets the environment for samtools (0.1.19) built using GNU Compilers #SAM (Sequence Alignment/Map) format is a generic format for storing #large nucleotide sequence alignments. SAM Tools provide various utilities #for manipulating alignments in the SAM format, including sorting, merging, #indexing and generating alignments in a per-position format. #http://samtools.sourceforge.net/ #---------------------------------------------------------------------------------- #module help htseq/0.6.1p1 #----------- Module Specific Help for 'htseq/0.6.1p1' -------------- #HTSeq 0.6.1p1: Analysing high-throughput sequencing data with Python. #Python package that provides infrastructure to process data from high-throughput sequencing assays. #For more information on htseq, please see http://www-huber.embl.de/users/anders/HTSeq #---------------------------------------------------------------------------------- # module help bedtools/2.24.0 #----------- Module Specific Help for 'bedtools/2.24.0' ------------ #bedtools 2.24.0 A fast, flexible toolset for genome arithmetic. #Collectively, the bedtools utilities are a swiss-army knife of tools for a wide-range of genomics analysis tasks. The most widely-used tools enable genome arithmetic: that is, set theory on the genome. For example, bedtools allows one to intersect, merge, count, complement, and shuffle genomic intervals from multiple files in widely-used genomic file formats such as BAM, BED, GFF/GTF, VCF. While each individual tool is designed to do a relatively simple task (e.g., intersect two interval files), quite sophisticated analyses can be conducted by combining multiple bedtools operations on the UNIX command line. #For more information on bedtools, please see http://bedtools.readthedocs.org/en/latest/index.html #---------------------------------------------------------------------------------- # copy user input data files to scratch cp ${Dataset_DIR}/${Sample_ID}/fastq/tophat2/${Sample_ID}'_primary_unique.bam' . #Copy gtf file (annotation file): cp ${GTF_Files_DIR}/${ANNOTATION_FILE} . #------------------------------------------ if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ]; then #Copy Intron_Only gtf file (annotation file): cp ${GTF_Files_DIR}/Intron_Only_Regions.gtf . fi #------------------------------------------ #Initialize INPUT_BAM: INPUT_BAM=${Sample_ID}'_primary_unique.bam' #---------------------------------------------------------------------------------- #Make output dir: STORAGE_DIR=${Dataset_DIR}/${Sample_ID}/fastq/tophat2/HTSeq #Need if statements to name the OUTPUT_DIR depending on the ANNOTATION_FILE used: #------------------------------------------ if [ "${ANNOTATION_FILE}" == "genes.gtf" ]; then OUTPUT_DIR=$STORAGE_DIR/Illumina_GTF fi #------------------------------------------ if [ "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ]; then OUTPUT_DIR=$STORAGE_DIR/RefSeq_Exon_GTF fi #------------------------------------------ if [ "${ANNOTATION_FILE}" == "Intron_Only_Regions.gtf" ]; then OUTPUT_DIR=$STORAGE_DIR/RefSeq_Intron_GTF fi #------------------------------------------ if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ]; then OUTPUT_DIR=$STORAGE_DIR/RefSeq_Exon_Only_GTF fi #------------------------------------------ ############################## if [[ ! -d $OUTPUT_DIR ]]; then mkdir -p $OUTPUT_DIR else rm $OUTPUT_DIR/* fi ############################## OUTPUT_FILE=${Sample_ID}'_HTSeq.out' ###################### if [ -f $OUTPUT_FILE ] then rm $OUTPUT_FILE else touch $OUTPUT_FILE fi ###################### echo echo 'List files in scratch directory:' echo ls -alh echo echo 'Starting to run my commands' echo # run my commands. echo echo 'Starting samtools and htseq-count' echo ################################################################################## #--------------------------------------------------------------------------------- #For the "Exonic_Only" counts I need to process the BAM file to filter out reads that overlap Intron_Only_Regions #Need an if statement to check ANNOTATION_FILE used: if [ ${ANNOTATION_FILE} == "Exon_Only_Regions.gtf" ]; then echo echo 'Starting Intron_Only_Regions read overlap filter' echo #-v Only report those entries in A that have no overlap in B. Restricted by -f and -r. #-abam BAM file A. Each BAM alignment in A is compared to B in search of overlaps. Use “stdin” if passing A with a UNIX pipe: For example: samtools view -b | bedtools intersect -abam stdin -b genes.bed. Note: no longer necessary after version 2.19.0 #-b One or more BAM/BED/GFF/VCF file(s) “B”. Use “stdin” if passing B with a UNIX pipe. intersectBed -v -abam ${INPUT_BAM} -b Intron_Only_Regions.gtf > ${Sample_ID}'_primary_no_Intron_Only.bam' #Redefine the ${INPUT_BAM}: INPUT_BAM=${Sample_ID}'_primary_no_Intron_Only.bam' echo echo 'Ending Intron_Only_Regions read overlap filter' echo fi #End of if statement #--------------------------------------------------------------------------------- ################################################################################## ################################################################################## #--------------------------------------------------------------------------------- #For the "Intronic_Only" counts I need to process the BAM file to filter out splice junction reads #Need an if statement to check ANNOTATION_FILE used: if [ ${ANNOTATION_FILE} == "Intron_Only_Regions.gtf" ]; then echo echo 'Starting splice read filter' echo #Need a regular expression to specifically find these spliced reads: #Bunch of numbers then "M", bunch of numbers then "N", bunch of numbers then "M": # print only lines which do NOT match regex (emulates "grep -v") # NOT match regex: awk '$7 !~ /^[a-f]/' #--------------------------------------------------------------------------------- #Need the header (-h) #Need to use a regular expression match to extract header lines: #https://www.biostars.org/p/9247/ #Regular expression: #/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ #"At" symbol then two letters, a tab, then letters, then letter and/or number, then a colon, some characters after then the end of the string samtools view -h ${INPUT_BAM} | awk '($0 ~ /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)' > Header.txt #Need everything after the header information (do NOT match regex): samtools view -h ${INPUT_BAM} | awk '($0 !~ /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)' > temp1.sam #Run awk command to filter out spliced reads: awk '($6 !~ /[0-9]+[M][0-9]+[N][0-9]+[M]/)' temp1.sam > temp2.sam #Add header: cat Header.txt temp2.sam > temp3.sam #Convert SAM to BAM: samtools view -bS temp3.sam > ${Sample_ID}'_primary_no_splice.bam' #Redefine the ${INPUT_BAM}: INPUT_BAM=${Sample_ID}'_primary_no_splice.bam' echo echo 'Ending splice read filter' echo fi #End of if statement #--------------------------------------------------------------------------------- ################################################################################## #Sort the input BAM file: #Usage: samtools sort [options] #Options: -n sort by read name samtools sort -n ${INPUT_BAM} ${Sample_ID}'_sorted' #Since we sorted by read name we need the following htseq-count option (required for Paired-end sequencing data): #--order=name #--------------------------------------------------------------------------------- #Usage: htseq-count [options] alignment_file gff_file #This script takes an alignment file in SAM/BAM format and a feature file in #GFF format and calculates for each feature the number of reads mapping to it. #See http://www-huber.embl.de/users/anders/HTSeq/doc/count.html for details. #Options: # -h, --help show this help message and exit # -f SAMTYPE, --format=SAMTYPE # type of data, either 'sam' or 'bam' # (default: sam) # -r ORDER, --order=ORDER # 'pos' or 'name'. Sorting order of # (default: name). Paired-end sequencing data must be # sorted either by position or by read name, and the # sorting order must be specified. Ignored for single- # end data. # -s STRANDED, --stranded=STRANDED # whether the data is from a strand-specific assay. # Specify 'yes', 'no', or 'reverse' (default: yes). # 'reverse' means 'yes' with reversed strand # interpretation # -a MINAQUAL, --minaqual=MINAQUAL # skip all reads with alignment quality lower than the # given minimum value (default: 10) # -t FEATURETYPE, --type=FEATURETYPE # feature type (3rd column in GFF file) to be used, all # features of other type are ignored (default, suitable # for Ensembl GTF files: exon) # -i IDATTR, --idattr=IDATTR # GFF attribute to be used as feature ID (default, # suitable for Ensembl GTF files: gene_id) # -m MODE, --mode=MODE mode to handle reads overlapping more than one feature # (choices: union, intersection-strict, intersection- # nonempty; default: union) # -o SAMOUT, --samout=SAMOUT # write out all SAM alignment records into an output SAM # file called SAMOUT, annotating each line with its # feature assignment (as an optional field with tag # 'XF') # -q, --quiet suppress progress report #Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology #Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General #Public License v3. Part of the 'HTSeq' framework, version 0.6.1p1. #--------------------------------------------------------------------------------- #Print the command echo "samtools view ${Sample_ID}'_sorted'.bam | htseq-count --stranded=${STRANDEDNESS_HTSeq} --type=${FEATURE_TYPE} --idattr=${FEATURE_ID} --order=name --mode=${MODE} - ${ANNOTATION_FILE} > ${OUTPUT_FILE}" #Run the command samtools view ${Sample_ID}'_sorted'.bam | htseq-count --stranded=${STRANDEDNESS_HTSeq} --type=${FEATURE_TYPE} --idattr=${FEATURE_ID} --order=name --mode=${MODE} - ${ANNOTATION_FILE} > ${OUTPUT_FILE} ################################################################################## echo echo 'Ending samtools and htseq-count' echo #--------------------------------------------------------------------------------- #For the "Intronic_Only" counts I need to process the output count file so that upload to SEGEX will work with the RefSeq platform #Need an if statement to check ANNOTATION_FILE used: if [ ${ANNOTATION_FILE} == "Intron_Only_Regions.gtf" ]; then #Copy RefSeq_Genes_Zero_Count.txt: cp ${SCRIPT_DIR}/RefSeq_Files/RefSeq_Genes_Zero_Count.txt . #Omit header from RefSeq_Genes_Zero_Count.txt tail -n +2 RefSeq_Genes_Zero_Count.txt > RefSeq_Genes_Zero_Count.temp #The HTSeq output file has 5 extra lines at the end (extract then add back) tail -n 5 ${OUTPUT_FILE} > HTSeq_Extra_Lines.txt #Print everything except last n lines (use negative number): head -n -5 ${OUTPUT_FILE} > ${OUTPUT_FILE}.temp1 #Concatenate files: cat ${OUTPUT_FILE}.temp1 RefSeq_Genes_Zero_Count.temp > ${OUTPUT_FILE}.temp2 #Sort file by gene symbol sort -k1,1 ${OUTPUT_FILE}.temp2 > ${OUTPUT_FILE}.temp3 #Concatenate HTSeq_Extra_Lines.txt back to count file: cat ${OUTPUT_FILE}.temp3 HTSeq_Extra_Lines.txt > ${OUTPUT_FILE}.temp4 #Rename the count file: mv ${OUTPUT_FILE}.temp4 ${OUTPUT_FILE} #Remove temp files: rm *.temp* fi #--------------------------------------------------------------------------------- # copy the output files to users storage dir cp ${OUTPUT_FILE} ${OUTPUT_DIR} #------------------------------------------- #Note that the output file will be summarized by gene symbol and the last 5 lines will have extra counts #Example output file: #wc -l G110_M1_HTSeq.out #24202 G110_M1_HTSeq.out #I was expecting there would be 24197 unique gene symbols #head G110_M1_HTSeq.out #0610005C13Rik 2655 #0610007P14Rik 310 #0610009B22Rik 151 #0610009L18Rik 7 #0610009O20Rik 602 #0610010B08Rik 9 #0610010F05Rik 348 #0610010K14Rik 194 #0610011F06Rik 664 #0610012G03Rik 163 #tail G110_M1_HTSeq.out #Zyx 793 #Zzef1 1019 #Zzz3 719 #a 0 #l7Rn6 106 #no_feature 4226540 #ambiguous 59841 #too_low_aQual 0 #not_aligned 0 #alignment_not_unique 1677774 #Excluding these 5 lines, I get the expected 24197 #------------------------------------------- echo echo "List files in scratch" echo ls -alh echo "==========================================================" echo "Finished on : $(date)" #Use to calculate job time: #End_Time in seconds End_Time=$(date +"%s") diff=$(($End_Time-$Start_Time)) echo "$(($diff / 3600)) hours, $((($diff / 60) % 60)) minutes and $(($diff % 60)) seconds elapsed." echo "=========================================================="