#!/bin/bash -l ################################################################################## #Andy Rampersaud, 02.22.16 #This script is called by setup_Extract_Counts.sh ################################################################################## # Specify which shell to use #$ -S /bin/bash # Run on the current working directory #$ -cwd # Join standard output and error to a single file #$ -j n # change to y if you want a single qlog file ################################################################################## #Initialize variables from Extract_Counts.sh ################################################################################## #checking the command line arg #-ne : "is not equal to" if [ $# -ne 9 ] ; then echo "Need 9 arguments for the qsub command:" echo "qsub -N ${Job_Name}'_'${Sample_ID} -P waxmanlab -l h_rt=${TIME_LIMIT} Extract_Counts.qsub ${Sample_ID} ${Dataset_DIR} ${Sample_Labels_DIR} ${GTF_Files_DIR} ${STRANDEDNESS_featureCount} ${FEATURE_ID} ${SCRIPT_DIR} ${ANNOTATION_FILE} ${FEATURE_TYPE}" exit 0 fi #http://www.ibm.com/developerworks/library/l-bash-parameters/ #Note: If you have more than 9 parameters, you cannot use $10 to refer to the tenth one. You must first either process or save the first parameter ($1), then use the shift command to drop parameter 1 and move all remaining parameters down 1, so that $10 becomes $9 and so on. #http://unix.stackexchange.com/questions/104420/how-to-use-command-line-arguments-in-a-shell-script #If you need access more than 9 command line arguments, you can use the shift command. Example: shift 2 renames $3 to $1, $4 to $2 etc. #process the command line arguments Sample_ID=$1 Dataset_DIR=$2 Sample_Labels_DIR=$3 GTF_Files_DIR=$4 STRANDEDNESS_featureCount=$5 FEATURE_ID=$6 SCRIPT_DIR=$7 ANNOTATION_FILE=$8 #FEATURE_TYPE=$10 #echo "-----------------------" #echo "Need shift 1 command (have more than 9 arguments):" ##The "shift" command allows users to move through an argument list (if there are more than 9 arguments) ##It removes the arguments that occur at the beginning of the list and shifts arguments at the end of the list so that they are now within the list of 9 arguments ##Example: shift 1 ##This will remove arg #1 then shift args #2 - #10 so that they are numbered $1 - $9 ##Now the user can save the arg #10 as $9 ##Shift_Number = (Total number of args) - (9) ##For this script: ##Shift_Number = (Total number of args) - (9) = (10) - (9) = 1 ##Hence: Need shift 4 command ##Now I can save arg: ##10 as $9 #echo "-----------------------" #shift 1 #process the command line arguments FEATURE_TYPE=$9 #Print variables (make sure they appear correctly): echo "-----------------------" echo "Start of variable list:" echo "-----------------------" echo "Sample_ID:" echo ${Sample_ID} echo "Dataset_DIR:" echo ${Dataset_DIR} echo "Sample_Labels_DIR:" echo ${Sample_Labels_DIR} echo "GTF_Files_DIR:" echo ${GTF_Files_DIR} echo "STRANDEDNESS_featureCount:" echo ${STRANDEDNESS_featureCount} echo "FEATURE_ID:" echo ${FEATURE_ID} echo "SCRIPT_DIR:" echo ${SCRIPT_DIR} echo "ANNOTATION_FILE:" echo ${ANNOTATION_FILE} echo "FEATURE_TYPE:" echo ${FEATURE_TYPE} echo "-----------------------" echo "End of variable list" echo "-----------------------" #You can use option "-pe omp N", where N is the number of cores ( number of course on our system can be any integer between 1 and 16): #$ -pe omp 16 # Now let's keep track of some information just in case anything goes wrong echo "==========================================================" #Use to calculate job time: #Start_Time in seconds Start_Time=$(date +"%s") echo "Starting on : $(date)" echo "Running on node : $(hostname)" echo "Current directory : $(pwd)" echo "Current job ID : $JOB_ID" echo "Current job name : $JOB_NAME" echo "Task index number : $SGE_TASK_ID" echo "Parameter for multiple cores : $NSLOTS" echo "==========================================================" # Go to local scratch directory echo echo 'Change dir to scratch directory' echo cd ${TMPDIR} echo echo 'Print scratch directory location:' echo echo $TMPDIR #-------------------------------------- echo echo 'Loading required modules...' echo #Make sure the shebang line = #!/bin/bash -l #Need the -l option to load modules #Search for latest program installed: #module avail -t 2>&1 | grep -i samtools module load samtools/samtools-0.1.19_gnu446 #module avail -t 2>&1 | grep -i subread module load subread/1.4.6_p5 #module avail -t 2>&1 | grep -i bedtools module load bedtools/2.24.0 #---------------------------------------------------------------------------------- #module help samtools/samtools-0.1.19_gnu446 #----------- Module Specific Help for 'samtools/samtools-0.1.19_gnu446' --------------------------- #sets the environment for samtools (0.1.19) built using GNU Compilers #SAM (Sequence Alignment/Map) format is a generic format for storing #large nucleotide sequence alignments. SAM Tools provide various utilities #for manipulating alignments in the SAM format, including sorting, merging, #indexing and generating alignments in a per-position format. #http://samtools.sourceforge.net/ #---------------------------------------------------------------------------------- #module help subread/1.4.6_p5 #----------- Module Specific Help for 'subread/1.4.6_p5' ----------- #subread 1.4.6_p5 High-performance read alignment, quantification, and mutation discovery #The Subread package comprises a suite of software programs for processing next-gen sequencing read data including: # - Subread: an accurate and efficient aligner for mapping both genomic DNA-seq reads and RNA-seq reads (for the purpose of expression analysis). # - Subjunc: an RNA-seq aligner suitable for all purposes of RNA-seq analyses. # - featureCounts: a highly efficient and accurate read summarization program. # - exactSNP: a SNP caller that discovers SNPs by testing signals against local background noises. #These programs were also implemented in Bioconductor R package Rsubread. #For more information on subread, please see http://subread.sourceforge.net/ #---------------------------------------------------------------------------------- # module help bedtools/2.24.0 #----------- Module Specific Help for 'bedtools/2.24.0' ------------ #bedtools 2.24.0 A fast, flexible toolset for genome arithmetic. #Collectively, the bedtools utilities are a swiss-army knife of tools for a wide-range of genomics analysis tasks. The most widely-used tools enable genome arithmetic: that is, set theory on the genome. For example, bedtools allows one to intersect, merge, count, complement, and shuffle genomic intervals from multiple files in widely-used genomic file formats such as BAM, BED, GFF/GTF, VCF. While each individual tool is designed to do a relatively simple task (e.g., intersect two interval files), quite sophisticated analyses can be conducted by combining multiple bedtools operations on the UNIX command line. #For more information on bedtools, please see http://bedtools.readthedocs.org/en/latest/index.html #---------------------------------------------------------------------------------- # copy user input data files to scratch cp ${Dataset_DIR}/${Sample_ID}/fastq/tophat2/${Sample_ID}'_primary_unique.bam' . #Copy gtf file (annotation file): cp ${GTF_Files_DIR}/${ANNOTATION_FILE} . #------------------------------------------ if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ]; then #Copy Intron_Only gtf file (annotation file): cp ${GTF_Files_DIR}/Intron_Only_Regions.gtf . elif [ "${ANNOTATION_FILE}" == "exonic_only_gene_models_ncRNA_for_counting.gtf" ]; then #Copy intronic_only_gene_models_ncRNA_for_counting gtf file (annotation file): #rename the intronic_only gtf file as Intron_Only_Regions.gtf, such that I can reuse Andy's if statements cp ${GTF_Files_DIR}/intronic_only_gene_models_ncRNA_for_counting.gtf Intron_Only_Regions.gtf fi #------------------------------------------ #Initialize INPUT_BAM: INPUT_BAM=${Sample_ID}'_primary_unique.bam' #---------------------------------------------------------------------------------- #Make output dir: STORAGE_DIR=${Dataset_DIR}/${Sample_ID}/fastq/tophat2/featureCounts #Need if statements to name the OUTPUT_DIR depending on the ANNOTATION_FILE used: #------------------------------------------ if [ "${ANNOTATION_FILE}" == "genes.gtf" ]; then OUTPUT_DIR=$STORAGE_DIR/Illumina_GTF fi #------------------------------------------ if [ "${ANNOTATION_FILE}" == "RefSeq_GeneBody.gtf" ]; then OUTPUT_DIR=$STORAGE_DIR/RefSeq_Exon_GTF fi #------------------------------------------ if [ "${ANNOTATION_FILE}" == "Intron_Only_Regions.gtf" ]; then OUTPUT_DIR=$STORAGE_DIR/RefSeq_Intron_GTF fi #------------------------------------------ if [ "${ANNOTATION_FILE}" == "Exon_Only_Regions.gtf" ]; then OUTPUT_DIR=$STORAGE_DIR/RefSeq_Exon_Only_GTF fi #Start of lncRNA gtf #------------------------------------------ if [ "${ANNOTATION_FILE}" == "ncRNA_exon_for_counting.gtf" ]; then OUTPUT_DIR=$STORAGE_DIR/LncRNA_Exon_Collapsed_GTF fi #--------------------------------------------------------------------------------- if [ "${ANNOTATION_FILE}" == "ncRNA_genebodies_for_counting.gtf" ]; then OUTPUT_DIR=$STORAGE_DIR/LncRNA_GeneBody_GTF fi #--------------------------------------------------------------------------------- if [ "${ANNOTATION_FILE}" == "intronic_only_gene_models_ncRNA_for_counting.gtf" ]; then OUTPUT_DIR=$STORAGE_DIR/LncRNA_Intronic_Only_GTF fi #--------------------------------------------------------------------------------- if [ "${ANNOTATION_FILE}" == "exonic_only_gene_models_ncRNA_for_counting.gtf" ]; then OUTPUT_DIR=$STORAGE_DIR/LncRNA_Exonic_Only_GTF fi #------------------------------------------ ############################## if [[ ! -d $OUTPUT_DIR ]]; then mkdir -p $OUTPUT_DIR else rm $OUTPUT_DIR/* fi ############################## OUTPUT_FILE=${Sample_ID}'_featureCounts.out' ###################### if [ -f $OUTPUT_FILE ] then rm $OUTPUT_FILE else touch $OUTPUT_FILE fi ###################### echo echo 'List files in scratch directory:' echo ls -alh echo echo 'Starting to run my commands' echo # run my commands. echo echo 'Starting featureCounts' echo ################################################################################## #--------------------------------------------------------------------------------- #For the "Exonic_Only" counts I need to process the BAM file to filter out reads that overlap Intron_Only_Regions #Need an if statement to check ANNOTATION_FILE used: if [ ${ANNOTATION_FILE} == "Exon_Only_Regions.gtf" -o "${ANNOTATION_FILE}" == "exonic_only_gene_models_ncRNA_for_counting.gtf" ]; then echo echo 'Starting Intron_Only_Regions read overlap filter' echo #-v Only report those entries in A that have no overlap in B. Restricted by -f and -r. #-abam BAM file A. Each BAM alignment in A is compared to B in search of overlaps. Use “stdin” if passing A with a UNIX pipe: For example: samtools view -b | bedtools intersect -abam stdin -b genes.bed. Note: no longer necessary after version 2.19.0 #-b One or more BAM/BED/GFF/VCF file(s) “B”. Use “stdin” if passing B with a UNIX pipe. intersectBed -v -abam ${INPUT_BAM} -b Intron_Only_Regions.gtf > ${Sample_ID}'_primary_no_Intron_Only.bam' #Redefine the ${INPUT_BAM}: INPUT_BAM=${Sample_ID}'_primary_no_Intron_Only.bam' echo echo 'Ending Intron_Only_Regions read overlap filter' echo fi #End of if statement #--------------------------------------------------------------------------------- ################################################################################## #--------------------------------------------------------------------------------- #For the "Intronic_Only" counts I need to process the BAM file to filter out splice junction reads #Need an if statement to check ANNOTATION_FILE used: if [ ${ANNOTATION_FILE} == "Intron_Only_Regions.gtf" -o "${ANNOTATION_FILE}" == "intronic_only_gene_models_ncRNA_for_counting.gtf" ]; then echo echo 'Starting splice read filter' echo #Need a regular expression to specifically find these spliced reads: #Bunch of numbers then "M", bunch of numbers then "N", bunch of numbers then "M": # print only lines which do NOT match regex (emulates "grep -v") # NOT match regex: awk '$7 !~ /^[a-f]/' #--------------------------------------------------------------------------------- #Need the header (-h) #Need to use a regular expression match to extract header lines: #https://www.biostars.org/p/9247/ #Regular expression: #/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ #"At" symbol then two letters, a tab, then letters, then letter and/or number, then a colon, some characters after then the end of the string samtools view -h ${INPUT_BAM} | awk '($0 ~ /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)' > Header.txt #Need everything after the header information (do NOT match regex): samtools view -h ${INPUT_BAM} | awk '($0 !~ /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)' > temp1.sam #Run awk command to filter out spliced reads: awk '($6 !~ /[0-9]+[M][0-9]+[N][0-9]+[M]/)' temp1.sam > temp2.sam #Add header: cat Header.txt temp2.sam > temp3.sam #Convert SAM to BAM: samtools view -bS temp3.sam > ${Sample_ID}'_primary_no_splice.bam' #Redefine the ${INPUT_BAM}: INPUT_BAM=${Sample_ID}'_primary_no_splice.bam' #Remove temp files: rm temp*.sam rm Header.txt echo echo 'Ending splice read filter' echo fi #End of if statement #--------------------------------------------------------------------------------- ################################################################################## #Sort the input BAM file: #Usage: samtools sort [options] #Options: -n sort by read name samtools sort -n ${INPUT_BAM} ${Sample_ID}'_sorted' #--------------------------------------------------------------------------------- #featureCounts #Version 1.4.6-p5 #Usage: featureCounts [options] -a -o input_file1 [input_file2] ... # Required parameters: # -a Give the name of the annotation file. The program assumes # that the provided annotation file is in GTF format. Use -F # option to specify other annotation formats. # # -o Give the name of the output file. The output file contains # the number of reads assigned to each meta-feature (or each # feature if -f is specified). A meta-feature is the aggregation # of features, grouped by using gene identifiers. Please refer # to the users guide for more details. # # input_files Give the names of input read files that include the read # mapping results. Format of input files is automatically # determined (SAM or BAM). Paired-end reads will be # automatically re-ordered if it is found that reads from the # same pair are not adjacent to each other. Multiple files can # be provided at the same time. # # Optional parameters: # # -A Specify the name of a file including aliases of chromosome # names. The file should be a comma delimited text file that # includes two columns. The first column gives the chromosome # names used in the annotation and the second column gives the # chromosome names used by reads. This file should not contain # header lines. Names included in this file are case sensitive. # # -F Specify the format of the annotation file. Acceptable formats # include `GTF' and `SAF'. `GTF' by default. Please refer to the # users guide for SAF annotation format. # # -t Specify the feature type. Only rows which have the matched # matched feature type in the provided GTF annotation file # will be included for read counting. `exon' by default. # # -g Specify the attribute type used to group features (eg. exons) # into meta-features (eg. genes), when GTF annotation is provided. # `gene_id' by default. This attribute type is usually the gene # identifier. This argument is useful for the meta-feature level # summarization. # # -f If specified, read summarization will be performed at the # feature level (eg. exon level). Otherwise, it is performed at # meta-feature level (eg. gene level). # # -O If specified, reads (or fragments if -p is specified) will # be allowed to be assigned to more than one matched meta- # feature (or feature if -f is specified). # # -s Indicate if strand-specific read counting should be performed. # It has three possible values: 0 (unstranded), 1 (stranded) and # 2 (reversely stranded). 0 by default. # # -M If specified, multi-mapping reads/fragments will be counted (ie. # a multi-mapping read will be counted up to N times if it has N # reported mapping locations). The program uses the `NH' tag to # find multi-mapping reads. # # -Q The minimum mapping quality score a read must satisfy in order # to be counted. For paired-end reads, at least one end should # satisfy this criteria. 0 by default. # # -T Number of the threads. 1 by default. # # -v Output version of the program. # # -R Output read counting result for each read/fragment. For each # input read file, read counting results for reads/fragments will # be saved to a tab-delimited file that contains four columns # including read name, status(assigned or the reason if not # assigned), name of target feature/meta-feature and number of # hits if the read/fragment is counted multiple times. Name of # the file is the same as name of the input read file except a # suffix `.featureCounts' is added. # # --largestOverlap If specified, reads (or fragments) will be # assigned to the target that has the largest number of overlapping # bases. # # --minOverlap Specify the minimum required number of # overlapping bases between a read (or a fragment) and a feature. # 1 by default. If a negative value is provided, the read will be # extended from both ends. # # --readExtension5 Reads are extended upstream by bases from # their 5' end. # # --readExtension3 Reads are extended upstream by bases from # their 3' end. # # --read2pos <5:3> The read is reduced to its 5' most base or 3' # most base. Read summarization is then performed based on the # single base which the read is reduced to. # # --fraction If specified, a fractional count 1/n will be generated for each # multi-mapping read, where n is the number of alignments (indica- # ted by 'NH' tag) reported for the read. This option must be used # together with the '-M' option. # # --primary If specified, only primary alignments will be counted. Primary # and secondary alignments are identified using bit 0x100 in the # Flag field of SAM/BAM files. All primary alignments in a dataset # will be counted no matter they are from multi-mapping reads or # not ('-M' is ignored). # # --ignoreDup If specified, reads that were marked as # duplicates will be ignored. Bit Ox400 in FLAG field of SAM/BAM # file is used for identifying duplicate reads. In paired end # data, the entire read pair will be ignored if at least one end # is found to be a duplicate read. # # --countSplitAlignmentsOnly If specified, only split alignments (CIGAR # strings containing letter `N') will be counted. All the other # alignments will be ignored. An example of split alignments is # the exon-spanning reads in RNA-seq data. # # Optional paired-end parameters: # # -p If specified, fragments (or templates) will be counted instead # of reads. This option is only applicable for paired-end reads. # The two reads from the same fragment must be adjacent to each # other in the provided SAM/BAM file. # # -P If specified, paired-end distance will be checked when assigning # fragments to meta-features or features. This option is only # applicable when -p is specified. The distance thresholds should # be specified using -d and -D options. # # -d Minimum fragment/template length, 50 by default. # # -D Maximum fragment/template length, 600 by default. # # -B If specified, only fragments that have both ends # successfully aligned will be considered for summarization. # This option is only applicable for paired-end reads. # # -S Orientation of the two read from the same pair, 'fr' by # by default. # # -C If specified, the chimeric fragments (those fragments that # have their two ends aligned to different chromosomes) will # NOT be included for summarization. This option is only # applicable for paired-end read data. # # --donotsort If specified, paired end reads will not be reordered even if # reads from the same pair were found not to be next to each other # in the input. #--------------------------------------------------------------------------------- #For paired-end data we want the (-p) option #The (-B) option seems logical but the only effect is a slightly lowered read count #Leave out the (-B) option (get overall counts closer to HTSeq counts) #Need an if statement to check ANNOTATION_FILE used: if [ ${ANNOTATION_FILE} == "genes.gtf" -o \ ${ANNOTATION_FILE} == "ncRNA_exon_for_counting.gtf" -o \ ${ANNOTATION_FILE} == "ncRNA_genebodies_for_counting.gtf" -o \ ${ANNOTATION_FILE} == "intronic_only_gene_models_ncRNA_for_counting.gtf" -o \ ${ANNOTATION_FILE} == "exonic_only_gene_models_ncRNA_for_counting.gtf" ]; then #Print the command: echo "featureCounts -p -T $NSLOTS -s ${STRANDEDNESS_featureCount} -g ${FEATURE_ID} -t ${FEATURE_TYPE} -a ${ANNOTATION_FILE} -o ${OUTPUT_FILE} ${Sample_ID}'_sorted'.bam" #Run the command: featureCounts -p -T $NSLOTS -s ${STRANDEDNESS_featureCount} -g ${FEATURE_ID} -t ${FEATURE_TYPE} -a ${ANNOTATION_FILE} -o ${OUTPUT_FILE} ${Sample_ID}'_sorted'.bam fi #--------------------------------------------------------------------------------- #Dealing with overlapping gene symbols, need to run the Parse_GTF.R #Parse_GTF.R will create 2 GTF files from the ${ANNOTATION_FILE} #1st GTF file for modified counting #2nd GTF file for normal counting #--------------------------------------------------------------------------------- #Need an if statement to check ANNOTATION_FILE used: if [ ${ANNOTATION_FILE} != "genes.gtf" -a \ ${ANNOTATION_FILE} != "ncRNA_exon_for_counting.gtf" -a \ ${ANNOTATION_FILE} != "ncRNA_genebodies_for_counting.gtf" -a \ ${ANNOTATION_FILE} != "intronic_only_gene_models_ncRNA_for_counting.gtf" -a \ ${ANNOTATION_FILE} != "exonic_only_gene_models_ncRNA_for_counting.gtf" ]; then #GTF file name determines the GeneSym_List.txt #Extract extension GTF_name=${ANNOTATION_FILE%\.gtf} echo "GTF file name: "${GTF_name} #Copy GeneSym_List.txt: cp ${GTF_Files_DIR}/featureCount_Files/${GTF_name}/GeneSym_DoubleCounts/GeneSym_List.txt . #Copy Parse_GTF.R: cp ${SCRIPT_DIR}/Scripts/Parse_GTF.R . #Print the command: echo "Rscript Parse_GTF.R ${ANNOTATION_FILE} GeneSym_List.txt" #Run the command: Rscript Parse_GTF.R ${ANNOTATION_FILE} GeneSym_List.txt echo "Running assign_all_features counting" # -O If specified, reads (or fragments if -p is specified) will # be allowed to be assigned to more than one matched meta- # feature (or feature if -f is specified). #Print the command: echo "featureCounts -O -p -T $NSLOTS -s ${STRANDEDNESS_featureCount} -g ${FEATURE_ID} -t ${FEATURE_TYPE} -a GeneSym_assign_all_features.gtf -o ${Sample_ID}"_assign_all_features.out" ${Sample_ID}'_sorted'.bam" #Run the command: featureCounts -O -p -T $NSLOTS -s ${STRANDEDNESS_featureCount} -g ${FEATURE_ID} -t ${FEATURE_TYPE} -a GeneSym_assign_all_features.gtf -o ${Sample_ID}"_assign_all_features.out" ${Sample_ID}'_sorted'.bam #--------------------------------------------------------------------------------- echo "Running assign_only1_feature counting" #Print the command: echo "featureCounts -p -T $NSLOTS -s ${STRANDEDNESS_featureCount} -g ${FEATURE_ID} -t ${FEATURE_TYPE} -a GeneSym_assign_only1_feature.gtf -o ${Sample_ID}"_assign_only1_feature.out" ${Sample_ID}'_sorted'.bam" #Run the command: featureCounts -p -T $NSLOTS -s ${STRANDEDNESS_featureCount} -g ${FEATURE_ID} -t ${FEATURE_TYPE} -a GeneSym_assign_only1_feature.gtf -o ${Sample_ID}"_assign_only1_feature.out" ${Sample_ID}'_sorted'.bam #--------------------------------------------------------------------------------- #The featureCounts output file needs to be reformatted for use by DiffExp scripts file_list=*_assign_*.out for file in ${file_list} do echo "Processing "${file} #Get rid of first 2 header lines: tail -n +3 ${file} > ${file}.temp1 #Just want the first and last column #NF is always the last column awk -F '\t' '{print $1"\t"$NF}' ${file}.temp1 > ${file}.temp2 #Rename the count file: mv ${file}.temp2 ${file} #Remove temp files: rm *.temp* echo "Done processing file." done #--------------------------------------------------------------------------------- #Need to concatenate summary files #Need to add headers to indicate counting method echo "${Sample_ID}"_assign_all_features.out >> ${OUTPUT_FILE}.summary cat ${Sample_ID}"_assign_all_features.out.summary" >> ${OUTPUT_FILE}.summary echo "${Sample_ID}"_assign_only1_feature.out >> ${OUTPUT_FILE}.summary cat ${Sample_ID}"_assign_only1_feature.out.summary" >> ${OUTPUT_FILE}.summary #--------------------------------------------------------------------------------- #Concatenate files then sort by 1st column cat *_assign_*.out > ${OUTPUT_FILE}.temp1 sort -k1,1 ${OUTPUT_FILE}.temp1 > ${OUTPUT_FILE}.temp2 mv ${OUTPUT_FILE}.temp2 ${OUTPUT_FILE} #Remove temp files: rm ${OUTPUT_FILE}.temp* #--------------------------------------------------------------------------------- fi #End if statement to check ANNOTATION_FILE #--------------------------------------------------------------------------------- ################################################################################## echo echo 'Ending featureCounts' echo #--------------------------------------------------------------------------------- #Example featureCounts output file: ## Program:featureCounts v1.4.6-p5; Command:"featureCounts" "-T" "16" "-s" "0" "-g" "gene_id" "-t" "exon" "-a" "RefSeq_GeneBody.gtf" "-o" "G83_M1_featureCounts.out" "G83_M1_sorted.bam" #Geneid Chr Start End Strand Length G83_M1_sorted.bam #Xkr4 chr1;chr1;chr1 3204563;3411783;3660633 3207049;3411982;3661579 -;-;- 3634 0 #Rp1 chr1;chr1;chr1;chr1;chr1;chr1 4280927;4333588;4341991;4342283;4350281;4399251 4283093;4340172;4342162;4342918;4350395;4399322 -;-;-;-;-;- 9747 0 #Sox17 chr1;chr1;chr1;chr1;chr1 4481009;4483181;4483853;4485217;4486372 4482749;4483571;4483944;4486023;4487435 -;-;-;-;- 4095 4 #--------------------------------------------------------------------------------- #--------------------------------------------------------------------------------- #Need an if statement to check ANNOTATION_FILE used: if [ ${ANNOTATION_FILE} == "genes.gtf" -o \ ${ANNOTATION_FILE} == "ncRNA_exon_for_counting.gtf" -o \ ${ANNOTATION_FILE} == "ncRNA_genebodies_for_counting.gtf" -o \ ${ANNOTATION_FILE} == "intronic_only_gene_models_ncRNA_for_counting.gtf" -o \ ${ANNOTATION_FILE} == "exonic_only_gene_models_ncRNA_for_counting.gtf" ]; then #The featureCounts output file needs to be reformatted for use by DiffExp scripts #Get rid of first 2 header lines: tail -n +3 ${OUTPUT_FILE} > ${OUTPUT_FILE}.temp1 #Just want the first and last column #NF is always the last column awk -F '\t' '{print $1"\t"$NF}' ${OUTPUT_FILE}.temp1 > ${OUTPUT_FILE}.temp2 #Rename the count file: mv ${OUTPUT_FILE}.temp2 ${OUTPUT_FILE} #Remove temp files: rm *.temp* fi #--------------------------------------------------------------------------------- #--------------------------------------------------------------------------------- #For the "Intronic_Only" counts I need to process the output count file so that upload to SEGEX will work with the RefSeq platform #Need an if statement to check ANNOTATION_FILE used: if [ ${ANNOTATION_FILE} == "Intron_Only_Regions.gtf" ]; then #Copy RefSeq_Genes_Zero_Count.txt: cp ${SCRIPT_DIR}/RefSeq_Files/RefSeq_Genes_Zero_Count.txt . #Omit header from RefSeq_Genes_Zero_Count.txt tail -n +2 RefSeq_Genes_Zero_Count.txt > RefSeq_Genes_Zero_Count.temp #The featureCounts output file has already been reformatted above #Concatenate files: cat ${OUTPUT_FILE} RefSeq_Genes_Zero_Count.temp > ${OUTPUT_FILE}.temp1 #Sort file by gene symbol sort -k1,1 ${OUTPUT_FILE}.temp1 > ${OUTPUT_FILE}.temp2 #Rename the count file: mv ${OUTPUT_FILE}.temp2 ${OUTPUT_FILE} #Remove temp files: rm *.temp* fi #--------------------------------------------------------------------------------- # copy the output files to users storage dir cp ${OUTPUT_FILE} ${OUTPUT_DIR} cp ${OUTPUT_FILE}.summary ${OUTPUT_DIR} #--------------------------------------------------------------------------------- #Example job output files: #--------------------------------------------------------------------------------- #*.out file: #Xkr4 0 #Rp1 0 #Sox17 2 #Mrpl15 7 #Lypla1 33 #Tcea1 12 #Rgs20 0 #Atp6v1h 9 #Oprk1 0 #Npbwr1 0 #Rb1cc1 11 #Fam150a 0 #--------------------------------------------------------------------------------- #*.out.summary file: #Status G83_M1_sorted.bam #Assigned 181926 #Unassigned_Ambiguity 4618 #Unassigned_MultiMapping 0 #Unassigned_NoFeatures 24533 #Unassigned_Unmapped 0 #Unassigned_MappingQuality 0 #Unassigned_FragmentLength 0 #Unassigned_Chimera 0 #Unassigned_Secondary 0 #Unassigned_Nonjunction 0 #Unassigned_Duplicate 0 #--------------------------------------------------------------------------------- echo echo "List files in scratch" echo ls -alh echo "==========================================================" echo "Finished on : $(date)" #Use to calculate job time: #End_Time in seconds End_Time=$(date +"%s") diff=$(($End_Time-$Start_Time)) echo "$(($diff / 3600)) hours, $((($diff / 60) % 60)) minutes and $(($diff % 60)) seconds elapsed." echo "=========================================================="