#!/bin/bash ################################################################################## #Andy Rampersaud, 01.27.16 #Adapted from tophat/single_end_mapping scripts by Tisha Melia ################################################################################## #Assumptions for this job to run correctly: #1. You have already run the TopHat_Single_End job #2. Your data is organized in the following way: #You have a data set dir such as: #/projectnb/wax-es/aramp10/G83_Samples #Within this dir you have sample specific folders such as: #G83_M1 #G83_M2 #G83_M3 #G83_M4 #Within each sample specific folder you have a "fastq" folder with: #Files: *_R1_*.fastq.gz and *_R2_*.fastq.gz such as: #Waxman-TP17_CGATGT_L007_R1_001.fastq.gz #Waxman-TP17_CGATGT_L007_R2_001.fastq.gz #Within each sample specific folder you have a "tophat2" folder containing output files ################################################################################## #Fill in the following information: ################################################################################## #Information about your data set #As mentioned above, you should have a data set dir containing your sample specific folders: #Dataset_DIR=/projectnb/wax-es/aramp10/G83_Samples ################################################################################## #Samples to process #To facilitate processing of samples in parallel we can use a text file that lists the samples to analyze #Note: this text file is still valid even if there is only one sample to process #You need to have a "Sample_Labels" dir within your Dataset_DIR #Within the Sample_Labels dir have a Sample_Labels.txt such that: ################################################ #The text file is formatted like the following: #---------------------------------------------- #Sample_DIR Sample_ID Description #Sample_Waxman-TP17 G83_M1 Male 8wk-pool 1 #Sample_Waxman-TP18 G83_M2 Male 8wk-pool 2 #Sample_Waxman-TP19 G83_M3 Female 8wk-pool 1 #Sample_Waxman-TP20 G83_M4 Female 8wk-pool 2 #---------------------------------------------- #The 1st column: The Sample_DIR name #The 2nd column: Waxman Lab Sample_ID #The 3rd column: Sample's description ################################################ #Sample_Labels_DIR=${Dataset_DIR}/Sample_Labels ################################################################################## #GTF_Files_DIR #Need this dir that contains the various GTF files #Feel free to use my dir but it's better practice to have a copy in your own Dataset_DIR #GTF_Files_DIR=/projectnb/wax-es/aramp10/GTF_Files ################################################################################## #HTSeq counting mode #http://www-huber.embl.de/users/anders/HTSeq/doc/count.html #-m , --mode= # Mode to handle reads overlapping more than one feature. Possible values for are union, intersection-strict and intersection-nonempty (default: union) #--------------------------------------------------------------------------------- #Based on lab discussions, we've decided that the "intersection-nonempty" mode is the desired counting mode for HTSeq counting. Specifically because of how it will assign reads that partially overlap a region covered by multiple genes. However, if the read completely overlaps a region covered by multiple genes, then that read is labeled ambiguous (effectively ignored). #Please see the link above for illustrations describing each option #--------------------------------------------------------------------------------- #Choose one: #MODE="union" #MODE="intersection-strict" #MODE="intersection-nonempty" #--------------------------------------------------------------------------------- ################################################################################## #Type of sequencing: #what is the strandedness of your rna-seq dataset. The acceptable values are: yes, no, or reverse (all low caps) #no: the dataset is unstranded #yes: the dataset is stranded. Also, reads are mapped to the same strand of the transcription #reverse: the dataset is stranded, Also, reads are mapped to the opposite direction of the transcription #--------------------------------------------------------------------------------- #Emailed note regarding the "NEBNext Ultra Directional kit": #The RNA molecules from our samples are complementary to the template strand of our gene. So first strand of DNA synthesis is actually re-creating the template strand of our gene. Sequencing this fragment, then mapping to the reference genome, the RNA-Seq reads will map to the template strand of the reference genome. But we actually want our RNA-Seq reads to map to the coding strand of the reference genome. So for this reason, the "reverse" option makes sense because the data is single-end reads mapping to the opposite strand as the feature (which satisfies the definition of "reverse" according to the HTSeq documentation) #--------------------------------------------------------------------------------- #Choose one: #STRANDEDNESS="no" #STRANDEDNESS="yes" #STRANDEDNESS="reverse" #--------------------------------------------------------------------------------- ################################################################################## #FEATURE_ID will be "gene_id" #FEATURE_ID="gene_id" ################################################################################## #Need to get the current dir #SCRIPT_DIR=$(pwd) ################################################################################## #Time hour limit #On SCC a 12-hour runtime limit is enforced on all jobs, unless specified explicitly. #A runtime limit can be specified in the format "hh:mm:ss" #Dont change the following time limit value unless you know that your job is going to go over 12 hrs #TIME_LIMIT="12:00:00" ##################################################################################