#!/bin/bash
##################################################################################
#Andy Rampersaud, 01.27.16
#Adapted from tophat/single_end_mapping scripts by Tisha Melia
##################################################################################
#Assumptions for this job to run correctly:
#1. You have already run the TopHat_Single_End job
#2. Your data is organized in the following way:
#You have a data set dir such as:
#/projectnb/wax-es/aramp10/G83_Samples
#Within this dir you have sample specific folders such as:
#G83_M1
#G83_M2
#G83_M3
#G83_M4
#Within each sample specific folder you have a "fastq" folder with:
#Files: *_R1_*.fastq.gz and *_R2_*.fastq.gz such as:
#Waxman-TP17_CGATGT_L007_R1_001.fastq.gz
#Waxman-TP17_CGATGT_L007_R2_001.fastq.gz
#Within each sample specific folder you have a "tophat2" folder containing output files
##################################################################################
#Fill in the following information:
##################################################################################
#Information about your data set
#As mentioned above, you should have a data set dir containing your sample specific folders:
#Dataset_DIR=/projectnb/wax-es/aramp10/G83_Samples
##################################################################################
#Samples to process
#To facilitate processing of samples in parallel we can use a text file that lists the samples to analyze
#Note: this text file is still valid even if there is only one sample to process
#You need to have a "Sample_Labels" dir within your Dataset_DIR
#Within the Sample_Labels dir have a Sample_Labels.txt such that:
################################################
#The text file is formatted like the following:
#----------------------------------------------
#Sample_DIR	Sample_ID	Description
#Sample_Waxman-TP17	G83_M1	Male 8wk-pool 1
#Sample_Waxman-TP18	G83_M2	Male 8wk-pool 2
#Sample_Waxman-TP19	G83_M3	Female 8wk-pool 1
#Sample_Waxman-TP20	G83_M4	Female 8wk-pool 2	
#----------------------------------------------
#The 1st column: The Sample_DIR name
#The 2nd column: Waxman Lab Sample_ID 
#The 3rd column: Sample's description 
################################################
#Sample_Labels_DIR=${Dataset_DIR}/Sample_Labels
##################################################################################
#GTF_Files_DIR
#Need this dir that contains the various GTF files
#Feel free to use my dir but it's better practice to have a copy in your own Dataset_DIR
#GTF_Files_DIR=/projectnb/wax-es/aramp10/GTF_Files
##################################################################################
#HTSeq counting mode
#http://www-huber.embl.de/users/anders/HTSeq/doc/count.html
#-m <mode>, --mode=<mode>
#    Mode to handle reads overlapping more than one feature. Possible values for <mode> are union, intersection-strict and intersection-nonempty (default: union)
#---------------------------------------------------------------------------------
#Based on lab discussions, we've decided that the "intersection-nonempty" mode is the desired counting mode for HTSeq counting.  Specifically because of how it will assign reads that partially overlap a region covered by multiple genes.  However, if the read completely overlaps a region covered by multiple genes, then that read is labeled ambiguous (effectively ignored).
#Please see the link above for illustrations describing each option
#---------------------------------------------------------------------------------
#Choose one:
#MODE="union"
#MODE="intersection-strict"
#MODE="intersection-nonempty"
#---------------------------------------------------------------------------------
##################################################################################
#Type of sequencing:
#what is the strandedness of your rna-seq dataset. The acceptable values are: yes, no, or reverse (all low caps)
#no: the dataset is unstranded
#yes: the dataset is stranded. Also, reads are mapped to the same strand of the transcription
#reverse: the dataset is stranded, Also, reads are mapped to the opposite direction of the transcription
#---------------------------------------------------------------------------------
#Emailed note regarding the "NEBNext Ultra Directional kit":
#The RNA molecules from our samples are complementary to the template strand of our gene.  So first strand of DNA synthesis is actually re-creating the template strand of our gene.  Sequencing this fragment, then mapping to the reference genome, the RNA-Seq reads will map to the template strand of the reference genome.  But we actually want our RNA-Seq reads to map to the coding strand of the reference genome.  So for this reason, the "reverse" option makes sense because the data is single-end reads mapping to the opposite strand as the feature (which satisfies the definition of "reverse" according to the HTSeq documentation)
#---------------------------------------------------------------------------------
#Choose one:
#STRANDEDNESS="no"
#STRANDEDNESS="yes"
#STRANDEDNESS="reverse"
#---------------------------------------------------------------------------------
##################################################################################
#FEATURE_ID will be "gene_id"
#FEATURE_ID="gene_id"
##################################################################################
#Need to get the current dir
#SCRIPT_DIR=$(pwd)
##################################################################################
#Time hour limit
#On SCC a 12-hour runtime limit is enforced on all jobs, unless specified explicitly. 
#A runtime limit can be specified in the format "hh:mm:ss"
#Dont change the following time limit value unless you know that your job is going to go over 12 hrs 
#TIME_LIMIT="12:00:00"
##################################################################################