######################################################################################
# Kritika Karri, 08.04.2017
# This script takes the differential expression file (.txt) for Feature Count method starting with the following prefix DiffExp_v2_GeneBody from Output_DiffExp_1a_HTSeq_GeneBody folder for all the DE comparisons.
# The result fo this script is Pearson correlation heatmaps and matrix (.csv) file for every sample replicate and also a merge sample file with correlation values for all samples and their replicates.

##################################################################################
#---------------------------------------------------------------------------------

 wd <- getwd()
 if(!is.null(wd))
  setwd(wd)

require(dplyr)
require(stringr)
require(reshape2)
require(ggplot2)
require(MASS)
library(tools)
require(data.table)

############################## FUNCTIONS ######################################
 
 delete.na <- function(DF, n=0) {
  DF[rowSums(is.na(DF)) <= n,]
}

# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
  cormat[upper.tri(cormat)] <- NA
  return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
  cormat[lower.tri(cormat)]<- NA
  return(cormat)
}
 
 
 # Analyze_rpkm function takes DiffExp_v2_Genebody (.txt) file from Differential Expression and then calculate pearson correlation and outputs matrix and heatmap
 
 
 analyze_rpkm <- function(filename)
   {
    filename
    data= data.frame(read.table(filename, header = T ))
   
    data_grep <- data[, grepl( "rpkm_" , names( data ) ) ]
    data_filter <- data_grep[data_grep[,grepl( "rpkm_mean" , names(data_grep ) )]>1,]
    data_filter = as.matrix(delete.na(data_filter))
    final <- cor(data_filter)
    final1 <-as.matrix(final)
    file1 <- file_path_sans_ext(filename)
    file_path_sans_ext(file1)
   
   # Write the correlation matrix
    f1 <- paste0("Pearson_Filtered_RPKM_",file1,".csv")
    write.table(final,file=f1) # keeps the rownames
    read.table(f1,header=TRUE,row.names=1) # says first column are rownames
      
  # Create the pearson plots   
     m= round(min(final),3)
     dm = m -0.01
    M= round(max(final),3)
    melted_cormat <- melt(final)
    upper_tri <- get_upper_tri(final)
    melted_cormat <- melt(upper_tri, na.rm = TRUE)
    rpkm_plot <- ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
    geom_tile(color = "white")+
    scale_fill_gradientn(colours = rainbow(4),limits=c(dm,M), space= "Lab",
    #scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                          name="Pearson\nCorrelation filtered at RPKM >1") +
     theme_minimal()+ 
     theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                      size = 12, hjust = 1))+
     coord_fixed()
   
     f <- paste0("Pearson_Filtered_RPKM",file1,".pdf")
     
  # Save the correlation plot
  ggsave(f,plot= rpkm_plot, device = "pdf",path= wd, width = 30, height = 30, units = "cm")
 }
 
# Analyze_all function calculates the pearson correlation for all genes from DE files 

 analyze_all <- function(filename)
 {
   
   data= data.frame(read.table(filename, header = T ))
   
   data_grep <- data[, grepl( "rpkm_" , names( data ) ) ]
   final <- cor(data_grep)
   final1 <-as.matrix(final)
   file1 <- file_path_sans_ext(filename)
   file_path_sans_ext(file1)
   
   # Write the correlation matrix
   f1 <- paste0("Pearson_All_",file1,".csv")
   write.table(final,file=f1) # keeps the rownames
   read.table(f1,header=TRUE,row.names=1) # says first column are rownames
   
   # Create the pearson plots   
   m= round(min(final),3)
   dm = m- 0.01
   M= round(max(final),3)
   melted_cormat <- melt(final)
   upper_tri <- get_upper_tri(final)
   melted_cormat <- melt(upper_tri, na.rm = TRUE)
   rpkm_plot <- ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
     geom_tile(color = "white")+
     scale_fill_gradientn(colours = rainbow(4),limits=c(dm,M), space= "Lab",
                          
     #  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
    #                      midpoint = 0, limit = c(-1,1), space = "Lab", 
                          name="Pearson\nCorrelation-All genes") +
     theme_minimal()+ 
     theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                      size = 12, hjust = 1))+
     coord_fixed()
   
   f <- paste0("Pearson_All_",file1,".pdf")
   
   # Save the correlation plot
   ggsave(f,plot= rpkm_plot, device = "pdf",path= wd, width = 30, height = 30, units = "cm")
 }
 
 
####################################### FUNCTION ENDS ###################################
 
 
# list all txt files from the current directory
# use the pattern argument to define a common pattern  for import files with regex. Here: .txt
 

list.filenames.HT <- list()
list.filenames.HT<-list.files(pattern=".txt$")
list.filenames.HT
  
 
 
 if(!is.na(list.filenames.HT[1])){
 # print "hello" 
 
   dataset = cbind.data.frame(lapply(list.filenames.HT, fread, header=TRUE, sep="\t"))
   
   dataset_grep <- dataset[, grepl( "rpkm_" , names( dataset ) ) ]
   
   final_dataset <- round(cor(dataset_grep),3)
   duplicated.columns <- duplicated(t(final_dataset))
       
   duplicated.rows <- duplicated((final_dataset)) 
   new.matrix <- final_dataset[!duplicated.rows,!duplicated.columns] 
  
   write.table(new.matrix,file= "Pearson_All_Merge_File.csv") # keeps the rownames
   read.table("Pearson_All_Merge_File.csv",header=TRUE,row.names=1) # says first column are rownames
   
   # Create the pearson plots   
  m= round(min(new.matrix),3)
  dm= m-0.01
  M= round(max(new.matrix),3)
   melted_cormat <- melt(new.matrix)
   upper_tri <- get_upper_tri(new.matrix)
   melted_cormat <- melt(upper_tri, na.rm = TRUE)
   rpkm_plot <- ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
     geom_tile(color = "white")+
    #scale_fill_continuous(low= "red",high="green",limits=c(-1,1), space="Lab",
     scale_fill_gradientn(colours = rainbow(4),limits=c(dm,M), space = "Lab",
                          name="Pearson\nCorrelation-All genes\n") +
     theme_minimal()+ 
     theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                      size = 12, hjust = 1))
     coord_fixed()
      rpkm_plot
  # f <- paste0("Pearson_All",file1,".pdf")
   
   # Save the correlation plot
   ggsave("Pearson_All_Merge_File.pdf",plot= rpkm_plot, device = "pdf",path= wd, width = 30, height = 30, units = "cm")
  
  ####################################### filtered analysis#########################333
  
  
  dataset_filter <- dataset_grep[dataset_grep[,grepl( "rpkm_mean" , names(dataset_grep ) )]>1,]
  dataset_filter = as.matrix(delete.na(dataset_filter))
    
  final_dataset_filter <- round(cor(dataset_filter),3)
  duplicated.columns.filter <- duplicated(t(final_dataset_filter))
  
  duplicated.rows.filter <- duplicated((final_dataset_filter)) 
  new.matrix.filter <- final_dataset_filter[!duplicated.rows.filter,!duplicated.columns.filter] 
  
  write.table(new.matrix.filter,file= "Pearson_Filtered_Merge_File.csv") # keeps the rownames
  read.table("Pearson_Filtered_Merge_File.csv",header=TRUE,row.names=1) # says first column are rownames
  
  # Create the pearson plots   
  m= round(min(new.matrix.filter),3)
  dm = m-0.01
  M= round(max(new.matrix.filter),3)
  melted_cormat_filter <- melt(new.matrix.filter)
  upper_tri_filter <- get_upper_tri(new.matrix.filter)
  melted_cormat_filter <- melt(upper_tri_filter, na.rm = TRUE)
  rpkm_plot_filter <- ggplot(data = melted_cormat_filter, aes(Var2, Var1, fill = value))+
    geom_tile(color = "white")+
    #scale_fill_continuous(low= "red",high="green",limits=c(-1,1), space="Lab",
    scale_fill_gradientn(colours = rainbow(4),limits=c(dm,M), space = "Lab",
                         name="Pearson\nCorrelation-RPKM >1 genes\n") +
    theme_minimal()+ 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                     size = 12, hjust = 1))
  coord_fixed()
  rpkm_plot_filter
  
  # Save the correlation plot
  ggsave("Pearson_Filtered_Merge_File.pdf",plot= rpkm_plot_filter, device = "pdf",path= wd, width = 30, height = 30, units = "cm")
  
    
 
 for(i in 1:length(list.filenames.HT)){
 
   analyze_all(list.filenames.HT[i])
   analyze_rpkm(list.filenames.HT[i])
   
   
 }
 
 } else{
   print("No  files in the folder !!")
      
 }