###################################################################################### # Kritika Karri, 08.04.2017 # This script takes the differential expression file (.txt) for Feature Count method starting with the following prefix DiffExp_v2_GeneBody from Output_DiffExp_1a_HTSeq_GeneBody folder for all the DE comparisons. # The result fo this script is Pearson correlation heatmaps and matrix (.csv) file for every sample replicate and also a merge sample file with correlation values for all samples and their replicates. ################################################################################## #--------------------------------------------------------------------------------- wd <- getwd() if(!is.null(wd)) setwd(wd) require(dplyr) require(stringr) require(reshape2) require(ggplot2) require(MASS) library(tools) require(data.table) ############################## FUNCTIONS ###################################### delete.na <- function(DF, n=0) { DF[rowSums(is.na(DF)) <= n,] } # Get lower triangle of the correlation matrix get_lower_tri<-function(cormat){ cormat[upper.tri(cormat)] <- NA return(cormat) } # Get upper triangle of the correlation matrix get_upper_tri <- function(cormat){ cormat[lower.tri(cormat)]<- NA return(cormat) } # Analyze_rpkm function takes DiffExp_v2_Genebody (.txt) file from Differential Expression and then calculate pearson correlation and outputs matrix and heatmap analyze_rpkm <- function(filename) { filename data= data.frame(read.table(filename, header = T )) data_grep <- data[, grepl( "rpkm_" , names( data ) ) ] data_filter <- data_grep[data_grep[,grepl( "rpkm_mean" , names(data_grep ) )]>1,] data_filter = as.matrix(delete.na(data_filter)) final <- cor(data_filter) final1 <-as.matrix(final) file1 <- file_path_sans_ext(filename) file_path_sans_ext(file1) # Write the correlation matrix f1 <- paste0("Pearson_Filtered_RPKM_",file1,".csv") write.table(final,file=f1) # keeps the rownames read.table(f1,header=TRUE,row.names=1) # says first column are rownames # Create the pearson plots m= round(min(final),3) dm = m -0.01 M= round(max(final),3) melted_cormat <- melt(final) upper_tri <- get_upper_tri(final) melted_cormat <- melt(upper_tri, na.rm = TRUE) rpkm_plot <- ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+ geom_tile(color = "white")+ scale_fill_gradientn(colours = rainbow(4),limits=c(dm,M), space= "Lab", #scale_fill_gradient2(low = "blue", high = "red", mid = "white", name="Pearson\nCorrelation filtered at RPKM >1") + theme_minimal()+ theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+ coord_fixed() f <- paste0("Pearson_Filtered_RPKM",file1,".pdf") # Save the correlation plot ggsave(f,plot= rpkm_plot, device = "pdf",path= wd, width = 30, height = 30, units = "cm") } # Analyze_all function calculates the pearson correlation for all genes from DE files analyze_all <- function(filename) { data= data.frame(read.table(filename, header = T )) data_grep <- data[, grepl( "rpkm_" , names( data ) ) ] final <- cor(data_grep) final1 <-as.matrix(final) file1 <- file_path_sans_ext(filename) file_path_sans_ext(file1) # Write the correlation matrix f1 <- paste0("Pearson_All_",file1,".csv") write.table(final,file=f1) # keeps the rownames read.table(f1,header=TRUE,row.names=1) # says first column are rownames # Create the pearson plots m= round(min(final),3) dm = m- 0.01 M= round(max(final),3) melted_cormat <- melt(final) upper_tri <- get_upper_tri(final) melted_cormat <- melt(upper_tri, na.rm = TRUE) rpkm_plot <- ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+ geom_tile(color = "white")+ scale_fill_gradientn(colours = rainbow(4),limits=c(dm,M), space= "Lab", # scale_fill_gradient2(low = "blue", high = "red", mid = "white", # midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation-All genes") + theme_minimal()+ theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+ coord_fixed() f <- paste0("Pearson_All_",file1,".pdf") # Save the correlation plot ggsave(f,plot= rpkm_plot, device = "pdf",path= wd, width = 30, height = 30, units = "cm") } ####################################### FUNCTION ENDS ################################### # list all txt files from the current directory # use the pattern argument to define a common pattern for import files with regex. Here: .txt list.filenames.HT <- list() list.filenames.HT<-list.files(pattern=".txt$") list.filenames.HT if(!is.na(list.filenames.HT[1])){ # print "hello" dataset = cbind.data.frame(lapply(list.filenames.HT, fread, header=TRUE, sep="\t")) dataset_grep <- dataset[, grepl( "rpkm_" , names( dataset ) ) ] final_dataset <- round(cor(dataset_grep),3) duplicated.columns <- duplicated(t(final_dataset)) duplicated.rows <- duplicated((final_dataset)) new.matrix <- final_dataset[!duplicated.rows,!duplicated.columns] write.table(new.matrix,file= "Pearson_All_Merge_File.csv") # keeps the rownames read.table("Pearson_All_Merge_File.csv",header=TRUE,row.names=1) # says first column are rownames # Create the pearson plots m= round(min(new.matrix),3) dm= m-0.01 M= round(max(new.matrix),3) melted_cormat <- melt(new.matrix) upper_tri <- get_upper_tri(new.matrix) melted_cormat <- melt(upper_tri, na.rm = TRUE) rpkm_plot <- ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+ geom_tile(color = "white")+ #scale_fill_continuous(low= "red",high="green",limits=c(-1,1), space="Lab", scale_fill_gradientn(colours = rainbow(4),limits=c(dm,M), space = "Lab", name="Pearson\nCorrelation-All genes\n") + theme_minimal()+ theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1)) coord_fixed() rpkm_plot # f <- paste0("Pearson_All",file1,".pdf") # Save the correlation plot ggsave("Pearson_All_Merge_File.pdf",plot= rpkm_plot, device = "pdf",path= wd, width = 30, height = 30, units = "cm") ####################################### filtered analysis#########################333 dataset_filter <- dataset_grep[dataset_grep[,grepl( "rpkm_mean" , names(dataset_grep ) )]>1,] dataset_filter = as.matrix(delete.na(dataset_filter)) final_dataset_filter <- round(cor(dataset_filter),3) duplicated.columns.filter <- duplicated(t(final_dataset_filter)) duplicated.rows.filter <- duplicated((final_dataset_filter)) new.matrix.filter <- final_dataset_filter[!duplicated.rows.filter,!duplicated.columns.filter] write.table(new.matrix.filter,file= "Pearson_Filtered_Merge_File.csv") # keeps the rownames read.table("Pearson_Filtered_Merge_File.csv",header=TRUE,row.names=1) # says first column are rownames # Create the pearson plots m= round(min(new.matrix.filter),3) dm = m-0.01 M= round(max(new.matrix.filter),3) melted_cormat_filter <- melt(new.matrix.filter) upper_tri_filter <- get_upper_tri(new.matrix.filter) melted_cormat_filter <- melt(upper_tri_filter, na.rm = TRUE) rpkm_plot_filter <- ggplot(data = melted_cormat_filter, aes(Var2, Var1, fill = value))+ geom_tile(color = "white")+ #scale_fill_continuous(low= "red",high="green",limits=c(-1,1), space="Lab", scale_fill_gradientn(colours = rainbow(4),limits=c(dm,M), space = "Lab", name="Pearson\nCorrelation-RPKM >1 genes\n") + theme_minimal()+ theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1)) coord_fixed() rpkm_plot_filter # Save the correlation plot ggsave("Pearson_Filtered_Merge_File.pdf",plot= rpkm_plot_filter, device = "pdf",path= wd, width = 30, height = 30, units = "cm") for(i in 1:length(list.filenames.HT)){ analyze_all(list.filenames.HT[i]) analyze_rpkm(list.filenames.HT[i]) } } else{ print("No files in the folder !!") }