# loading mi table mi <- read.csv(file = "../data/mi.csv") # loading mice weight table mice_weights <- read.csv2(file = "../data/male_and_female_weight.csv") # --------------------------- ### MATRICES # creating a matrix from a vector with default option matrix(data = 1:9, nrow = 3, ncol = 3) # creating a matrix from a vector "by row" matrix(data = 1:9, nrow = 3, ncol = 3, byrow = TRUE) # creating a matrix : stacking vectors together rbind(1:3, 4:6, 7:9) # creating a matrix : putting vectors side by side cbind(1:3, 4:6, 7:9) #! All the elements of a matrix are of the same time. If trying to create a matrix from vectors # containing elements of different nature, there will be some modifications! cbind( 1:3, c(TRUE, FALSE, FALSE), c("snow", "cloud", "vapour")) # -> check out the structure of the object above. # element recycling : What happens when trying to create a matrix with vectors of different length? rbind(1:4, 1:2) cbind(1:4, 1:2) cbind(1:5, c(1,2), c(3, 4, 5)) # growing matrices random_16 <- matrix(rnorm(16), nrow = 4, ncol = 4) #Create a 4 x 4 matrix of random numbers from a normal distribution # adding a row (careful at the length of the vector you want to add to avoid recycling issues) random_20 <- rbind(random_16, rnorm(4)) # adding a column (careful at the length of the vector you want to add to avoid recycling issues) random_25 <- cbind(random_20, rnorm(5)) # transposing my_matrix <- matrix(data = 1:9, nrow = 3, ncol = 3) t(my_matrix) # slicing a matrix # extract the first row of my_matrix my_matrix[1,] # extract the second col of my_matrix my_matrix[,2] # accessing the dimension of a matrix # general dimension of the matrix dim(my_matrix) # number of rows ? nrow(my_matrix) # length(my_matrix[,1]) other option # number of col ? ncol(my_matrix) # extracting the element from the last row and the last column my_matrix[nrow(my_matrix), ncol(my_matrix)] # slicing the matrix from the 2nd col till the end my_matrix[, 2 : ncol(my_matrix)] # extracting a row from a matrix and use it as a vector # in multiple steps row2 <- my_matrix[2,] str(row2) row2[3] # in 1 step my_matrix[2,][3] # naming columns colnames(my_matrix) <- paste("COL", 1:3, sep = "_") # or colnames(my_matrix) <- paste0("COL_", 1:3) # using ncol() can be useful colnames(my_matrix) <- paste("COL", 1:ncol(my_matrix), sep = "_") # colnames(my_matrix) <- c("COL_1", "COL_2", "COL_3") # Probably the longest way to achieve what we want, impossible to do with very big tables # naming rows rownames(my_matrix) <- paste("ROW", 1:nrow(my_matrix), sep = "_") # using names to extract element of a matrix (here same logic as named vectors applies) # extracting the element at the second row and first column my_matrix["ROW_2", "COL_1"] # or my_matrix["ROW_2",]["COL_1"] # Extracting the entire second row my_matrix["ROW_2",] # operations on matrices my_matrix + 1 # using booleans to extract element of a matrix my_matrix > 4 my_matrix[my_matrix > 4] ### LISTS # creating a list gene_info <- list("WBGene00004775", "sep-1", "I", 3433156, 3438543, FALSE) # naming the list names(gene_info) <- c("id", "name", "chrom", "start", "end", "fwd") # adding list names while creating the list gene_info_2 <- list( "id" = "WBGene00004775", "name" = "sep-1", "chrom" = "I", "start" = 3433156, "end" = 3438543, "fwd" = FALSE) # lists are very versatile funny_list <- list( a_list=list(2, c(FALSE, TRUE, FALSE)), a_string="Hohoho!", the_mean_function=mean, some_letters=c("A", "B", "Z"), a_matrix=matrix(1:4, nrow = 2, ncol = 2)) # doing funny (?) things with a list # 1. ifelse(funny_list[[1]][[2]], "up", "down") # 2. funny_list$the_mean_function(1:5) # 3. funny_list$a_matrix t(funny_list$a_matrix) # adding a third column (containing the sums of the rows) in the matrix inside the funny_list # create the modified matrix a_matrix_mod <- cbind(funny_list$a_matrix, rowSums(funny_list$a_matrix)) # make a copy of funny list (in case we want the original list to stay unchanged) funny_list_copy <- funny_list # replacing the matrix we want to change by the modified matrix funny_list_copy$a_matrix <- a_matrix_mod # displaying the modify list funny_list_copy # one liner code (note : this will modify the original list) funny_list$a_matrix <- cbind(funny_list$a_matrix, rowSums(funny_list$a_matrix)) # adding a new element to a list # first option funny_list_extended[["a_matrix_mod"]] <- a_matrix_mod # second option funny_list_extended$a_matrix_mod2 <- a_matrix_mod # third option (note : here, the added element will not be named) funny_list_extended[[length(funny_list_extended)+1]] <- a_matrix_mod ### DATA FRAMES # Creating a data frame data.frame( 1:5, c(FALSE, FALSE, TRUE, FALSE, FALSE), as.factor(c("cow", "ewe", "cow", "goat", "soy")), c(4, 5, 3, 3, 1) ) # Note : a data frame as the same aspect than a matrix, but has the advantage of being able to store data of different type # Create a data frame with named columns # (Like the vectors or the lists, we can name the data frame columns while creating it) cheese <- data.frame( "id" = 1:5, "cooked" = c(FALSE, FALSE, TRUE, FALSE, FALSE), "milk" = as.factor(c("cow", "ewe", "cow", "goat", "soy")), "strength" = c(4, 5, 3, 3, 1)) # adding row names (same as for the matrices) rownames(cheese) <- c( "Camembert", "Roquefort", "Comté", "Pélardon", "Tofu") # selecting a row using rownames (already seen with the matrics) cheese["Tofu",] # selecting a column using colnames (already seen with the matrics) cheese[,"milk"] # difference between putting the comma or not cheese[,"milk"] # This command returns a vector (same as the command cheese$milk) str(cheese[,"milk"]) cheese["milk"] # This command returns a data frame str(cheese["milk"]) mean(cheese[,"strength"]) # This will work mean(cheese["strength"]) # This will not work because the mean function works only on vectors # extract columns # The 3 following commands return the same output cheese[, "milk"] cheese$milk cheese[["milk"]] # data frames share similarities with lists # slicing a data frame using indices (same as for the matrices) cheese[c(1,3),2:3] # boolean subsetting to select rows cheese[cheese$cooked,] cheese[cheese$cooked] # don't forget the comma! or you will subset the columns, not the rows # adding a column new_col <- rnorm(nrow(cheese)) cbind(cheese, new_col) cheese_mod <- cbind(cheese, saltness = rnorm(nrow(cheese))) cheese_mod <- cbind(cheese[,1:2], saltness = rnorm(nrow(cheese)), cheese[3:4]) # to place the created columns at the third column # contency table with total column and total row # make a data frame from the contency table contency_df <- as.data.frame.matrix(table(cheese$cooked, cheese$milk)) #as.data.frame.matrix() to keep the same structure returned by the table() function contency_df$Total <- rowSums(contency_df) # adding the column with the row sums (contency_df[,"Total"] <- rowSums(contency_df), works as well) contency_df["Total",] <- colSums(contency_df) # adding the row with the col sums # fastest way: using the addmargins() function contency_df <- as.data.frame.matrix(addmargins(table(cheese$cooked, cheese$milk))) # Playing with factors str(mi$Income) # if not using factors in this column, the classes don't follow a natural order barplot(table(mi$Income)) # We can change that by choosing the order of the levels in the factor() function mi$Income <- factor(mi$Income, levels = c("[0-1000]", "(1000-2000]", "(2000-3000]", "(3000-inf]")) barplot(table(mi$Income)) # Another advantage of factors : Applying a function to a subset of the table tapply(cheese$strength, cheese$milk, mean) ### DATA FRAME MANIPULATION # importing new data counts_WT_1 <- read.table( "../data/WT_1.tsv", header=TRUE, sep="\t", stringsAsFactors=FALSE) counts_WT_2 <- read.table( "../data/WT_2.tsv", header=TRUE, sep="\t", stringsAsFactors=FALSE) counts_KO_1 <- read.table( "../data/KO_1.tsv", header=TRUE, sep="\t", stringsAsFactors=FALSE) counts_KO_2 <- read.table( "../data/KO_2.tsv", header=TRUE, sep="\t", stringsAsFactors=FALSE) # merging WT tables together counts_WT <- merge(x = counts_WT_1, y = counts_WT_2, by = "gene_name") head(counts_WT) tail(counts_WT) # merging KO tables together counts_KO <- merge(x = counts_KO_1, y = counts_KO_2, by = "gene_name") head(counts_KO, 3) # merging WT & KO tables together counts_all <- merge(x = counts_WT, y = counts_KO, by = "gene_name", all = FALSE) #all = FALSE means that we are only keeping items of columns "gene_name"that are found in both data frames # computing the sequencing depth for the 4 samples colSums(counts_all[,-1]) # norm_all <- cbind(counts_all[1], t(t(counts_all[-1])/colSums(counts_all[,-1]))) # double transposition!! # head(norm_all) norm_all <- counts_all # copying the original matrix of expression # dividing each column by its total reads (only doable if not a lot of columns. If too many, use the line of code above with the double transposition) norm_all$WT_1 <- norm_all$WT_1 / sum(norm_all$WT_1) norm_all$WT_2 <- norm_all$WT_2 / sum(norm_all$WT_2) norm_all$KO_1 <- norm_all$KO_1 / sum(norm_all$KO_1) norm_all$KO_2 <- norm_all$KO_2 / sum(norm_all$KO_2) # compute the normalized mean of the WT group : store it in a column called WT_mean norm_all$WT_mean <- rowMeans(norm_all[,c("WT_1", "WT_2")]) # compute the normalized mean of the KO group : store it in a column called KO_mean norm_all$KO_mean <- rowMeans(norm_all[,c("KO_1", "KO_2")]) # compute the fold change KO vs WT for each gene -> store it in a new column named FOLDCHANGE norm_all$FOLDCHANGE <- norm_all$KO_mean / norm_all$WT_mean # the genes most up-regulated in the mutant head(norm_all$gene_name[order(norm_all$FOLDCHANGE, decreasing = TRUE)], 20) # the genes most down-regulated in the mutant head(norm_all$gene_name[order(norm_all$FOLDCHANGE, decreasing = FALSE)], 20) sorted_norm_all <- norm_all[order(norm_all$FOLDCHANGE, decreasing = TRUE),] head(sorted_norm_all, 20) # log transform the fold change sorted_norm_all$logfc <- log2(sorted_norm_all$FOLDCHANGE) # add a column with up, down and not_deg status : "status" # all genes with logfc> 1 -> "up" # all genes with logfc< -1 -> "down" # all the remaining genes -> "not_deg" table(sorted_norm_all$logfc > 1) sorted_norm_all$status <- ifelse(sorted_norm_all$logfc > 1, "up", ifelse(sorted_norm_all$logfc < -1, "down", "not_deg")) # saving the table write.csv2(sorted_norm_all, file = "../data/sorted_norm_all.csv", quote = FALSE, row.names = FALSE) # loading a library library(ggplot2)