# loading mi table
mi <- read.csv(file = "../data/mi.csv")
# loading mice weight table
mice_weights <- read.csv2(file = "../data/male_and_female_weight.csv")
# ---------------------------

### MATRICES
# creating a matrix from a vector with default option
matrix(data = 1:9, nrow = 3, ncol = 3)
# creating a matrix from a vector "by row"
matrix(data = 1:9, nrow = 3, ncol = 3, byrow = TRUE)
# creating a matrix : stacking vectors together
rbind(1:3, 4:6, 7:9)
# creating a matrix : putting vectors side by side
cbind(1:3, 4:6, 7:9)

#! All the elements of a matrix are of the same time. If trying to create a matrix from vectors 
# containing elements of different nature, there will be some modifications!
cbind(
  1:3,
  c(TRUE, FALSE, FALSE),
  c("snow", "cloud", "vapour"))

# -> check out the structure of the object above.

# element recycling : What happens when trying to create a matrix with vectors of different length?
rbind(1:4, 1:2)
cbind(1:4, 1:2)

cbind(1:5, c(1,2), c(3, 4, 5))

# growing matrices
random_16 <- matrix(rnorm(16), nrow = 4, ncol = 4) #Create a 4 x 4 matrix of random numbers from a normal distribution
# adding a row (careful at the length of the vector you want to add to avoid recycling issues)
random_20 <- rbind(random_16, rnorm(4))
# adding a column (careful at the length of the vector you want to add to avoid recycling issues)
random_25 <- cbind(random_20, rnorm(5))

# transposing
my_matrix <- matrix(data = 1:9, nrow = 3, ncol = 3)
t(my_matrix)

# slicing a matrix
# extract the first row of my_matrix
my_matrix[1,]
# extract the second col of my_matrix
my_matrix[,2]

# accessing the dimension of a matrix
# general dimension of the matrix
dim(my_matrix)
# number of rows ?
nrow(my_matrix)
# length(my_matrix[,1]) other option

# number of col ?
ncol(my_matrix)

# extracting the element from the last row and the last column
my_matrix[nrow(my_matrix), ncol(my_matrix)]

# slicing the matrix from the 2nd col till the end
my_matrix[, 2 : ncol(my_matrix)]

# extracting a row from a matrix and use it as a vector
# in multiple steps
row2 <- my_matrix[2,]
str(row2)
row2[3]
# in 1 step
my_matrix[2,][3]

# naming columns
colnames(my_matrix) <- paste("COL", 1:3, sep = "_") # or colnames(my_matrix) <- paste0("COL_", 1:3)
# using ncol() can be useful
colnames(my_matrix) <- paste("COL", 1:ncol(my_matrix), sep = "_")
# colnames(my_matrix) <- c("COL_1", "COL_2", "COL_3") # Probably the longest way to achieve what we want, impossible to do with very big tables

# naming rows
rownames(my_matrix) <- paste("ROW", 1:nrow(my_matrix), sep = "_")

# using names to extract element of a matrix (here same logic as named vectors applies)
# extracting the element at the second row and first column
my_matrix["ROW_2", "COL_1"] # or my_matrix["ROW_2",]["COL_1"]
# Extracting the entire second row
my_matrix["ROW_2",] 

# operations on matrices
my_matrix + 1

# using booleans to extract element of a matrix
my_matrix > 4
my_matrix[my_matrix > 4]


### LISTS
# creating a list
gene_info <- list("WBGene00004775", "sep-1", "I", 3433156, 3438543, FALSE)
# naming the list
names(gene_info) <- c("id", "name", "chrom", "start", "end", "fwd")

# adding list names while creating the list
gene_info_2 <- list(
  "id" = "WBGene00004775", 
  "name" = "sep-1",
  "chrom" = "I", 
  "start" = 3433156, 
  "end" = 3438543, 
  "fwd" = FALSE)

# lists are very versatile
funny_list <- list(
  a_list=list(2, c(FALSE, TRUE, FALSE)),
  a_string="Hohoho!",
  the_mean_function=mean,
  some_letters=c("A", "B", "Z"),
  a_matrix=matrix(1:4, nrow = 2, ncol = 2))

# doing funny (?) things with a list
# 1.
ifelse(funny_list[[1]][[2]], "up", "down")

# 2.
funny_list$the_mean_function(1:5)

# 3.
funny_list$a_matrix
t(funny_list$a_matrix)

# adding a third column (containing the sums of the rows) in the matrix inside the funny_list

# create the modified matrix
a_matrix_mod <- cbind(funny_list$a_matrix, rowSums(funny_list$a_matrix))
# make a copy of funny list (in case we want the original list to stay unchanged)
funny_list_copy <- funny_list
# replacing the matrix we want to change by the modified matrix
funny_list_copy$a_matrix <- a_matrix_mod
# displaying the modify list
funny_list_copy

# one liner code (note : this will modify the original list)
funny_list$a_matrix <- cbind(funny_list$a_matrix, rowSums(funny_list$a_matrix))

# adding a new element to a list
# first option 
funny_list_extended[["a_matrix_mod"]] <- a_matrix_mod
# second option
funny_list_extended$a_matrix_mod2 <- a_matrix_mod
# third option (note : here, the added element will not be named)
funny_list_extended[[length(funny_list_extended)+1]] <- a_matrix_mod

### DATA FRAMES
# Creating a data frame
data.frame(
  1:5,
  c(FALSE, FALSE, TRUE, FALSE, FALSE),
  as.factor(c("cow", "ewe", "cow", "goat", "soy")),
  c(4, 5, 3, 3, 1)
)

# Note :  a data frame as the same aspect than a matrix, but has the advantage of being able to store data of different type

# Create a data frame with named columns
# (Like the vectors or the lists, we can name the data frame columns while creating it)
cheese <- data.frame(
  "id" = 1:5,
  "cooked" = c(FALSE, FALSE, TRUE, FALSE, FALSE),
  "milk" = as.factor(c("cow", "ewe", "cow", "goat", "soy")),
  "strength" = c(4, 5, 3, 3, 1))
# adding row names (same as for the matrices)
rownames(cheese) <- c(
  "Camembert", "Roquefort", "Comté",
  "Pélardon", "Tofu")


# selecting a row using rownames (already seen with the matrics)
cheese["Tofu",]
# selecting a column using colnames (already seen with the matrics)
cheese[,"milk"]
# difference between putting the comma or not
cheese[,"milk"] # This command returns a vector (same as the command cheese$milk)
str(cheese[,"milk"])
cheese["milk"] # This command returns a data frame
str(cheese["milk"])

mean(cheese[,"strength"]) # This will work

mean(cheese["strength"]) # This will not work because the mean function works only on vectors

# extract columns
# The 3 following commands return the same output
cheese[, "milk"]
cheese$milk
cheese[["milk"]] # data frames share similarities with lists

# slicing a data frame using indices (same as for the matrices)
cheese[c(1,3),2:3]

# boolean subsetting to select rows
cheese[cheese$cooked,]
cheese[cheese$cooked] # don't forget the comma! or you will subset the columns, not the rows

# adding a column
new_col <- rnorm(nrow(cheese))
cbind(cheese, new_col)
cheese_mod <- cbind(cheese, saltness = rnorm(nrow(cheese)))
cheese_mod <- cbind(cheese[,1:2], saltness = rnorm(nrow(cheese)), cheese[3:4]) # to place the created columns at the third column

# contency table with total column and total row
# make a data frame from the contency table
contency_df <- as.data.frame.matrix(table(cheese$cooked, cheese$milk)) #as.data.frame.matrix() to keep the same structure returned by the table() function
contency_df$Total <- rowSums(contency_df) # adding the column with the row sums (contency_df[,"Total"] <- rowSums(contency_df), works as well)
contency_df["Total",] <- colSums(contency_df) # adding the row with the col sums

# fastest way: using the addmargins() function
contency_df <- as.data.frame.matrix(addmargins(table(cheese$cooked, cheese$milk)))

# Playing with factors
str(mi$Income)
# if not using factors in this column, the classes don't follow a natural order
barplot(table(mi$Income))
# We can change that by choosing the order of the levels in the factor() function
mi$Income <- factor(mi$Income, levels = c("[0-1000]", "(1000-2000]", "(2000-3000]", "(3000-inf]"))
barplot(table(mi$Income))

# Another advantage of factors : Applying a function to a subset of the table
tapply(cheese$strength, cheese$milk, mean)

### DATA FRAME MANIPULATION
# importing new data
counts_WT_1 <- read.table(
  "../data/WT_1.tsv",
  header=TRUE,
  sep="\t",
  stringsAsFactors=FALSE)

counts_WT_2 <- read.table(
  "../data/WT_2.tsv",
  header=TRUE,
  sep="\t",
  stringsAsFactors=FALSE)

counts_KO_1 <- read.table(
  "../data/KO_1.tsv",
  header=TRUE,
  sep="\t",
  stringsAsFactors=FALSE)

counts_KO_2 <- read.table(
  "../data/KO_2.tsv",
  header=TRUE,
  sep="\t",
  stringsAsFactors=FALSE)

# merging WT tables together
counts_WT <- merge(x = counts_WT_1, y = counts_WT_2, by = "gene_name")
head(counts_WT)
tail(counts_WT)

# merging KO tables together
counts_KO <- merge(x = counts_KO_1, y = counts_KO_2, by = "gene_name")
head(counts_KO, 3)

# merging WT & KO tables together
counts_all <- merge(x = counts_WT, y = counts_KO, by = "gene_name", all = FALSE) #all = FALSE means that we are only keeping items of columns "gene_name"that are found in both data frames

# computing the sequencing depth for the 4 samples
colSums(counts_all[,-1])

# norm_all <- cbind(counts_all[1], t(t(counts_all[-1])/colSums(counts_all[,-1]))) # double transposition!!
# head(norm_all)

norm_all <- counts_all # copying the original matrix of expression
# dividing each column by its total reads (only doable if not a lot of columns. If too many, use the line of code above with the double transposition)
norm_all$WT_1 <- norm_all$WT_1 / sum(norm_all$WT_1)
norm_all$WT_2 <- norm_all$WT_2 / sum(norm_all$WT_2)
norm_all$KO_1 <- norm_all$KO_1 / sum(norm_all$KO_1)
norm_all$KO_2 <- norm_all$KO_2 / sum(norm_all$KO_2)

# compute the normalized mean of the WT group : store it in a column called WT_mean
norm_all$WT_mean <- rowMeans(norm_all[,c("WT_1", "WT_2")])
# compute the normalized mean of the KO group : store it in a column called KO_mean
norm_all$KO_mean <- rowMeans(norm_all[,c("KO_1", "KO_2")])

# compute the fold change KO vs WT for each gene -> store it in a new column named FOLDCHANGE
norm_all$FOLDCHANGE <- norm_all$KO_mean / norm_all$WT_mean

# the genes most up-regulated in the mutant
head(norm_all$gene_name[order(norm_all$FOLDCHANGE, decreasing = TRUE)], 20)

# the genes most down-regulated in the mutant
head(norm_all$gene_name[order(norm_all$FOLDCHANGE, decreasing = FALSE)], 20)

sorted_norm_all <- norm_all[order(norm_all$FOLDCHANGE, decreasing = TRUE),]
head(sorted_norm_all, 20)

# log transform the fold change
sorted_norm_all$logfc <- log2(sorted_norm_all$FOLDCHANGE)
# add a column with up, down and not_deg status : "status"
# all genes with logfc> 1 -> "up"
# all genes with logfc< -1 -> "down"
# all the remaining genes -> "not_deg"
table(sorted_norm_all$logfc > 1)

sorted_norm_all$status <- ifelse(sorted_norm_all$logfc > 1, "up",
                                 ifelse(sorted_norm_all$logfc < -1, "down", "not_deg"))

# saving the table
write.csv2(sorted_norm_all, file = "../data/sorted_norm_all.csv", quote = FALSE, row.names = FALSE)

# loading a library
library(ggplot2)