#Author Firstname Lastname
#Created DDMMYYYY, last updated DDMMYYYY (as a substitute to versioning)

# This script is for B3 course 2024, "using a Rscript" part, taught by VB. 
# Goal of this part is to:
# - get acquainted with RStudio
# - set your working environment
# - load a dataset & get acquainted with it (ncol, nrow, how is data organized) 
# - subset this dataset to only what you are interested in
# - plot the variables from this subsetted dataset using ggplot2
# - save what you did

#################################################
## Intro - why (R) scripts, why/how of Rstudio ##
################################################
# see https://hub-courses.pages.pasteur.fr/refresher_utilities_hts/R_script_practice.html

####################################
## 0- set your working environment #
####################################
## identify where you are
#you shall be in a directory called "Rpractical": this is going to be your working directory
getwd() #pwd equivalent

#in this directory, you shall have one file: the Rpractice R script we are looking at :)
list.files() #ls equivalent

#if you are not in Rpractical, go there :) either using setwd() or through the file navigator in the lower pane.
# think about sync the wdirs: in lower right Rstudio pane, Files > gear > synchronize working directories 

#your working directory is your actual work environment, it needs to be properly structured!
#minimal example
# |--input_data               #all input data (and metadata) used in the analysis.
#   |-- data.tsv <- meh naming...
#   |-- tpm_exp1_PapainStim_20210519.tsv <- great naming
# |--plots                    #figures generated by the analysis.
#   |-- test.png <- meh naming...
#   |-- fig1_scatterplot_MI_heart_temp.png
# |-- output_data             #all intermediate or output data produced during the analysis, eg tables. 
# |-- docs                    #all other non-input-data: manuscript, pptx file with schme of experimental design, ...
# |-- script.R                #if >1 script, number them and organize them into dedicated folder

#more on that ? https://www.britishecologicalsociety.org/wp-content/uploads/2017/12/guide-to-reproducible-code.pdf
# many solutions exist, none is perfect x much dependent on one's preferences - it is however key to be consistent 


#create the "input_data", "plots" and "output_data" folders
#create them at your will:
# - create these through the file navigator
# - use mkdir in terminal
# - do it the R way, as show below:
dir.create("./input_data")
dir.create("./output_data")
dir.create("./plots")

###########################################
## 1- load & explore your dataset        ##
###########################################

### you are going to work on MI.csv, to be found at https://hub-courses.pages.pasteur.fr/R_pasteur_phd/data/mi.csv 
## Q: where shall MI.csv be stored ? --> input_data

#several options:
#download the file, copy it to input_data using either unix terminal or file navigator
#R solution: 
#assign this URL to a new variable, "DatasetUrl"
DatasetUrl <- "https://hub-courses.pages.pasteur.fr/R_pasteur_phd/data/mi.csv"
download.file(url = DatasetUrl,
              destfile = "input_data/MI.csv")
#rq: not compulsory to assign to variable, but good practice (useful if many URLs!)

#check you indeed have MI.csv where it should be:
#list.files("input_data/")


#load the MI.csv dataset using read.table() function
MIdataset <- read.table(file="input_data/MI.csv", sep = ",", header = TRUE)
#read.table(): swiss-army knife of read delims functions (read.csv, read.csv2, read.delim,..)
#note: read.table can also work from URL 
#MIdataset <- read.table(file=DatasetUrl, sep = ",", header = TRUE)

#get (minimally) acquainted to MIdataset / sanity check
#how many rows vs columns does it contains ? check consistency with what you see in environment pane
dim(MIdataset)

#have an "interactive" view of it:
View(MIdataset)

#note its organisation: rows as individuals (observations), columns as variables (features)

#closer look at the variables
colnames(MIdataset)


##########################################
## 2- subset your dataset               ##
##########################################

## let's create a subset of the MIdataset:
# - limited to non-Cannabis users,
# - containing only the following variables: Sex, HeartRate, Temperature

#how does the dataset split between cannabis users vs non-users ?
table(MIdataset$UsesCannabis)

## check possible levels for the Cannabis variable / how it is encoded again:
unique(MIdataset$UsesCannabis)

#to do so, we will use the subset() function of base R (built-in)
#alternative approaches: bracket-based (base R), tidyverse
#for the record: bracket solution
#MIdataset[MIdataset$UsesCannabis == "No",c("Sex","HeartRate","Temperature")]

#how do we use subset, again ? --> check its vignette
?subset()

MIsubset <- subset(x = MIdataset,
                  subset = UsesCannabis == "No", #logical expression
                  select = c(Sex,HeartRate,Temperature)) #vector of columns of interest

#can we change the logical expression to get the exact same result ? 
#!(UsesCannabis == "Yes") means the exact same 

#More complex selections: 
#UsesCannabis == "No" & HadMeasles == "Yes"
#UsesCannabis == "No" & (HadMeasles == "Yes" | HadChickenPox == "Yes")


#sanity check - are the dims of MIsubset consistent with our expectations?
dim(MIsubset)

#save this subsetted dataset
#as a table:
write.table(MIsubset,file = "output_data/MIsubset.tsv", quote = FALSE, sep = "\t", col.names = TRUE)

###ADVANCED: RDa, RDS, and why you should prefer the latter - /!\ DO NOT RUN /!\ 
#what if this were not a table, but a huge R object (eg Seurat object, prob matrix, ..)? 
#two options: RDA and RDS
#look at the structures:
#Rda objects are reopened "as such" vs. Rds objects are assigned to var name
#/!\ high risk of overwriting var names with Rda /!\ (eg, if you had subsetted MI another way)

## RDS: 
#saveRDS(MIsubset,file = "output_data/MIsubset.rds") #save
#MIsubsetv3 <- readRDS(file = "output_data/MIsubset.rds") #plug it back to R

## RDA:
#save(MIsubset,file = "output_data/MIsubset.rda") #save
#load(file = "output_data/MyBigMess.rda")  #plug it back to R

#list.files("output_data/")


##########################################
## 3- ggplot the subsetted dataset       #
##########################################

#we are going to plot heart rate as a function of temperature using the ggplot2 package/library
#a few refs on ggplot2:
#https://rpubs.com/hadley/ggplot2-layers
#many adds-on to ggplot2: https://exts.ggplot2.tidyverse.org/gallery/
#cheat-sheets: https://rstudio.github.io/cheatsheets/html/data-visualization.html


#contrary to eg subset(), ggplot2() is not a base R utils, it needs to be
#1) installed
#2) loaded (when you need it)

#package installation can be done through several ways:
  # very common package, found on CRAN
    #install.packages("packagename") #if on the CRAN
  # something a bit exotic/still under dev, on the github/bitbucket/etc of the developer
    #devtools::install_github("johndoe/usefulpackagenotinCRAN/") #should the repo URL be https://github.com/johndoe/usefulpackagenotinCRAN/
  # for Bioconductor packages - Bioconductor has its own installation manager, BiocManager
    #BiocManager::install(c("GenomicFeatures", "AnnotationDbi"))

#here ggplot2 is already installed, let's simply load it:
library(ggplot2)

#### Understanding ggplot2 reasoning:

#ggplot works as successive layers: data, 
# "aesthetics" (aes): how variables are mapped to visual properties or aesthetics.
# "geometrics" (geoms): the type of plot to be created: line plot, scatter plot, ...
# "statistics" (stat): transforms the data (+/- summarizing them, eg, means). Some overlap w geoms.
# "coordinates" (coord): coordinate system (flipped or native, cartesian == zoomed)
# and more: labs, theme, ...
#cheatsheet is your best friend!

#let's go for:
#a scatter plot 
#heart rate vs temperature,
#highlighting means heart rate value per temperature level (doing that using stat_summary or geom_point)  
#with a line trend
#male vs female as distinct plots (facetting), by sex, with a title

ggplot(data = MIsubset) +
      aes(x = Temperature, y = HeartRate) + 
      geom_point(colour = "#454545", size = 1) +
      stat_summary(geom = "point", colour = "purple", fun = "mean", size = 3 , shape = "triangle") +
#      geom_point(stat = "summary", fun = "mean", colour = "yellow",size=2) + #an equivalent to what we did for stat_summary
      stat_smooth(method = "lm", se = FALSE, colour = "darkgrey") +
      facet_wrap(facets = "Sex") +
      #coord_cartesian(ylim = c(36, 36.5)) + #coord_flip()
      labs(title = "my title", 
            subtitle = "Body temp vs heart rate, split by sex", 
            caption = "(based on data from MI cohort)")

## happy ? assign then save

MyPlot <- ggplot(data = MIsubset) +
                aes(x = Temperature, y = HeartRate) + 
                geom_point(colour = "#454545", size = 1) +
                stat_summary(geom = "point", colour = "purple", fun = "mean", size = 3 , shape = "triangle") +
                stat_smooth(method = "lm", se = FALSE, colour = "darkgrey") +
                facet_wrap(facets = "Sex") +
                labs(title = "my title", 
                subtitle = "Body temp vs heart rate, by sex", 
                caption = "(based on data from MI cohort)")

## save the plot
ggsave("fig1_scatterplot_MI_heart_temp.png", MyPlot, device = "png", path = "plots/")


#And that's all!
#sessionInfo()