#Author Firstname Lastname #Created DDMMYYYY, last updated DDMMYYYY (as a substitute to versioning) # This script is for B3 course 2024, "using a Rscript" part, taught by VB. # Goal of this part is to: # - get acquainted with RStudio # - set your working environment # - load a dataset & get acquainted with it (ncol, nrow, how is data organized) # - subset this dataset to only what you are interested in # - plot the variables from this subsetted dataset using ggplot2 # - save what you did ################################################# ## Intro - why (R) scripts, why/how of Rstudio ## ################################################ # see https://hub-courses.pages.pasteur.fr/refresher_utilities_hts/R_script_practice.html #################################### ## 0- set your working environment # #################################### ## identify where you are #you shall be in a directory called "Rpractical": this is going to be your working directory getwd() #pwd equivalent #in this directory, you shall have one file: the Rpractice R script we are looking at :) list.files() #ls equivalent #if you are not in Rpractical, go there :) either using setwd() or through the file navigator in the lower pane. # think about sync the wdirs: in lower right Rstudio pane, Files > gear > synchronize working directories #your working directory is your actual work environment, it needs to be properly structured! #minimal example # |--input_data #all input data (and metadata) used in the analysis. # |-- data.tsv <- meh naming... # |-- tpm_exp1_PapainStim_20210519.tsv <- great naming # |--plots #figures generated by the analysis. # |-- test.png <- meh naming... # |-- fig1_scatterplot_MI_heart_temp.png # |-- output_data #all intermediate or output data produced during the analysis, eg tables. # |-- docs #all other non-input-data: manuscript, pptx file with schme of experimental design, ... # |-- script.R #if >1 script, number them and organize them into dedicated folder #more on that ? https://www.britishecologicalsociety.org/wp-content/uploads/2017/12/guide-to-reproducible-code.pdf # many solutions exist, none is perfect x much dependent on one's preferences - it is however key to be consistent #create the "input_data", "plots" and "output_data" folders #create them at your will: # - create these through the file navigator # - use mkdir in terminal # - do it the R way, as show below: dir.create("./input_data") dir.create("./output_data") dir.create("./plots") ########################################### ## 1- load & explore your dataset ## ########################################### ### you are going to work on MI.csv, to be found at https://hub-courses.pages.pasteur.fr/R_pasteur_phd/data/mi.csv ## Q: where shall MI.csv be stored ? --> input_data #several options: #download the file, copy it to input_data using either unix terminal or file navigator #R solution: #assign this URL to a new variable, "DatasetUrl" DatasetUrl <- "https://hub-courses.pages.pasteur.fr/R_pasteur_phd/data/mi.csv" download.file(url = DatasetUrl, destfile = "input_data/MI.csv") #rq: not compulsory to assign to variable, but good practice (useful if many URLs!) #check you indeed have MI.csv where it should be: #list.files("input_data/") #load the MI.csv dataset using read.table() function MIdataset <- read.table(file="input_data/MI.csv", sep = ",", header = TRUE) #read.table(): swiss-army knife of read delims functions (read.csv, read.csv2, read.delim,..) #note: read.table can also work from URL #MIdataset <- read.table(file=DatasetUrl, sep = ",", header = TRUE) #get (minimally) acquainted to MIdataset / sanity check #how many rows vs columns does it contains ? check consistency with what you see in environment pane dim(MIdataset) #have an "interactive" view of it: View(MIdataset) #note its organisation: rows as individuals (observations), columns as variables (features) #closer look at the variables colnames(MIdataset) ########################################## ## 2- subset your dataset ## ########################################## ## let's create a subset of the MIdataset: # - limited to non-Cannabis users, # - containing only the following variables: Sex, HeartRate, Temperature #how does the dataset split between cannabis users vs non-users ? table(MIdataset$UsesCannabis) ## check possible levels for the Cannabis variable / how it is encoded again: unique(MIdataset$UsesCannabis) #to do so, we will use the subset() function of base R (built-in) #alternative approaches: bracket-based (base R), tidyverse #for the record: bracket solution #MIdataset[MIdataset$UsesCannabis == "No",c("Sex","HeartRate","Temperature")] #how do we use subset, again ? --> check its vignette ?subset() MIsubset <- subset(x = MIdataset, subset = UsesCannabis == "No", #logical expression select = c(Sex,HeartRate,Temperature)) #vector of columns of interest #can we change the logical expression to get the exact same result ? #!(UsesCannabis == "Yes") means the exact same #More complex selections: #UsesCannabis == "No" & HadMeasles == "Yes" #UsesCannabis == "No" & (HadMeasles == "Yes" | HadChickenPox == "Yes") #sanity check - are the dims of MIsubset consistent with our expectations? dim(MIsubset) #save this subsetted dataset #as a table: write.table(MIsubset,file = "output_data/MIsubset.tsv", quote = FALSE, sep = "\t", col.names = TRUE) ###ADVANCED: RDa, RDS, and why you should prefer the latter - /!\ DO NOT RUN /!\ #what if this were not a table, but a huge R object (eg Seurat object, prob matrix, ..)? #two options: RDA and RDS #look at the structures: #Rda objects are reopened "as such" vs. Rds objects are assigned to var name #/!\ high risk of overwriting var names with Rda /!\ (eg, if you had subsetted MI another way) ## RDS: #saveRDS(MIsubset,file = "output_data/MIsubset.rds") #save #MIsubsetv3 <- readRDS(file = "output_data/MIsubset.rds") #plug it back to R ## RDA: #save(MIsubset,file = "output_data/MIsubset.rda") #save #load(file = "output_data/MyBigMess.rda") #plug it back to R #list.files("output_data/") ########################################## ## 3- ggplot the subsetted dataset # ########################################## #we are going to plot heart rate as a function of temperature using the ggplot2 package/library #a few refs on ggplot2: #https://rpubs.com/hadley/ggplot2-layers #many adds-on to ggplot2: https://exts.ggplot2.tidyverse.org/gallery/ #cheat-sheets: https://rstudio.github.io/cheatsheets/html/data-visualization.html #contrary to eg subset(), ggplot2() is not a base R utils, it needs to be #1) installed #2) loaded (when you need it) #package installation can be done through several ways: # very common package, found on CRAN #install.packages("packagename") #if on the CRAN # something a bit exotic/still under dev, on the github/bitbucket/etc of the developer #devtools::install_github("johndoe/usefulpackagenotinCRAN/") #should the repo URL be https://github.com/johndoe/usefulpackagenotinCRAN/ # for Bioconductor packages - Bioconductor has its own installation manager, BiocManager #BiocManager::install(c("GenomicFeatures", "AnnotationDbi")) #here ggplot2 is already installed, let's simply load it: library(ggplot2) #### Understanding ggplot2 reasoning: #ggplot works as successive layers: data, # "aesthetics" (aes): how variables are mapped to visual properties or aesthetics. # "geometrics" (geoms): the type of plot to be created: line plot, scatter plot, ... # "statistics" (stat): transforms the data (+/- summarizing them, eg, means). Some overlap w geoms. # "coordinates" (coord): coordinate system (flipped or native, cartesian == zoomed) # and more: labs, theme, ... #cheatsheet is your best friend! #let's go for: #a scatter plot #heart rate vs temperature, #highlighting means heart rate value per temperature level (doing that using stat_summary or geom_point) #with a line trend #male vs female as distinct plots (facetting), by sex, with a title ggplot(data = MIsubset) + aes(x = Temperature, y = HeartRate) + geom_point(colour = "#454545", size = 1) + stat_summary(geom = "point", colour = "purple", fun = "mean", size = 3 , shape = "triangle") + # geom_point(stat = "summary", fun = "mean", colour = "yellow",size=2) + #an equivalent to what we did for stat_summary stat_smooth(method = "lm", se = FALSE, colour = "darkgrey") + facet_wrap(facets = "Sex") + #coord_cartesian(ylim = c(36, 36.5)) + #coord_flip() labs(title = "my title", subtitle = "Body temp vs heart rate, split by sex", caption = "(based on data from MI cohort)") ## happy ? assign then save MyPlot <- ggplot(data = MIsubset) + aes(x = Temperature, y = HeartRate) + geom_point(colour = "#454545", size = 1) + stat_summary(geom = "point", colour = "purple", fun = "mean", size = 3 , shape = "triangle") + stat_smooth(method = "lm", se = FALSE, colour = "darkgrey") + facet_wrap(facets = "Sex") + labs(title = "my title", subtitle = "Body temp vs heart rate, by sex", caption = "(based on data from MI cohort)") ## save the plot ggsave("fig1_scatterplot_MI_heart_temp.png", MyPlot, device = "png", path = "plots/") #And that's all! #sessionInfo()