###### tags: `R` # R programming for image processing, file management, and data visualisation R code used to generate outcome demonstrated during knowledge hour presentation at GLC 21-04-2023 ## Set up directory ```r! # Input and output directories dir.C.drive <-"C:" dir.R.packages <- file.path(dir.C.drive,"R","R-4.1.3") dir.main <- file.path(dir.C.drive,"Google-Drive") dir.GLC <- file.path(dir.main,"Greenlight-clinical") dir.GLC.presentation <- file.path(dir.GLC,"Knowledge-hours-presentation") dir.GLC.data <- file.path(dir.GLC.presentation, "data-download") dir.GLC.P.drive <- file.path(dir.GLC,"P-drive") dir.eTMF <- file.path(dir.GLC.presentation,"eTMF") ``` ## Bring installed packages to memory ```r! library(jpeg, lib.loc = dir.R.packages) library(withr,lib.loc = dir.R.packages) library(dplyr, lib.loc = dir.R.packages) library(tidyr,lib.loc = dir.R.packages) library(purrr, lib.loc = dir.R.packages) library(foreach, lib.loc = dir.R.packages) library(iterators, lib.loc = dir.R.packages) library(doParallel, lib.loc = dir.R.packages) library(qrcode, lib.loc = dir.R.packages) library(backports, lib.loc = dir.R.packages) library(tzdb, lib.loc = dir.R.packages) library(ggplot2,lib.loc = dir.R.packages) library(crayon,lib.loc = dir.R.packages) library(tidyverse, lib.loc = dir.R.packages) library(farver, lib.loc = dir.R.packages) library(svglite,lib.loc = dir.R.packages) library(grid,lib.loc = dir.R.packages) library(labeling,lib.loc = dir.R.packages) library(magrittr,lib.loc = dir.R.packages) library(imager,lib.loc = dir.R.packages) library(patchwork,lib.loc = dir.R.packages) library(magick,lib.loc = dir.R.packages) library(forcats,lib.loc = dir.R.packages) library(cowplot,lib.loc = dir.R.packages) library(fs,lib.loc = dir.R.packages) library(data.tree,lib.loc = dir.R.packages) library(sketcher,lib.loc = dir.R.packages) library(rJava,lib.loc = dir.R.packages) library(xlsx,lib.loc = dir.R.packages) library(readxl,lib.loc = dir.R.packages) ``` ## List Excel sheets ```r! #----------------------- # List Excel sheets #----------------------- list.files(path = dir.GLC.data, pattern="*.xlsx", full.names = TRUE) # There are 115 sheets. All sheets matched source data file sheets.C200 <- readxl::excel_sheets(path=file.path(dir.GLC.data,"C-200-001_sdtm - Copy.xlsx")) #length(sheets.C200) #115 # Some sheet names not seen in the source data file sheets.invetus <- readxl::excel_sheets(path=file.path(dir.GLC.data,"AET P 20286 Collated Data - Copy.xlsx")) #length(sheets.invetus) 101 ``` ## Write sheets into Excel ```r! #------------------------ # Write sheets into Excel #------------------------ # Create a new Excel file with same sheets from "C-200-001_sdtm - Copy.xlsx" note <- data.frame(note="Client please use existing sheet names") new.excel.file.path <- file.path(dir.GLC.data,"C-300-001_sdtm.xlsx") # Write data into first sheet xlsx::write.xlsx(x=note ,file=new.excel.file.path ,sheetName=sheets.C200[1] ,append=FALSE) # Add second to last sheets to the same Excel lapply(X=2:length(sheets.C200) ,FUN=function(i) xlsx::write.xlsx(note ,file=new.excel.file.path ,sheetName=sheets.C200[[i]] ,append=TRUE)) ``` ## Create Gantt chart for employment history ```r! #------------------------------------------ # Create Gantt chart for employment history #------------------------------------------ # Input, output directory task.1.input.folder.path <- file.path(dir.GLC.presentation,"images-download") # dir.exists(task.1.input.folder.path) [1] TRUE task.1.output.folder.path <- file.path(dir.GLC.presentation,"gantt-charts") # dir.exists(task.1.output.folder.path) [1] TRUE # Set end date of current employment to today date.today <- as.character(Sys.Date()) # Create employment data # Reshape it to long format which start dates and end dates are replaced by two new variables- variable, value df <- as.data.frame(rbind( c("University of Queensland\n PhD student", '2016-04-05', '2020-05-11',"past") ,c("QIMR Berghofer Medical Research Institute\n R programmer", '2020-04-27', '2021-11-09',"past") ,c("Greenlight Clinical\n Statistical programmer", '2021-09-27', date.today,"current"))) %>% dplyr::select("task" = 1, "start" = 2, "end" = 3, "status"=4) %>% dplyr::mutate(task = forcats::fct_rev(factor(task, levels = (task))), start = as.Date(start), end = as.Date(end)) %>% dplyr::mutate(lab_pos = start) %>% tidyr::pivot_longer(2:3, names_to = "variable", values_to = "value") # dim(df) [1] 6 4 # Change size of plotting window to 10:5 width to height dev.new(width=10, height=5,noRStudioGD = TRUE) # Plot employment as a gantt chart with Y axis showing every year ## [Changing line colors with ggplot()](https://stackoverflow.com/questions/5171263/changing-line-colors-with-ggplot) gantt.chart <- ggplot2::ggplot(df, aes(x=value, y=task, color=status)) + ggplot2::geom_line(linewidth = 20, show.legend = FALSE) + # To modify color, use ggplot2::scale_color_manual(values=). values matched to color variable values current, past ggplot2::scale_color_manual(values=c("red","grey50"))+ ggplot2::scale_x_date( labels = function(x) lubridate::year(x) ,date_breaks = '1 year' ,date_minor_breaks = "1 year" ,position = "top") + ggplot2::labs(x = NULL,y = NULL) + ggplot2::theme( axis.text.x=element_text(size=10) ,axis.text.y=element_text(size=12)) # Save employment gantt chart as a png file ## plot size needs to match those in dev.new() ggplot2::ggsave( filename = file.path( task.1.output.folder.path,"employment-history.png") ,width = 10 ,height = 5 ,plot= print(gantt.chart) # ,device = "png") # Read logo image files logo.bash <- magick::image_read(file.path(task.1.input.folder.path, "BASH_full_colored_dark.jpg")) # class(logo.bash) [1] "magick-image" logo.R <- magick::image_read(file.path(task.1.input.folder.path,"R-logo-Cluster-Analysis-with-R-Edureka.png")) logo.SAS <- magick::image_read(file.path(task.1.input.folder.path,"SAS-logo.png")) # Set base x y positions of the plot where logo images are added to base.position.x.group.1 <- 0.475 base.position.y.group.1 <- 0.975 base.position.x.group.2 <- base.position.x.group.1+0.325 base.position.y.group.2 <- base.position.y.group.1-0.325 base.position.x.group.3 <- base.position.x.group.2+0.15 base.position.y.group.3 <- base.position.y.group.2-0.25 # Introduction to cowplot # https://cran.r-project.org/web/packages/cowplot/vignettes/introduction.html gantt.chart.final <- cowplot::ggdraw(gantt.chart) + # Add logo images to employment 1 bar cowplot::draw_image(logo.bash ,x = base.position.x.group.1 ,y = base.position.y.group.1 ,hjust = 1, vjust = 1, width = 0.1, height = 0.2) + cowplot::draw_image(logo.R ,x = base.position.x.group.1 + 0.1 ,y = base.position.y.group.1 - 0.04 ,hjust = 1, vjust = 1, width = 0.1, height = 0.1) + cowplot::draw_image(logo.SAS ,x = base.position.x.group.1+0.2 ,y = base.position.y.group.1 ,hjust = 1, vjust = 1, width = 0.1, height = 0.2)+ # Add logo images to employment 2 bar cowplot::draw_image(logo.R ,x = base.position.x.group.2 ,y = base.position.y.group.2 ,hjust = 1, vjust = 1, width = 0.1, height = 0.1)+ cowplot::draw_image(logo.bash ,x = base.position.x.group.2+ 0.1 ,y = base.position.y.group.2+0.05 ,hjust = 1, vjust = 1, width = 0.1, height = 0.2)+ # Add logo images to employment 3 bar cowplot::draw_image(logo.SAS ,x = base.position.x.group.3 ,y = base.position.y.group.3 ,hjust = 1, vjust = 1, width = 0.1, height = 0.2) # Save the plot as a png file # Sizes of exported image should be set to be similar to the plotting window, modified by dev.new(width=, height=) ggplot2::ggsave( filename = file.path(task.1.output.folder.path,"employment-history_programming-software-added.png") ,width = 10 ,height = 5 ,plot= print(gantt.chart.final) # ,device = "png") ``` ## Find and copy image file types ```r! #-------------------------------------------------- # Find and copy image file types #-------------------------------------------------- # Input, output directory task.2.input.folder.path <- file.path(dir.GLC.presentation,"images-download") task.2.output.folder.path <- file.path(task.2.input.folder.path,"images-only") # dir.exists(task.2.output.folder.path) #dir.create(task.2.output.folder.path) image.files.want.source <- list.files( path = task.2.input.folder.path ,pattern = "\\.png$|\\.jpg$" ,full.names = TRUE) # length(image.files.want.source) 10 image.files.want.destination <- file.path(task.2.output.folder.path ,basename(image.files.want.source)) # Copy files. Destination files will have the same date times as source files file.copy(from = image.files.want.source ,to = image.files.want.destination ,copy.date = TRUE) # Copy files. Destination files will have the date times when copy is performed ## Delete copied files before running this code file.copy(from = image.files.want.source ,to = image.files.want.destination ,copy.date = FALSE) ``` ## List folders as a tree ```r! #----------------------- # List folders as a tree #----------------------- # Input directories dir.tmf <- file.path(dir.GLC.P.drive,"BIOSTATISTICS","Gina","tmf","Numbered") dir.tmf.short <- file.path(dir.tmf, "Short_Names") # dir.exists(dir.tmf.short) dir.tmf.long.stat <- file.path(dir.tmf,"Long_Names","Statistics") # dir.exists(dir.tmf.long.stat) # List directories as a tree fs::dir_tree(path = dir.tmf.short, recurse = TRUE) fs::dir_tree(path = dir.tmf.long.stat, recurse = TRUE) ``` ## Create folders using existing folders ```r! #-------------------------------------- # Create folders using existing folders #-------------------------------------- # Get folder directories within dir.tmf.short ## If we set, recursive = TRUE, the directories are returned in hierarchical order, beginning with the main directory path and then followed by the sub-directories. However, if we specify recursive = FALSE, only the child directories are returned folder.paths.tmf.short.child <- list.dirs(path = dir.tmf.short ,full.names = TRUE ,recursive = FALSE) # length(folder.paths.tmf.short.child) 6 # Get child, grandchild directories folder.paths.tmf.short.grandchild <- list.dirs(path=folder.paths.tmf.short.child ,full.names = TRUE) # length(folder.paths.tmf.short.grandchild) 31 # Copy source trial master file folders to destination # Create a new parent directory dir.tmf.new.short <- file.path(dir.GLC.presentation,"trial-master-files","numbered","short-named") dir.create(path = dir.tmf.new.short, recursive = TRUE) # Get child and grandchild folder names folder.names.tmf.short.grandchild <- folder.paths.tmf.short.grandchild %>% # Remove parent directory gsub(pattern="C:/Google-Drive/Greenlight-clinical/P-drive/BIOSTATISTICS/Gina/tmf/Numbered/Short_Names", replacement="") # Create new destination folder paths dest.folder.paths.tm.short <- paste0(dir.tmf.new.short, folder.names.tmf.short.grandchild) # length(dest.folder.paths.tm.short) 31 unlist(lapply(X=dest.folder.paths.tm.short ,FUN=function(x) dir.create(x, recursive = TRUE)) ) ``` ## Create folders using eTMF reference model file ```r! #----------------------------------------------- # Create folders using eTMF reference model file #----------------------------------------------- eTMF <- readxl::read_excel(path = file.path(dir.GLC.data,"tmf-reference-model-version-3-0_16-june-20153.xlsx") ,sheet = "v3.0" ,skip = 2 ,col_names = TRUE) %>% # Select columns wanted by position dplyr::select(c(1:6)) # dim(eTMF) 249 6 colnames(eTMF) <- colnames(eTMF) %>% gsub(pattern=" #", replacement="_number") %>% gsub(pattern=" ", replacement="_") # Select Zone_Name="Statistics" eTMF.stat <- eTMF %>% dplyr::filter(Zone_Name=="Statistics") %>% # Remove dots in *_number columns dplyr::mutate(Section_number=gsub(pattern="\\.", replacement="",x=Section_number) ,Artifact_number=gsub(pattern="\\.", replacement="",x=Artifact_number) ,Zone_number_name=paste0(Zone_number,"_",Zone_Name) ,Section_number_name=paste0(Section_number,"_",Section_Name) ,Artifact_number_name=paste0(Artifact_number,"_",Artifact_name) ,folder.path=file.path(dir.eTMF, Zone_number_name, Section_number_name,Artifact_number_name)) # dim(eTMF.stat) 25 8 # Create new folders using the folder.path column unlist(lapply(X=eTMF.stat$folder.path ,FUN=function(x) dir.create(x, recursive = TRUE)) ) # List directories as a tree fs::dir_tree(path = file.path(dir.GLC.presentation,"eTMF","11_Statistics") ,recurse = TRUE) ``` ## Reduce file size of a single image ```r! #-------------------------------------------------- # Reduce file size of a single image #-------------------------------------------------- task.3.input.folder.path <- file.path(dir.main, "Receipts-meals-not-claimed-yet") # dir.exists(task.3.input.folder.path) task.3.output.folder.path <- file.path(dir.GLC.presentation,"reduced_images") # dir.exists(task.3.output.folder.path) file.paths.jpg <- list.files(path=task.3.input.folder.path ,pattern = "*.jpg" ,full.names = TRUE) # length(file.paths.jpg) 98 # Reduce 1 jpg file img <- jpeg::readJPEG(source=file.paths.jpg[2] , native = TRUE) # class(img) [1] "nativeRaster" # Export reduced object as a jpg file output.file.path <-paste0(task.3.output.folder.path,"/small_",basename(file.paths.jpg[2])) jpeg::writeJPEG(image=img ,target = output.file.path , quality = 0.5) ``` ## Reduce file size of multiple image image files ```r! #-------------------------------------------------- # Reduce file size of multiple image image files #-------------------------------------------------- # cl <- parallel::makeCluster(parallel::detectCores()-1 # ,outfile=file.path(task.3.output.folder.path # ,paste0("image-reduction_error-log.txt") # ) # ) # Use all detected cores minus 1 core (CPU) as a cluster, leaving 1 core for software or app running in the background in case desktop crashes (monitor turns black and nothing working) # doParallel::registerDoParallel(cl, cores = parallel::detectCores()-1) # Loop thru all jpg images to reduce file sizes # Check existence of input folder if(dir.exists(task.3.input.folder.path)!=TRUE){ cat("Input image folder could not be found") } else{ # Read input image file paths as a vector image.file.paths <- list.files(path = task.3.input.folder.path ,full.names = TRUE ,pattern = "\\.jpg$") # length(image.file.paths) 99 cat("There are",length(image.file.paths),"images in the folder") # Use image file name to look up data for the image in the cell segmentation data foreach::foreach(i=1:length(image.file.paths) ,.combine = 'c' ,.packages=c("foreach","dplyr","jpeg")) %dopar% { print(paste0("=========== Processing image ",i,"============")) # If data can be found, then subset. If data cannot be found, then copy the input image to the output image.file.path <-image.file.paths[i] image.file.name <-basename(image.file.path) # Name output file output.file.path <-paste0(task.3.output.folder.path,"/small_", image.file.name) # Read jpg file img <- jpeg::readJPEG(source=image.file.path, native = TRUE) # class(img) [1] "nativeRaster" # Reduce jpg file size and output jpg file jpeg::writeJPEG(image=img ,target = output.file.path , quality = 0.5) } } # Calculate % reduction total.file.size.original <- 339 total.file.size.reduced <- 59.2 (total.file.size.original-total.file.size.reduced)/total.file.size.original*100 #[1] 82.53687 ``` ## Google map link to a single QR code ```r! #--------------------------------------------- # Google map link to a single QR code #--------------------------------------------- url <- "http://www.google.com/maps/place/49.46800006494457,17.11514008755796" # A function to plot QR code in changeable color ggQRCode <- function(text, color="black", alpha=1) { x <- qr_code(text) x <- as.data.frame(x) y <- x y$id <- rownames(y) y <- gather(y, "key", "value", colnames(y)[-ncol(y)]) y$key = factor(y$key, levels=rev(colnames(x))) y$id = factor(y$id, levels=rev(rownames(x))) ggplot(y, aes_(x=~id, y=~key)) + geom_tile(aes_(fill=~value), alpha=alpha) + scale_fill_manual(values=c("white", color)) + # scale_fill_gradient(low="white", high=color) + theme_void() + theme(legend.position='none') } # QR code in black plot.new() QR.black <- ggQRCode(url,color="black") plot(QR.black) # QR code in a different color QR.blue <- ggQRCode(url,color="#56B4E9") plot(QR.blue) ``` ## Brisbane bike path latitude and longitude to QR codes ```r! #------------------------------------------- # Brisbane bike path latitude and longitude to QR codes #------------------------------------------- task.32.input.folder.path <- file.path(dir.GLC.presentation, "data-download") task.32.output.folder.path <- file.path(dir.GLC.presentation, "QR-codes") # Read csv files bikeway <- read.delim(file=file.path(task.32.input.folder.path,"open-data-bikeway-count-sites-2004-2021.csv") ,header = TRUE ,sep = ",")[,c(1:3,5,6)] # dim(bikeway) 183 5 # Rename columns colnames(bikeway) <- c("Site.ID","Parent.Site.ID","Site.Name","Latitude","Longitude") # Create a new column with Google maps URLs bikeway.2 <- bikeway %>% dplyr::mutate(Google.map.link=paste0("http://www.google.com/maps/place/",Latitude,",",Longitude)) # Create output file paths task.32.output.file.paths <- file.path(task.32.output.folder.path, bikeway.2$Site.ID,".svg") # length(task.32.output.file.paths) 183 # Loop thru all jpg images to reduce file sizes # Check existence of input folder if(dir.exists(task.32.output.folder.path)!=TRUE){ cat("Output folder could not be found") } else{ # Use image file name to look up data for the image in the cell segmentation data foreach::foreach(i=1:length(bikeway.2$Site.ID) ,.combine = 'c' ,.packages=c("foreach","dplyr","ggplot2","qrcode","svglite","grid")) %dopar% { print(paste0("=========== Creating QR code ",i,"============")) # Get Google map URL url <-bikeway.2$Google.map.link[i] # Name output file output.file.path <- paste0(task.32.output.folder.path,"/", bikeway.2$Site.ID[i],".svg") # Create QR code QR.code <- ggQRCode(url, color="#56B4E9") # Add label to plot QR.code.2 <- QR.code+ggplot2::annotation_custom(grid::textGrob(bikeway.2$Site.Name[i]) ,xmin = -Inf ,xmax = Inf ,ymin = -Inf ,ymax = Inf) # Save QR code as .svg file ggplot2::ggsave( filename = output.file.path ,plot= print(QR.code.2) ,device = "svg") } } ```