Try   HackMD
tags: R

R programming for image processing, file management, and data visualisation

R code used to generate outcome demonstrated during knowledge hour presentation at GLC 21-04-2023

Set up directory

# Input and output directories
dir.C.drive <-"C:"
dir.R.packages <- file.path(dir.C.drive,"R","R-4.1.3")
dir.main <- file.path(dir.C.drive,"Google-Drive")
dir.GLC <- file.path(dir.main,"Greenlight-clinical")
dir.GLC.presentation <- file.path(dir.GLC,"Knowledge-hours-presentation")
dir.GLC.data <- file.path(dir.GLC.presentation, "data-download")
dir.GLC.P.drive <- file.path(dir.GLC,"P-drive")
dir.eTMF <- file.path(dir.GLC.presentation,"eTMF")

Bring installed packages to memory

library(jpeg, lib.loc = dir.R.packages)
library(withr,lib.loc = dir.R.packages)
library(dplyr, lib.loc = dir.R.packages)
library(tidyr,lib.loc = dir.R.packages)
library(purrr, lib.loc = dir.R.packages)
library(foreach, lib.loc = dir.R.packages)
library(iterators, lib.loc = dir.R.packages)
library(doParallel, lib.loc = dir.R.packages)
library(qrcode, lib.loc = dir.R.packages)
library(backports, lib.loc = dir.R.packages)
library(tzdb, lib.loc = dir.R.packages)
library(ggplot2,lib.loc = dir.R.packages)
library(crayon,lib.loc = dir.R.packages)
library(tidyverse, lib.loc = dir.R.packages)
library(farver, lib.loc = dir.R.packages)
library(svglite,lib.loc = dir.R.packages)
library(grid,lib.loc = dir.R.packages)
library(labeling,lib.loc = dir.R.packages)
library(magrittr,lib.loc = dir.R.packages)
library(imager,lib.loc = dir.R.packages)
library(patchwork,lib.loc = dir.R.packages)
library(magick,lib.loc = dir.R.packages)
library(forcats,lib.loc = dir.R.packages)
library(cowplot,lib.loc = dir.R.packages)
library(fs,lib.loc = dir.R.packages)
library(data.tree,lib.loc = dir.R.packages)
library(sketcher,lib.loc = dir.R.packages)
library(rJava,lib.loc = dir.R.packages)
library(xlsx,lib.loc = dir.R.packages)
library(readxl,lib.loc = dir.R.packages)

List Excel sheets

#-----------------------
# List Excel sheets
#-----------------------
list.files(path = dir.GLC.data, pattern="*.xlsx", full.names = TRUE)

# There are 115 sheets. All sheets matched source data file
sheets.C200 <- readxl::excel_sheets(path=file.path(dir.GLC.data,"C-200-001_sdtm - Copy.xlsx"))
#length(sheets.C200) #115

# Some sheet names not seen in the source data file
sheets.invetus <- readxl::excel_sheets(path=file.path(dir.GLC.data,"AET P 20286 Collated Data - Copy.xlsx"))
#length(sheets.invetus) 101

Write sheets into Excel

#------------------------
# Write sheets into Excel
#------------------------
# Create a new Excel file with same sheets from "C-200-001_sdtm - Copy.xlsx"
note <- data.frame(note="Client please use existing sheet names")
new.excel.file.path <- file.path(dir.GLC.data,"C-300-001_sdtm.xlsx")

# Write data into first sheet
xlsx::write.xlsx(x=note
                ,file=new.excel.file.path
                ,sheetName=sheets.C200[1]
                ,append=FALSE)

# Add second to last sheets to the same Excel
lapply(X=2:length(sheets.C200)
       ,FUN=function(i) xlsx::write.xlsx(note
                                         ,file=new.excel.file.path
                                         ,sheetName=sheets.C200[[i]]
                                         ,append=TRUE))

Create Gantt chart for employment history

#------------------------------------------
# Create Gantt chart for employment history 
#------------------------------------------

# Input, output directory
task.1.input.folder.path <- file.path(dir.GLC.presentation,"images-download") # dir.exists(task.1.input.folder.path) [1] TRUE
task.1.output.folder.path <- file.path(dir.GLC.presentation,"gantt-charts") # dir.exists(task.1.output.folder.path) [1] TRUE

# Set end date of current employment to today
date.today <- as.character(Sys.Date()) 

# Create employment data
# Reshape it to long format which start dates and end dates are replaced by two new variables- variable, value
df <- as.data.frame(rbind(
  c("University of Queensland\n PhD student", '2016-04-05', '2020-05-11',"past")
  ,c("QIMR Berghofer Medical Research Institute\n R programmer", '2020-04-27', '2021-11-09',"past")
  ,c("Greenlight Clinical\n Statistical programmer", '2021-09-27', date.today,"current"))) %>%
  dplyr::select("task" = 1, "start" = 2, "end" = 3, "status"=4) %>%
  dplyr::mutate(task = forcats::fct_rev(factor(task, levels = (task))),
         start = as.Date(start), 
         end = as.Date(end)) %>%
  dplyr::mutate(lab_pos = start) %>%
  tidyr::pivot_longer(2:3, names_to = "variable", values_to = "value") # dim(df) [1] 6 4

# Change size of plotting window to 10:5 width to height
dev.new(width=10, height=5,noRStudioGD = TRUE)

# Plot employment as a gantt chart with Y axis showing every year
## [Changing line colors with ggplot()](https://stackoverflow.com/questions/5171263/changing-line-colors-with-ggplot)
gantt.chart <- ggplot2::ggplot(df, aes(x=value, y=task, color=status)) + 
  ggplot2::geom_line(linewidth = 20, show.legend = FALSE) +
  # To modify color, use ggplot2::scale_color_manual(values=). values matched to color variable values current, past
  ggplot2::scale_color_manual(values=c("red","grey50"))+
  ggplot2::scale_x_date( labels =  function(x) lubridate::year(x)
                         ,date_breaks = '1 year'
                         ,date_minor_breaks = "1 year"
                         ,position = "top") +
  ggplot2::labs(x = NULL,y = NULL) +
  ggplot2::theme( axis.text.x=element_text(size=10)
                  ,axis.text.y=element_text(size=12))

# Save employment gantt chart as a png file
## plot size needs to match those in dev.new()
ggplot2::ggsave( filename = file.path( task.1.output.folder.path,"employment-history.png")
                 ,width = 10
                 ,height = 5
                 ,plot= print(gantt.chart) # 
                 ,device = "png")

# Read logo image files
logo.bash <- magick::image_read(file.path(task.1.input.folder.path, "BASH_full_colored_dark.jpg")) # class(logo.bash) [1] "magick-image"
logo.R <- magick::image_read(file.path(task.1.input.folder.path,"R-logo-Cluster-Analysis-with-R-Edureka.png"))
logo.SAS <- magick::image_read(file.path(task.1.input.folder.path,"SAS-logo.png"))

# Set base x y positions of the plot where logo images are added to
base.position.x.group.1 <- 0.475
base.position.y.group.1 <- 0.975
base.position.x.group.2 <- base.position.x.group.1+0.325
base.position.y.group.2 <- base.position.y.group.1-0.325
base.position.x.group.3 <- base.position.x.group.2+0.15
base.position.y.group.3 <- base.position.y.group.2-0.25

# Introduction to cowplot
# https://cran.r-project.org/web/packages/cowplot/vignettes/introduction.html
gantt.chart.final <- cowplot::ggdraw(gantt.chart) +
  # Add logo images to employment 1 bar
  cowplot::draw_image(logo.bash
                      ,x = base.position.x.group.1
                      ,y = base.position.y.group.1
                      ,hjust = 1, vjust = 1, width = 0.1, height = 0.2) +
  cowplot::draw_image(logo.R
                      ,x = base.position.x.group.1 + 0.1
                      ,y = base.position.y.group.1 - 0.04
                      ,hjust = 1, vjust = 1, width = 0.1, height = 0.1) +
  cowplot::draw_image(logo.SAS
                      ,x = base.position.x.group.1+0.2
                      ,y = base.position.y.group.1
                      ,hjust = 1, vjust = 1, width = 0.1, height = 0.2)+
  # Add logo images to employment 2 bar
  cowplot::draw_image(logo.R
                      ,x = base.position.x.group.2
                      ,y = base.position.y.group.2
                      ,hjust = 1, vjust = 1, width = 0.1, height = 0.1)+
  cowplot::draw_image(logo.bash
                      ,x = base.position.x.group.2+ 0.1
                      ,y = base.position.y.group.2+0.05
                      ,hjust = 1, vjust = 1, width = 0.1, height = 0.2)+
  # Add logo images to employment 3 bar
  cowplot::draw_image(logo.SAS
                      ,x = base.position.x.group.3
                      ,y = base.position.y.group.3
                      ,hjust = 1, vjust = 1, width = 0.1, height = 0.2)

# Save the plot as a png file
# Sizes of exported image should be set to be similar to the plotting window, modified by dev.new(width=, height=)
ggplot2::ggsave( filename = file.path(task.1.output.folder.path,"employment-history_programming-software-added.png")
                 ,width = 10
                 ,height = 5
                 ,plot= print(gantt.chart.final) # 
                 ,device = "png")

Find and copy image file types

#--------------------------------------------------
# Find and copy image file types
#--------------------------------------------------
# Input, output directory
task.2.input.folder.path <- file.path(dir.GLC.presentation,"images-download")
task.2.output.folder.path <- file.path(task.2.input.folder.path,"images-only") # dir.exists(task.2.output.folder.path)
#dir.create(task.2.output.folder.path)

image.files.want.source <- list.files( path = task.2.input.folder.path
                                      ,pattern = "\\.png$|\\.jpg$"
                                      ,full.names = TRUE) # length(image.files.want.source) 10

image.files.want.destination <- file.path(task.2.output.folder.path
                                          ,basename(image.files.want.source))

# Copy files. Destination files will have the same date times as source files
file.copy(from = image.files.want.source
          ,to = image.files.want.destination
          ,copy.date = TRUE)

# Copy files. Destination files will have the date times when copy is performed
## Delete copied files before running this code
file.copy(from = image.files.want.source
          ,to = image.files.want.destination
          ,copy.date = FALSE)

List folders as a tree

#-----------------------
# List folders as a tree
#-----------------------

# Input directories
dir.tmf <- file.path(dir.GLC.P.drive,"BIOSTATISTICS","Gina","tmf","Numbered")
dir.tmf.short <- file.path(dir.tmf, "Short_Names") # dir.exists(dir.tmf.short)
dir.tmf.long.stat <- file.path(dir.tmf,"Long_Names","Statistics") # dir.exists(dir.tmf.long.stat)

# List directories as a tree
fs::dir_tree(path = dir.tmf.short, recurse = TRUE)
fs::dir_tree(path = dir.tmf.long.stat, recurse = TRUE)

Create folders using existing folders

#--------------------------------------
# Create folders using existing folders
#--------------------------------------

# Get folder directories within dir.tmf.short
## If we set, recursive = TRUE, the directories are returned in hierarchical order, beginning with the main directory path and then followed by the sub-directories. However, if we specify recursive = FALSE, only the child directories are returned

folder.paths.tmf.short.child <- list.dirs(path = dir.tmf.short
                                          ,full.names = TRUE
                                          ,recursive = FALSE) # length(folder.paths.tmf.short.child) 6

# Get child, grandchild directories
folder.paths.tmf.short.grandchild <- list.dirs(path=folder.paths.tmf.short.child
                                               ,full.names = TRUE) # length(folder.paths.tmf.short.grandchild) 31

# Copy source trial master file folders to destination
# Create a new parent directory
dir.tmf.new.short <- file.path(dir.GLC.presentation,"trial-master-files","numbered","short-named")
dir.create(path = dir.tmf.new.short, recursive = TRUE)

# Get child and grandchild folder names
folder.names.tmf.short.grandchild <- folder.paths.tmf.short.grandchild %>% 
  # Remove parent directory
  gsub(pattern="C:/Google-Drive/Greenlight-clinical/P-drive/BIOSTATISTICS/Gina/tmf/Numbered/Short_Names", replacement="")

# Create new destination folder paths
dest.folder.paths.tm.short <- paste0(dir.tmf.new.short, folder.names.tmf.short.grandchild) # length(dest.folder.paths.tm.short) 31

unlist(lapply(X=dest.folder.paths.tm.short
              ,FUN=function(x) dir.create(x, recursive = TRUE))
       )

Create folders using eTMF reference model file

#-----------------------------------------------
# Create folders using eTMF reference model file
#-----------------------------------------------
eTMF <- readxl::read_excel(path = file.path(dir.GLC.data,"tmf-reference-model-version-3-0_16-june-20153.xlsx")
                           ,sheet = "v3.0"
                           ,skip = 2
                           ,col_names = TRUE) %>%
  # Select columns wanted by position
  dplyr::select(c(1:6))  # dim(eTMF) 249 6

colnames(eTMF) <- colnames(eTMF) %>% 
  gsub(pattern=" #", replacement="_number") %>%
  gsub(pattern=" ", replacement="_")

# Select Zone_Name="Statistics"
eTMF.stat <- eTMF %>% 
  dplyr::filter(Zone_Name=="Statistics") %>%
  # Remove dots in *_number columns
  dplyr::mutate(Section_number=gsub(pattern="\\.", replacement="",x=Section_number)
                ,Artifact_number=gsub(pattern="\\.", replacement="",x=Artifact_number)
                ,Zone_number_name=paste0(Zone_number,"_",Zone_Name)
                ,Section_number_name=paste0(Section_number,"_",Section_Name)
                ,Artifact_number_name=paste0(Artifact_number,"_",Artifact_name)
                ,folder.path=file.path(dir.eTMF, Zone_number_name, Section_number_name,Artifact_number_name)) # dim(eTMF.stat) 25 8

# Create new folders using the folder.path column
unlist(lapply(X=eTMF.stat$folder.path
              ,FUN=function(x) dir.create(x, recursive = TRUE))
)

# List directories as a tree
fs::dir_tree(path = file.path(dir.GLC.presentation,"eTMF","11_Statistics")
             ,recurse = TRUE)

Reduce file size of a single image

#--------------------------------------------------
# Reduce file size of a single image
#--------------------------------------------------
task.3.input.folder.path <- file.path(dir.main, "Receipts-meals-not-claimed-yet") # dir.exists(task.3.input.folder.path)
task.3.output.folder.path <- file.path(dir.GLC.presentation,"reduced_images") # dir.exists(task.3.output.folder.path)

file.paths.jpg <- list.files(path=task.3.input.folder.path
                            ,pattern = "*.jpg"
                            ,full.names = TRUE) # length(file.paths.jpg) 98
# Reduce 1 jpg file
img <- jpeg::readJPEG(source=file.paths.jpg[2]
                      , native = TRUE) # class(img) [1] "nativeRaster"

# Export reduced object as a jpg file
output.file.path <-paste0(task.3.output.folder.path,"/small_",basename(file.paths.jpg[2]))

jpeg::writeJPEG(image=img
               ,target = output.file.path
               , quality = 0.5)

Reduce file size of multiple image image files

#--------------------------------------------------
# Reduce file size of multiple image image files
#--------------------------------------------------

# cl <- parallel::makeCluster(parallel::detectCores()-1
#                             ,outfile=file.path(task.3.output.folder.path
#                                                ,paste0("image-reduction_error-log.txt")
#                                                )
# ) # Use all detected cores minus 1 core (CPU) as a cluster, leaving 1 core for software or app running in the background in case desktop crashes (monitor turns black and nothing working)
# doParallel::registerDoParallel(cl, cores = parallel::detectCores()-1)

# Loop thru all jpg images to reduce file sizes
# Check existence of input folder
if(dir.exists(task.3.input.folder.path)!=TRUE){
  cat("Input image folder could not be found")
} else{
  # Read input image file paths as a vector
  image.file.paths <- list.files(path = task.3.input.folder.path
                                ,full.names = TRUE
                                ,pattern = "\\.jpg$") # length(image.file.paths) 99
  cat("There are",length(image.file.paths),"images in the folder")
  # Use image file name to look up data for the image in the cell segmentation data
  foreach::foreach(i=1:length(image.file.paths)
                   ,.combine = 'c'
                   ,.packages=c("foreach","dplyr","jpeg")) %dopar% {
                     print(paste0("=========== Processing image ",i,"============"))
                     
      # If data can be found, then subset. If data cannot be found, then copy the input image to the output
      image.file.path <-image.file.paths[i]
      image.file.name <-basename(image.file.path)
      
      # Name output file
      output.file.path <-paste0(task.3.output.folder.path,"/small_", image.file.name)
      
      # Read jpg file
      img <- jpeg::readJPEG(source=image.file.path, native = TRUE) # class(img) [1] "nativeRaster"
      
      # Reduce jpg file size and output jpg file
      jpeg::writeJPEG(image=img
                      ,target = output.file.path
                      , quality = 0.5) 
       }
  }  

# Calculate % reduction
total.file.size.original <- 339
total.file.size.reduced <- 59.2
(total.file.size.original-total.file.size.reduced)/total.file.size.original*100 #[1] 82.53687 
#---------------------------------------------
# Google map link to a single QR code
#---------------------------------------------
url <- "http://www.google.com/maps/place/49.46800006494457,17.11514008755796"

# A function to plot QR code in changeable color
ggQRCode <- function(text, color="black", alpha=1) {
  x <- qr_code(text)
  x <- as.data.frame(x)
  
  y <- x
  y$id <- rownames(y)
  y <- gather(y, "key", "value", colnames(y)[-ncol(y)])
  y$key = factor(y$key, levels=rev(colnames(x)))
  y$id = factor(y$id, levels=rev(rownames(x)))
  
  ggplot(y, aes_(x=~id, y=~key)) + geom_tile(aes_(fill=~value), alpha=alpha) +
    scale_fill_manual(values=c("white", color)) + 
    # scale_fill_gradient(low="white", high=color) +
    theme_void() + theme(legend.position='none')
}

# QR code in black
plot.new()
QR.black <- ggQRCode(url,color="black") 
plot(QR.black) 

# QR code in a different color
QR.blue <- ggQRCode(url,color="#56B4E9")
plot(QR.blue) 

Brisbane bike path latitude and longitude to QR codes

#-------------------------------------------
# Brisbane bike path latitude and longitude to QR codes 
#-------------------------------------------
task.32.input.folder.path <- file.path(dir.GLC.presentation, "data-download")
task.32.output.folder.path <- file.path(dir.GLC.presentation, "QR-codes")

# Read csv files
bikeway <- read.delim(file=file.path(task.32.input.folder.path,"open-data-bikeway-count-sites-2004-2021.csv")
                      ,header = TRUE
                      ,sep = ",")[,c(1:3,5,6)] # dim(bikeway) 183 5
# Rename columns
colnames(bikeway) <- c("Site.ID","Parent.Site.ID","Site.Name","Latitude","Longitude")

# Create a new column with Google maps URLs
bikeway.2 <- bikeway %>% 
  dplyr::mutate(Google.map.link=paste0("http://www.google.com/maps/place/",Latitude,",",Longitude))

# Create output file paths
task.32.output.file.paths <- file.path(task.32.output.folder.path, bikeway.2$Site.ID,".svg") # length(task.32.output.file.paths) 183

# Loop thru all jpg images to reduce file sizes
# Check existence of input folder
if(dir.exists(task.32.output.folder.path)!=TRUE){
  cat("Output folder could not be found")
} else{
  # Use image file name to look up data for the image in the cell segmentation data
  foreach::foreach(i=1:length(bikeway.2$Site.ID)
                   ,.combine = 'c'
                   ,.packages=c("foreach","dplyr","ggplot2","qrcode","svglite","grid")) %dopar% {
                     print(paste0("=========== Creating QR code ",i,"============"))
  # Get Google map URL
  url <-bikeway.2$Google.map.link[i]
  # Name output file
  output.file.path <- paste0(task.32.output.folder.path,"/", bikeway.2$Site.ID[i],".svg")
  # Create QR code
  QR.code <- ggQRCode(url, color="#56B4E9")
  
  # Add label to plot
  QR.code.2 <- QR.code+ggplot2::annotation_custom(grid::textGrob(bikeway.2$Site.Name[i])
                                                    ,xmin = -Inf
                                                    ,xmax = Inf
                                                    ,ymin = -Inf
                                                    ,ymax = Inf) 
  # Save QR code as .svg file
  ggplot2::ggsave( filename = output.file.path
                   ,plot= print(QR.code.2)  
                   ,device = "svg")

     }
}