tags: `R`

R programming for image processing, file management, and data visualisation

R code used to generate outcome demonstrated during knowledge hour presentation at GLC 21-04-2023

Set up directory

# Input and output directories
dir.C.drive <-"C:"
dir.R.packages <- file.path(dir.C.drive,"R","R-4.1.3")
dir.main <- file.path(dir.C.drive,"Google-Drive")
dir.GLC <- file.path(dir.main,"Greenlight-clinical")
dir.GLC.presentation <- file.path(dir.GLC,"Knowledge-hours-presentation")
dir.GLC.data <- file.path(dir.GLC.presentation, "data-download")
dir.GLC.P.drive <- file.path(dir.GLC,"P-drive")
dir.eTMF <- file.path(dir.GLC.presentation,"eTMF")

Bring installed packages to memory

library(jpeg, lib.loc = dir.R.packages)
library(withr,lib.loc = dir.R.packages)
library(dplyr, lib.loc = dir.R.packages)
library(tidyr,lib.loc = dir.R.packages)
library(purrr, lib.loc = dir.R.packages)
library(foreach, lib.loc = dir.R.packages)
library(iterators, lib.loc = dir.R.packages)
library(doParallel, lib.loc = dir.R.packages)
library(qrcode, lib.loc = dir.R.packages)
library(backports, lib.loc = dir.R.packages)
library(tzdb, lib.loc = dir.R.packages)
library(ggplot2,lib.loc = dir.R.packages)
library(crayon,lib.loc = dir.R.packages)
library(tidyverse, lib.loc = dir.R.packages)
library(farver, lib.loc = dir.R.packages)
library(svglite,lib.loc = dir.R.packages)
library(grid,lib.loc = dir.R.packages)
library(labeling,lib.loc = dir.R.packages)
library(magrittr,lib.loc = dir.R.packages)
library(imager,lib.loc = dir.R.packages)
library(patchwork,lib.loc = dir.R.packages)
library(magick,lib.loc = dir.R.packages)
library(forcats,lib.loc = dir.R.packages)
library(cowplot,lib.loc = dir.R.packages)
library(fs,lib.loc = dir.R.packages)
library(data.tree,lib.loc = dir.R.packages)
library(sketcher,lib.loc = dir.R.packages)
library(rJava,lib.loc = dir.R.packages)
library(xlsx,lib.loc = dir.R.packages)
library(readxl,lib.loc = dir.R.packages)

List Excel sheets

#-----------------------
# List Excel sheets
#-----------------------
list.files(path = dir.GLC.data, pattern="*.xlsx", full.names = TRUE)

# There are 115 sheets. All sheets matched source data file
sheets.C200 <- readxl::excel_sheets(path=file.path(dir.GLC.data,"C-200-001_sdtm - Copy.xlsx"))
#length(sheets.C200) #115

# Some sheet names not seen in the source data file
sheets.invetus <- readxl::excel_sheets(path=file.path(dir.GLC.data,"AET P 20286 Collated Data - Copy.xlsx"))
#length(sheets.invetus) 101

Write sheets into Excel

#------------------------
# Write sheets into Excel
#------------------------
# Create a new Excel file with same sheets from "C-200-001_sdtm - Copy.xlsx"
note <- data.frame(note="Client please use existing sheet names")
new.excel.file.path <- file.path(dir.GLC.data,"C-300-001_sdtm.xlsx")

# Write data into first sheet
xlsx::write.xlsx(x=note
                ,file=new.excel.file.path
                ,sheetName=sheets.C200[1]
                ,append=FALSE)

# Add second to last sheets to the same Excel
lapply(X=2:length(sheets.C200)
       ,FUN=function(i) xlsx::write.xlsx(note
                                         ,file=new.excel.file.path
                                         ,sheetName=sheets.C200[[i]]
                                         ,append=TRUE))

Create Gantt chart for employment history

#------------------------------------------
# Create Gantt chart for employment history 
#------------------------------------------

# Input, output directory
task.1.input.folder.path <- file.path(dir.GLC.presentation,"images-download") # dir.exists(task.1.input.folder.path) [1] TRUE
task.1.output.folder.path <- file.path(dir.GLC.presentation,"gantt-charts") # dir.exists(task.1.output.folder.path) [1] TRUE

# Set end date of current employment to today
date.today <- as.character(Sys.Date()) 

# Create employment data
# Reshape it to long format which start dates and end dates are replaced by two new variables- variable, value
df <- as.data.frame(rbind(
  c("University of Queensland\n PhD student", '2016-04-05', '2020-05-11',"past")
  ,c("QIMR Berghofer Medical Research Institute\n R programmer", '2020-04-27', '2021-11-09',"past")
  ,c("Greenlight Clinical\n Statistical programmer", '2021-09-27', date.today,"current"))) %>%
  dplyr::select("task" = 1, "start" = 2, "end" = 3, "status"=4) %>%
  dplyr::mutate(task = forcats::fct_rev(factor(task, levels = (task))),
         start = as.Date(start), 
         end = as.Date(end)) %>%
  dplyr::mutate(lab_pos = start) %>%
  tidyr::pivot_longer(2:3, names_to = "variable", values_to = "value") # dim(df) [1] 6 4

# Change size of plotting window to 10:5 width to height
dev.new(width=10, height=5,noRStudioGD = TRUE)

# Plot employment as a gantt chart with Y axis showing every year
## [Changing line colors with ggplot()](https://stackoverflow.com/questions/5171263/changing-line-colors-with-ggplot)
gantt.chart <- ggplot2::ggplot(df, aes(x=value, y=task, color=status)) + 
  ggplot2::geom_line(linewidth = 20, show.legend = FALSE) +
  # To modify color, use ggplot2::scale_color_manual(values=). values matched to color variable values current, past
  ggplot2::scale_color_manual(values=c("red","grey50"))+
  ggplot2::scale_x_date( labels =  function(x) lubridate::year(x)
                         ,date_breaks = '1 year'
                         ,date_minor_breaks = "1 year"
                         ,position = "top") +
  ggplot2::labs(x = NULL,y = NULL) +
  ggplot2::theme( axis.text.x=element_text(size=10)
                  ,axis.text.y=element_text(size=12))

# Save employment gantt chart as a png file
## plot size needs to match those in dev.new()
ggplot2::ggsave( filename = file.path( task.1.output.folder.path,"employment-history.png")
                 ,width = 10
                 ,height = 5
                 ,plot= print(gantt.chart) # 
                 ,device = "png")

# Read logo image files
logo.bash <- magick::image_read(file.path(task.1.input.folder.path, "BASH_full_colored_dark.jpg")) # class(logo.bash) [1] "magick-image"
logo.R <- magick::image_read(file.path(task.1.input.folder.path,"R-logo-Cluster-Analysis-with-R-Edureka.png"))
logo.SAS <- magick::image_read(file.path(task.1.input.folder.path,"SAS-logo.png"))

# Set base x y positions of the plot where logo images are added to
base.position.x.group.1 <- 0.475
base.position.y.group.1 <- 0.975
base.position.x.group.2 <- base.position.x.group.1+0.325
base.position.y.group.2 <- base.position.y.group.1-0.325
base.position.x.group.3 <- base.position.x.group.2+0.15
base.position.y.group.3 <- base.position.y.group.2-0.25

# Introduction to cowplot
# https://cran.r-project.org/web/packages/cowplot/vignettes/introduction.html
gantt.chart.final <- cowplot::ggdraw(gantt.chart) +
  # Add logo images to employment 1 bar
  cowplot::draw_image(logo.bash
                      ,x = base.position.x.group.1
                      ,y = base.position.y.group.1
                      ,hjust = 1, vjust = 1, width = 0.1, height = 0.2) +
  cowplot::draw_image(logo.R
                      ,x = base.position.x.group.1 + 0.1
                      ,y = base.position.y.group.1 - 0.04
                      ,hjust = 1, vjust = 1, width = 0.1, height = 0.1) +
  cowplot::draw_image(logo.SAS
                      ,x = base.position.x.group.1+0.2
                      ,y = base.position.y.group.1
                      ,hjust = 1, vjust = 1, width = 0.1, height = 0.2)+
  # Add logo images to employment 2 bar
  cowplot::draw_image(logo.R
                      ,x = base.position.x.group.2
                      ,y = base.position.y.group.2
                      ,hjust = 1, vjust = 1, width = 0.1, height = 0.1)+
  cowplot::draw_image(logo.bash
                      ,x = base.position.x.group.2+ 0.1
                      ,y = base.position.y.group.2+0.05
                      ,hjust = 1, vjust = 1, width = 0.1, height = 0.2)+
  # Add logo images to employment 3 bar
  cowplot::draw_image(logo.SAS
                      ,x = base.position.x.group.3
                      ,y = base.position.y.group.3
                      ,hjust = 1, vjust = 1, width = 0.1, height = 0.2)

# Save the plot as a png file
# Sizes of exported image should be set to be similar to the plotting window, modified by dev.new(width=, height=)
ggplot2::ggsave( filename = file.path(task.1.output.folder.path,"employment-history_programming-software-added.png")
                 ,width = 10
                 ,height = 5
                 ,plot= print(gantt.chart.final) # 
                 ,device = "png")

Find and copy image file types

#--------------------------------------------------
# Find and copy image file types
#--------------------------------------------------
# Input, output directory
task.2.input.folder.path <- file.path(dir.GLC.presentation,"images-download")
task.2.output.folder.path <- file.path(task.2.input.folder.path,"images-only") # dir.exists(task.2.output.folder.path)
#dir.create(task.2.output.folder.path)

image.files.want.source <- list.files( path = task.2.input.folder.path
                                      ,pattern = "\\.png$|\\.jpg$"
                                      ,full.names = TRUE) # length(image.files.want.source) 10

image.files.want.destination <- file.path(task.2.output.folder.path
                                          ,basename(image.files.want.source))

# Copy files. Destination files will have the same date times as source files
file.copy(from = image.files.want.source
          ,to = image.files.want.destination
          ,copy.date = TRUE)

# Copy files. Destination files will have the date times when copy is performed
## Delete copied files before running this code
file.copy(from = image.files.want.source
          ,to = image.files.want.destination
          ,copy.date = FALSE)

List folders as a tree

#-----------------------
# List folders as a tree
#-----------------------

# Input directories
dir.tmf <- file.path(dir.GLC.P.drive,"BIOSTATISTICS","Gina","tmf","Numbered")
dir.tmf.short <- file.path(dir.tmf, "Short_Names") # dir.exists(dir.tmf.short)
dir.tmf.long.stat <- file.path(dir.tmf,"Long_Names","Statistics") # dir.exists(dir.tmf.long.stat)

# List directories as a tree
fs::dir_tree(path = dir.tmf.short, recurse = TRUE)
fs::dir_tree(path = dir.tmf.long.stat, recurse = TRUE)

Create folders using existing folders

#--------------------------------------
# Create folders using existing folders
#--------------------------------------

# Get folder directories within dir.tmf.short
## If we set, recursive = TRUE, the directories are returned in hierarchical order, beginning with the main directory path and then followed by the sub-directories. However, if we specify recursive = FALSE, only the child directories are returned

folder.paths.tmf.short.child <- list.dirs(path = dir.tmf.short
                                          ,full.names = TRUE
                                          ,recursive = FALSE) # length(folder.paths.tmf.short.child) 6

# Get child, grandchild directories
folder.paths.tmf.short.grandchild <- list.dirs(path=folder.paths.tmf.short.child
                                               ,full.names = TRUE) # length(folder.paths.tmf.short.grandchild) 31

# Copy source trial master file folders to destination
# Create a new parent directory
dir.tmf.new.short <- file.path(dir.GLC.presentation,"trial-master-files","numbered","short-named")
dir.create(path = dir.tmf.new.short, recursive = TRUE)

# Get child and grandchild folder names
folder.names.tmf.short.grandchild <- folder.paths.tmf.short.grandchild %>% 
  # Remove parent directory
  gsub(pattern="C:/Google-Drive/Greenlight-clinical/P-drive/BIOSTATISTICS/Gina/tmf/Numbered/Short_Names", replacement="")

# Create new destination folder paths
dest.folder.paths.tm.short <- paste0(dir.tmf.new.short, folder.names.tmf.short.grandchild) # length(dest.folder.paths.tm.short) 31

unlist(lapply(X=dest.folder.paths.tm.short
              ,FUN=function(x) dir.create(x, recursive = TRUE))
       )

Create folders using eTMF reference model file

#-----------------------------------------------
# Create folders using eTMF reference model file
#-----------------------------------------------
eTMF <- readxl::read_excel(path = file.path(dir.GLC.data,"tmf-reference-model-version-3-0_16-june-20153.xlsx")
                           ,sheet = "v3.0"
                           ,skip = 2
                           ,col_names = TRUE) %>%
  # Select columns wanted by position
  dplyr::select(c(1:6))  # dim(eTMF) 249 6

colnames(eTMF) <- colnames(eTMF) %>% 
  gsub(pattern=" #", replacement="_number") %>%
  gsub(pattern=" ", replacement="_")

# Select Zone_Name="Statistics"
eTMF.stat <- eTMF %>% 
  dplyr::filter(Zone_Name=="Statistics") %>%
  # Remove dots in *_number columns
  dplyr::mutate(Section_number=gsub(pattern="\\.", replacement="",x=Section_number)
                ,Artifact_number=gsub(pattern="\\.", replacement="",x=Artifact_number)
                ,Zone_number_name=paste0(Zone_number,"_",Zone_Name)
                ,Section_number_name=paste0(Section_number,"_",Section_Name)
                ,Artifact_number_name=paste0(Artifact_number,"_",Artifact_name)
                ,folder.path=file.path(dir.eTMF, Zone_number_name, Section_number_name,Artifact_number_name)) # dim(eTMF.stat) 25 8

# Create new folders using the folder.path column
unlist(lapply(X=eTMF.stat$folder.path
              ,FUN=function(x) dir.create(x, recursive = TRUE))
)

# List directories as a tree
fs::dir_tree(path = file.path(dir.GLC.presentation,"eTMF","11_Statistics")
             ,recurse = TRUE)

Reduce file size of a single image

#--------------------------------------------------
# Reduce file size of a single image
#--------------------------------------------------
task.3.input.folder.path <- file.path(dir.main, "Receipts-meals-not-claimed-yet") # dir.exists(task.3.input.folder.path)
task.3.output.folder.path <- file.path(dir.GLC.presentation,"reduced_images") # dir.exists(task.3.output.folder.path)

file.paths.jpg <- list.files(path=task.3.input.folder.path
                            ,pattern = "*.jpg"
                            ,full.names = TRUE) # length(file.paths.jpg) 98
# Reduce 1 jpg file
img <- jpeg::readJPEG(source=file.paths.jpg[2]
                      , native = TRUE) # class(img) [1] "nativeRaster"

# Export reduced object as a jpg file
output.file.path <-paste0(task.3.output.folder.path,"/small_",basename(file.paths.jpg[2]))

jpeg::writeJPEG(image=img
               ,target = output.file.path
               , quality = 0.5)

Reduce file size of multiple image image files

#--------------------------------------------------
# Reduce file size of multiple image image files
#--------------------------------------------------

# cl <- parallel::makeCluster(parallel::detectCores()-1
#                             ,outfile=file.path(task.3.output.folder.path
#                                                ,paste0("image-reduction_error-log.txt")
#                                                )
# ) # Use all detected cores minus 1 core (CPU) as a cluster, leaving 1 core for software or app running in the background in case desktop crashes (monitor turns black and nothing working)
# doParallel::registerDoParallel(cl, cores = parallel::detectCores()-1)

# Loop thru all jpg images to reduce file sizes
# Check existence of input folder
if(dir.exists(task.3.input.folder.path)!=TRUE){
  cat("Input image folder could not be found")
} else{
  # Read input image file paths as a vector
  image.file.paths <- list.files(path = task.3.input.folder.path
                                ,full.names = TRUE
                                ,pattern = "\\.jpg$") # length(image.file.paths) 99
  cat("There are",length(image.file.paths),"images in the folder")
  # Use image file name to look up data for the image in the cell segmentation data
  foreach::foreach(i=1:length(image.file.paths)
                   ,.combine = 'c'
                   ,.packages=c("foreach","dplyr","jpeg")) %dopar% {
                     print(paste0("=========== Processing image ",i,"============"))
                     
      # If data can be found, then subset. If data cannot be found, then copy the input image to the output
      image.file.path <-image.file.paths[i]
      image.file.name <-basename(image.file.path)
      
      # Name output file
      output.file.path <-paste0(task.3.output.folder.path,"/small_", image.file.name)
      
      # Read jpg file
      img <- jpeg::readJPEG(source=image.file.path, native = TRUE) # class(img) [1] "nativeRaster"
      
      # Reduce jpg file size and output jpg file
      jpeg::writeJPEG(image=img
                      ,target = output.file.path
                      , quality = 0.5) 
       }
  }  

# Calculate % reduction
total.file.size.original <- 339
total.file.size.reduced <- 59.2
(total.file.size.original-total.file.size.reduced)/total.file.size.original*100 #[1] 82.53687

Google map link to a single QR code

#---------------------------------------------
# Google map link to a single QR code
#---------------------------------------------
url <- "http://www.google.com/maps/place/49.46800006494457,17.11514008755796"

# A function to plot QR code in changeable color
ggQRCode <- function(text, color="black", alpha=1) {
  x <- qr_code(text)
  x <- as.data.frame(x)
  
  y <- x
  y$id <- rownames(y)
  y <- gather(y, "key", "value", colnames(y)[-ncol(y)])
  y$key = factor(y$key, levels=rev(colnames(x)))
  y$id = factor(y$id, levels=rev(rownames(x)))
  
  ggplot(y, aes_(x=~id, y=~key)) + geom_tile(aes_(fill=~value), alpha=alpha) +
    scale_fill_manual(values=c("white", color)) + 
    # scale_fill_gradient(low="white", high=color) +
    theme_void() + theme(legend.position='none')
}

# QR code in black
plot.new()
QR.black <- ggQRCode(url,color="black") 
plot(QR.black) 

# QR code in a different color
QR.blue <- ggQRCode(url,color="#56B4E9")
plot(QR.blue)

Brisbane bike path latitude and longitude to QR codes

#-------------------------------------------
# Brisbane bike path latitude and longitude to QR codes 
#-------------------------------------------
task.32.input.folder.path <- file.path(dir.GLC.presentation, "data-download")
task.32.output.folder.path <- file.path(dir.GLC.presentation, "QR-codes")

# Read csv files
bikeway <- read.delim(file=file.path(task.32.input.folder.path,"open-data-bikeway-count-sites-2004-2021.csv")
                      ,header = TRUE
                      ,sep = ",")[,c(1:3,5,6)] # dim(bikeway) 183 5
# Rename columns
colnames(bikeway) <- c("Site.ID","Parent.Site.ID","Site.Name","Latitude","Longitude")

# Create a new column with Google maps URLs
bikeway.2 <- bikeway %>% 
  dplyr::mutate(Google.map.link=paste0("http://www.google.com/maps/place/",Latitude,",",Longitude))

# Create output file paths
task.32.output.file.paths <- file.path(task.32.output.folder.path, bikeway.2$Site.ID,".svg") # length(task.32.output.file.paths) 183

# Loop thru all jpg images to reduce file sizes
# Check existence of input folder
if(dir.exists(task.32.output.folder.path)!=TRUE){
  cat("Output folder could not be found")
} else{
  # Use image file name to look up data for the image in the cell segmentation data
  foreach::foreach(i=1:length(bikeway.2$Site.ID)
                   ,.combine = 'c'
                   ,.packages=c("foreach","dplyr","ggplot2","qrcode","svglite","grid")) %dopar% {
                     print(paste0("=========== Creating QR code ",i,"============"))
  # Get Google map URL
  url <-bikeway.2$Google.map.link[i]
  # Name output file
  output.file.path <- paste0(task.32.output.folder.path,"/", bikeway.2$Site.ID[i],".svg")
  # Create QR code
  QR.code <- ggQRCode(url, color="#56B4E9")
  
  # Add label to plot
  QR.code.2 <- QR.code+ggplot2::annotation_custom(grid::textGrob(bikeway.2$Site.Name[i])
                                                    ,xmin = -Inf
                                                    ,xmax = Inf
                                                    ,ymin = -Inf
                                                    ,ymax = Inf) 
  # Save QR code as .svg file
  ggplot2::ggsave( filename = output.file.path
                   ,plot= print(QR.code.2)  
                   ,device = "svg")

     }
}

tags: R

R programming for image processing, file management, and data visualisation

Set up directory

Bring installed packages to memory

List Excel sheets

Write sheets into Excel

Create Gantt chart for employment history

Find and copy image file types

List folders as a tree

Create folders using existing folders

Create folders using eTMF reference model file

Reduce file size of a single image

Reduce file size of multiple image image files

Google map link to a single QR code

Brisbane bike path latitude and longitude to QR codes

Read more

Plot start dates and end dates of events

Errors, warnings, issues in R programming

Debugging SAS programs

Data visualisation in R

tags: `R`