###### tags: `R`
# R programming for image processing, file management, and data visualisation
R code used to generate outcome demonstrated during knowledge hour presentation at GLC 21-04-2023
## Set up directory
```r!
# Input and output directories
dir.C.drive <-"C:"
dir.R.packages <- file.path(dir.C.drive,"R","R-4.1.3")
dir.main <- file.path(dir.C.drive,"Google-Drive")
dir.GLC <- file.path(dir.main,"Greenlight-clinical")
dir.GLC.presentation <- file.path(dir.GLC,"Knowledge-hours-presentation")
dir.GLC.data <- file.path(dir.GLC.presentation, "data-download")
dir.GLC.P.drive <- file.path(dir.GLC,"P-drive")
dir.eTMF <- file.path(dir.GLC.presentation,"eTMF")
```
## Bring installed packages to memory
```r!
library(jpeg, lib.loc = dir.R.packages)
library(withr,lib.loc = dir.R.packages)
library(dplyr, lib.loc = dir.R.packages)
library(tidyr,lib.loc = dir.R.packages)
library(purrr, lib.loc = dir.R.packages)
library(foreach, lib.loc = dir.R.packages)
library(iterators, lib.loc = dir.R.packages)
library(doParallel, lib.loc = dir.R.packages)
library(qrcode, lib.loc = dir.R.packages)
library(backports, lib.loc = dir.R.packages)
library(tzdb, lib.loc = dir.R.packages)
library(ggplot2,lib.loc = dir.R.packages)
library(crayon,lib.loc = dir.R.packages)
library(tidyverse, lib.loc = dir.R.packages)
library(farver, lib.loc = dir.R.packages)
library(svglite,lib.loc = dir.R.packages)
library(grid,lib.loc = dir.R.packages)
library(labeling,lib.loc = dir.R.packages)
library(magrittr,lib.loc = dir.R.packages)
library(imager,lib.loc = dir.R.packages)
library(patchwork,lib.loc = dir.R.packages)
library(magick,lib.loc = dir.R.packages)
library(forcats,lib.loc = dir.R.packages)
library(cowplot,lib.loc = dir.R.packages)
library(fs,lib.loc = dir.R.packages)
library(data.tree,lib.loc = dir.R.packages)
library(sketcher,lib.loc = dir.R.packages)
library(rJava,lib.loc = dir.R.packages)
library(xlsx,lib.loc = dir.R.packages)
library(readxl,lib.loc = dir.R.packages)
```
## List Excel sheets
```r!
#-----------------------
# List Excel sheets
#-----------------------
list.files(path = dir.GLC.data, pattern="*.xlsx", full.names = TRUE)
# There are 115 sheets. All sheets matched source data file
sheets.C200 <- readxl::excel_sheets(path=file.path(dir.GLC.data,"C-200-001_sdtm - Copy.xlsx"))
#length(sheets.C200) #115
# Some sheet names not seen in the source data file
sheets.invetus <- readxl::excel_sheets(path=file.path(dir.GLC.data,"AET P 20286 Collated Data - Copy.xlsx"))
#length(sheets.invetus) 101
```
## Write sheets into Excel
```r!
#------------------------
# Write sheets into Excel
#------------------------
# Create a new Excel file with same sheets from "C-200-001_sdtm - Copy.xlsx"
note <- data.frame(note="Client please use existing sheet names")
new.excel.file.path <- file.path(dir.GLC.data,"C-300-001_sdtm.xlsx")
# Write data into first sheet
xlsx::write.xlsx(x=note
,file=new.excel.file.path
,sheetName=sheets.C200[1]
,append=FALSE)
# Add second to last sheets to the same Excel
lapply(X=2:length(sheets.C200)
,FUN=function(i) xlsx::write.xlsx(note
,file=new.excel.file.path
,sheetName=sheets.C200[[i]]
,append=TRUE))
```
## Create Gantt chart for employment history
```r!
#------------------------------------------
# Create Gantt chart for employment history
#------------------------------------------
# Input, output directory
task.1.input.folder.path <- file.path(dir.GLC.presentation,"images-download") # dir.exists(task.1.input.folder.path) [1] TRUE
task.1.output.folder.path <- file.path(dir.GLC.presentation,"gantt-charts") # dir.exists(task.1.output.folder.path) [1] TRUE
# Set end date of current employment to today
date.today <- as.character(Sys.Date())
# Create employment data
# Reshape it to long format which start dates and end dates are replaced by two new variables- variable, value
df <- as.data.frame(rbind(
c("University of Queensland\n PhD student", '2016-04-05', '2020-05-11',"past")
,c("QIMR Berghofer Medical Research Institute\n R programmer", '2020-04-27', '2021-11-09',"past")
,c("Greenlight Clinical\n Statistical programmer", '2021-09-27', date.today,"current"))) %>%
dplyr::select("task" = 1, "start" = 2, "end" = 3, "status"=4) %>%
dplyr::mutate(task = forcats::fct_rev(factor(task, levels = (task))),
start = as.Date(start),
end = as.Date(end)) %>%
dplyr::mutate(lab_pos = start) %>%
tidyr::pivot_longer(2:3, names_to = "variable", values_to = "value") # dim(df) [1] 6 4
# Change size of plotting window to 10:5 width to height
dev.new(width=10, height=5,noRStudioGD = TRUE)
# Plot employment as a gantt chart with Y axis showing every year
## [Changing line colors with ggplot()](https://stackoverflow.com/questions/5171263/changing-line-colors-with-ggplot)
gantt.chart <- ggplot2::ggplot(df, aes(x=value, y=task, color=status)) +
ggplot2::geom_line(linewidth = 20, show.legend = FALSE) +
# To modify color, use ggplot2::scale_color_manual(values=). values matched to color variable values current, past
ggplot2::scale_color_manual(values=c("red","grey50"))+
ggplot2::scale_x_date( labels = function(x) lubridate::year(x)
,date_breaks = '1 year'
,date_minor_breaks = "1 year"
,position = "top") +
ggplot2::labs(x = NULL,y = NULL) +
ggplot2::theme( axis.text.x=element_text(size=10)
,axis.text.y=element_text(size=12))
# Save employment gantt chart as a png file
## plot size needs to match those in dev.new()
ggplot2::ggsave( filename = file.path( task.1.output.folder.path,"employment-history.png")
,width = 10
,height = 5
,plot= print(gantt.chart) #
,device = "png")
# Read logo image files
logo.bash <- magick::image_read(file.path(task.1.input.folder.path, "BASH_full_colored_dark.jpg")) # class(logo.bash) [1] "magick-image"
logo.R <- magick::image_read(file.path(task.1.input.folder.path,"R-logo-Cluster-Analysis-with-R-Edureka.png"))
logo.SAS <- magick::image_read(file.path(task.1.input.folder.path,"SAS-logo.png"))
# Set base x y positions of the plot where logo images are added to
base.position.x.group.1 <- 0.475
base.position.y.group.1 <- 0.975
base.position.x.group.2 <- base.position.x.group.1+0.325
base.position.y.group.2 <- base.position.y.group.1-0.325
base.position.x.group.3 <- base.position.x.group.2+0.15
base.position.y.group.3 <- base.position.y.group.2-0.25
# Introduction to cowplot
# https://cran.r-project.org/web/packages/cowplot/vignettes/introduction.html
gantt.chart.final <- cowplot::ggdraw(gantt.chart) +
# Add logo images to employment 1 bar
cowplot::draw_image(logo.bash
,x = base.position.x.group.1
,y = base.position.y.group.1
,hjust = 1, vjust = 1, width = 0.1, height = 0.2) +
cowplot::draw_image(logo.R
,x = base.position.x.group.1 + 0.1
,y = base.position.y.group.1 - 0.04
,hjust = 1, vjust = 1, width = 0.1, height = 0.1) +
cowplot::draw_image(logo.SAS
,x = base.position.x.group.1+0.2
,y = base.position.y.group.1
,hjust = 1, vjust = 1, width = 0.1, height = 0.2)+
# Add logo images to employment 2 bar
cowplot::draw_image(logo.R
,x = base.position.x.group.2
,y = base.position.y.group.2
,hjust = 1, vjust = 1, width = 0.1, height = 0.1)+
cowplot::draw_image(logo.bash
,x = base.position.x.group.2+ 0.1
,y = base.position.y.group.2+0.05
,hjust = 1, vjust = 1, width = 0.1, height = 0.2)+
# Add logo images to employment 3 bar
cowplot::draw_image(logo.SAS
,x = base.position.x.group.3
,y = base.position.y.group.3
,hjust = 1, vjust = 1, width = 0.1, height = 0.2)
# Save the plot as a png file
# Sizes of exported image should be set to be similar to the plotting window, modified by dev.new(width=, height=)
ggplot2::ggsave( filename = file.path(task.1.output.folder.path,"employment-history_programming-software-added.png")
,width = 10
,height = 5
,plot= print(gantt.chart.final) #
,device = "png")
```
## Find and copy image file types
```r!
#--------------------------------------------------
# Find and copy image file types
#--------------------------------------------------
# Input, output directory
task.2.input.folder.path <- file.path(dir.GLC.presentation,"images-download")
task.2.output.folder.path <- file.path(task.2.input.folder.path,"images-only") # dir.exists(task.2.output.folder.path)
#dir.create(task.2.output.folder.path)
image.files.want.source <- list.files( path = task.2.input.folder.path
,pattern = "\\.png$|\\.jpg$"
,full.names = TRUE) # length(image.files.want.source) 10
image.files.want.destination <- file.path(task.2.output.folder.path
,basename(image.files.want.source))
# Copy files. Destination files will have the same date times as source files
file.copy(from = image.files.want.source
,to = image.files.want.destination
,copy.date = TRUE)
# Copy files. Destination files will have the date times when copy is performed
## Delete copied files before running this code
file.copy(from = image.files.want.source
,to = image.files.want.destination
,copy.date = FALSE)
```
## List folders as a tree
```r!
#-----------------------
# List folders as a tree
#-----------------------
# Input directories
dir.tmf <- file.path(dir.GLC.P.drive,"BIOSTATISTICS","Gina","tmf","Numbered")
dir.tmf.short <- file.path(dir.tmf, "Short_Names") # dir.exists(dir.tmf.short)
dir.tmf.long.stat <- file.path(dir.tmf,"Long_Names","Statistics") # dir.exists(dir.tmf.long.stat)
# List directories as a tree
fs::dir_tree(path = dir.tmf.short, recurse = TRUE)
fs::dir_tree(path = dir.tmf.long.stat, recurse = TRUE)
```
## Create folders using existing folders
```r!
#--------------------------------------
# Create folders using existing folders
#--------------------------------------
# Get folder directories within dir.tmf.short
## If we set, recursive = TRUE, the directories are returned in hierarchical order, beginning with the main directory path and then followed by the sub-directories. However, if we specify recursive = FALSE, only the child directories are returned
folder.paths.tmf.short.child <- list.dirs(path = dir.tmf.short
,full.names = TRUE
,recursive = FALSE) # length(folder.paths.tmf.short.child) 6
# Get child, grandchild directories
folder.paths.tmf.short.grandchild <- list.dirs(path=folder.paths.tmf.short.child
,full.names = TRUE) # length(folder.paths.tmf.short.grandchild) 31
# Copy source trial master file folders to destination
# Create a new parent directory
dir.tmf.new.short <- file.path(dir.GLC.presentation,"trial-master-files","numbered","short-named")
dir.create(path = dir.tmf.new.short, recursive = TRUE)
# Get child and grandchild folder names
folder.names.tmf.short.grandchild <- folder.paths.tmf.short.grandchild %>%
# Remove parent directory
gsub(pattern="C:/Google-Drive/Greenlight-clinical/P-drive/BIOSTATISTICS/Gina/tmf/Numbered/Short_Names", replacement="")
# Create new destination folder paths
dest.folder.paths.tm.short <- paste0(dir.tmf.new.short, folder.names.tmf.short.grandchild) # length(dest.folder.paths.tm.short) 31
unlist(lapply(X=dest.folder.paths.tm.short
,FUN=function(x) dir.create(x, recursive = TRUE))
)
```
## Create folders using eTMF reference model file
```r!
#-----------------------------------------------
# Create folders using eTMF reference model file
#-----------------------------------------------
eTMF <- readxl::read_excel(path = file.path(dir.GLC.data,"tmf-reference-model-version-3-0_16-june-20153.xlsx")
,sheet = "v3.0"
,skip = 2
,col_names = TRUE) %>%
# Select columns wanted by position
dplyr::select(c(1:6)) # dim(eTMF) 249 6
colnames(eTMF) <- colnames(eTMF) %>%
gsub(pattern=" #", replacement="_number") %>%
gsub(pattern=" ", replacement="_")
# Select Zone_Name="Statistics"
eTMF.stat <- eTMF %>%
dplyr::filter(Zone_Name=="Statistics") %>%
# Remove dots in *_number columns
dplyr::mutate(Section_number=gsub(pattern="\\.", replacement="",x=Section_number)
,Artifact_number=gsub(pattern="\\.", replacement="",x=Artifact_number)
,Zone_number_name=paste0(Zone_number,"_",Zone_Name)
,Section_number_name=paste0(Section_number,"_",Section_Name)
,Artifact_number_name=paste0(Artifact_number,"_",Artifact_name)
,folder.path=file.path(dir.eTMF, Zone_number_name, Section_number_name,Artifact_number_name)) # dim(eTMF.stat) 25 8
# Create new folders using the folder.path column
unlist(lapply(X=eTMF.stat$folder.path
,FUN=function(x) dir.create(x, recursive = TRUE))
)
# List directories as a tree
fs::dir_tree(path = file.path(dir.GLC.presentation,"eTMF","11_Statistics")
,recurse = TRUE)
```
## Reduce file size of a single image
```r!
#--------------------------------------------------
# Reduce file size of a single image
#--------------------------------------------------
task.3.input.folder.path <- file.path(dir.main, "Receipts-meals-not-claimed-yet") # dir.exists(task.3.input.folder.path)
task.3.output.folder.path <- file.path(dir.GLC.presentation,"reduced_images") # dir.exists(task.3.output.folder.path)
file.paths.jpg <- list.files(path=task.3.input.folder.path
,pattern = "*.jpg"
,full.names = TRUE) # length(file.paths.jpg) 98
# Reduce 1 jpg file
img <- jpeg::readJPEG(source=file.paths.jpg[2]
, native = TRUE) # class(img) [1] "nativeRaster"
# Export reduced object as a jpg file
output.file.path <-paste0(task.3.output.folder.path,"/small_",basename(file.paths.jpg[2]))
jpeg::writeJPEG(image=img
,target = output.file.path
, quality = 0.5)
```
## Reduce file size of multiple image image files
```r!
#--------------------------------------------------
# Reduce file size of multiple image image files
#--------------------------------------------------
# cl <- parallel::makeCluster(parallel::detectCores()-1
# ,outfile=file.path(task.3.output.folder.path
# ,paste0("image-reduction_error-log.txt")
# )
# ) # Use all detected cores minus 1 core (CPU) as a cluster, leaving 1 core for software or app running in the background in case desktop crashes (monitor turns black and nothing working)
# doParallel::registerDoParallel(cl, cores = parallel::detectCores()-1)
# Loop thru all jpg images to reduce file sizes
# Check existence of input folder
if(dir.exists(task.3.input.folder.path)!=TRUE){
cat("Input image folder could not be found")
} else{
# Read input image file paths as a vector
image.file.paths <- list.files(path = task.3.input.folder.path
,full.names = TRUE
,pattern = "\\.jpg$") # length(image.file.paths) 99
cat("There are",length(image.file.paths),"images in the folder")
# Use image file name to look up data for the image in the cell segmentation data
foreach::foreach(i=1:length(image.file.paths)
,.combine = 'c'
,.packages=c("foreach","dplyr","jpeg")) %dopar% {
print(paste0("=========== Processing image ",i,"============"))
# If data can be found, then subset. If data cannot be found, then copy the input image to the output
image.file.path <-image.file.paths[i]
image.file.name <-basename(image.file.path)
# Name output file
output.file.path <-paste0(task.3.output.folder.path,"/small_", image.file.name)
# Read jpg file
img <- jpeg::readJPEG(source=image.file.path, native = TRUE) # class(img) [1] "nativeRaster"
# Reduce jpg file size and output jpg file
jpeg::writeJPEG(image=img
,target = output.file.path
, quality = 0.5)
}
}
# Calculate % reduction
total.file.size.original <- 339
total.file.size.reduced <- 59.2
(total.file.size.original-total.file.size.reduced)/total.file.size.original*100 #[1] 82.53687
```
## Google map link to a single QR code
```r!
#---------------------------------------------
# Google map link to a single QR code
#---------------------------------------------
url <- "http://www.google.com/maps/place/49.46800006494457,17.11514008755796"
# A function to plot QR code in changeable color
ggQRCode <- function(text, color="black", alpha=1) {
x <- qr_code(text)
x <- as.data.frame(x)
y <- x
y$id <- rownames(y)
y <- gather(y, "key", "value", colnames(y)[-ncol(y)])
y$key = factor(y$key, levels=rev(colnames(x)))
y$id = factor(y$id, levels=rev(rownames(x)))
ggplot(y, aes_(x=~id, y=~key)) + geom_tile(aes_(fill=~value), alpha=alpha) +
scale_fill_manual(values=c("white", color)) +
# scale_fill_gradient(low="white", high=color) +
theme_void() + theme(legend.position='none')
}
# QR code in black
plot.new()
QR.black <- ggQRCode(url,color="black")
plot(QR.black)
# QR code in a different color
QR.blue <- ggQRCode(url,color="#56B4E9")
plot(QR.blue)
```
## Brisbane bike path latitude and longitude to QR codes
```r!
#-------------------------------------------
# Brisbane bike path latitude and longitude to QR codes
#-------------------------------------------
task.32.input.folder.path <- file.path(dir.GLC.presentation, "data-download")
task.32.output.folder.path <- file.path(dir.GLC.presentation, "QR-codes")
# Read csv files
bikeway <- read.delim(file=file.path(task.32.input.folder.path,"open-data-bikeway-count-sites-2004-2021.csv")
,header = TRUE
,sep = ",")[,c(1:3,5,6)] # dim(bikeway) 183 5
# Rename columns
colnames(bikeway) <- c("Site.ID","Parent.Site.ID","Site.Name","Latitude","Longitude")
# Create a new column with Google maps URLs
bikeway.2 <- bikeway %>%
dplyr::mutate(Google.map.link=paste0("http://www.google.com/maps/place/",Latitude,",",Longitude))
# Create output file paths
task.32.output.file.paths <- file.path(task.32.output.folder.path, bikeway.2$Site.ID,".svg") # length(task.32.output.file.paths) 183
# Loop thru all jpg images to reduce file sizes
# Check existence of input folder
if(dir.exists(task.32.output.folder.path)!=TRUE){
cat("Output folder could not be found")
} else{
# Use image file name to look up data for the image in the cell segmentation data
foreach::foreach(i=1:length(bikeway.2$Site.ID)
,.combine = 'c'
,.packages=c("foreach","dplyr","ggplot2","qrcode","svglite","grid")) %dopar% {
print(paste0("=========== Creating QR code ",i,"============"))
# Get Google map URL
url <-bikeway.2$Google.map.link[i]
# Name output file
output.file.path <- paste0(task.32.output.folder.path,"/", bikeway.2$Site.ID[i],".svg")
# Create QR code
QR.code <- ggQRCode(url, color="#56B4E9")
# Add label to plot
QR.code.2 <- QR.code+ggplot2::annotation_custom(grid::textGrob(bikeway.2$Site.Name[i])
,xmin = -Inf
,xmax = Inf
,ymin = -Inf
,ymax = Inf)
# Save QR code as .svg file
ggplot2::ggsave( filename = output.file.path
,plot= print(QR.code.2)
,device = "svg")
}
}
```