Try   HackMD

Regular expression, expression, and string manipualtion in R

Extract numbers of varying length from the end of strings

# Get file names of pdf or docx files
file.names <- list.files( path=dir.assignment.2
                          ,pattern=".*\\.pdf$|.*\\.docx$"
                          ,full.names=FALSE) # length(file.names) 21

file.names
# [1] "agarslawrence_204628_20026409_DMC_Assignment2_4614834-1.docx"   
# [2] "andrewstimothy_297293_19936198_DMC Assignment 2 0953.docx"      
# [3] "ashtonisabel_294366_19979255_DMC_2021S2_A2_6589.pdf"            
# [4] "beaumontjenna_294376_20033339_DMC_Assignment2_8752.pdf"         
# [5] "claytonphilipandrew_125548_20002099_DMC_Assignment2_7392-1.pdf" 
# [6] "eddydani_297300_19967943_DMC_Assignment2_4833.docx"             
# [7] "edwardsfelicity_281039_19732901_DMC_Assignment2_4127.pdf"       
# [8] "gannonsean_294373_19967863_DMC_Assignment2_6490.pdf"            
# [9] "gawjames_295684_19988820_DMC_Assignment2_7389.docx"             
# [10] "godwinsam_294371_20034982_DMC_Assignment2_unikey_6083_FINAL.pdf"
# [11] "guanqi_LATE_296467_20039218_ DMC_Assignment2_0313-1.pdf"        
# [12] "herbertrob_110692_19622280_DMC_Assignment2_2309.pdf"            
# [13] "hopkinsnathan_293664_20025212_DMC_Assignment2_8928.pdf"         
# [14] "huangannette_265602_20028154_DMC_Assignment 2_6822.pdf"         
# [15] "ilaovince_294377_20026222_DMC_Assignment2_6193.docx"            
# [16] "lammichael_293038_19961008_DMC_assignment2_0718.docx"           
# [17] "lamyanyin_294380_19991757_DMC_Assignment2_6098.docx"            
# [18] "leekenneth_281025_19958430_DMC_Assignment2_8800-2.docx"         
# [19] "luimatthew_296777_20032458_DMC_Assignment2_0384-1.pdf"          
# [20] "mcphailsandi_283784_20036309_5572_Assign2_DMC-1.pdf"            
# [21] "millerjake_278059_20036475_DMC_Assignment2_8847.docx"  

file.names.1 <- data.frame(file.name=file.names
                            ,stringsAsFactors = F) %>%
  # Remove unwanted strings
  dplyr::mutate(have= stringr::str_remove_all( string=file.names
                                                        ,pattern = "_Assign2_DMC-1.pdf|-1.docx|-1.pdf|_FINAL.pdf|-2.docx|.docx|.pdf")
                # Extract assignment ID as 4 digit numbers at the end of .assignment.ID (\\d{4,7})$  
                ,want=gsub(x=have, pattern = "(.*)(\\d{4,7})$", replacement = "\\2", perl = TRUE)) %>%
  #Keep wanted columns
  dplyr::select(have, want)

# Output
#                                                        have want
# 1     agarslawrence_204628_20026409_DMC_Assignment2_4614834 4834
# 2      andrewstimothy_297293_19936198_DMC Assignment 2 0953 0953
# 3           ashtonisabel_294366_19979255_DMC_2021S2_A2_6589 6589
# 4        beaumontjenna_294376_20033339_DMC_Assignment2_8752 8752
# 5  claytonphilipandrew_125548_20002099_DMC_Assignment2_7392 7392
# 6             eddydani_297300_19967943_DMC_Assignment2_4833 4833
# 7      edwardsfelicity_281039_19732901_DMC_Assignment2_4127 4127
# 8           gannonsean_294373_19967863_DMC_Assignment2_6490 6490
# 9             gawjames_295684_19988820_DMC_Assignment2_7389 7389
# 10    godwinsam_294371_20034982_DMC_Assignment2_unikey_6083 6083
# 11        guanqi_LATE_296467_20039218_ DMC_Assignment2_0313 0313
# 12          herbertrob_110692_19622280_DMC_Assignment2_2309 2309
# 13       hopkinsnathan_293664_20025212_DMC_Assignment2_8928 8928
# 14       huangannette_265602_20028154_DMC_Assignment 2_6822 6822
# 15           ilaovince_294377_20026222_DMC_Assignment2_6193 6193
# 16          lammichael_293038_19961008_DMC_assignment2_0718 0718
# 17           lamyanyin_294380_19991757_DMC_Assignment2_6098 6098
# 18          leekenneth_281025_19958430_DMC_Assignment2_8800 8800
# 19          luimatthew_296777_20032458_DMC_Assignment2_0384 0384
# 20                        mcphailsandi_283784_20036309_5572 5572
# 21          millerjake_278059_20036475_DMC_Assignment2_8847 8847

Specify fitration condtions as expression objects and evaulate them in dplyr::filter()

# A test data.frame 
d <- data.frame(  x1=runif(10)
                 ,x2=runif(10)
                 ,y1=runif(10)
                 ,y2=runif(10)
                 ,stringsAsFactors = F) # dim(d) 10 4

#------------------------------------
# Subset data without using expression
#------------------------------------
# Subset data using dplyr::filter on column positions
d.1 <- d %>% dplyr::filter(.[,1]> 0.3 & .[,3]> 0.2) # dim(d.1) 5 4
d.2 <- d %>% dplyr::filter(.[,2]> 0.4 & .[,4]> 0.6) # dim(d.1) 5 4
d.3 <- d %>% dplyr::filter(.[,1]> 0.3 & .[,3]> 0.2 & .[,2]> 0.4 & .[,4]> 0.6) # dim(d.3) 1 4

#------------------------------------
# Subset data using expression
#------------------------------------
# Add the filtration conditions abve to expression objects
expression.1 <- expression(.[,1] > 0.3 & .[,3] > 0.2 ) # class(ss) [1] "expression"
expression.2 <- expression(.[,2] > 0.4 & .[,4] > 0.6 ) # class(ss) [1] "expression"
expression.3 <- expression(.[,1] > 0.3 & .[,3] > 0.2 & .[,2] > 0.4 & .[,4] > 0.6 ) # class(ss) [1] "expression"

# Evaluate the expression and use the filtration conditions to filter data
d.1 <- d %>% dplyr::filter(eval(expression.1)) # dim(d.1) 5 4
d.2 <- d %>% dplyr::filter(eval(expression.2)) # dim(d.2) 2 4
d.3 <- d %>% dplyr::filter(eval(expression.3)) # dim(d.3) 1 4

#------------------------------------
# Subset data using expression
# Pass variables to expression
#------------------------------------
threshold.x.1 <- 0.3
threshold.y.1 <- 0.2

threshold.x.2 <- 0.4
threshold.y.2 <- 0.6

expression.1 <- expression(.[,1] > threshold.x.1 & .[,3] > threshold.y.1 ) # class(ss) [1] "expression"
expression.2 <- expression(.[,2] > threshold.x.2 & .[,4] > threshold.y.2 ) # class(ss) [1] "expression"
expression.3 <- expression(.[,1] > threshold.x.1 &
                           .[,3] > threshold.y.1 & 
                           .[,2] > threshold.x.2 & 
                           .[,4] > threshold.y.2 )

d.1 <- d %>% dplyr::filter(eval(expression.1)) # dim(d.1) 5 4
d.2 <- d %>% dplyr::filter(eval(expression.2)) # dim(d.2) 2 4
d.3 <- d %>% dplyr::filter(eval(expression.3)) # dim(d.3) 1 4

Examples of subsetting numbers from image file names

# Get "[1,1,B]" from the string x1
x1 <- "200416_PD1CD8 HNSCC TMA B_Core[1,1,B]_[50535,21284].im3"
gsub( pattern = ".*(\\[[0-9],[0-9],[A-Z]\\]).*"
     ,replacement = "\\1" # get pattern within the 1st pair of brackets
     ,x= x1) # [1] "[1,1,B]"

# Get "[1,1,B]_[50535,21284]" from the string x1
gsub(pattern = ".*(\\[[0-9],[0-9],[A-Z]\\]_\\[\\d+,\\d+\\]).*"
     ,replacement= "\\1" # get pattern within the 1st pair of brackets
     ,x=x1) # [1] "[1,1,B]_[50535,21284]"

# Get "200330_[50706,13606]" from the string x2
x2 <- "B16F10 KO 200330_[50706,13606]_CD8_path_view.jpg" 
gsub(pattern = ".*( )(\\d+_\\[\\d+,\\d+\\]).*"
     ,replacement= "\\2" # get pattern within the 2nd pair of brackets
     ,x=x2) # [1] "200330_[50706,13606]" 

# Get "200330_[50706,13606]" from the string x3
x3 <- "200330_[50706,13606]_CD8_path_view.jpg"
gsub(pattern = "(\\d+_\\[\\d+,\\d+\\]).*"
     ,replacement= "\\1" # get pattern within the 1st pair of brackets
     ,x=x3) # [1] "200330_[50706,13606]"

gsub(pattern = ".*(\\d+_\\[\\d+,\\d+\\]).*"
     ,replacement= "\\1" # get pattern within the 1st pair of brackets
     ,x=x3) # [1] "0_[50706,13606]"

# Get "200330_[50706,13606]" from the string x2, x3 using user xiangying's approach
stringr::str_sub(string=x3
                 ,stringr::str_locate(string = x3, pattern = "\\d{1,7}_\\[\\d{1,7},\\d{1,7}\\]")
) # [1] "200330_[50706,13606]"

stringr::str_sub(string=x2
                 ,stringr::str_locate(string = x2, pattern = "\\d{1,7}_\\[\\d{1,7},\\d{1,7}\\]")
) # [1] "200330_[50706,13606]"

Quantifiers

Regular Expressions in R
Quantifiers specify how many repetitions of the pattern.

*: matches at least 0 times.
+: matches at least 1 times.
?: matches at most 1 times.
{n}: matches exactly n times.
{n,}: matches at least n times.
{n,m}: matches between n and m times.


Operators

Regular Expressions in R

.: matches any single character, as shown in the first example.
[]: a character list, matches any one of the characters inside the square brackets. We can also use - inside the brackets to specify a range of characters.


Character classes

Regular Expressions in R
Character classes allows to – surprise! – specify entire classes of characters, such as numbers, letters, etc. There are two flavors of character classes, one uses [: and :] around a predefined name inside square brackets and the other uses \ and a special character. They are sometimes interchangeable.

\d: digits, equivalent to [0-9].
\D: non-digits, equivalent to [^0-9].
\w: word characters, equivalent to [[:alnum:]_] or [A-z0-9_].
\W: not word, equivalent to [^A-z0-9_].
\s: space, .
\S: not space.

[:digit:] or \d: digits, 0 1 2 3 4 5 6 7 8 9, equivalent to [0-9].
[:lower:]: lower-case letters, equivalent to [a-z].
[:upper:]: upper-case letters, equivalent to [A-Z].
[:alpha:]: alphabetic characters, equivalent to [[:lower:][:upper:]] or [A-z].
[:alnum:]: alphanumeric characters, equivalent to [[:alpha:][:digit:]] or [A-z0-9].
[:xdigit:]: hexadecimal digits (base 16), 0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f, equivalent to [0-9A-Fa-f].
[:blank:]: blank characters, i.e. space and tab.
[:space:]: space characters: tab, newline, vertical tab, form feed, carriage return, space.
[:punct:]: punctuation characters, e.g.,

! " # $ % & ’ ( ) * + , - . / : ; < = > ? @ [  ] ^ _ ` { | } ~.

[:graph:]: graphical (human readable) characters: equivalent to [[:alnum:][:punct:]].
[:print:]: printable characters, equivalent to [[:alnum:][:punct:]\s].
[:cntrl:]: control characters, like \n or \r, [\x00-\x1F\x7F].
Note:

[:...:] has to be used inside square brackets, e.g. [[:digit:]].
\ itself is a special character that needs escape, e.g. \\d. Do not confuse these regular expressions with R escape sequences such as \t.


gsub, sub


Fix strings that are not in a pattern as the other strings do

# Create strings with most elements follow a pattern of centreID, disease, 3 digit number, and name
string <- c("A lung 001, Tom", "B melanoma 001, John", "C HNSCC 010, Kim", "breast 001 D, Kate")

# Find strings that are in the pattern
strings.matched.patterns <- grep(pattern = "^[A-C] \\w+ \\d+, \\w+", x=string, value = TRUE)
strings.matched.patterns
# [1] "A lung 001, Tom"      "B melanoma 001, John" "C HNSCC 010, Kim"

# Find strings that are not in the pattern
strings.not.matched.patterns <- grep(pattern = "^[A-C] \\w+ \\d+, \\w+", x=string, value = TRUE, invert = TRUE)
strings.not.matched.patterns
# [1] "breast 001 D, Kate"

# Split string into parts, geting one part at a time. Here the grouping operator () is used only for the part of interest
center <- gsub(pattern = "\\w+ \\d+ ([A-Z]), \\w+","\\1",x=strings.not.matched.patterns)
center
#[1] "D"

disease <- gsub(pattern = "(\\w+) \\d+ [A-Z], \\w+","\\1",x=strings.not.matched.patterns)
disease
#[1] "breast"

ID <- gsub(pattern = "\\w+ (\\d+) [A-Z], \\w+","\\1",x=strings.not.matched.patterns)
ID
#[1] "001"

name <- gsub(pattern = "(\\w+) \\d+ [A-Z], (\\w+)","\\2",x=strings.not.matched.patterns)
name
#[1] "Kate"

# Combine parts to a new string
new.string <- paste0(center," ", disease," ", ID,", ", name )
new.string
#[1] "D breast 001, Kate"

# Repeat the steps above with 1 line
new.string <- gsub(pattern = "(\\w+) (\\d+) ([A-Z]), (\\w+)", replacement = "\\3 \\1 \\2, \\4",x=strings.not.matched.patterns)
new.string
#[1] "D breast 001, Kate"

# Put matched strings and cleaned string together
strings.cleaned <- c(strings.matched.patterns,new.string)
strings.cleaned
#[1] "A lung 001, Tom"      "B melanoma 001, John" "C HNSCC 010, Kim"     "D breast 001, Kate"

Replace single back slash with a dash

Replace single backslash in R

# A string with \. Note R automatically adds \ to escape \
x <- "A\\B"
# Replace \ with -
gsub(pattern = "\\\\", replacement="-", x)
# "A-B"

# A string with /
y <- "C/D"

# Replace / with -
gsub(pattern = "/", replacement = "-", x=y)
#[1] "C-D"

Extract numbers from string

Extract only numbers from a string with punctuation and spaces in R? [duplicate]
R Only extract 3 digit numbers from a string

# (\\S+) matches one or more non-white space
# (\\s+) matches one or more white space 
# that are repeated twice ({2}) followed by one or more numbers captured as a group ((\\d+)) followed by a , and other characters until the end of the string and replace it with the backreference of the second captured group (\\2)
df1 <- data.frame(A=c( "XY Z 123, 30009 Addr"
                      ,"AB CBA 12, 900000 Addr"
                      ,"FC AX 1234, 977777 Addr")
                  ,stringsAsFactors = F)

df1$A.part1 <- sub("(\\S+\\s+){2}(\\d+),.*", "\\1", df1$A) # [1] "Z "   "CBA " "AX "
df1$A.part2 <- as.numeric(sub("(\\S+\\s+){2}(\\d+),.*", "\\2", df1$A)) #[1]  123   12 1234
df1$A.part3 <- sub("(\\S+\\s+){2}(\\d+),.*", "\\3", df1$A) # [1] "" "" ""

Search a pattern that starts with "GSCAN" and ends with "S" and 1-digit number between 1 and 8

# A vector to search a pattern
> colnames
 [1] "FAMID"                    "ID"                       "FATHERID"                 "MOTHERID"                
 [5] "GENDER"                   "fid"                      "GSCAN_Q1"                 "GSCAN_Q2_recode"         
 [9] "GSCAN_Q3_recode"          "GSCAN_Q4"                 "GSCAN_Q5_Drinks_per_week" "GSCAN_Q6_recode"         
[29] "GSCAN.ai.S1"              "GSCAN.ai.S2"              "GSCAN.ai.S3"              "GSCAN.ai.S4"             
[33] "GSCAN.ai.S5"              "GSCAN.ai.S6"              "GSCAN.ai.S7"              "GSCAN.ai.S8"             

# Search a pattern starts with GSCAN, ends with S and 1-8, with 2 dots and anything between 
grep("^GSCAN.*.S[1-8]$",colnames,value=TRUE)
[1] "GSCAN.ai.S1"  "GSCAN.ai.S2"  "GSCAN.ai.S3"  "GSCAN.ai.S4"  "GSCAN.ai.S5"  "GSCAN.ai.S6"  "GSCAN.ai.S7"  "GSCAN.ai.S8"

Search a pattern that ends with ".x". Delete this colname suffix

*remove all delimiters at beginning and end of string

# The data to change colnames
names(OR_horiPleio_keep_small)
[1] "id.exposure" "id.outcome"  "outcome.x"   "exposure.x"  "method"      "nsnp"        "b"           "se.x"        "pval.x"     
[10] "lo_ci"       "up_ci"       "or"          "or_lci95"    "or_uci95"   

# Remove the suffix ".x"
gsub(names(OR_horiPleio_keep_small),pattern = ".x$",replacement = "")
[1] "id.exposure" "id.outcome"  "outcome"     "exposure"    "method"      "nsnp"        "b"           "se"          "pval"       
[10] "lo_ci"       "up_ci"       "or"          "or_lci95"    "or_uci95"

R user group meeting # 9 slide content

#----------------------------------------------------------------
# Scenario 1: getting output from linear or logistic regression
#----------------------------------------------------------------
hsb2 <- read.csv("https://stats.idre.ucla.edu/stat/data/hsb2.csv") # dim(hsb2)

# Run linear regression with categorical predictors
linear.model.summary <- summary(lm(write ~ factor(race), data = hsb2))

# Getting coefficients for the predictors
coefficients <- linear.model.summary[["coefficients"]]

# Convert a matrix to a data.frame keeping its dimnames
## .margins = 1 splits .data by rows
## .fun=c applies c() to each piece
coefficients.dataFrame <- plyr::adply( .data= coefficients
                                      ,.margins = 1
                                      ,.fun=c) # class(coefficients.dataFrame) "data.frame"

coefficients.dataFrame$X1 <- as.character(coefficients.dataFrame$X1)

# Rename columns
new.column.names <- gsub( x=colnames(coefficients.dataFrame)
                         ,pattern = "X1"
                         ,replacement = "Predictor")

colnames(coefficients.dataFrame) <- new.column.names

# Remove unwanted string (, factor, ) in a column with gsub()
patterns <- "\\(|factor\\(|\\)"
temp1 <- coefficients.dataFrame
temp1$Predictor <- gsub( x=temp1$Predictor
                         ,pattern=patterns
                         ,replacement="")

# Remove unwanted string (, factor, ) in a column with stringr::str_replace_all
patterns <- "\\(|factor\\(|\\)"
temp2 <- coefficients.dataFrame
temp2$Predictor <- stringr::str_replace_all(string = temp2$Predictor
                                            ,pattern=patterns
                                            ,replacement="")

# Predictor  Estimate Std. Error    t value     Pr(>|t|)
# 1 Intercept 46.458333   1.842243 25.2183502 1.921811e-63
# 2     race2 11.541667   3.286129  3.5122376 5.515272e-04
# 3     race3  1.741667   2.732488  0.6373922 5.246133e-01
# 4     race4  7.596839   1.988870  3.8196768 1.792682e-04
#----------------------------------------------------------------
# Scenario 2: Subset file paths with a pattern
#----------------------------------------------------------------
source.files.path <- "/mnt/lustre/working/lab_nickm/lunC/MR_ICC_GSCAN_201806/two-sample-MR/input/harmonised-data"

all.files <- list.files(path=source.files.path) # length(all.files) 383

# Subset TSV files (positive filtering) with list.files()
patterns <- "harmonised-data.*\\.tsv$"
tsv.files <- list.files(path=source.files.path
                        ,pattern = patterns
                        ,full.names = TRUE) # length(tsv.files) 220

# Subset TSV files with Sys.glob()
## Sys.glob() does wildcard expansion like Unix Bash Shell
patterns <- "harmonised-data*\\.tsv"
tsv.files <- Sys.glob(file.path(paste0(source.files.path,"/",patterns))) # length(tsv.files) 220

# Subset non tsv files (negative filtering)
patterns <- "harmonised-data.*\\.tsv$"
non.tsv.files <- grep(x=all.files
                      ,pattern = patterns
                      ,value = TRUE
                      ,invert = TRUE) # length(non.tsv.files) 163
#----------------------------------------------------------------
# Scenario 3: Subset groups
#----------------------------------------------------------------
# Create subgroups 
group.1 <- c("NSW","ACT","VIC","QLD","SA","WA","TAS","NT") # length(group.1) 8
group.2 <- paste0("age",c("4-20","21-40","41-60","61+"))
group.3 <- c("males","females","bothSexes")

# Create all combinations from the 3 vectors
## data.table::CJ creates a Join data table
all.groups.subgroups <- data.table::CJ(group.1, group.2, group.3, sorted = FALSE)[, paste(group.1, group.2, group.3, sep ="_")] # length(all.groups.subgroups) 96

# Subset males
males <- grep(x=all.groups.subgroups,pattern = "_males$", value = TRUE) # length(males) 32

# Subset females aged over 61 from eastern states (QLD, NSW, VIC, ACT, TAS)
# Specify patterns
pattern.1 <- "^NSW|^QLD|^VIC|^ACT|^TAS"
pattern.2 <- "_females$"
pattern.3 <- "61\\+"

# Subset data from Eastern states
eastern.states <- grep(x=all.groups.subgroups,pattern = pattern.1, value = TRUE) # length(eastern.states) 60 

# Subset data from females in Eastern states
eastern.states.females <- grep(x=eastern.states,pattern = pattern.2, value = TRUE) # length(eastern.states.females) 20

# Subset data from females 61+ in Eastern states
eastern.states.females.61plus <- grep(x=eastern.states.females,pattern = pattern.3, value = TRUE) # length(eastern.states.females.61plus) 5

# Subset data from females 61+ in Eastern states
eastern.states.females.61plus <- grep(x=all.groups.subgroups, pattern = pattern.1, value = TRUE) %>% 
  grep(., pattern = pattern.2, value=T) %>% 
  grep(. , pattern=pattern.3, value=T) # length(eastern.states.females.61plus) 5

#----------------------------------------------------------------
# Summary
#----------------------------------------------------------------
gsub(pattern = )
str_replace_all(pattern = )
list.files(pattern=)
grep(pattern = )
Sys.glob()

list.files() sorts your data[1]

# Manually list the trait pairs as the order of the destination file paths. Be aware that data have been sorted
# Exposure              clumping p1 Outcome
#------------------------------------------------
# GSCAN drinks per week p1-1e-5     GSCAN CPD
# UKB-caffeine          p1-1e-5     UKB-CPD
# UKB-caffeine          p1-5e-8     ICC-CI
# GSCAN SI              p1-1e-5     UKB-caffeine
# GSCAN SI              p1-5e-8     UKB-caffeine
#------------------------------------------------

Get a list of file paths with a pattern[2]

#---------------------------------------------------
# Get a list of paths of harmonised data files
#---------------------------------------------------
# Method 1: use list.files() and glob2rx() 
## glob2rx() uses * as a wildcard and expands the string into regular expression for R. Note regex in R is different from regex in Linux
## Here glob2rx() added ^, dot, \\ and $
### regex in R: (1) dot usually means "match any character" (2) * means The preceding item will be matched zero or more times. .* is the same as * in linux
pattern.file.names <- glob2rx("harmonised-data_exposure*.tsv") # "^harmonised-data_exposure.*\\.tsv$"

file.paths.harmonised.data <- list.files(path=loc.twoSampleMR.harmonised
                                         ,pattern = pattern.file.names
                                         ,full.names = TRUE) # length(file.paths.harmonised.data) 220
# Method 2: use Sys.glob()
# Sys.glob() Expands wildcard on file paths
file.paths.harmonised.data <- Sys.glob(path=paste0(loc.twoSampleMR.harmonised,"harmonised-data*\\.tsv")) # length(file.paths.harmonised.data) 220

Copy selective files from one directory to another directory[3]

# Specify patterns in file names to search in file paths. Note file paths do not begin with file name. 
pattern.1 <- "*dpw-noICC-LDWindow-kb-10000-R2-0\\.01-p1-1e-5-p2-1e-5-linear-BETA-added_outcome-GSCAN-CPD\\.tsv$"
pattern.2 <- "*si-noICC-LDWindow-kb-10000-R2-0\\.01-p1-1e-5-p2-1e-5*_outcome-UKB-caffeine\\.tsv$"
pattern.3 <- "*UKB-caffeine-LDWindow-kb-10000-R2-0\\.01-p1-1e-5-p2-1e-5*_outcome-UKB-CPD\\.tsv$"
pattern.4 <- "*si-noICC-LDWindow-kb-10000-R2-0\\.01-p1-5e-8-p2-1e-6*_outcome-UKB-caffeine\\.tsv$"

# Combine multiple patterns as one string
patterns <- paste0(pattern.1,"|",pattern.2,"|",pattern.3,"|",pattern.4)

# Get paths of the files
source.file.paths <- grep(patterns
                        ,list.files(path= loc.harmonised.data
                                    ,full.names = T)
                        ,value = TRUE) # length(source.file.paths) 4

# Create destination file paths
destin.file.paths <- paste0(loc.MRPRESSO.input,basename(source.file.paths))

# Manually move existing files to the archive folder
file.copy(from= source.file.paths
          ,to= destin.file.paths)

Parse log files generated from Linux software and tabulate data as a single file[4].

Subset file paths with negative filtration using dplyr::filter(!grepl("pattern",variable))

pattern.file.name.h2.log <- glob2rx("SNP-heritability_*-*.log") # "^SNP-heritability_.*-.*\\.log$"

h2.log <- data.frame(file.path=list.files(path=loc.LDSC.h2
                                          ,pattern = pattern.file.name.h2.log
                                          ,full.names = TRUE)
                     ,stringsAsFactors = F) # dim(h2.log) 11 1

h2.log <- h2.log %>% 
  # Exclude CCPD
  dplyr::filter(!grepl("CCPD",file.path))

Remove special characters. (1) What are special characters? (2) How to search them and then remove them? (3) glob2rx() Change wildcard aka globbing patterns into the corresponding regular expressions (regexp).

clean.string <- "Clean string with words separated by white space"

string.with.special.characters <- "Clean string with words separated by (white space)"
string.with.special.characters <- "Clean string with words separated by (white space) ~!@#$%^&*(){}_+:\"<>?,./;'[]-="

# This will Work
pattern.to.search <- "\\(|\\)|\\~|\\!"
gsub(string.with.special.characters, pattern=pattern.to.search, replacement="")

Drop/delete columns in a data.frame

# Drop unwanted columns 
columns.to.drop <- c("clumping.p1","clumping.p2")
exposure.GWAS <- exposure.GWAS[,!(names(exposure.GWAS) %in% columns.to.drop)]

Regular expression

# A regex for scientific notation
-?[\d.]+(?:e-?\d+)?

# Explanation
-?      # an optional -
[\d.]+  # a series of digits or dots (see *1)
(?:     # start non capturing group
  e     # "e"
  -?    # an optional -
  \d+   # digits
)?      # end non-capturing group, make optional

Manipulating string in data.frame

Extract scientific notations from a column.

# A column contains scientific notation
head(exposure.GWAS$fileNameSpecial,2)
[1] "clumped-ai-noICC-LDWindow-kb-10000-R2-0.01-p1-1e-5-p2-1e-5"
[2] "clumped-ai-noICC-LDWindow-kb-10000-R2-0.01-p1-5e-8-p2-1e-6"

# Extract scientific notations using regex 
## Only 2 values to extract for p1: 1e-5, 5e-8
## Only 2 values to extract for p2: 1e-5, 1e-6
library(dplyr)
exposure.GWAS <- exposure.GWAS %>%
  dplyr::mutate(clumping.p1.value= stringr::str_extract(fileNameSpecial,"[1|5]e[-][5|8]")
                ,clumping.p2.value= stringr::str_extract(fileNameSpecial,"[1]e[-][5|6]")) # dim(exposure.GWAS) 20 13

> head(exposure.GWAS$clumping.p1.value,2)
[1] "1e-5" "5e-8"
> head(exposure.GWAS$clumping.p2.value,2)
[1] "1e-5" "1e-6"

Delete/remove strings in a column using stringr::str_replace_all

# Vertically combine two data.frames with identifical column structure   
# Delete 3 strings in a column (1) "factor(" , (2) ")1", (3) ")2"
strings.to.search <- glob2rx("^factor\\(|\\)1$|\\)2$")

tem1.tem2 <- dplyr::bind_rows(tem1.small,tem2.small) %>%
  dplyr::mutate(predictor= stringr::str_replace_all(predictor
                                                    ,pattern=strings.to.search
                                                    ,replacement = "")) # dim(tem1.tem2)66 9

Delete/remove part of a string in a column using gsub(data$column,)

# Two columns to delete "p1-" and "p2-"
head(exposure.GWAS$clumping.p1,2)
[1] "p1-1e-5" "p1-5e-8"
> head(exposure.GWAS$clumping.p2,2)
[1] "p2-1e-5" "p2-1e-6"

# Delete "p1-" and "p2-"
exposure.GWAS$clumping.p1.value <- gsub(exposure.GWAS$clumping.p1, pattern = "p1-", replacement = "")
exposure.GWAS$clumping.p2.value <- gsub(exposure.GWAS$clumping.p2, pattern = "p2-", replacement = "")

head(exposure.GWAS$clumping.p1.value,2)
[1] "1e-5" "5e-8"
head(exposure.GWAS$clumping.p2.value,2)
[1] "1e-5" "1e-6"

Replace dots with dashes in column names

# A simple regular expression to replace dots with dashes. This might have unintended consequences, so be sure to check the results
names(data) <- gsub(x = names(data),pattern = "\\.",replacement = "-")

# replace dash with underscore str_replace_all
unlist(str_replace_all(inputFileNamePart2,"-","_"))

List directory of multiple files with patterns. Syntax form: list.files(path=c(folder1,folder2,..),pattern=glob2rx(^patter1*patter2$),full.names=TRUE) Sys.glob() doesn't seem to take a complex pattern. glob2rx() converts a pattern including a wildcard into the equivalent regular expression. You then need to pass this regular expression onto one of R's pattern matching tools. The 5 patterns are specified with list.files(path=,pattern=glob2rx(p1|p2|p3|p4|p5),full.names=TRUE) will return full paths of matched files

# List directory of files with 1 pattern in their file names from 4 folders
filePath.UKB <- list.files(path=c(locUKB.3456.QC4,locUKB.ESDPW.QC4,locUKB.CCPD.QC4,locUKB.20161.QC4)
                           ,pattern = glob2rx("^GWAS_from-clumped-SNPs_GWAS-UKB*_sample-size-added$")
                           ,full.names = T) # length(filePath.UKB) 8
# List directory of files with multiple patterns
pattern.GSCAN.files <- "^GWAS_from-clumped-SNPs_ai_noICC_LDWindow-kb-10000_R2-0.01_*|^GWAS_from-clumped-SNPs_cpd_noICC_LDWindow-kb-10000_R2-0.01*|^GWAS_from-clumped-SNPs_dpw_*linear-BETA-added$|GWAS_from-clumped-SNPs_sc_noICC_LDWindow-kb-10000_R2-0.01_*|GWAS_from-clumped-SNPs_si_noICC_LDWindow-kb-10000_R2-0.01_*"

filePath.clumped.GWAS.GSCAN <- list.files(path=locGSCAN
                                          ,pattern=glob2rx(pattern.GSCAN.files)
                                          ,full.names = TRUE) # length(filePath.clumped.GWAS.GSCAN) 10                     

Find file names that start with a pattern and end with another pattern

# List files that start with "revised_bolt_imputed", denoted by ^, and end with ".glm.logistic", denoted by $
list.files(path = current_dir,pattern = "^revised_bolt_imputed(.*).bgen.assoc$|(.*).glm.logistic$")

How to use the strsplit function with a period? When using a regular expression in the split argument of strsplit(), you've got to escape the . with \\., or use a charclass [.]. Otherwise you use . as its special character meaning, "any single character".

To extract numbers from a string, replace non-numeric part with nothing

# Extract 10 from "LDBasedSNPclumping_chr10.clumped" using gsub(arg1,arg2,arg3)
## arg1: string to look for
## arg2: replace arg1 with arg2 string
## arg3: the string to work on
CHR=as.numeric(gsub("LDBasedSNPclumping_chr","",unlist(strsplit(fileName,"[.]"))[1]))

To move a directory in R like mv in UNIX, first copy the directory recursively to the destination folder, check if the dates of last modification are preserved, and then delete the folders that have been copied.

*Copy folders from one directory to another in R

#-------------------------------------------------------------------------------------#
#------- Archive old folders keeping date of last modification------------------------#
#-------------------------------------------------------------------------------------#
destination=paste0(locGCTAInput,"archive")

source_folders=c(paste0("phenoGroup2_everDrug1to10-CUD_GCTA--",c("covar","qcovar","pheno"))
                 ,paste0("phenoGroup3_alcoho-tobacc_GCTA--",c("covar","qcovar","pheno"))
                 ,paste0("phenoGroup5_diagMD-diagSU_GCTA--",c("covar","qcovar","pheno"))
                 )
# Move 9 folders to the archive folder
count=0
for (folderToMove in source_folders){
  path_folderToMove=paste0(locGCTAInput,folderToMove)
  count=count+1
  print(paste0("===================================== iteration", count," ===================="))
  print(paste0("path_folderToMove=",path_folderToMove))
  file.copy(from= path_folderToMove
            ,to= destination
            ,recursive = TRUE # TRUE incidcates the to= is a directory
            ,copy.date = TRUE # if TRUE, preserve date of last modification of "from"
  )
}

# Go check if date of last modification are preserved in the archive folder

# Delete source folder that have been copied to the archive folder
count=0
for (folderToMove in source_folders){
  path_folderToMove=paste0(locGCTAInput,folderToMove)
  count=count+1
  print(paste0("===================================== iteration", count," ===================="))
  print(paste0("path_folderToMove=",path_folderToMove))
  # Delete the folders that have been copied with last modification dates
  unlink(path_folderToMove, recursive=TRUE)
}

Find files with matched patterns in their directories using Sys.glob() instead of list.files(). It uses shell globbing for the pattern instead of regular expressions. This function does the same thing as shell realpath

# Get full paths of same-named files summedRiskProfiles.S1-S8_newHeader under 80 different folders
Sys.glob(path=paste0(locASCOut,"/uniqSNPs_from_metaDataQCed-Release8*/innerJoinedSNPsByCHRBP_metaDataQCed-Release8*/dosageFam_Release8_*/summedRiskProfiles.S1-S8_newHeader"))

list files under a directory as absolute paths

# The folder to extract full paths of every file
ouputFolderPath=paste0(locGCTAInput,"phenoGroup3_alcoho-tobacc_GCTA--pheno/")
# Similar to realpath in shell, the full.names=TRUE option gets list.files() to print the file path
list.files(path=ouputFolderPath,pattern="pheno_*",full.names = TRUE)

Extract file name or folder path from a file path

# A file path to manipulate
GWAS_to_use$directory[dir]
"/mnt/lustre/working/lab_nickm/lunC/PRS_UKB_201711/GWASSummaryStatistics/GWAS_GSCAN/noUKBiobank_results/cpd_noUKBiobank.txt"
# Extract file name from file path using basename()
GWASFileName=basename(GWAS_to_use$directory[dir])
# Extract folder path from file path using dirname()
GWASFileFolderPath=dirname(GWAS_to_use$directory[dir]) 

Read files that don't have two patterns in their file names from a folder

input1="D:/Now/library_genetics_epidemiology/GWAS/GCTA/GREML1Var_201711/"

# Read files that DON"T have 'nil' in their file names and exclude the 'scripts' folder
filesWantList <- grep(list.files(path=input1)
                      ,pattern='nil|scripts' # 2 patterns to look for
                      ,inv=T # inverts the match
                      ,value=T # returns the values of the matches (i.e. the filenames) rather than the indices of the matches
                      )

  1. R script file path: /mnt/backedup/home/lunC/scripts/MR_step06-05-01_prepare-input-files-for-MR-PRESSO.R ↩︎

  2. R script file path:
    /mnt/backedup/home/lunC/scripts/MR_ICC_GSCAN_201806/MR_step06-03-07_run_heterogeneity-test.R ↩︎

  3. R script file path: /mnt/backedup/home/lunC/scripts/MR_ICC_GSCAN_201806/MR_step06-05-01_prepare-input-files-for-MR-PRESSO.R ↩︎

  4. R script file path:
    /mnt/backedup/home/lunC/scripts/MR_ICC_GSCAN_201806/MR_step08-02-03_parse-tabulate_LDSC-SNP-heritability_LDSC-genetic-correlations.R ↩︎