###### tags: `R` `grep` `list.files` `basename` `dirname` `regex` `Sys.glob` `file.copy` `unlink` `gsub` `as.numeric` `strsplit` `glob2rx` `Sys.genenv` `dplyr::bind_rows` `dplyr::filter` `dplyr::mutate` `grepl` `names` `stringr::str_replace_all` `stringr::str_extract` `sub` `expression` `eval` # Regular expression, expression, and string manipualtion in R ### Extract numbers of varying length from the end of strings ```r! # Get file names of pdf or docx files file.names <- list.files( path=dir.assignment.2 ,pattern=".*\\.pdf$|.*\\.docx$" ,full.names=FALSE) # length(file.names) 21 file.names # [1] "agarslawrence_204628_20026409_DMC_Assignment2_4614834-1.docx" # [2] "andrewstimothy_297293_19936198_DMC Assignment 2 0953.docx" # [3] "ashtonisabel_294366_19979255_DMC_2021S2_A2_6589.pdf" # [4] "beaumontjenna_294376_20033339_DMC_Assignment2_8752.pdf" # [5] "claytonphilipandrew_125548_20002099_DMC_Assignment2_7392-1.pdf" # [6] "eddydani_297300_19967943_DMC_Assignment2_4833.docx" # [7] "edwardsfelicity_281039_19732901_DMC_Assignment2_4127.pdf" # [8] "gannonsean_294373_19967863_DMC_Assignment2_6490.pdf" # [9] "gawjames_295684_19988820_DMC_Assignment2_7389.docx" # [10] "godwinsam_294371_20034982_DMC_Assignment2_unikey_6083_FINAL.pdf" # [11] "guanqi_LATE_296467_20039218_ DMC_Assignment2_0313-1.pdf" # [12] "herbertrob_110692_19622280_DMC_Assignment2_2309.pdf" # [13] "hopkinsnathan_293664_20025212_DMC_Assignment2_8928.pdf" # [14] "huangannette_265602_20028154_DMC_Assignment 2_6822.pdf" # [15] "ilaovince_294377_20026222_DMC_Assignment2_6193.docx" # [16] "lammichael_293038_19961008_DMC_assignment2_0718.docx" # [17] "lamyanyin_294380_19991757_DMC_Assignment2_6098.docx" # [18] "leekenneth_281025_19958430_DMC_Assignment2_8800-2.docx" # [19] "luimatthew_296777_20032458_DMC_Assignment2_0384-1.pdf" # [20] "mcphailsandi_283784_20036309_5572_Assign2_DMC-1.pdf" # [21] "millerjake_278059_20036475_DMC_Assignment2_8847.docx" file.names.1 <- data.frame(file.name=file.names ,stringsAsFactors = F) %>% # Remove unwanted strings dplyr::mutate(have= stringr::str_remove_all( string=file.names ,pattern = "_Assign2_DMC-1.pdf|-1.docx|-1.pdf|_FINAL.pdf|-2.docx|.docx|.pdf") # Extract assignment ID as 4 digit numbers at the end of .assignment.ID (\\d{4,7})$ ,want=gsub(x=have, pattern = "(.*)(\\d{4,7})$", replacement = "\\2", perl = TRUE)) %>% #Keep wanted columns dplyr::select(have, want) # Output # have want # 1 agarslawrence_204628_20026409_DMC_Assignment2_4614834 4834 # 2 andrewstimothy_297293_19936198_DMC Assignment 2 0953 0953 # 3 ashtonisabel_294366_19979255_DMC_2021S2_A2_6589 6589 # 4 beaumontjenna_294376_20033339_DMC_Assignment2_8752 8752 # 5 claytonphilipandrew_125548_20002099_DMC_Assignment2_7392 7392 # 6 eddydani_297300_19967943_DMC_Assignment2_4833 4833 # 7 edwardsfelicity_281039_19732901_DMC_Assignment2_4127 4127 # 8 gannonsean_294373_19967863_DMC_Assignment2_6490 6490 # 9 gawjames_295684_19988820_DMC_Assignment2_7389 7389 # 10 godwinsam_294371_20034982_DMC_Assignment2_unikey_6083 6083 # 11 guanqi_LATE_296467_20039218_ DMC_Assignment2_0313 0313 # 12 herbertrob_110692_19622280_DMC_Assignment2_2309 2309 # 13 hopkinsnathan_293664_20025212_DMC_Assignment2_8928 8928 # 14 huangannette_265602_20028154_DMC_Assignment 2_6822 6822 # 15 ilaovince_294377_20026222_DMC_Assignment2_6193 6193 # 16 lammichael_293038_19961008_DMC_assignment2_0718 0718 # 17 lamyanyin_294380_19991757_DMC_Assignment2_6098 6098 # 18 leekenneth_281025_19958430_DMC_Assignment2_8800 8800 # 19 luimatthew_296777_20032458_DMC_Assignment2_0384 0384 # 20 mcphailsandi_283784_20036309_5572 5572 # 21 millerjake_278059_20036475_DMC_Assignment2_8847 8847 ``` --- ### Specify fitration condtions as expression objects and evaulate them in dplyr::filter() ```r! # A test data.frame d <- data.frame( x1=runif(10) ,x2=runif(10) ,y1=runif(10) ,y2=runif(10) ,stringsAsFactors = F) # dim(d) 10 4 #------------------------------------ # Subset data without using expression #------------------------------------ # Subset data using dplyr::filter on column positions d.1 <- d %>% dplyr::filter(.[,1]> 0.3 & .[,3]> 0.2) # dim(d.1) 5 4 d.2 <- d %>% dplyr::filter(.[,2]> 0.4 & .[,4]> 0.6) # dim(d.1) 5 4 d.3 <- d %>% dplyr::filter(.[,1]> 0.3 & .[,3]> 0.2 & .[,2]> 0.4 & .[,4]> 0.6) # dim(d.3) 1 4 #------------------------------------ # Subset data using expression #------------------------------------ # Add the filtration conditions abve to expression objects expression.1 <- expression(.[,1] > 0.3 & .[,3] > 0.2 ) # class(ss) [1] "expression" expression.2 <- expression(.[,2] > 0.4 & .[,4] > 0.6 ) # class(ss) [1] "expression" expression.3 <- expression(.[,1] > 0.3 & .[,3] > 0.2 & .[,2] > 0.4 & .[,4] > 0.6 ) # class(ss) [1] "expression" # Evaluate the expression and use the filtration conditions to filter data d.1 <- d %>% dplyr::filter(eval(expression.1)) # dim(d.1) 5 4 d.2 <- d %>% dplyr::filter(eval(expression.2)) # dim(d.2) 2 4 d.3 <- d %>% dplyr::filter(eval(expression.3)) # dim(d.3) 1 4 #------------------------------------ # Subset data using expression # Pass variables to expression #------------------------------------ threshold.x.1 <- 0.3 threshold.y.1 <- 0.2 threshold.x.2 <- 0.4 threshold.y.2 <- 0.6 expression.1 <- expression(.[,1] > threshold.x.1 & .[,3] > threshold.y.1 ) # class(ss) [1] "expression" expression.2 <- expression(.[,2] > threshold.x.2 & .[,4] > threshold.y.2 ) # class(ss) [1] "expression" expression.3 <- expression(.[,1] > threshold.x.1 & .[,3] > threshold.y.1 & .[,2] > threshold.x.2 & .[,4] > threshold.y.2 ) d.1 <- d %>% dplyr::filter(eval(expression.1)) # dim(d.1) 5 4 d.2 <- d %>% dplyr::filter(eval(expression.2)) # dim(d.2) 2 4 d.3 <- d %>% dplyr::filter(eval(expression.3)) # dim(d.3) 1 4 ``` --- ### Examples of subsetting numbers from image file names ```r! # Get "[1,1,B]" from the string x1 x1 <- "200416_PD1CD8 HNSCC TMA B_Core[1,1,B]_[50535,21284].im3" gsub( pattern = ".*(\\[[0-9],[0-9],[A-Z]\\]).*" ,replacement = "\\1" # get pattern within the 1st pair of brackets ,x= x1) # [1] "[1,1,B]" # Get "[1,1,B]_[50535,21284]" from the string x1 gsub(pattern = ".*(\\[[0-9],[0-9],[A-Z]\\]_\\[\\d+,\\d+\\]).*" ,replacement= "\\1" # get pattern within the 1st pair of brackets ,x=x1) # [1] "[1,1,B]_[50535,21284]" # Get "200330_[50706,13606]" from the string x2 x2 <- "B16F10 KO 200330_[50706,13606]_CD8_path_view.jpg" gsub(pattern = ".*( )(\\d+_\\[\\d+,\\d+\\]).*" ,replacement= "\\2" # get pattern within the 2nd pair of brackets ,x=x2) # [1] "200330_[50706,13606]" # Get "200330_[50706,13606]" from the string x3 x3 <- "200330_[50706,13606]_CD8_path_view.jpg" gsub(pattern = "(\\d+_\\[\\d+,\\d+\\]).*" ,replacement= "\\1" # get pattern within the 1st pair of brackets ,x=x3) # [1] "200330_[50706,13606]" gsub(pattern = ".*(\\d+_\\[\\d+,\\d+\\]).*" ,replacement= "\\1" # get pattern within the 1st pair of brackets ,x=x3) # [1] "0_[50706,13606]" # Get "200330_[50706,13606]" from the string x2, x3 using user xiangying's approach stringr::str_sub(string=x3 ,stringr::str_locate(string = x3, pattern = "\\d{1,7}_\\[\\d{1,7},\\d{1,7}\\]") ) # [1] "200330_[50706,13606]" stringr::str_sub(string=x2 ,stringr::str_locate(string = x2, pattern = "\\d{1,7}_\\[\\d{1,7},\\d{1,7}\\]") ) # [1] "200330_[50706,13606]" ``` --- ### Quantifiers [Regular Expressions in R](https://rstudio-pubs-static.s3.amazonaws.com/74603_76cd14d5983f47408fdf0b323550b846.html) Quantifiers specify how many repetitions of the pattern. `*`: matches at least 0 times. `+`: matches at least 1 times. `?`: matches at most 1 times. `{n}`: matches exactly n times. `{n,}`: matches at least n times. `{n,m}`: matches between n and m times. --- ### Operators [Regular Expressions in R](https://rstudio-pubs-static.s3.amazonaws.com/74603_76cd14d5983f47408fdf0b323550b846.html) .: matches any single character, as shown in the first example. [...]: a character list, matches any one of the characters inside the square brackets. We can also use - inside the brackets to specify a range of characters. [^...]: an inverted character list, similar to [...], but matches any characters except those inside the square brackets. \: suppress the special meaning of metacharacters in regular expression, i.e. $ * + . ? [ ] ^ { } | ( ) \, similar to its usage in escape sequences. Since \ itself needs to be escaped in R, we need to escape these metacharacters with double backslash like \\$. |: an “or” operator, matches patterns on either side of the |. (...): grouping in regular expressions. This allows you to retrieve the bits that matched various parts of your regular expression so you can alter them or use them for building up a new string. Each group can than be refer using \\N, with N being the No. of (...) used. This is called backreference. --- ### Character classes [Regular Expressions in R](https://rstudio-pubs-static.s3.amazonaws.com/74603_76cd14d5983f47408fdf0b323550b846.html) Character classes allows to – surprise! – specify entire classes of characters, such as numbers, letters, etc. There are two flavors of character classes, one uses [: and :] around a predefined name inside square brackets and the other uses \ and a special character. They are sometimes interchangeable. \d: digits, equivalent to [0-9]. \D: non-digits, equivalent to [^0-9]. \w: word characters, equivalent to `[[:alnum:]_]` or `[A-z0-9_]`. \W: not word, equivalent to [^A-z0-9_]. \s: space, ` `. \S: not space. [:digit:] or \d: digits, 0 1 2 3 4 5 6 7 8 9, equivalent to [0-9]. [:lower:]: lower-case letters, equivalent to [a-z]. [:upper:]: upper-case letters, equivalent to [A-Z]. [:alpha:]: alphabetic characters, equivalent to [[:lower:][:upper:]] or [A-z]. [:alnum:]: alphanumeric characters, equivalent to [[:alpha:][:digit:]] or [A-z0-9]. [:xdigit:]: hexadecimal digits (base 16), 0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f, equivalent to [0-9A-Fa-f]. [:blank:]: blank characters, i.e. space and tab. [:space:]: space characters: tab, newline, vertical tab, form feed, carriage return, space. [:punct:]: punctuation characters, e.g., ```r! ! " # $ % & ’ ( ) * + , - . / : ; < = > ? @ [ ] ^ _ ` { | } ~. ``` [:graph:]: graphical (human readable) characters: equivalent to [[:alnum:][:punct:]]. [:print:]: printable characters, equivalent to [[:alnum:][:punct:]\\s]. [:cntrl:]: control characters, like \n or \r, [\x00-\x1F\x7F]. Note: `[:...:]` has to be used inside square brackets, e.g. `[[:digit:]]`. `\` itself is a special character that needs escape, e.g. `\\d`. Do not confuse these regular expressions with R escape sequences such as `\t`. --- ### `gsub`, `sub` --- #### Fix strings that are not in a pattern as the other strings do ```r! # Create strings with most elements follow a pattern of centreID, disease, 3 digit number, and name string <- c("A lung 001, Tom", "B melanoma 001, John", "C HNSCC 010, Kim", "breast 001 D, Kate") # Find strings that are in the pattern strings.matched.patterns <- grep(pattern = "^[A-C] \\w+ \\d+, \\w+", x=string, value = TRUE) strings.matched.patterns # [1] "A lung 001, Tom" "B melanoma 001, John" "C HNSCC 010, Kim" # Find strings that are not in the pattern strings.not.matched.patterns <- grep(pattern = "^[A-C] \\w+ \\d+, \\w+", x=string, value = TRUE, invert = TRUE) strings.not.matched.patterns # [1] "breast 001 D, Kate" # Split string into parts, geting one part at a time. Here the grouping operator () is used only for the part of interest center <- gsub(pattern = "\\w+ \\d+ ([A-Z]), \\w+","\\1",x=strings.not.matched.patterns) center #[1] "D" disease <- gsub(pattern = "(\\w+) \\d+ [A-Z], \\w+","\\1",x=strings.not.matched.patterns) disease #[1] "breast" ID <- gsub(pattern = "\\w+ (\\d+) [A-Z], \\w+","\\1",x=strings.not.matched.patterns) ID #[1] "001" name <- gsub(pattern = "(\\w+) \\d+ [A-Z], (\\w+)","\\2",x=strings.not.matched.patterns) name #[1] "Kate" # Combine parts to a new string new.string <- paste0(center," ", disease," ", ID,", ", name ) new.string #[1] "D breast 001, Kate" # Repeat the steps above with 1 line new.string <- gsub(pattern = "(\\w+) (\\d+) ([A-Z]), (\\w+)", replacement = "\\3 \\1 \\2, \\4",x=strings.not.matched.patterns) new.string #[1] "D breast 001, Kate" # Put matched strings and cleaned string together strings.cleaned <- c(strings.matched.patterns,new.string) strings.cleaned #[1] "A lung 001, Tom" "B melanoma 001, John" "C HNSCC 010, Kim" "D breast 001, Kate" ``` --- #### Replace single back slash with a dash [Replace single backslash in R](https://stackoverflow.com/questions/25424382/replace-single-backslash-in-r) ```r! # A string with \. Note R automatically adds \ to escape \ x <- "A\\B" # Replace \ with - gsub(pattern = "\\\\", replacement="-", x) # "A-B" # A string with / y <- "C/D" # Replace / with - gsub(pattern = "/", replacement = "-", x=y) #[1] "C-D" ``` --- #### Extract numbers from string [Extract only numbers from a string with punctuation and spaces in R? [duplicate]](https://stackoverflow.com/questions/38545848/extract-only-numbers-from-a-string-with-punctuation-and-spaces-in-r) [R Only extract 3 digit numbers from a string](https://stackoverflow.com/questions/45512529/r-only-extract-3-digit-numbers-from-a-string) ```r! # (\\S+) matches one or more non-white space # (\\s+) matches one or more white space # that are repeated twice ({2}) followed by one or more numbers captured as a group ((\\d+)) followed by a , and other characters until the end of the string and replace it with the backreference of the second captured group (\\2) df1 <- data.frame(A=c( "XY Z 123, 30009 Addr" ,"AB CBA 12, 900000 Addr" ,"FC AX 1234, 977777 Addr") ,stringsAsFactors = F) df1$A.part1 <- sub("(\\S+\\s+){2}(\\d+),.*", "\\1", df1$A) # [1] "Z " "CBA " "AX " df1$A.part2 <- as.numeric(sub("(\\S+\\s+){2}(\\d+),.*", "\\2", df1$A)) #[1] 123 12 1234 df1$A.part3 <- sub("(\\S+\\s+){2}(\\d+),.*", "\\3", df1$A) # [1] "" "" "" ``` --- #### Search a pattern that starts with "GSCAN" and ends with "S" and 1-digit number between 1 and 8 ```r! # A vector to search a pattern > colnames [1] "FAMID" "ID" "FATHERID" "MOTHERID" [5] "GENDER" "fid" "GSCAN_Q1" "GSCAN_Q2_recode" [9] "GSCAN_Q3_recode" "GSCAN_Q4" "GSCAN_Q5_Drinks_per_week" "GSCAN_Q6_recode" [29] "GSCAN.ai.S1" "GSCAN.ai.S2" "GSCAN.ai.S3" "GSCAN.ai.S4" [33] "GSCAN.ai.S5" "GSCAN.ai.S6" "GSCAN.ai.S7" "GSCAN.ai.S8" # Search a pattern starts with GSCAN, ends with S and 1-8, with 2 dots and anything between grep("^GSCAN.*.S[1-8]$",colnames,value=TRUE) [1] "GSCAN.ai.S1" "GSCAN.ai.S2" "GSCAN.ai.S3" "GSCAN.ai.S4" "GSCAN.ai.S5" "GSCAN.ai.S6" "GSCAN.ai.S7" "GSCAN.ai.S8" ``` #### Search a pattern that ends with ".x". Delete this colname suffix *[remove all delimiters at beginning and end of string](https://stackoverflow.com/questions/40163328/remove-all-delimiters-at-beginning-and-end-of-string) ```r! # The data to change colnames names(OR_horiPleio_keep_small) [1] "id.exposure" "id.outcome" "outcome.x" "exposure.x" "method" "nsnp" "b" "se.x" "pval.x" [10] "lo_ci" "up_ci" "or" "or_lci95" "or_uci95" # Remove the suffix ".x" gsub(names(OR_horiPleio_keep_small),pattern = ".x$",replacement = "") [1] "id.exposure" "id.outcome" "outcome" "exposure" "method" "nsnp" "b" "se" "pval" [10] "lo_ci" "up_ci" "or" "or_lci95" "or_uci95" ``` --- ### R user group meeting # 9 slide content ```r! #---------------------------------------------------------------- # Scenario 1: getting output from linear or logistic regression #---------------------------------------------------------------- hsb2 <- read.csv("https://stats.idre.ucla.edu/stat/data/hsb2.csv") # dim(hsb2) # Run linear regression with categorical predictors linear.model.summary <- summary(lm(write ~ factor(race), data = hsb2)) # Getting coefficients for the predictors coefficients <- linear.model.summary[["coefficients"]] # Convert a matrix to a data.frame keeping its dimnames ## .margins = 1 splits .data by rows ## .fun=c applies c() to each piece coefficients.dataFrame <- plyr::adply( .data= coefficients ,.margins = 1 ,.fun=c) # class(coefficients.dataFrame) "data.frame" coefficients.dataFrame$X1 <- as.character(coefficients.dataFrame$X1) # Rename columns new.column.names <- gsub( x=colnames(coefficients.dataFrame) ,pattern = "X1" ,replacement = "Predictor") colnames(coefficients.dataFrame) <- new.column.names # Remove unwanted string (, factor, ) in a column with gsub() patterns <- "\\(|factor\\(|\\)" temp1 <- coefficients.dataFrame temp1$Predictor <- gsub( x=temp1$Predictor ,pattern=patterns ,replacement="") # Remove unwanted string (, factor, ) in a column with stringr::str_replace_all patterns <- "\\(|factor\\(|\\)" temp2 <- coefficients.dataFrame temp2$Predictor <- stringr::str_replace_all(string = temp2$Predictor ,pattern=patterns ,replacement="") # Predictor Estimate Std. Error t value Pr(>|t|) # 1 Intercept 46.458333 1.842243 25.2183502 1.921811e-63 # 2 race2 11.541667 3.286129 3.5122376 5.515272e-04 # 3 race3 1.741667 2.732488 0.6373922 5.246133e-01 # 4 race4 7.596839 1.988870 3.8196768 1.792682e-04 ``` ```r! #---------------------------------------------------------------- # Scenario 2: Subset file paths with a pattern #---------------------------------------------------------------- source.files.path <- "/mnt/lustre/working/lab_nickm/lunC/MR_ICC_GSCAN_201806/two-sample-MR/input/harmonised-data" all.files <- list.files(path=source.files.path) # length(all.files) 383 # Subset TSV files (positive filtering) with list.files() patterns <- "harmonised-data.*\\.tsv$" tsv.files <- list.files(path=source.files.path ,pattern = patterns ,full.names = TRUE) # length(tsv.files) 220 # Subset TSV files with Sys.glob() ## Sys.glob() does wildcard expansion like Unix Bash Shell patterns <- "harmonised-data*\\.tsv" tsv.files <- Sys.glob(file.path(paste0(source.files.path,"/",patterns))) # length(tsv.files) 220 # Subset non tsv files (negative filtering) patterns <- "harmonised-data.*\\.tsv$" non.tsv.files <- grep(x=all.files ,pattern = patterns ,value = TRUE ,invert = TRUE) # length(non.tsv.files) 163 ``` ```r! #---------------------------------------------------------------- # Scenario 3: Subset groups #---------------------------------------------------------------- # Create subgroups group.1 <- c("NSW","ACT","VIC","QLD","SA","WA","TAS","NT") # length(group.1) 8 group.2 <- paste0("age",c("4-20","21-40","41-60","61+")) group.3 <- c("males","females","bothSexes") # Create all combinations from the 3 vectors ## data.table::CJ creates a Join data table all.groups.subgroups <- data.table::CJ(group.1, group.2, group.3, sorted = FALSE)[, paste(group.1, group.2, group.3, sep ="_")] # length(all.groups.subgroups) 96 # Subset males males <- grep(x=all.groups.subgroups,pattern = "_males$", value = TRUE) # length(males) 32 # Subset females aged over 61 from eastern states (QLD, NSW, VIC, ACT, TAS) # Specify patterns pattern.1 <- "^NSW|^QLD|^VIC|^ACT|^TAS" pattern.2 <- "_females$" pattern.3 <- "61\\+" # Subset data from Eastern states eastern.states <- grep(x=all.groups.subgroups,pattern = pattern.1, value = TRUE) # length(eastern.states) 60 # Subset data from females in Eastern states eastern.states.females <- grep(x=eastern.states,pattern = pattern.2, value = TRUE) # length(eastern.states.females) 20 # Subset data from females 61+ in Eastern states eastern.states.females.61plus <- grep(x=eastern.states.females,pattern = pattern.3, value = TRUE) # length(eastern.states.females.61plus) 5 # Subset data from females 61+ in Eastern states eastern.states.females.61plus <- grep(x=all.groups.subgroups, pattern = pattern.1, value = TRUE) %>% grep(., pattern = pattern.2, value=T) %>% grep(. , pattern=pattern.3, value=T) # length(eastern.states.females.61plus) 5 ``` ```r! #---------------------------------------------------------------- # Summary #---------------------------------------------------------------- gsub(pattern = ) str_replace_all(pattern = ) list.files(pattern=) grep(pattern = ) Sys.glob() ``` #### list.files() sorts your data[^4] [^4]: **R script file path:** /mnt/backedup/home/lunC/scripts/MR_step06-05-01_prepare-input-files-for-MR-PRESSO.R ```r! # Manually list the trait pairs as the order of the destination file paths. Be aware that data have been sorted # Exposure clumping p1 Outcome #------------------------------------------------ # GSCAN drinks per week p1-1e-5 GSCAN CPD # UKB-caffeine p1-1e-5 UKB-CPD # UKB-caffeine p1-5e-8 ICC-CI # GSCAN SI p1-1e-5 UKB-caffeine # GSCAN SI p1-5e-8 UKB-caffeine #------------------------------------------------ ``` #### Get a list of file paths with a pattern[^3] [^3]: **R script file path:** /mnt/backedup/home/lunC/scripts/MR_ICC_GSCAN_201806/MR_step06-03-07_run_heterogeneity-test.R ```r! #--------------------------------------------------- # Get a list of paths of harmonised data files #--------------------------------------------------- # Method 1: use list.files() and glob2rx() ## glob2rx() uses * as a wildcard and expands the string into regular expression for R. Note regex in R is different from regex in Linux ## Here glob2rx() added ^, dot, \\ and $ ### regex in R: (1) dot usually means "match any character" (2) * means The preceding item will be matched zero or more times. .* is the same as * in linux pattern.file.names <- glob2rx("harmonised-data_exposure*.tsv") # "^harmonised-data_exposure.*\\.tsv$" file.paths.harmonised.data <- list.files(path=loc.twoSampleMR.harmonised ,pattern = pattern.file.names ,full.names = TRUE) # length(file.paths.harmonised.data) 220 # Method 2: use Sys.glob() # Sys.glob() Expands wildcard on file paths file.paths.harmonised.data <- Sys.glob(path=paste0(loc.twoSampleMR.harmonised,"harmonised-data*\\.tsv")) # length(file.paths.harmonised.data) 220 ``` #### Copy selective files from one directory to another directory[^2] [^2]: R script file path: /mnt/backedup/home/lunC/scripts/MR_ICC_GSCAN_201806/MR_step06-05-01_prepare-input-files-for-MR-PRESSO.R ```r! # Specify patterns in file names to search in file paths. Note file paths do not begin with file name. pattern.1 <- "*dpw-noICC-LDWindow-kb-10000-R2-0\\.01-p1-1e-5-p2-1e-5-linear-BETA-added_outcome-GSCAN-CPD\\.tsv$" pattern.2 <- "*si-noICC-LDWindow-kb-10000-R2-0\\.01-p1-1e-5-p2-1e-5*_outcome-UKB-caffeine\\.tsv$" pattern.3 <- "*UKB-caffeine-LDWindow-kb-10000-R2-0\\.01-p1-1e-5-p2-1e-5*_outcome-UKB-CPD\\.tsv$" pattern.4 <- "*si-noICC-LDWindow-kb-10000-R2-0\\.01-p1-5e-8-p2-1e-6*_outcome-UKB-caffeine\\.tsv$" # Combine multiple patterns as one string patterns <- paste0(pattern.1,"|",pattern.2,"|",pattern.3,"|",pattern.4) # Get paths of the files source.file.paths <- grep(patterns ,list.files(path= loc.harmonised.data ,full.names = T) ,value = TRUE) # length(source.file.paths) 4 # Create destination file paths destin.file.paths <- paste0(loc.MRPRESSO.input,basename(source.file.paths)) # Manually move existing files to the archive folder file.copy(from= source.file.paths ,to= destin.file.paths) ``` --- #### Parse log files generated from Linux software and tabulate data as a single file[^1]. [^1]: **R script file path:** /mnt/backedup/home/lunC/scripts/MR_ICC_GSCAN_201806/MR_step08-02-03_parse-tabulate_LDSC-SNP-heritability_LDSC-genetic-correlations.R #### Subset file paths with negative filtration using `dplyr::filter(!grepl("pattern",variable))` * [grepl: Search within a string that does not contain a pattern](https://stackoverflow.com/questions/8898501/grepl-search-within-a-string-that-does-not-contain-a-pattern) ```r! pattern.file.name.h2.log <- glob2rx("SNP-heritability_*-*.log") # "^SNP-heritability_.*-.*\\.log$" h2.log <- data.frame(file.path=list.files(path=loc.LDSC.h2 ,pattern = pattern.file.name.h2.log ,full.names = TRUE) ,stringsAsFactors = F) # dim(h2.log) 11 1 h2.log <- h2.log %>% # Exclude CCPD dplyr::filter(!grepl("CCPD",file.path)) ``` #### Remove special characters. (1) What are special characters? (2) How to search them and then remove them? (3) `glob2rx()` Change wildcard aka globbing patterns into the corresponding regular expressions (regexp). * [Remove all special characters from a string in R?](https://stackoverflow.com/questions/10294284/remove-all-special-characters-from-a-string-in-r) * ```r! clean.string <- "Clean string with words separated by white space" string.with.special.characters <- "Clean string with words separated by (white space)" string.with.special.characters <- "Clean string with words separated by (white space) ~!@#$%^&*(){}_+:\"<>?,./;'[]-=" # This will Work pattern.to.search <- "\\(|\\)|\\~|\\!" gsub(string.with.special.characters, pattern=pattern.to.search, replacement="") ``` #### Drop/delete columns in a data.frame * [drop data frame columns by name](https://stackoverflow.com/questions/4605206/drop-data-frame-columns-by-name) ```r! # Drop unwanted columns columns.to.drop <- c("clumping.p1","clumping.p2") exposure.GWAS <- exposure.GWAS[,!(names(exposure.GWAS) %in% columns.to.drop)] ``` #### Regular expression * `?` means 1 * `+` means a number between 1 and > 1 * `|` means "or" * [Regex for numbers on scientific notation?](https://stackoverflow.com/questions/4479408/regex-for-numbers-on-scientific-notation?lq=1) ```bash! # A regex for scientific notation -?[\d.]+(?:e-?\d+)? # Explanation -? # an optional - [\d.]+ # a series of digits or dots (see *1) (?: # start non capturing group e # "e" -? # an optional - \d+ # digits )? # end non-capturing group, make optional ``` --- ### Manipulating string in data.frame #### Extract scientific notations from a column. * [How to capture minus sign in scientific notation with regex?](https://stackoverflow.com/questions/30018497/how-to-capture-minus-sign-in-scientific-notation-with-regex) * [Question: Populate data frame column X with substring from column Y using R](https://www.biostars.org/p/268832/) * [Remove part of a string in dataframe column (R)](https://stackoverflow.com/questions/25277117/remove-part-of-a-string-in-dataframe-column-r) ```r! # A column contains scientific notation head(exposure.GWAS$fileNameSpecial,2) [1] "clumped-ai-noICC-LDWindow-kb-10000-R2-0.01-p1-1e-5-p2-1e-5" [2] "clumped-ai-noICC-LDWindow-kb-10000-R2-0.01-p1-5e-8-p2-1e-6" # Extract scientific notations using regex ## Only 2 values to extract for p1: 1e-5, 5e-8 ## Only 2 values to extract for p2: 1e-5, 1e-6 library(dplyr) exposure.GWAS <- exposure.GWAS %>% dplyr::mutate(clumping.p1.value= stringr::str_extract(fileNameSpecial,"[1|5]e[-][5|8]") ,clumping.p2.value= stringr::str_extract(fileNameSpecial,"[1]e[-][5|6]")) # dim(exposure.GWAS) 20 13 > head(exposure.GWAS$clumping.p1.value,2) [1] "1e-5" "5e-8" > head(exposure.GWAS$clumping.p2.value,2) [1] "1e-5" "1e-6" ``` #### Delete/remove strings in a column using `stringr::str_replace_all` ```r! # Vertically combine two data.frames with identifical column structure # Delete 3 strings in a column (1) "factor(" , (2) ")1", (3) ")2" strings.to.search <- glob2rx("^factor\\(|\\)1$|\\)2$") tem1.tem2 <- dplyr::bind_rows(tem1.small,tem2.small) %>% dplyr::mutate(predictor= stringr::str_replace_all(predictor ,pattern=strings.to.search ,replacement = "")) # dim(tem1.tem2)66 9 ``` #### Delete/remove part of a string in a column using `gsub(data$column,)` ```r! # Two columns to delete "p1-" and "p2-" head(exposure.GWAS$clumping.p1,2) [1] "p1-1e-5" "p1-5e-8" > head(exposure.GWAS$clumping.p2,2) [1] "p2-1e-5" "p2-1e-6" # Delete "p1-" and "p2-" exposure.GWAS$clumping.p1.value <- gsub(exposure.GWAS$clumping.p1, pattern = "p1-", replacement = "") exposure.GWAS$clumping.p2.value <- gsub(exposure.GWAS$clumping.p2, pattern = "p2-", replacement = "") head(exposure.GWAS$clumping.p1.value,2) [1] "1e-5" "5e-8" head(exposure.GWAS$clumping.p2.value,2) [1] "1e-5" "1e-6" ``` --- #### Replace dots with dashes in column names ```r! # A simple regular expression to replace dots with dashes. This might have unintended consequences, so be sure to check the results names(data) <- gsub(x = names(data),pattern = "\\.",replacement = "-") # replace dash with underscore str_replace_all unlist(str_replace_all(inputFileNamePart2,"-","_")) ``` --- #### List directory of multiple files with patterns. Syntax form: `list.files(path=c(folder1,folder2,..),pattern=glob2rx(^patter1*patter2$),full.names=TRUE)` Sys.glob() doesn't seem to take a complex pattern. glob2rx() converts a pattern including a wildcard into the equivalent regular expression. You then need to pass this regular expression onto one of R's pattern matching tools. The 5 patterns are specified with `list.files(path=,pattern=glob2rx(p1|p2|p3|p4|p5),full.names=TRUE)` will return full paths of matched files * [R list files with multiple conditions](https://stackoverflow.com/questions/18028225/r-list-files-with-multiple-conditions) * [Question: Is there any possible way to list files from different locations?](https://www.biostars.org/p/116637/) * [Pattern matching using a wildcard](https://stackoverflow.com/questions/5823503/pattern-matching-using-a-wildcard/5823670) ```r! # List directory of files with 1 pattern in their file names from 4 folders filePath.UKB <- list.files(path=c(locUKB.3456.QC4,locUKB.ESDPW.QC4,locUKB.CCPD.QC4,locUKB.20161.QC4) ,pattern = glob2rx("^GWAS_from-clumped-SNPs_GWAS-UKB*_sample-size-added$") ,full.names = T) # length(filePath.UKB) 8 # List directory of files with multiple patterns pattern.GSCAN.files <- "^GWAS_from-clumped-SNPs_ai_noICC_LDWindow-kb-10000_R2-0.01_*|^GWAS_from-clumped-SNPs_cpd_noICC_LDWindow-kb-10000_R2-0.01*|^GWAS_from-clumped-SNPs_dpw_*linear-BETA-added$|GWAS_from-clumped-SNPs_sc_noICC_LDWindow-kb-10000_R2-0.01_*|GWAS_from-clumped-SNPs_si_noICC_LDWindow-kb-10000_R2-0.01_*" filePath.clumped.GWAS.GSCAN <- list.files(path=locGSCAN ,pattern=glob2rx(pattern.GSCAN.files) ,full.names = TRUE) # length(filePath.clumped.GWAS.GSCAN) 10 ``` --- #### Find file names that start with a pattern and end with another pattern ```r! # List files that start with "revised_bolt_imputed", denoted by ^, and end with ".glm.logistic", denoted by $ list.files(path = current_dir,pattern = "^revised_bolt_imputed(.*).bgen.assoc$|(.*).glm.logistic$") ``` #### How to use the strsplit function with a period? When using a regular expression in the split argument of `strsplit()`, you've got to escape the `.` with `\\.`, or use a charclass `[.]`. Otherwise you use . as its special character meaning, "any single character". * [How to use the strsplit function with a period](https://stackoverflow.com/questions/26665100/how-to-use-the-strsplit-function-with-a-period) #### To extract numbers from a string, replace non-numeric part with nothing * [Extracting numbers from vectors of strings](https://stackoverflow.com/questions/14543627/extracting-numbers-from-vectors-of-strings) ```r! # Extract 10 from "LDBasedSNPclumping_chr10.clumped" using gsub(arg1,arg2,arg3) ## arg1: string to look for ## arg2: replace arg1 with arg2 string ## arg3: the string to work on CHR=as.numeric(gsub("LDBasedSNPclumping_chr","",unlist(strsplit(fileName,"[.]"))[1])) ``` #### To move a directory in R like `mv` in UNIX, first copy the directory recursively to the destination folder, check if the dates of last modification are preserved, and then delete the folders that have been copied. *[Copy folders from one directory to another in R](https://stackoverflow.com/questions/31655076/copy-folders-from-one-directory-to-another-in-r) ```r! #-------------------------------------------------------------------------------------# #------- Archive old folders keeping date of last modification------------------------# #-------------------------------------------------------------------------------------# destination=paste0(locGCTAInput,"archive") source_folders=c(paste0("phenoGroup2_everDrug1to10-CUD_GCTA--",c("covar","qcovar","pheno")) ,paste0("phenoGroup3_alcoho-tobacc_GCTA--",c("covar","qcovar","pheno")) ,paste0("phenoGroup5_diagMD-diagSU_GCTA--",c("covar","qcovar","pheno")) ) # Move 9 folders to the archive folder count=0 for (folderToMove in source_folders){ path_folderToMove=paste0(locGCTAInput,folderToMove) count=count+1 print(paste0("===================================== iteration", count," ====================")) print(paste0("path_folderToMove=",path_folderToMove)) file.copy(from= path_folderToMove ,to= destination ,recursive = TRUE # TRUE incidcates the to= is a directory ,copy.date = TRUE # if TRUE, preserve date of last modification of "from" ) } # Go check if date of last modification are preserved in the archive folder # Delete source folder that have been copied to the archive folder count=0 for (folderToMove in source_folders){ path_folderToMove=paste0(locGCTAInput,folderToMove) count=count+1 print(paste0("===================================== iteration", count," ====================")) print(paste0("path_folderToMove=",path_folderToMove)) # Delete the folders that have been copied with last modification dates unlink(path_folderToMove, recursive=TRUE) } ``` #### Find files with matched patterns in their directories using Sys.glob() instead of list.files(). It uses shell globbing for the pattern instead of regular expressions. This function does the same thing as shell realpath * [Finding files matching full path regex](http://r.789695.n4.nabble.com/Finding-files-matching-full-path-regex-td4685923.html) ```r! # Get full paths of same-named files summedRiskProfiles.S1-S8_newHeader under 80 different folders Sys.glob(path=paste0(locASCOut,"/uniqSNPs_from_metaDataQCed-Release8*/innerJoinedSNPsByCHRBP_metaDataQCed-Release8*/dosageFam_Release8_*/summedRiskProfiles.S1-S8_newHeader")) ``` #### list files under a directory as absolute paths * [Working with files and folders in R](https://www.masterdataanalysis.com/r/working-with-files-and-folders-in-r/) ```r! # The folder to extract full paths of every file ouputFolderPath=paste0(locGCTAInput,"phenoGroup3_alcoho-tobacc_GCTA--pheno/") # Similar to realpath in shell, the full.names=TRUE option gets list.files() to print the file path list.files(path=ouputFolderPath,pattern="pheno_*",full.names = TRUE) ``` #### Extract file name or folder path from a file path * (Manipulate File Paths)[https://stat.ethz.ch/R-manual/R-devel/library/base/html/basename.html] ```r! # A file path to manipulate GWAS_to_use$directory[dir] "/mnt/lustre/working/lab_nickm/lunC/PRS_UKB_201711/GWASSummaryStatistics/GWAS_GSCAN/noUKBiobank_results/cpd_noUKBiobank.txt" # Extract file name from file path using basename() GWASFileName=basename(GWAS_to_use$directory[dir]) # Extract folder path from file path using dirname() GWASFileFolderPath=dirname(GWAS_to_use$directory[dir]) ``` #### Read files that don't have two patterns in their file names from a folder ```r! input1="D:/Now/library_genetics_epidemiology/GWAS/GCTA/GREML1Var_201711/" # Read files that DON"T have 'nil' in their file names and exclude the 'scripts' folder filesWantList <- grep(list.files(path=input1) ,pattern='nil|scripts' # 2 patterns to look for ,inv=T # inverts the match ,value=T # returns the values of the matches (i.e. the filenames) rather than the indices of the matches ) ```