Try   HackMD
tags: R list do.call rbind table ifelse is.na subset select merge.data.frame plyr::join_all grep grepl %>% merge group_by filter with which is.na match duplicated foreach lapply as.character as.numeric dplyr::summarise n sum by str_detect wildcards q reshape2::dcast order factor adply filter_at str_locate %in% stack glob2rx na.omit length unite melt spread match anti_join regex mutate format.pval format round function stringr::str_c stringr::str_pad stringr::str_replace_all expand.grid arrayInd which.max which.min as.data.frame.table rowSums colSums apply tidyr::separate as.Date read.csv read.table gmodels::CrossTable plyr::adply plyr::ddply seq_along seq_len rowr::cbind.fill data.table::CJ table(factor) dplyr::case_when dplyr::filter dplyr::left_join dplyr::anti_join dplyr::arrange dplyr::mutate_at dplyr::recode dplyr::rename_at dplyr::rename_ dplyr::select dplyr::select_ data.table::setnames assign setdiff %<>% substr as.data.frame.matrix plyr::join_all as.POSIXct lag tryCatch rank purrr::reduce() dplyr::slice tidyr::pivot_wider summary.default unclass tidyr::fill tidyr::fill_

Data manipulation in R


Replace missing values with non-missing values in groups

Filling missing value in group

have <- data.frame(
  row.number=c(1:6)
  ,ID=c("A","A","A","B","B","B")
  ,v1=c(NA,1,NA, NA, NA, 3)
  ,v2=c(NA,NA,"Yes",NA,"Yes",NA)
  ,stringsAsFactors = F)

# Use column names
want <- have %>% 
  dplyr::group_by(ID) %>%
  tidyr::fill(v1,v2) %>%
  tidyr::fill(v1,v2, .direction="updown")

# Use column position [R: fill down multiple columns](https://stackoverflow.com/questions/37648980/r-fill-down-multiple-columns) want <- have %>% 
  dplyr::group_by(ID) %>%
  tidyr::fill_(names(.)[3:4]) %>%
  tidyr::fill_(names(.)[3:4], .direction="updown")


Get data structure as data.frame

# Get data structure into data.frame
data.structure <- data.frame(
  unclass(summary.default(data))
  ,check.names = FALSE
  ,stringsAsFactors = F
) # class(data.structure) [1] "data.frame" # dim(data.structure) 58 3

Find minimum, maximum in a column

# Find rows with the minimum p.value
data %>% dplyr::slice(which.min(p.value))

# Find rows with the maximum estimate
data %>% dplyr::slice(which.max(p.value))

Find elements of a vector that are unmatched to elements in another vector

a <- c('a','b','c','d','e')
b <- c('a','b','d','e')

# Find elements of a that match elements of b
a[a %in% b]
[1] "a" "b" "d" "e"

# Find elements of a that don't match elements in b
a[!(a %in% b)]
[1] "c"

Handle errors with tryCatch{} This allows iteration that results in erros to be skipped. Without this error handling, the iteration discontinues.

  # Use tryCatch() here as some iterations won't run because of not enough SNPs but we want the loopig continues even there is an error
  tryCatch({
  # R code that could result in errors
  # End tryCatch() function  
  }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})

Read specific lines/rows of a file with separators \n

# A log file to extract content
LDSC.rG.log.file.path
[1] "/mnt/lustre/working/lab_nickm/lunC/MR_ICC_GSCAN_201806/LD-score-correlation_run-via-HPC-Utility/rG_between_GSCAN-AI_and_ICC-CI/ai_noICC.ambiguousSNPRemoved-Cannabis_ICC_UKB_small_MAF0.99-0.01_cojo.result.rg.log"

# Read all lines of the file into R
LDSC.rG.log.file <- readLines(LDSC.rG.log.file.path) # length(LDSC.rG.log.file) 67
  
# Find row number/line number of a pattern "Heritability of phenotype 1"
## The pattern "Heritability of phenotype 1" is at line 33
## The pattern "Heritability of phenotype 2/2" is at line 41
row.number.h2.trait1.title <- which(grepl("Heritability of phenotype 1",LDSC.rG.log.file)) # 33
row.number.h2.trait2.title <- which(grepl("Heritability of phenotype 2/2",LDSC.rG.log.file)) # 41
## The section to extract is at a length of 5 lines
length.h2.trait1or2 <- 5
  
## Extract the five lines for trait 1
h2.trait1 <- scan(LDSC.rG.log.file.path
                  , what=character()
                  , sep='\n'
                  , skip=row.number.h2.trait1.title+1 # line number to skip before beginning to read data values.
                  , nlines=length.h2.trait1or2 # how many lines to read from the starting line
                  )
# Check content of the extract
h2.trait1
[1] "Total Liability scale h2: 0.0325 (0.0044)"    "Lambda GC: 1.0679"                           
[3] "Mean Chi^2: 1.0748"                           "Intercept: 0.9984 (0.0069)"                  
[5] "Ratio < 0 (usually indicates GC correction)."                  

Read a file containing quotation marks/quotes. Quotes may be unclosed single or double quotes (e.g. I've signed up)

# The data contain quotes. Quotes are set to " and ' by default for read.table. Set the quote to empty character here (quote="") to treat them as ordinary data elements. Otherwise, importatino stops at the first line containing quotes.
reg <- read.table(paste0(dir.table.tennis.score,registrants)
                ,sep="\t"
                ,header = TRUE
                ,stringsAsFactors = F
                ,na.strings = ""
                ,quote = "")

Duplicate/repeat each row for N times

# A data.frame to duplicate rows
dim(reg5) 
[1] 23  2

# Duplicate each row for 3 times
reg6 <- reg5[rep(seq_len(nrow(reg5)),each=3),] # dim(reg6)
dim(reg6) 
69  2

Replace non missing values with a number

# Replace non missing values in the newly merged columns (exposure*) with numeric occurence of 1
## A data set to replace non missing (non NA) with a number of 1 
temp <- left.table

head(temp)
      SNP.IDs GSCAN.LDClumped.ai GSCAN.LDClumped.cpd GSCAN.LDClumped.sc GSCAN.LDClumped.si
1  rs62478582                 ai                <NA>               <NA>               <NA>
2 rs193122850               <NA>                 cpd               <NA>               <NA>
3   rs2118359               <NA>                 cpd               <NA>               <NA>
4    rs215600               <NA>                 cpd               <NA>               <NA>
5   rs2273500               <NA>                 cpd               <NA>               <NA>
6  rs56113850               <NA>                 cpd                 sc               <NA>

# Loop through column 2 to end, replace non missing character values with 1
for (i in 1:ncol(temp)){
  if (i > 1){
    temp[!is.na(temp[,i]),i] <- 1
  }
}

# Check results
head(temp)
      SNP.IDs GSCAN.LDClumped.ai GSCAN.LDClumped.cpd GSCAN.LDClumped.sc GSCAN.LDClumped.si
1  rs62478582                  1                <NA>               <NA>               <NA>
2 rs193122850               <NA>                   1               <NA>               <NA>
3   rs2118359               <NA>                   1               <NA>               <NA>
4    rs215600               <NA>                   1               <NA>               <NA>
5   rs2273500               <NA>                   1               <NA>               <NA>
6  rs56113850               <NA>                   1                  1               <NA>

Assign a name to an object and export it to the global environment

## Assign them to the global environment
  assign(data.name,data,envir = .GlobalEnv)

keep data in original order after subsetting with filter() using dply::arrnge(match(variable,vector-order))

# The data to subset
dim(fix.eff.PRS.manu3.allSexGroups) # [1] 1800   13

# The order of target phenotypes you want the phenotype column to be sorted 
manu3.targ.pheno.predicted.by.SI
[1] "GSCAN_Q2_recode"          "GSCAN_Q1"                 "GSCAN_Q3_recode"          "nicdep4"                 
[5] "ftnd_dep"                 "GSCAN_Q5_Drinks_per_week" "alcdep4"                  "dsmiv_conductdx"         
[9] "aspddx4" 

# Subset target phenotypes predicted 
## Reorder data to keep original ordering of the phenotype column using arrange
manu3.tem.SI <- fix.eff.PRS.manu3.allSexGroups %>% 
  dplyr::filter(phenotype %in% manu3.targ.pheno.predicted.by.SI & name_fixEffect_trait == "si") %>%
  dplyr::arrange(match(phenotype, manu3.targ.pheno.predicted.by.SI)) # dim(manu3.tem.SI) 216  13 (9 target phenotypes predicted by 1 discovery trait * 3 sex groups * 8 p value thresholds)

Multiply multiple columns with a specific columnm, creating new columns for the multiplication

# A dataset to work on
PRS_everDrug1to10_CUD <- read.table(paste0(locPheno,"pheno2drugUseCUD-IDremapped_standardised-IDremapped-PRS-GSCAN.txt")
                                 ,header = T
                                 , sep=" "
                                 , stringsAsFactors = F
                                 , na.strings=c('.','NA'))
# The data structure with wanted columns shown                                 
> str(PRS_everDrug1to10_CUD)
'data.frame':	2463 obs. of  76 variables:
 $ nSEX        : int  1 2 2 1 1 1 1 2 2 1 ...
 $ GSCAN.ai.S2 : num  0.325 -2.008 -0.627 0.366 -0.731 ...
 $ GSCAN.ai.S3 : num  0.99 -0.252 -2.104 -1.147 -1.202 ...
 $ GSCAN.ai.S4 : num  1.505 3.152 -0.597 -1.691 -0.385 ...
 $ GSCAN.ai.S5 : num  0.668 2.307 -2.531 -1.427 -0.707 ...
 $ GSCAN.ai.S6 : num  0.922 1.447 -1.837 -1.418 -1.357 ...
 $ GSCAN.ai.S7 : num  -0.249 0.978 -3.346 -2.074 -2.022 ...
 $ GSCAN.ai.S8 : num  -0.302 1.045 -3.165 -2.128 -1.888 ...
 $ GSCAN.cpd.S1: num  -0.9651 0.799 -0.1756 0.1013 -0.0447 ...
 $ GSCAN.cpd.S2: num  -0.6752 0.8222 0.5144 -0.0802 -0.379 ...
 $ GSCAN.cpd.S3: num  -0.676 0.218 -0.765 -0.141 -0.543 ...
 $ GSCAN.cpd.S4: num  -0.9386 3.2686 -0.577 0.0432 -0.2242 ...
 $ GSCAN.cpd.S5: num  -0.095 4.892 -0.495 0.124 -0.11 ...
 $ GSCAN.cpd.S6: num  0.276 5.415 -0.456 0.039 -0.173 ...
 $ GSCAN.cpd.S7: num  0.2396 4.6077 -0.336 0.3825 -0.0603 ...
 $ GSCAN.cpd.S8: num  0.138 4.398 -0.294 0.308 -0.247 ...
 $ GSCAN.dpw.S1: num  0.118 -0.474 -0.551 -2.615 0.184 ...
 $ GSCAN.dpw.S2: num  0.201 0.875 0.913 -1.487 0.314 ...
 $ GSCAN.dpw.S3: num  -1.583 -0.446 1.746 -0.259 0.44 ...
 $ GSCAN.dpw.S4: num  -0.162 0.621 1.23 -1.076 0.286 ...
 $ GSCAN.dpw.S5: num  1.334 1.184 -0.477 -1.833 0.188 ...
 $ GSCAN.dpw.S6: num  1.535 0.335 -1.644 -1.583 0.319 ...
 $ GSCAN.dpw.S7: num  1.69 0.174 -1.82 -1.467 0.221 ...
 $ GSCAN.dpw.S8: num  1.7488 -0.0555 -2 -1.3446 0.1773 ...
 $ GSCAN.sc.S1 : num  -0.7893 -1.171 -2.5624 -0.077 -0.0439 ...
 $ GSCAN.sc.S2 : num  -0.1717 -1.4473 -2.1254 0.304 -0.0508 ...
 $ GSCAN.sc.S3 : num  -0.858 -1.781 3.678 0.811 0.543 ...
 $ GSCAN.sc.S4 : num  -0.953 0.537 1.975 1.286 0.857 ...
 $ GSCAN.sc.S5 : num  -0.642 1.786 0.866 1.275 1.037 ...
 $ GSCAN.sc.S6 : num  -0.243 2.205 1.777 0.956 0.71 ...
 $ GSCAN.sc.S7 : num  0.166 2.894 1.907 0.728 0.263 ...
 $ GSCAN.sc.S8 : num  0.126 2.791 1.814 0.843 0.334 ...
 $ GSCAN.si.S1 : num  -1.273 -1.833 0.667 0.166 -0.717 ...
 $ GSCAN.si.S2 : num  -1.541 -2.335 0.444 0.276 -0.193 ...
 $ GSCAN.si.S3 : num  -2.082 -2.688 -1.063 -0.764 -0.981 ...
 $ GSCAN.si.S4 : num  -2.4386 -2.2505 -1.3685 -1.0822 -0.0356 ...
 $ GSCAN.si.S5 : num  -1.7702 -2.1189 -3.2785 -1.4399 0.0188 ...
 $ GSCAN.si.S6 : num  -1.622 -3.822 -2.089 -1.53 -0.833 ...
 $ GSCAN.si.S7 : num  -1.09 -3.89 -1.89 -2.13 -1.27 ...
 $ GSCAN.si.S8 : num  -1.11 -4.07 -1.71 -2.12 -1.24 ...
 
# Multiply every column that starts with GSCAN with nSEX, creating 40 new columns for the multiplication
library(dplyr)

# Multiply every column that starts with GSCAN (PRS columns) with nSEX and suffix newly created columns with nSEX
# To change the suffix "_nSEX" to prefix "sex_", use rename_at()
PRS_everDrug1to10_CUD2 <- PRS_everDrug1to10_CUD %>% mutate_at(vars(matches("^GSCAN.*.S[1-8]$")),.funs= funs(nSEX= .*nSEX)) %>%
                            rename_at(vars(contains("_nSEX")),funs(paste("sex", gsub("_nSEX", "", .), sep = "_")))

# Compare an old column with a new column
> head(PRS_everDrug1to10_CUD2$GSCAN.ai.S1)
[1] -0.5150501 -1.1410706 -1.3126117 -1.1851313 -1.1344985 -0.6225980
> head(PRS_everDrug1to10_CUD2$sex_GSCAN.ai.S1)
[1] -0.5150501 -2.2821412 -2.6252234 -1.1851313 -1.1344985 -0.6225980
>        

Deselect multiple columns by their names

# Method 1
dd[ ,!(colnames(dd) %in% c("A", "B"))]
# Method 2: use dplyr::select(-one_of("columns-to-deselect"))
plyr::join_all(data.pheno.manu2.allSexes.list,by=c("ID"),type ="full") %>% 
            dplyr::select(-one_of("ID"))

Rename multiple columns

Rename columns with dplyr::rename_at(vars=, list(~function1, ~function2,...)) or dplyr::rename(new.var.name=old.var.name)

data %>%
  # Rename count variables by changing variable prefix "APExp60.1" to "APExp60.1.A"
  dplyr::rename_at(.vars= vars(starts_with("APExp60.1"))
                   ,list(~ sub(pattern = "^APExp60.1", replacement = "APExp60.1.A", x=.))
  )

Rename columns with data.table::setnames()

## Rename ID and AGEINT column as iid and age for join_all() with the other 2 data sets
age.for.binary.pheno <- data.table::setnames(age.for.binary.pheno
                                             , old=c("ID","AGEINT")
                                             , new=c("iid", "age"))

Column-bind two data.frames that have different row numbers

# Suppose df1 and df2 have different row numbers
df3 <- rowr::cbind.fill(df1, df2, fill = NA)

String manipulation

Create all combinations of two vectors, ordering the result as vector 1 and then vector 2. This cross join is more efficient than apply(expand.grip(x,y)) and custom sorting

# Create all combinations of vector 1 (MZ.twins.DZ.twins) and vector 2 (trait1.trait2). Note here the desired order is MZTwin1*, MZTwin2*, DZTwin1* and then DZTwin2*
vector1 <- c("MZTwin1","MZTwin2","DZTwin1","DZTwin2") # column dimension
vector2 <- c("trait1_0_trait2_0","trait1_1_trait2_0","trait1_0_trait2_1","trait1_1_trait2_1") # row  dimension

all.combinations <- data.table::CJ(vector1, vector2, sorted = FALSE)[, paste(V1, V2, sep ="_")]
> all.combinations
 [1] "MZTwin1_trait1_0_trait2_0" "MZTwin1_trait1_1_trait2_0" "MZTwin1_trait1_0_trait2_1" "MZTwin1_trait1_1_trait2_1"
 [5] "MZTwin2_trait1_0_trait2_0" "MZTwin2_trait1_1_trait2_0" "MZTwin2_trait1_0_trait2_1" "MZTwin2_trait1_1_trait2_1"
 [9] "DZTwin1_trait1_0_trait2_0" "DZTwin1_trait1_1_trait2_0" "DZTwin1_trait1_0_trait2_1" "DZTwin1_trait1_1_trait2_1"
[13] "DZTwin2_trait1_0_trait2_0" "DZTwin2_trait1_1_trait2_0" "DZTwin2_trait1_0_trait2_1" "DZTwin2_trait1_1_trait2_1"

Replace multiple strings with multiple replacements

# A vector of strings to process
> short_colnames_RSquare
# [1] "SI.S1"  "SI.S2"  "SI.S3"  "SI.S4"  "SI.S5"  "SI.S6"  "SI.S7"  "SI.S8"  "AI.S1"  "AI.S2"  "AI.S3"  "AI.S4" 
#[13] "AI.S5"  "AI.S6"  "AI.S7"  "AI.S8"  "CPD.S1" "CPD.S2" "CPD.S3" "CPD.S4" "CPD.S5" "CPD.S6" "CPD.S7" "CPD.S8"
#[25] "SC.S1"  "SC.S2"  "SC.S3"  "SC.S4"  "SC.S5"  "SC.S6"  "SC.S7"  "SC.S8"  "DPW.S1" "DPW.S2" "DPW.S3" "DPW.S4"
#[37] "DPW.S5" "DPW.S6" "DPW.S7" "DPW.S8"

# Replace S1:S8 with p< 5e-08, p< 1e-05, p< 1e-03, p< 1e-02, p< 5e-02, p< 0.1, p< 0.5 and p< 1
short_colnames_RSquare %>% stringr::str_replace_all(c( "S1"="p< 5e-08"
                                                       ,"S2"="p< 1e-05"
                                                       ,"S3"="p< 1e-03"
                                                       ,"S4"="p< 1e-02"
                                                       ,"S5"="p< 5e-02"
                                                       ,"S6"="p< 0.1"
                                                       ,"S7"="p< 0.5"
                                                       ,"S8"="p< 1")
#[1] "SI.p< 5e-08"  "SI.p< 1e-05"  "SI.p< 1e-03"  "SI.p< 1e-02"  "SI.p< 5e-02"  "SI.p< 0.1"    "SI.p< 0.5"   
#[8] "SI.p< 1"      "AI.p< 5e-08"  "AI.p< 1e-05"  "AI.p< 1e-03"  "AI.p< 1e-02"  "AI.p< 5e-02"  "AI.p< 0.1"   
#[15] "AI.p< 0.5"    "AI.p< 1"      "CPD.p< 5e-08" "CPD.p< 1e-05" "CPD.p< 1e-03" "CPD.p< 1e-02" "CPD.p< 5e-02"
#[22] "CPD.p< 0.1"   "CPD.p< 0.5"   "CPD.p< 1"     "SC.p< 5e-08"  "SC.p< 1e-05"  "SC.p< 1e-03"  "SC.p< 1e-02" 
#[29] "SC.p< 5e-02"  "SC.p< 0.1"    "SC.p< 0.5"    "SC.p< 1"      "DPW.p< 5e-08" "DPW.p< 1e-05" "DPW.p< 1e-03"
#[36] "DPW.p< 1e-02" "DPW.p< 5e-02" "DPW.p< 0.1"   "DPW.p< 0.5"   "DPW.p< 1"                                                       

Join multiple strings into a single string. Collapse multiple strings to 1 string.

# Example 1: by paste(strings, collapse)
## vector elements to be joined with a plus sign between them
other.predictors
[1] "all_coffee_cpd"               "merged_pack_years_20161"      "X3436_recodeMean"            
[4] "X3456.0.0"                    "complete_alcohol_unitsweekly"
## Join strings and add + signs
paste0(other.predictors,collapse ="+")
[1] "all_coffee_cpd+merged_pack_years_20161+X3436_recodeMean+X3456.0.0+complete_alcohol_unitsweekly"
# Example 2
covariates <- stringr::str_c("age","sex","ageSq","sexAge","sexAgeSq",sep="+")
> covariates
[1] "age+sex+ageSq+sexAge+sexAgeSq"

Count occurrences of categories in ordinal/binary variables

Suppose a cross table is generated for multiple binary variables (categories=0,1) where some variables don't have a category, say 1. Convert your variable to a factor, and set the categories you wish to include in the result using levels. Values with a count of zero will then also appear in the result:

# Number of cases and controls of trait1 in DZ twin1 and DZ twin2
num.case.ctrl.trait1.DZ <- table(factor(data.pheno.here.DZ[,trait1.name.twin1],levels=0:1)
                                 ,factor(data.pheno.here.DZ[,trait1.name.twin2],levels=0:1))

Produces 2-way cross table

# Generate 2-way cross table of frequencies of 2 variables
gmodels::CrossTable(data.gp7$ID.suffix, data.gp7$ZYGOSITY)
Total Observations in Table:  3290 

                   | data.gp7$ZYGOSITY 
data.gp7$ID.suffix |         1 |         2 |         3 |         4 |         5 |         6 |         9 | Row Total | 
-------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
                01 |       269 |       214 |       380 |       274 |       264 |       224 |         6 |      1631 | 
                   |     0.293 |     0.805 |     0.027 |     0.002 |     0.011 |     0.996 |     0.031 |           | 
                   |     0.165 |     0.131 |     0.233 |     0.168 |     0.162 |     0.137 |     0.004 |     0.496 | 
                   |     0.512 |     0.527 |     0.492 |     0.495 |     0.493 |     0.464 |     0.462 |           | 
                   |     0.082 |     0.065 |     0.116 |     0.083 |     0.080 |     0.068 |     0.002 |           | 
-------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
                02 |       230 |       179 |       352 |       255 |       242 |       222 |         4 |      1484 | 
                   |     0.196 |     0.093 |     0.032 |     0.105 |     0.000 |     0.079 |     0.592 |           | 
                   |     0.155 |     0.121 |     0.237 |     0.172 |     0.163 |     0.150 |     0.003 |     0.451 | 
                   |     0.438 |     0.441 |     0.455 |     0.460 |     0.451 |     0.460 |     0.308 |           | 
                   |     0.070 |     0.054 |     0.107 |     0.078 |     0.074 |     0.067 |     0.001 |           | 
-------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|

Date and time

# Calculate time difference between current Timestamp and previous Timestamp
temp2$time.difference <- with(temp2, Timestamp - lag(Timestamp))

# Display time difference as minutes
temp2$time.diff.minute <- as.numeric(temp2$time.difference, units = 'mins')

Convert datetime from character to datetime type

# Datetime from imported data
head(temp2$Timestamp,2)
[1] "3/13/2019 13:14:35" "3/13/2019 13:15:16"
# Convert character datetime to datetime
temp2$Timestamp <- as.POSIXct(temp$Timestamp, format="%m/%d/%Y %H:%M:%S", tz=Sys.timezone()) 
# Class changed
class(temp2$Timestamp)
[1] "POSIXct" "POSIXt"

To format a date column with inconsistent length in its month (e.g. 10/28/1948 4/10/1960), firstly read the date column as character, split it into 3 columns yyyy, mm, and dd, and combine them as a new date column, and then format the column using as.Date()

# Nicotine dependence and other diagnoses, processed by Gu Zhu, added sex, zygosity and DOB
## Don't specify DOB column as "Date" in colClasses= option. Convert character to date format later
## Examples of DOB: 10/28/1948 4/10/1960... Note the month length is inconsistent 
d6 <- read.csv(paste0(locPheno,"nic_use.csv")
               ,header = T
               ,na.strings = ""
               ,stringsAsFactors = F
               ,colClasses = c("character","character","integer","character","integer","integer","integer","integer","integer","integer","integer","integer","integer"))

## Split DOB column into mm, dd, and yyyy, pad leading zeros to mm and combine them together to make a new date
d6.1 <- d6 %>% tidyr::separate(DOB
                               ,c("DOB.mm","DOB.dd","DOB.yyyy")
                               ,sep= "/"
                               ,remove=FALSE)
## Pad leading zeros to DOB.mm
d6.1$DOB.mm <- stringr::str_pad(d6.1$DOB.mm, width=2,side="left",pad="0")

## Combine DOB.dd, DOB.mm, and DOB.yyyy to make a new date column
d6.1$DOB.yyyymmdd <- with(d6.1,paste(DOB.yyyy,DOB.mm,DOB.dd,sep="-"))

## Format the new date column as class date
d6.1$DOB.yyyymmdd <- as.Date(d6.1$DOB.yyyymmdd,format="%Y-%m-%d" )

## Check the date column
> str(d6.1$DOB.yyyymmdd)
 Date[1:9688], format: "1955-10-11" "1955-10-11" "1948-10-28" "1951-01-01" "1969-05-02" "1950-10-28" "1965-06-09" "1966-12-03" "1966-12-03" ...

search pattern "yyyy-mm-dd"

# A string containing yyyy-mm-dd 
> filePath
#[1] "/mnt/lustre/working/lab_nickm/lunC/MR_ICC_GSCAN_201806/reference/gwas-association-downloaded_2018-08-25-nicotine-dependence.tsv"
# Extract trait name by deleteing yyy-mm-dd, file prefix and file extension
yyyy_mm_dd="\\d{2,4}[.-]\\d{2}[.-]\\d{2,4}"
name1= gsub(basename(filePath),pattern =yyyy_mm_dd,replacement = "" )
name2=gsub(name1,pattern="gwas-association-downloaded_-",replacement = "")
name3=gsub(name2,pattern=".tsv",replacement = "")
> name3
[1] "nicotine-dependence"

Matrix

Reshape data from wide to long format

# Read a csv file (exported from a matrix) into a matrix
## The data.frame must have row.names similar to the rowname dimension of the imported matrix file
m1 <- read.table(paste0(loc.pheno.corr,"phenotypic-corr-matrix-between-GSCAN-PRS-and-illicit-SU-SUDs-QIMR19Up.txt")
                 ,header=TRUE
                 ,row.names=1) # says first column are rownames

m1.matrix <- as.matrix(m1) # dim(m1.matrix) 26 40

m2 <- as.data.frame.table(m1.matrix) # 1040 obs. of  3 variables:

# Split Var2 into 2 columns for reshaping data from long to wide: "discovery.trait" and "p.value.threshold"
m2$Var1 <- as.character(m2$Var1)
m2$Var2 <- as.character(m2$Var2)

library(tidyr)
m3 <- m2 %>% 
        tidyr::separate(Var2, c("discovery.trait","p.value.threshold"),"\\.")

# Reshape data from long to wide, resulting in 1 row per target phenotype, discovery trait and p value threshold
# Reshape long-form data to wide-form data using reshape()
# # v.names = a vector of measure variables
# # idvar = a vector of variables that will form the row dimension of the transposed table
# # timevar= variable whose values are to reshape to wide, forming the column dimension of the transposed table
# # The newly created columns will be "v.names.timevar" (4 measure variables * 5 unique values of name_fixEffect_trait)

measure_variables=c("Freq") # measure variables
m4 <- reshape(m3 
              ,v.names = measure_variables # measure variables
              ,idvar = c("Var1","p.value.threshold")
              ,timevar = "discovery.trait"
              ,direction = "wide"
              ,sep="_") # 208 obs. of  7 variables
# Custom sort data by target phenotype (order defined by the levels=) and then p.value.threshold
## Change Var1 back to factor
m4$Var1 <- factor(m4$Var1,levels=rownames(m1.matrix))

m5 <- m4[with(m4, order(Var1,p.value.threshold)),]

colnames(m5)[1] <- "target.phenotype"

Summarise a matrix

# Summarise the phenotypic correlation matrix
range(as.vector(m1.matrix)) # -0.1374839  0.2099488

# Get maximal value of the matrix
## First, get the index of maximal value. This return row.position and column.position of the value
## which.max(), which.min() determine the location, i.e., index of the (first) minimum or maximum of a numeric (or logical) vector.
m1.matrix.maximum <- arrayInd(which.max(m1.matrix), .dim=dim(m1.matrix))
## Retrieve rowname and column name by the matrix index
rownames(m1.matrix)[m1.matrix.maximum[,1]] # [1] "age at onset of cannabis abuse"
colnames(m1.matrix)[m1.matrix.maximum[,2]] # [1] "DPW.S4"

Export a matix and import it as a data.frame. It's important that the imported data.frame has a rowname that is from the matrix

# A matrix to work on
> str(phenotypic.corr.matrix.pheno.PRS.QIMR19Up)
 num [1:26, 1:40] 0.0416 0.04234 0.03419 0.071 0.00115 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:26] "ever used cocaine" "ever used amphetamine" "ever used inhalants" "ever used sedatives" ...
  ..$ : chr [1:40] "SI.S1" "SI.S2" "SI.S3" "SI.S4" ...
# Export the matrix, keeping the rownames
write.table(phenotypic.corr.matrix.pheno.PRS.QIMR19Up
            ,file=paste0(loc.pheno.corr,"phenotypic-correlation-between-GSCAN-PRS-and-illicit-SU-SUDs-QIMR19Up.txt"))

# Import the file to see if the matrix structure is unchanged
phenotypic.corr.matrix.pheno.PRS.QIMR19Up2 <- read.table(paste0(loc.pheno.corr,"phenotypic-correlation-between-GSCAN-PRS-and-illicit-SU-SUDs-QIMR19Up.txt")
                                                         ,header=TRUE
                                                         ,row.names=1) # says first column are rownames
# Check the structure
> str(phenotypic.corr.matrix.pheno.PRS.QIMR19Up2)
'data.frame':	26 obs. of  40 variables:
 $ SI.S1 : num  0.0416 0.04234 0.03419 0.071 0.00115 ...
 $ SI.S2 : num  0.0327 0.0555 0.0261 0.0526 0.0202 ...

Pad leading zeros to numeric variables

# A data file to process
> str(d1)
'data.frame':	42583 obs. of  8 variables:
 $ ID                           : int  201 202 203 204 205 208 209 250 251 401 ...
# Fix ID to 7 character in width by padding with leading zeros (e.g. 201 becomes 0000201)
# Convert ID/iid column to character in both files d3 and d4
# Keep ID column in same name for merging purposes: iid
d1$ID <- as.character(d1$ID) 
d1$iid <- stringr::str_pad(d1$ID,width=7, side="left", pad="0")
# Check the newly created iid column
> str(d1)
'data.frame':	42583 obs. of  9 variables:
 $ ID                           : chr  "201" "202" "203" "204" ...
 $ iid                          : chr  "0000201" "0000202" "0000203" "0000204" ...

Set plotting parameters by par(mar(),mpg(),oma())

  • mar() controls the margins, where plot axis titles are printed
  • mpg() sets the axis label locations relative to the edge of the inner plot window. The first value represents the location the labels (i.e. xlab and ylab in plot), the second the tick-mark labels, and third the tick marks. The default is c(3, 1, 0). Increase the label location when your axis label overlaps with axis ticks
    par_margin_outerMargin

Mutating joins and filtering joins

inner_join(df1, df2) #inner
left_join(df1, df2) #left outer
right_join(df1, df2) #right outer
left_join(df2, df1) #alternate right outer
full_join(df1, df2) #full join
semi_join(df1, df2) #keep only observations in df1 that match in df2.
anti_join(df1, df2) #drops all observations in df1 that match in df2.
# Exclude non-white people (file_notWhite) from a joined file (joined)
joined_rm1 <- joined %>% anti_join(file_notWhite) # 438870 obs. of  9 variables

# Exclude related individuals based on ID1 or ID2 from joined_rm1
joined_rm2_ID1rm <- anti_join(joined_rm1,file_related,by=c("IID" = "ID1")) # 361387 obs. of  9 variables
joined_rm2_ID2rm <- anti_join(joined_rm1,file_related,by=c("IID" = "ID2")) # 361443 obs. of  9 variables

Read selective lines of a file into R

*How to extract the first line from a text file?

## Read first line of a file
con <-file(filePath,"r") # Establish a connection to the file
first_line <- readLines(con,n=1) # Read 1st line of the file

Grep the line that starts with keywords from a no column text file

# Example 1 with special characters () ## greb the line that starts with keywords. Specify this pattern as a string by fixed=TRUE. Note when this option is there, you cannot use regular expression. line_want <- grep("Number of indivs with no missing phenotype(s) to use",logFile,value = T,fixed=TRUE) > line_want [1] "Number of indivs with no missing phenotype(s) to use: 276533" ## greb the line that starts with keywords. Specify this pattern using regular expression. Ensure that special characters are escaped by double backslashes \\ line_want <- grep("^Number of indivs with no missing phenotype\\(s\\) to use",logFile,value = T) > line_want [1] "Number of indivs with no missing phenotype(s) to use: 276533" # Example 2 with no special characters ## Suppose line 53-57 a file is ### Genetic Correlation ### ------------------- ### Genetic Correlation: -0.3813 (0.0371) ### Z-score: -10.275 ### P: 9.1379e-25 ## To grep -0.3813 ## Import log files logFile=readLines(inputFilePath) ## Grep the line that starts with keywords "Genetic Correlation:" line_want=grep("^Genetic Correlation:",logFile,value = T) rG=as.numeric(unlist(strsplit(line_want," "))[3])

Count number of non-missing values per row for multiple columns

# Count number of twins per family and genotyped.zygosity by counting number of non-missing ID columns per row
## Subset columns containing values to count non-missing
df.twin.ID.wide.small <- df.twin.ID.wide[,c("ID.01","ID.02","ID.51","ID.04","ID.54","ID.53","ID.50","ID.56","ID.57","ID.52","ID.55")]

## Method1: Calculate number of non-missing per row
df.twin.ID.wide$numb.twins <- rowSums(!is.na(df.twin.ID.wide.small))

## Method2: Calculate number of non-missing per row
apply(X=df.twin.ID.wide.small
      ,MARGIN= 1 # Margin=1 indicates rows, 2 indicates columns
      ,FUN=function(x) length(x[!is.na(x)]))

Count number of non-missing values in a column

# Number of non-missing values in alcoholAgeFirst
length(na.omit(PRS_alcoho_tobacc[,"alcoholAgeFirst"]))

Combine multiple columns into one column

# Method 1: use tydyr::unite().Combine two columns phenotype and subSampleSize into one column named phenotype. This replaces original phenotype. Be aware that data attribute may be changed by tidyr package. 
GCTAOut_part3 <- tidyr::unite(GCTAOut_part3
                       ,phenotype # name of the newly created column
                       , c(phenotype, subSampleSize) # columns to combine
                       , sep = "_" # separator
                       , remove=TRUE # remove input columns
                       ) 
# Method 2: with(data,paste0(column1,"sep",column2,"sep2"))
GCTAOut_part3$phenotype <- with(GCTAOut_part3,paste0(phenotype,"_",subSampleSize))

Search or match one or multiple patterns

Use grep(),glob2rx() and grep(), or match(patterns,object)

glob2rx() changes wildcard or globbing pattern into regular expression
# 1 positive search pattern. Return values using value=TRUE
target_var_group2=grep("alcohol",unique(f5$phenotype), value = TRUE) # 6 variables in red group

# 1 positive search pattern with wildcards
patterns_to_search=glob2rx("SU_DSM*cannabis*")
target_var_group3=grep(pattern_to_search,unique(f5$phenotype), value = TRUE)

# 2 positive patterns: 1 with wildcard
patterns_to_search=glob2rx("GSCAN.*_r|ZYGOSITY")
variables_to_reshape=grep(names(df_twin),pattern=patterns_to_search,value = TRUE)

# 3 positive patterns. None of them have wildcards
three_patterns_to_search=c("A1", "A9", "A6")
match(three_patterns_to_search, myfile$Letter)

Stack multiple columns to a single column

*stacking columns into 1 column in R

# Stack PRS for 7 p value thresholds to a single column
GSCAN.ai <- stack(PRS_pheno_uniqueID[1:7])

Split a column of a data.frame into columns by delimiter.

# Method 1: use tydyr::separate
## The data to work on 
> genetic_corr
  trait_exp h2_liab h2_liab_se     rg    se      z          p
1  GSCAN-SI   0.094      0.003  0.509 0.026 19.429 4.4169e-84
2  GSCAN-AI   0.048      0.003 -0.073 0.044 -1.637 1.0160e-01
3 GSCAN-CPD   0.074      0.010 -0.084 0.040 -2.071 3.8300e-02
4  GSCAN-SC   0.052      0.004 -0.146 0.045 -3.256 1.1000e-03
5 GSCAN-DPW   0.050      0.002  0.379 0.034 11.162 6.2520e-29

## Split trait_exp by delimiter "-" into 2 columns : consortium and trait
genetic_corr2 <- genetic_corr %>% tidyr::separate(trait_exp,c("consortium","trait_exp"),"-")

> genetic_corr2
  consortium trait_exp h2_liab h2_liab_se     rg    se      z          p
1      GSCAN        SI   0.094      0.003  0.509 0.026 19.429 4.4169e-84
2      GSCAN        AI   0.048      0.003 -0.073 0.044 -1.637 1.0160e-01
3      GSCAN       CPD   0.074      0.010 -0.084 0.040 -2.071 3.8300e-02
4      GSCAN        SC   0.052      0.004 -0.146 0.045 -3.256 1.1000e-03
5      GSCAN       DPW   0.050      0.002  0.379 0.034 11.162 6.2520e-29
# Method 2: a complicated one
## A column "name_fix_eff" contains text.A.S1, text.B.S2,...
## Split the column values by dot and add part 2 and part 3 as separate columns
f.all$name_fixEffect_trait <- as.data.frame(do.call("rbind",strsplit(f.all$name_fix_eff,"\\.")),stringsAsFactors =FALSE )[,2]

f.all$name_fixEffect_pThre <- as.data.frame(do.call("rbind",strsplit(f.all$name_fix_eff,"\\.")),stringsAsFactors =FALSE)[,3]

Substring a column by position

# Extract last 2 characters of the ID column
library(magrittr)
tem.uniqueID %<>% mutate(ID.suffix=substr(ID,nchar(ID)-2+1,nchar(ID))) # dim(tem.uniqueID) 2463    5

Subset part of a column by delimiter by splitting the column into multiple new columns using tidyr::separate and deselecting unwated columns

## Give an order of columns in the new data sets. 
### Reorder columns so that "target_pheno_group" appears as 1st column
columns.order <- c("target_pheno_group","phenotype","name_fix_eff","name_fixEffect_trait","name_fixEffect_pThre","fix_eff_esti","SE","pvalue2sided","R2")

# Stack all the phenotypes per manu 2 for all sexes
## Extract 2nd, 3rd part of name_fix_eff columns as two new columns: name_fixEffect_trait, name_fixEffect_pThre
## Reorder columns as the order aforespecified
fix.eff.PRS.manu2.sexPRS.exclu.allSexes <- rbind(fix.eff.PRS.phenoGp2.sexPRS.exclu.allSexes
                                                 ,fix.eff.PRS.phenoGp5.sexPRS.exclu.allSexes) %>% 
  # Extract 2nd and last part of a column split by separator dot, extracting the middle and last part
  tidyr::separate(col=name_fix_eff
                  ,into=c("part1","name_fixEffect_trait","name_fixEffect_pThre")
                  ,remove=FALSE) %>%  
  dplyr::select(-part1) %>% # Deleting parts that are not needed 
  dplyr::select_(.dots=columns.order) # dim(fix.eff.PRS.manu2.sexPRS.exclu.allSexes) 880   9

Find position of first underscore in a string. This allows you to subset part of a string if the string has a delimiter

# Find position of the first _ in a string
str_locate("pheno_SU_cannabis_abuse_onset","_")
     start end
[1,]     6   6

custom sort a character vector, data.frame

Custom sort a character vector by part of the values

# Step1: Create a character vector for custom sorting. Here are all column names of PRS
## Create vectors for ordering characters
discovery.trait.order <- c("si","ai","cpd","sc","dpw")
p.value.thres.order <- paste0("S",c(1:7))

## Create all column names of PRS                             
PRS.columns <- apply(expand.grid(x=paste0("GSCAN.",c("si","ai","cpd","sc","dpw"))
                       ,y=paste0("S",c(1:7)))
                     ,1
                     ,paste
                     ,collapse=".")
> PRS.columns
 [1] "GSCAN.si.S1"  "GSCAN.ai.S1"  "GSCAN.cpd.S1" "GSCAN.sc.S1"  "GSCAN.dpw.S1" "GSCAN.si.S2"  "GSCAN.ai.S2"  "GSCAN.cpd.S2"
 [9] "GSCAN.sc.S2"  "GSCAN.dpw.S2" "GSCAN.si.S3"  "GSCAN.ai.S3"  "GSCAN.cpd.S3" "GSCAN.sc.S3"  "GSCAN.dpw.S3" "GSCAN.si.S4" 
[17] "GSCAN.ai.S4"  "GSCAN.cpd.S4" "GSCAN.sc.S4"  "GSCAN.dpw.S4" "GSCAN.si.S5"  "GSCAN.ai.S5"  "GSCAN.cpd.S5" "GSCAN.sc.S5" 
[25] "GSCAN.dpw.S5" "GSCAN.si.S6"  "GSCAN.ai.S6"  "GSCAN.cpd.S6" "GSCAN.sc.S6"  "GSCAN.dpw.S6" "GSCAN.si.S7"  "GSCAN.ai.S7" 
[33] "GSCAN.cpd.S7" "GSCAN.sc.S7"  "GSCAN.dpw.S7"
# Step2: Convert PRS column vector to data.frame
PRS.columns.df <- data.frame(PRS.column= PRS.columns, stringsAsFactors = F)

# Step3: Split PRS column into 3 multiple columns by delimiter "." 
PRS.columns.df2 <- PRS.columns.df %>% tidyr::separate(PRS.column,c("consortium","discovery.trait","p.value.thres")
                          ,sep="\\."
                          ,remove=F )

# Step4: Convert character columns that will be custom sorted to factor with desired order as their levels
PRS.columns.df2$discovery.trait <- factor(PRS.columns.df2$discovery.trait,levels=discovery.trait.order)

# Step5: Reorder/sort PRS column by two newly created columns
PRS.columns.df2.ordered <- PRS.columns.df2[
  with(PRS.columns.df2, order(discovery.trait,p.value.thres)),
]

# Step6: Use the ordered column as a vector
> PRS.columns.df2.ordered$PRS.column
 [1] "GSCAN.si.S1"  "GSCAN.si.S2"  "GSCAN.si.S3"  "GSCAN.si.S4"  "GSCAN.si.S5"  "GSCAN.si.S6"  "GSCAN.si.S7"  "GSCAN.ai.S1" 
 [9] "GSCAN.ai.S2"  "GSCAN.ai.S3"  "GSCAN.ai.S4"  "GSCAN.ai.S5"  "GSCAN.ai.S6"  "GSCAN.ai.S7"  "GSCAN.cpd.S1" "GSCAN.cpd.S2"
[17] "GSCAN.cpd.S3" "GSCAN.cpd.S4" "GSCAN.cpd.S5" "GSCAN.cpd.S6" "GSCAN.cpd.S7" "GSCAN.sc.S1"  "GSCAN.sc.S2"  "GSCAN.sc.S3" 
[25] "GSCAN.sc.S4"  "GSCAN.sc.S5"  "GSCAN.sc.S6"  "GSCAN.sc.S7"  "GSCAN.dpw.S1" "GSCAN.dpw.S2" "GSCAN.dpw.S3" "GSCAN.dpw.S4"
[33] "GSCAN.dpw.S5" "GSCAN.dpw.S6" "GSCAN.dpw.S7"

Sort a character vector in a user-defined order with order(match(have,want))

*How to sort a character vector according to a specific order?

# A character vector to sort
x <- c("white","white","blue","green","red","blue","red")
# Your preferred order
y <- c("red","white","blue","green")
# Sort the character vector in your preferred order
x[order(match(x, y))]
# [1] "red"   "red"   "white" "white" "blue"  "blue"  "green"

Round numeric values and format p values

# Example 1: round numeric to 3 decimal places
# The data to round
> tmp
  trait_exp h2_liab h2_liab_se     rg  se.x      z        p.x         bxy       se.y         p.y n_snps
1  UKB_CCPD   0.042      0.005  0.037 0.040  0.928 3.5320e-01 -0.16404600 0.03875430 2.30628e-05     36
2   UKB_CPD   0.007      0.002 -0.215 0.088 -2.451 1.4200e-02 -0.01694180 0.00766063 2.69981e-02      5
3 UKB_ESDPW   0.062      0.003  0.383 0.040  9.532 1.5417e-21 -0.00455798 0.00475730 3.38011e-01     41
4  UKB_PYOS   0.052      0.003  0.394 0.035 11.182 5.0067e-29  0.01481260 0.00694742 3.29983e-02     19

# Round numeric values to 3 decimal places. Don't round p values
round_numeric_3decimal <-function(x) round(x,digits = 3)
format_p_values <- function(x) format.pval(x,digits = 2,eps = 0.001,nsmall = 3)  
# Test your functions
round_numeric_3decimal(1.2345)
format_p_values(0.0071)
# Round numeric values and format p values. Here columns are processed in the order we want
tmp <-cbind(tmp[c("trait_exp","h2_liab","h2_liab_se","rg","se.x","z")]
            ,lapply(tmp[c("p.x")],format_p_values)
            ,tmp[c("n_snps")]
            ,lapply(tmp[c("bxy","se.y")],round_numeric_3decimal)
            ,lapply(tmp[c("p.y")],format_p_values)
            )

# Example 2: Format wanted numeric columns to three decimal places
tmp2$b_3decimal <- format(round(tmp2$b,3),nsmall=3)
tmp2$se_3decimal <- format(round(tmp2$se,3),nsmall=3)
tmp2$b_se_formatted <- with(tmp2,paste0(b_3decimal," (",se_3decimal,")"))

# digits : number of digits, but after the 0.0
# eps = the threshold value above wich the. Function will replace the pvalue by "<0.0xxx"
# nsmall = how much tails 0 to keep if digits of original value < to digits defined
tmp2$pval_3decimal <- format.pval(tmp2$pval
                                  ,digits = 2 
                                  ,eps = 0.001  
                                  ,nsmall = 3)

                            # "0.990" "0.200" "0.323" "<0.001" "0.002"

Sort a data.frames based on ascending and descending order of multiple columns

# Example 1
attach(tmp2)

tmp2 <- tmp2[order(exposure,SNP,-method),]

detach(tmp2)

# Example 2
# Order data by ascending outcome substance, outcome, exposure and descending type.analysis. This approach sorts rows by columns in a descending order, those character columns in ascending order are specified with -rank()
t.ordered <- t[order( -rank(t$outcome.substance)
                     ,-rank(t$outcome)
                     ,-rank(t$exposure)
                     ,rank(t$type.analysis)
                     ,decreasing = TRUE),]

Run part of a R file by a range of line number

# The source2 founction 
source2 <- function(file, start, end, ...) {
    file.lines <- scan(file, what=character(), skip=start-1, nlines=end-start+1, sep='\n')
    file.lines.collapsed <- paste(file.lines, collapse='\n')
    source(textConnection(file.lines.collapsed), ...)
}
# Run the function. start and end must cover a complete code chunk
filePathScript= paste0(locScripts,"PRS_UKB_201711_step18-01-01_adjust-significance-threshold-for-multiple-testing-in-a-single-data-set_pheno-group2-5.R")

source2(filePathScript,214,280)

Sort a data.frame by an user-defined order.

# The problem: the phenotype column is ordered as everDrug1, everDrug10, everDrug2... rather than everDrug1, everDrug2... everDrug10
# list phenotype values as you want in the levels= option. The values should match phenotype values in the data to sort
f2$phenotype <- factor(f2$phenotype, levels = paste0("everDrug",c(1:10)))
# Order data by the phenotype column
f2.sorted <- f2[order(f2$phenotype),]

Find the positions of NAs in the matrix using which(is.na(matrix_name),arr.ind=T)

Get positions for NAs only in the “middle” of a matrix column

# Show location of the NAs
which(is.na(f.all.RSquare.wide.matrix), arr.ind = T)
#                         row col
# DSM5 CUD ctrl vs cases  28  16

which(is.na(f.all.pValue.wide.matrix),arr.ind = T)
#                         row col
# DSM5 CUD ctrl vs cases  28  16

## Replace NA with 0 for R2, NA with 0.999 for p values
f.all.RSquare.wide.matrix[is.na(f.all.RSquare.wide.matrix) ] <- 0
f.all.pValue.wide.matrix[is.na(f.all.pValue.wide.matrix)] <- 0.999

Convert a matrix to a data.frame without losing its dimnames. This method writes the matrix's rownames into 1st column and uses its colnames as column names for the data.frame

library(plyr)
# Convert a matrix to a data.frame keeping its dimnames
f.all.percentageExplained.wide.df <- adply(f.all.percentageExplained.wide.matrix, 1, c)

Convert a data.frame to a matrix. The key is to use matrix(as.matrix(DF)). dimnames= specifies the rownames and colnames of the matrix.

matrix(as.matrix(f.all.pValue.wide[,c(2:41)])
                                  ,nrow = nrow(f.all.pValue.wide)
                                  ,ncol=ncol(f.all.pValue.wide[,c(2:41)])
                                  ,dimnames = list(label_phenotypes,short_colnames_pValue))

Reshape data from long to wide

Method 1: use reshape(). Limitation is only 1 timevar can be used. Use method 2 if multiple timevar

# Method 1: use reshape. Limitation is only 1 timevar can be used. Use method 2 if multiple timevar
## v.names = a vector of measure variables
## idvar = a vector of variables that will form the row dimension of the transposed table
## timevar= variable whose values are to reshape to wide, forming the column dimension of the transposed table
## The newly created columns will be "v.names.timevar" (4 measure variables * 5 unique values of name_fixEffect_trait)
measure_variables=c("fix_eff_estimate","fix_eff_SE","pvalue2sided","R2") # multiple measure variables
f_test <- reshape(f.everDrug_AU_CU # data to reshape
                    ,v.names = measure_variables # measure variables
                    ,idvar = c("target_pheno_group","phenotype","name_fixEffect_pThre")
                    ,timevar = "name_fixEffect_trait"
                    ,direction = "wide") 

Method 2: use reshape2::dcast(). Limitation: when there are duplicate variables, dcast() applies its default function length to your data (e.g. when your value.var takes a character column, but the reshaped values will become 1 or 0). To overcome this limitation, you will create another new column that uniquely identifies the combinations of the ~ variables. Function arguments:

  1. value.var = can take only 1 measure variable if packageVersion("reshape2") 1.4.3
  2. On the left of ~ : the transposed data has one row per unique value of "variable"
  3. On the right of ~ : the transposed columns are unique combination of wave and value
  4. drop= : should missing combinations dropped or kept? If variable contains NA, set drop= to TRUE; Otherwise set it to FALSE
# Reshape contingency table data to one row per variable with multiple timevar columns by dcast()
append1_wide <- reshape2::dcast(append1, variable ~ wave+ value , value.var = "Freq", drop = FALSE)

# Avoid dcast() applies fun.aggregate=length to reshaped values sepecifeid by value.var= when the values are character

## Make a data containing duplicates (i.e. variables id and cat, used by ~ formula, don't give unique combinations. See row 6 and 7 of the following dataset df2) 
df2 <- structure(list(id = c("A", "B", "C", "A", "B", "C", "C"), cat = c("SS", 
"SS", "SS", "SV", "SV", "SV", "SV"), val = c(220L, 222L, 223L, 
224L, 225L, 220L, 1L)), .Names = c("id", "cat", "val"), class = "data.frame", row.names = c(NA, 
-7L))
> tail(df2)
  id cat val
2  B  SS 222
3  C  SS 223
4  A  SV 224
5  B  SV 225
6  C  SV 220
7  C  SV   1

library(reshape2)
library(plyr)
# Add a variable for how many times the id*cat combination has occured
## seq_along(cat) is to incrementally counting the occurence of cat. When cat has duplicates, it counts them incrementally (see row 7 of tmp)
tmp <- ddply(df2, .(id, cat), transform, newid = paste(id, seq_along(cat)))
> tail(tmp)
  id cat val newid
2  A  SV 224   A 1
3  B  SS 222   B 1
4  B  SV 225   B 1
5  C  SS 223   C 1
6  C  SV 220   C 1
7  C  SV   1   C 2

## Aggregate using this newid and toss in the id so we don't lose it
out <- dcast(tmp, id + newid ~ cat, value.var = "val")
## Remove newid if we want
out <- out[,-which(colnames(out) == "newid")]
> out
#  id  SS  SV
#1  A 220 224
#2  B 222 225
#3  C 223 220
#4  C  NA   1

Method 3: use reshape2::melt() and tidyr::spread()

## step0: the table to reshape
str(f.everDrug_AU_CU) # 1040 obs. of  8 variables

## step1: Reshape measure variables from wide to long, reshaping them into 2 new columns 
### (1) "variable" contains the variable name of the measure variable and 
### (2) "value" contains the values of the measure variables
### melt(id.vars=non-measure-variables)
df<-f.everDrug_AU_CU %>% reshape2::melt(id.vars=c(1,2,3,4)) # 4160 obs. of  6 variables

## step2: Create a column for the column heading of the transposed table
### This heading column combines the variable to transpose into wide format and the names of the measure variables
df$column_heading<-paste0(df$name_fixEffect_trait,"_",df$variable)

## step3: Remove the two old columns
df$name_fixEffect_trait<-df$variable<-NULL

## step4: Transpose data into wide format
### spread(data, key, value)
#### key: The bare (unquoted) name of the column whose values will be used as column headings
#### value: The bare (unquoted) name of the column whose values will populate the cells
dd <- df %>%  spread(column_heading,value)

## step5: Check the tranposed table
str(dd) # Classes ‘data.table’ and 'data.frame':	208 obs. of  23 variables

Method 3: reshape data when identifier columns containing duplicates using tidyr::spread() The trick is to create a dummy column containing unique number representing rows of the reshaped data using gather() unite() group_by and mutate() and then reshape data using the dummy column as the key column, and eventually drop this dummy column

# A long-format data to reshape to one row per month
jj <- data.frame(month=rep(1:3,4),
             student=rep(c("Amy", "Bob"), each=6),
             A=c(9, 7, 6, 8, 6, 9, 3, 2, 1, 5, 6, 5),
             B=c(6, 7, 8, 5, 6, 7, 5, 4, 6, 3, 1, 5)) # dim(jj) 12  4
# Data structure of this data
## Non-measurement variables: month, student
## Measurement variables: A, B               
str(jj)
'data.frame':	12 obs. of  4 variables:
 $ month  : int  1 2 3 1 2 3 1 2 3 1 ...
 $ student: Factor w/ 2 levels "Amy","Bob": 1 1 1 1 1 1 2 2 2 2 ...
 $ A      : num  9 7 6 8 6 9 3 2 1 5 ...
 $ B      : num  6 7 8 5 6 7 5 4 6 3 ...
 
# Reshape data to wide-format
## gather() : Collapse measurement variables into two new columns variable and value. Simply add all the non-measurement variables within -(var1:varN). To use the range you must order columns so that columns are non-measurement then measurement
## unite(): Combine non-measurement variables that don't form the row dimension of the reshaped data to a temporary variable 
              
jj.wide <- jj %>% 
  gather(key="variable", value="value", -(month:student)) %>% 
  unite(temp, student, variable) %>% 
  group_by(temp) %>% 
  mutate(id=1:n()) %>% 
  spread(temp, value) %>% 
  select(-id) # dim(jj.wide) 6 5

# Data structure of the newly reshaped data
## The row structure is 1 row per month which contains duplicates  
str(jj.wide)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	6 obs. of  5 variables:
 $ month: int  1 1 2 2 3 3
 $ Amy_A: num  9 8 7 6 6 9
 $ Amy_B: num  6 5 7 6 8 7
 $ Bob_A: num  3 5 2 6 1 5
 $ Bob_B: num  5 3 4 1 6 5

Method 4: reshape data from long to wide using

# Spread the levels of Tissue.Category (1 column) to 1 column per level (3 columns), resulting in three sets of columns (Tumor:,Stroma:, total: )
summary.Exp26.wide <- tidyr::pivot_wider( data=summary.Exp26
                                         ,id_cols = c(1) # Columns that form the row dimension
                                         ,names_from = c(2) # Columns to pivot wider
                                         ,names_sep = "_"
                                         # Measurement variables, the data portion
                                         ,values_from = c(3:6))
# Spread Duplicate column, resulting in two sets of columns (:RBWH_A , :RBWH_B)
Exp08.survival.markers.wide <- tidyr::pivot_wider( data=Exp08.survival.markers
                                                   ,id_cols = c(shortID, coreID) 
                                                   ,names_from = Duplicate # Columns to collapse into column dimension
                                                   ,names_sep = "_"
                                                   ,values_from = c(6,7,9:29)) # Measurement variables (data portion)

Conditionally create a new column with mutate(new.var=case_when) Values of new variable are specified on the right of ~

# Example 1: Conditionally create a new column method.abbreviated with values "Egger", "WM" and "IVW". Note the use of `TRUE ~ as.character(method)`
append_MR_files_selective_methods <- append_MR_files %>% 
  dplyr::filter(method %in% c("MR Egger","Weighted median","Inverse variance weighted")) %>%
  dplyr::mutate(method.abbreviated= case_when(method=="MR Egger" ~ "Egger"
                                              ,method=="Weighted median" ~ "WM"
                                              ,method=="Inverse variance weighted" ~ "IVW"
                                              ,TRUE ~ as.character(method)))
# Example 1 that won't work
append_MR_files_selective_methods <- append_MR_files %>% 
  dplyr::filter(method %in% c("MR Egger","Weighted median","Inverse variance weighted")) %>%
  dplyr::mutate(method.abbreviated= case_when(method=="MR Egger" ~ "Egger"
                                              ,method=="Weighted median" ~ "WM"
                                              ,method=="Inverse variance weighted" ~ "IVW"))

# Example 2: Conditionally create V5 based on the values of V1, V2,V3 and V4. V5 values are 1,2 or 0
library(dplyr)
myfile %>% 
       mutate(V5 = case_when(V1 == 1 & V2 != 4 ~ 1, 
                             V2 == 4 & V3 != 1 ~ 2,
                             TRUE ~ 0))
# Example 3 working: recode values <=0 as numeric NA, else as original numeric values.
df <- df %>% mutate(new = case_when(old == 1 ~ 5,
                                    old == 2 ~ NA_real_,
                                    TRUE ~ as.numeric(old)))                             

Conditionally filter rows based on values of a column.

*onditional filter of grouped factors - dplyr

# Condiitonally subset rows based on the value of the column pval.y (p value for horizontal pleiotropy)
## if pval.y < 0.05, keep rows with method= "MR Egger" or "Weighted median" (i.e. delete rows with method !%in% c("MR Egger","Weighted median") when pval.y >= 0.05)
## Do nothing when pval.y >= 0.05
## The condition to check is by a new variable "check"
OR_horiPleio_keep <-  OR_horiPleio %>% 
                        group_by(id.exposure,id.outcome) %>% 
                          mutate(check= pval.y <0.05) %>% 
                            filter(!(check==TRUE & method %in% c("MR Egger","Weighted median"))) 

Negatively filter rows based on a pattern in a column

# Example 1
# Exclude 2 rows with SNPs rs2472297 and rs4410790. Note that ! is used in the beginning of the condition
library("dplyr",lib.loc="/software/R/R-3.4.1/lib64/R/library")
subset(harmonised.UKBCCPD.ICCCI.5e8, !SNP %in% c("rs2472297","rs4410790")) # dim(harmonised.UKBCCPD.ICCCI.5e8.rm2SNPs)
harmonised.UKBCCPD.ICCCI.5e8 %>% dplyr::filter(!SNP %in% c("rs2472297","rs4410790"))
# Example 2
library(dplyr)
library(tidyr)
# Excluding values in phenotype containing "onset"
f5 <- read.table(file = paste0(input_pheno5,input_file5),header = T, sep=",", stringsAsFactors = F) %>% 
        filter(!grepl("onset",phenotype))

Negatively filter columns with column names ending with a pattern

*How to drop columns by name pattern in R?

# Drop column names that end with "S8"
PRS_pheno_gp2=PRS_pheno_gp2[,-grep("S8", colnames(PRS_pheno_gp2))]

Reorder columns in a flexible way using setdiff()

# Vertically combine all 3 sex groups as 1 data set
temp <- rbind(get(paste0(prefix.out.file.name.manu2,".allSexes"))
                   ,get(paste0(prefix.out.file.name.manu2,".males"))
                   ,get(paste0(prefix.out.file.name.manu2,".females")))

# Recorder columns and sort rows for tables in SAS
## Put these columns to first in a flexible way by setdiff()
columns.to.first <- c("phenotype","phenotype.order.num","var.label","sex.group")
temp <- temp[,c(columns.to.first, setdiff(names(temp),columns.to.first))]

Select columns and order them using dplyr::select_().

library(dplyr)
library(tidyr)
# Example 1
# Reorder columns
columns_want_in_this_order=c("FID","IID","missing","missing","batch","kinship","exclude_kinship","excess_relative","age","sex","white.British","X3456.0.0","X3456.1.0","X3456.2.0","X3456.0.0_recode","X3456.1.0_recode","X3456.2.0_recode","X3456_mean")
pheno_ukb3456_NA20453_ordered <- pheno_ukb3456_NA20453 %>% select_(.dots=columns_want_in_this_order)

# Example 2
columns_to_select=c("phenotype","covarPheno","covariate","fix_eff","pvalue2sided","R2")
f2 <- read.table(file = paste0(input_pheno2,input_file2),header = T, sep=",", stringsAsFactors = F) %>% 
        filter(str_detect(covarPheno,"GSCAN")) %>%  # rows reduced from 15800 to 8000
        select_(.dots=columns_to_select)

Subset rows using dplyr::filter()

Use multiple patterns with filter(str_detect(variable,"A|B|C")). str_detect() accepts length-1 pattern
# Subset rows from phenotype is "tobacco" or tobaccoFreq and rowNames is GSCAN.si.*
f3 %>% 
    filter(str_detect(phenotype, "tobaccoEver|tobaccoFreq")) %>% 
    filter(str_detect(rowNames,"GSCAN.si"))
Searching a particular string, an equivalent to using wildcard
# load packages
library(dplyr)
library(stringr)
# Import CSV data files
## Subset rows from covarPheno==GSCAN.*
f2=read.table(file = paste0(input_pheno2,input_file2),header = T, sep=",", stringsAsFactors = F) %>% filter(str_detect(covarPheno,"GSCAN")) # rows reduced from 15800 to 8000 
Positive filtration with a pattern containing special characters that need to escape: dot
## Subset ID (column 2) and PRS columns (names with prefix GSCAN.)
PRS_pheno_gp4= subset(data_gp4, select = c(2, grep("GSCAN\\.", names(data_gp4)))) 
Positive filtration with a search pattern of multiple strings
library(dplyr)
target <- c("Tom", "Lynn")
filter(dat, name %in% target)  # equivalently, dat %>% filter(name %in% target)
Positive filtration with logical operators
# Method 1
by(file$PVALUE<5e-8,file$CHROM,sum )
# Method 2: %>% filter() %>% group_by() %>% summarise(newColumnName=n())
## Be aware of conflict in same-named function summarise() of package dplyr and plyr
z2 <- file %>% 
          filter(PVALUE<5e-8) %>% 
              group_by(CHROM) %>% # multiple group columns
                  dplyr::summarise(numSNPMetPThres= n()) 
# Example: Multiple functions in the summarise()
testData <- read.table(file = "clipboard",header = TRUE)
require(dplyr)
testData %>%
  group_by(Y) %>%
      summarise(total = sum(income),freq = n()) %>%
          filter(freq > 3)

Subset part of a string by deleting the part that we don't want

# Strings to subset "GSCAN_ai_S1_r_01" "GSCAN_ai_S1_r_02" # Delete the ending part to get the part we want string=c("GSCAN_ai_S1_r_01","GSCAN_ai_S1_r_02") gsub(string,pattern="_01|_02",replacement="") #[1] "GSCAN_ai_S1_r" "GSCAN_ai_S1_r"

Sum up a column with multiple group_by columns

# A data.frame has five columns
str(z_df)
'data.frame':	880 obs. of  5 variables:
 $ metaData     : chr  "HRCr1.1" "HRCr1.1" "HRCr1.1" "HRCr1.1" ...
 $ trait        : chr  "ai" "ai" "ai" "ai" ...
 $ CHR          : num  10 10 10 10 10 10 10 10 11 11 ...
 $ pThreshold   : num  5e-08 1e-05 1e-03 1e-02 5e-02 1e-01 5e-01 1e+00 5e-08 1e-05 ...
 $ num_SNP_met_p: num  0 4 109 684 2557 ...
# Sum up num_SNP_met_p across 22 autosomes
library(dplyr)
z2 <- z_df %>% 
        group_by(metaData,trait,pThreshold) %>% # multiple group columns
        summarise(sum= sum(num_SNP_met_p))  # multiple summary columns

Convert factor columns to character or other type lapply()

# Factor columns to numeric 
## Example 1
> str(GSMR)
'data.frame':	4 obs. of  5 variables:
 $ Exposure: Factor w/ 5 levels "UKB_CCPD","UKB_CPD",..: 2 1 3 4
 $ n_snps  : Factor w/ 5 levels "19","36","41",..: 4 2 3 1
 $ bxy     : Factor w/ 5 levels "-0.00455798",..: 2 3 1 4
 $ se      : Factor w/ 5 levels "0.0047573","0.00694742",..: 3 4 1 2
 $ p       : Factor w/ 5 levels "0.0269981","0.0329983",..: 1 4 3 2
## Convert column 2 to 5 to numeric 
GSMR[,c(2:5)] <- lapply(GSMR[,c(2:5)], function(x) as.numeric(levels(x))[x])
> str(GSMR)
'data.frame':	4 obs. of  5 variables:
 $ Exposure: Factor w/ 5 levels "UKB_CCPD","UKB_CPD",..: 2 1 3 4
 $ n_snps  : num  5 36 41 19
 $ bxy     : num  -0.01694 -0.16405 -0.00456 0.01481
 $ se      : num  0.00766 0.03875 0.00476 0.00695
 $ p       : num  2.70e-02 2.31e-05 3.38e-01 3.30e-02
 
## Example 2: (not always working) Convert column 3 to 5 from character to numeric
z_df[,3:5]<-lapply(z_df[,3:5],as.numeric)

#Convert column 3 to 5 from factor to character
data[,1:3]<-lapply(data[,1:3],as.character)

# Convert all columns to character
data[] <-lapply(data,as.character)

# Convert all factor columns to character
datHar_CI_UKBCCPD[] <- lapply(datHar_CI_UKBCCPD, function(x) if(is.factor(x)) as.character(x) else x)

Convert multiple POSIXct columns to Date

# Read an Excel file 
dim(.BRAF.20181218) 183 49

# Get column names of all date columns ignoring case
BRAF.20181218.date.colnames <- grep(pattern="date", x=colnames(.BRAF.20181218), ignore.case = TRUE, value = TRUE)

# Assign data types to columns
BRAF.20181218.data.types <- ifelse(colnames(.BRAF.20181218) %in% BRAF.20181218.date.colnames, "date", "text")

# Read the excel file using the data types
BRAF.20181218 <- readxl::read_excel( path = filepath.source.clinical.data.BRAF.20181218
                                     ,sheet = "PRE_clindata"
                                     ,na=c(" ")
                                     ,n_max = 184
                                     ,guess_max =184
                                     ,col_types = BRAF.20181218.data.types) # dim(.BRAF.20181218) 183 49

# Change POSIXct to Date
BRAF.20181218[,BRAF.20181218.date.colnames] <- lapply(BRAF.20181218[, BRAF.20181218.date.colnames]
                                                      , function(x) as.Date(x, origin="1899-12-30"))
       

An example of using nested foreach loops. foreach() loops are used for their return value, like lapply. In this way they are very different from for loops which are used for their side effects. By using the appropriate .combine functions, the inner foreach loop can return vectors which are combined row-wise into a matrix by the outer foreach loop. Notice that packages used in the inner foreach loop need to be called in the outer foreach loop

# The following matrix x has 10*5 rows
x <- foreach(i=1:10, .combine='rbind', .packages=c("foreach","dplyr")) %dopar% {
        foreach(j=1:5, .combine='rbind') %do% { }}

# The following matrix x has 10 rows. The result from the inner foreach loop is put together horizontally, probably not what you want
x <- foreach(i=1:10, .combine='rbind', .packages="foreach") %dopar% {
        foreach(j=1:5, .combine='c') %do% { }}        
Subsetting using subset()
# this subset 40 rows, which is correct.
significant <- subset(data, phenotype %in% c("everDrug1","everDrug3","everDrug6","everDrug7","everDrug8") & covarPheno=="GSCAN.si", select=c("phenotype","covariate","pvalue2sided","R2") ) 

# This subset just 8 rows. But where is wrong?
#significant <- data[data$phenotype==c("everDrug1","everDrug3","everDrug6","everDrug7","everDrug8") & data$covarPheno=="GSCAN.si",c("phenotype","covariate","pvalue2sided","R2")] 

Calculation in a data.frame

*Creating a new column to a data frame using a formula from another variable

# Perform calculation based on multiple columns
## syntax: data$newVar <- with(data, expression)
leftOuterJoin$twoPQBeta2 <- with(leftOuterJoin, 2 * ALT_FREQS * (1-ALT_FREQS) * beta * beta)

Remove duplicated rows based on specific columns

*removing duplicate units from data frame

# rbind PRS per group 2 and 5. 
PRS_pheno <- rbind(PRS_pheno_gp2,PRS_pheno_gp5) # 4790 obs, 41 variables

## Remove duplicate rows based on the first column "ID"
PRS_pheno_uniqueID <-PRS_pheno[!duplicated(PRS_pheno[,1]),] # 2463 obs

Remove rows with column ID that occurs more than once and then remove these rows using match()

# Make a data set
dat <- data.frame( ID = c("A","A","B","B","C","D"), val = 1:6 )
> dat
ID val
1  A   1
2  A   2
3  B   3
4  B   4
5  C   5
6  D   6
# Remove rows with column ID that occurs more than once
dat[which(is.na(match(dat$ID, dat$ID[duplicated(dat$ID)]))),]
ID val
5  C   5
6  D   6

Subset/Remove rows with column ID that occurs more than once and then remove these rows using filter()

# File  rows    Explantion
#-----------------------------
# left  594186  raw data
# left1 594046  left with non rs number ID removed
# left2 677     Number of rows with ID occurs more than once in left1
# left3 593369  remove left2 from left1
#-----------------------------

left=read.table(file=paste0(locGWAS_UKB20116,fileNameStart,".X20116_recodeFinal.glm.logistic")
                ,header = TRUE
                ,sep="\t"
                ,stringsAsFactors=F
                ,comment.char = "!") %>% select(c(X.CHROM,POS,ID,ALT,OR,P)) 

# Remove non-rs number variants
left1=left[grep("^rs",left$ID),]

#subset all rows that have a value in column ID occur more than once in the dataset.
library(dplyr)
left2 <- left1 %>% group_by(ID) %>% filter(n()>1)

# Remove all occurrences of duplicates
left3 <- left1 %>% group_by(ID) %>% filter(n()==1)

Left outer join in R, with merging key columns are same-named or different-named. Prefer dplyr::left_join() to merge() because it keeps the order of merging key of the joined file the same as the left table

library(dplyr)
library(magrittr)

# testing on UKB 20116, chr1
# Import data files
fileNameStart="X20116_recodeFinal-HRC.chr1.plink2.output"
# Note #CHROM causes 1st line to be considered as a comment

# File          rows    Explantion
#-----------------------------------------------------------------------------
# left          594186  raw data 1
# left1         594046  left with non rs number ID removed
# left2         677     Number of rows with ID occurs more than once in left1
# left3         593369  remove left2 from left1
# right         3074089 raw data 2
# right1        2845136 right with non rs number ID removed
# right2        21704   Number of rows with ID occurs more than once in right1
# right3        2823432 remove right2 from right1
# leftOuterJoin 593369  left outer join left3 and right3
#-----------------------------------------------------------------------------
left=read.table(file=paste0(locGWAS_UKB20116,fileNameStart,".X20116_recodeFinal.glm.logistic")
                ,header = TRUE
                ,sep="\t"
                ,stringsAsFactors=F
                ,comment.char = "!") %>% select(c(X.CHROM,POS,ID,ALT,OR,P)) 

# Remove non-rs number variants
left1=left[grep("^rs",left$ID),]

#subset all rows that have a value in column ID occur more than once in the dataset.
library(dplyr)
left2 <- left1 %>% group_by(ID) %>% filter(n()>1)

# Remove all occurrences of duplicates
left3 <- left1 %>% group_by(ID) %>% filter(n()==1)

# Note #CHROM causes 1st line to be considered as a comment
right=read.table(file=paste0(locGWAS_UKB20116,fileNameStart,".afreq")
                 ,header = TRUE
                 ,sep="\t"
                 ,stringsAsFactors=F
                 ,comment.char = "!") %>% select(c(ID,ALT,ALT_FREQS)) 

# Remove non-rs number variants
right1=right[grep("^rs",right$ID),]

#subset all rows that have a value in column ID occur more than once in the dataset.
right2= right1 %>% group_by(ID) %>% filter(n()>1)

# Remove all occurrences of duplicates
right3= right1 %>% group_by(ID) %>% filter(n()==1)

# Left join left3 (left table) and right3 (right table) with merging key=ID (rs number)
leftOuterJoin=merge(x= left3, y=right3, by= "ID", all.x=TRUE)
# LeftJoin plot data (allSuplotData, left table) and plot colors (subplotInfoSource, right table)
## keep columns of right table to those wanted
subplotInfoSource_small=subplotInfoSource[,c("PRSPheno","color","color_pale")]

leftJoin=merge(allSuplotData
               ,subplotInfoSource_small
               ,by.x = "covarPheno"
               ,by.y = "PRSPheno"
               ,all.x=TRUE)

Subset values that begin with rs in a data.frame

*Using grep to help subset a data frame in R

library(dplyr)
library(magrittr)

# Import data files
fileNameStart="X20116_recodeFinal-HRC.chr1.plink2.output"
# Note #CHROM causes 1st line to be considered as a comment
# N=594186
left=read.table(file=paste0(locGWAS_UKB20116,fileNameStart,".X20116_recodeFinal.glm.logistic")
                ,header = TRUE
                ,sep="\t"
                ,stringsAsFactors=F
                ,comment.char = "!") %>% select(c(X.CHROM,POS,ID,ALT,OR,P)) 

# Remove non-rs number variants (i.e. subset ID that start with 'rs')
## N= 594046
left1=left[grep("^rs",left$ID),]

sort a data.frame by multiple columns

*How to Sort a Data Frame by Multiple Columns in R

# Sort the data.frame base by column group_keyword1, group_keyword2, group_keyword3, and group_keyword4
## the with() generates the index order. This effectively creates a new environment using the passed in data in the first argument along with an expression for evaluating that data in the second argument.

base[
  with(base, order(group_keyword1,group_keyword2,group_keyword3,group_keyword4)),
  ]

Subset rows where one or multiple columns are not missing

# Suppose we are looping through multiple columns in the data set. The first column is everDrug1 
variable="everDrug1"
variables_to_select=c(variable,"age","nSEX")

# Based on 1 column, Method 1: use subset() function
data_age_sex= subset(data,(!is.na(data[,variable])),select=variables_to_select)

# Based on 1 column, Method 2: use dplyr::filter
data_age_sex <- data %>% 
                  filter_at(vars(variable),any_vars(!is.na(.)))
# Based on 2 columns
Q2 <- subset(PRS_GSCAN_pheno,(!is.na(PRS_GSCAN_pheno[,"GSCAN_Q2_recode"])) & (!is.na(PRS_GSCAN_pheno[,"age"]))) # 12441 obs. of  68 variables

Merge, join multiple data.frame simultaneously using list(df1,df2,df3...) %>% reduce(left_join, by = c("key1","key2"..))

# Left join data set across all 3 sex groups, each has 72 rows, 7 columns
library(tidyverse)
rP.GSCAN.PRS.targ.pheno.gp2.wide <- list(rP.GSCAN.PRS.targ.pheno.gp2.wide.allSexes
                                         ,rP.GSCAN.PRS.targ.pheno.gp2.wide.females
                                         ,rP.GSCAN.PRS.targ.pheno.gp2.wide.males) %>% reduce(left_join, by = c("Var1","p.value.threshold")) # dim(rP.GSCAN.PRS.targ.pheno.gp2.wide) 72 17

Merging a list of > 2 data.frames using join_all. To do this, firstly adds all the data.frames to join to a list. These df should have same-named merging key. Then join_all merges all the df as a single df. Note that merge() is limited to combining 2 files. The following code merge 5 data.frames for 16 groups, which are constructed by 4 for loops

*join_all: Recursively join a list of data frames

  • see complete code in PRS_UKB_201711_step11-01_inner-join_summedPRS-across-all-traits_PCs_impCov.R
  • alternatively use Reduce(merge()) to do same thing as join_all
library(plyr)
# Read in 5 files from each of the 16 groups (80 input files)
count=0
for (a in 1:length(unique(base$group_keyword1))){
  for (b in 1:length(unique(base$group_keyword2))){
    for (c in 1:length(unique(base$group_keyword3))){
      for (d in 1:length(unique(base$group_keyword4))){
        aa=unique(base$group_keyword1)[a]
        bb=unique(base$group_keyword2)[b]
        cc=unique(base$group_keyword3)[c]
        dd=unique(base$group_keyword4)[d]
        count=count+1;
        print(paste("aa=",aa,"bb=",bb,"cc=",cc,"dd=",dd,sep = " "))
        # Subset the paths of 5 input files from each of the 16 groups and the output folders where the joined files are exported
        per_group_info= base[base$group_keyword1==aa & base$group_keyword2== bb & base$group_keyword3== cc & base$group_keyword4== dd, c("inputFilePath","outputFolderPath")]
        # Read in the 5 files
        ## Drop column FID, PHENO from file2-file5 using %>% select(-)
        per_group_input <- as.character(per_group_info$inputFilePath)
        file1= read.table(per_group_input[1],header = T, stringsAsFactors = F) 
        file2= read.table(per_group_input[2],header = T, stringsAsFactors = F) %>% select(-c(FID,PHENO))
        file3= read.table(per_group_input[3],header = T, stringsAsFactors = F) %>% select(-c(FID,PHENO))
        file4= read.table(per_group_input[4],header = T, stringsAsFactors = F) %>% select(-c(FID,PHENO))
        file5= read.table(per_group_input[5],header = T, stringsAsFactors = F) %>% select(-c(FID,PHENO))
        # Recursively join a list of the 5 data.frames as a single data.frame using join_all
        ## join_all uses same-named column "IID" as merging key
        dfs <- list(file1,file2,file3,file4,file5)
        joined <- join_all(dfs,by=c("IID"),type = "inner") # 43 columns
        # Merge joined file with pc_file
        joined_pc= merge.data.frame(joined,pc_file,by.x = "IID",by.y = "INDID") # 53 columns
        # Merge joined_pc and impCov_file
        joined_pc_impCov= merge.data.frame(joined_pc,impCov_file,by.x = "IID", by.y = "V2") # 54 columns
        
        # Change colname from V6 to impCov
        colnames(joined_pc_impCov)[54] <- "impCov"
        # Change colname from IID to ID, as at step11-02 GWAS-ID-remapper expects ID as $1
        colnames(joined_pc_impCov)[1] <- "ID"
        
        # Remove old files. Commented out as it has been done
        #file.remove(paste0(outputFolderPath,"/","summed-PRS_S1-S8_all-pheno"))
        
        ## Export the joined_pc_impCov files
        outputFolderPath <- as.character(unique(per_group_info$outputFolderPath)) # output folder path
        outputFilePath <- paste0(outputFolderPath,"/","summed-PRS_S1-S8_all-pheno_10PCs_impCov")
        write.table(joined_pc_impCov # object name the file to export
                    ,col.names=T   # keep column names
                    ,row.names = F # remove row number
                    ,file=outputFilePath
                    ,dec="."
                    ,sep=" "
                    ,quote=FALSE
                    ,na = "NA" ) # mark missing values as NA
        print(paste0("================================= iteration", count, "========================="))
      }
    }
  }
}

Merging 2 data.frames with different-named merging key columns

# merging key is IID in file x, INDID in file y
merge.data.frame(joined,pc_file,by.x = "IID",by.y = "INDID")

Merging 2 data sets using merge(). By default the data frames are merged on the columns with names they both have, but separate specifications of the columns can be given by by.x and by.y. To prevent this default, skip by= if the merging key is same-named in both data sets.

# Merging key is ID in both PRS_dir_1_standardised and PRS_dir_5_standardised 
PRS_dir_1_5=merge(PRS_dir_1_standardised,PRS_dir_5_standardised)

Read a file containing dashes in its column names. For example, column "UKB-SS-S1" becomes "UKB.SS.S1" However, you can change the column names back to orginal ones using qsub

*How to replace the “.” in column names generated by read.csv() with a single space when exporting?

# Read PRS files into a temporary data.frame called data, one file at a time. Unfortunately, R changes dashes to dots in the colname
data=read.table(current_PRS_file_path,header = T,sep=" ")
# A simple regular expression to replace dots with dashes. This might have unintended consequences, so be sure to check the results. Need to escape . with \\
names(data) <- gsub(x = names(data),pattern = "\\.",replacement = "-")

Convert a data.frame to a vector.

*R-friendly way to convert R data.frame column to a vector?

# Read file paths of PRS files, which were created in previous step. The read file becomes a one-column data.frame with no colname
fileList_PRS=read.table(paste0(locASCOut,"filePath_summedPRS_10PCs_impCov_IDremapped_GSCAN_UKB")
                        ,header=FALSE
                        ,stringsAsFactors = FALSE)
# Convert the data.frame above to a vector
fileList_PRS_vector=fileList_PRS[,1]

Row-subsetting and column-subsetting in R. Note this subsetting is based on character columns. factor columns won't work

dirGWAS="/mnt/lustre/working/lab_nickm/lunC/PRS_UKB_201711/GWASSummaryStatistics";
dirGSCAN=paste0(dirGWAS,"/GWAS_GSCAN/noUKBiobank_results")

fileGWASInfo="GWAS_info.csv";

GWASInfo=read.csv(file = paste0(dirGWAS,"/",fileGWASInfo)
                  ,header = TRUE
                  ,stringsAsFactors = FALSE)

# Minimize the info file to GWAS that will be used in the project
GWAS_to_use <- GWASInfo[GWASInfo$useOrNot=='Y',]

# Row-subset (on left of comma) and column-subset (on right of comma) data
# Extract GWAS measure description
GWASInfo[GWASInfo$useOrNot=='Y' & GWASInfo$dataSource=='UKB',c("dataSource","measureAbb","measureDescription")]

Conditionally copy columns to a new column based on another column

# Assign color to "color" for significant p values, and "color_pale" to non-significant p values
## Set significance level to 0.05
signi_level=0.05
## Convert columns to copy from factor to character
leftJoin[,c("color","color_pale")] <- lapply(leftJoin[,c("color","color_pale")],as.character)
## Conditionally copy color or color_pale, based on pvalue2sided
leftJoin$color_signi <- ifelse(leftJoin$pvalue2sided < signi_level, leftJoin$color,leftJoin$color_pale)

if(condition A){actions} else if(condition B){actions} and else doesn't perform vectorization. Running condition A, not including its brackets must return a single TRUE or FALSE. If it returns one line of multiple TRUE and/or FLASE, then you will get an error the condition has length > 1 and only the first element will be used. Suppose the if else is constructed based on the value of a column "dataSource" of a data.frame "GWAS_to_use"

# wrong if else. Running "GWAS_to_use$dataSource=='GSCAN'" returns multiple TRUE/FALSE
if(GWAS_to_use$dataSource=='GSCAN'){ actions}
else actions
# it's wrong becasue the if condition 

# correct way to write if else. Running "dataSource=='GSCAN'" returns a single TRUE/FALSE
for (dir in 1:length(GWAS_to_use$directory)){
  dataSource= GWAS_to_use$dataSource[dir]
  # Extract GSCAN GWAS file path
  if(dataSource=='GSCAN'){

Conditionally execute code in R. Don't nest if within if

When there are 2 conditions to evaluate

# Syndax form
if (condition1) {
    action1
} else {action2}
# An example with only 2 conditions maximally 
if (i >= 401 & i <=480){
    categoricalCovars=c("wave","nSEX","ImpCov")
    } else {
    categoricalCovars=c("wave","wave","nSEX","ImpCov")
    }
When there are 3 conditions to evaluate
# Syndax form
if (condition1) {
    action1
} else if (condition2){
    action2
} else {
    action3
}

# An exmaple of 3 conditions to evaluate maximally
## Specify colname names that vary from input files. Note column names vary within and between consortia
## colname names that are the same in all input files are specified as fixed values in read_exposure_data().
  if(consortium=='GSCAN' & trait_name=="dpw"){
    colname_beta="BETA_linear"
    colname_effect_allele="ALT"
    colname_other_allele="REF"
    colname_sample_size="N"
  } else if (consortium=='GSCAN' & trait_name!="dpw") {
    colname_beta="BETA"
    colname_effect_allele="ALT"
    colname_other_allele="REF"
    colname_sample_size="N"
  } else {
    trait_name=gsub(unlist(strsplit(basename(filePath),"_"))[3],pattern = "-",replacement = "_")
    colname_beta="BETA"
    colname_effect_allele="ALLELE1"
    colname_other_allele="ALLELE0"
    colname_sample_size="sample_size"
  }     

Create a new variable based on the conditions of a variable using a single or nested ifelse()

  • | for "or", which is different from || used by if()

  • '&' for "and", which is different from && used by if()

  • == for comparing variables, which is similar to == used by if()

  • Nested ifelse statement

# Example 1: conditions evaluated against fixed values
# Set values to NA if old values are -3 or 0. 
# Set values to 1 if old values are 2. 
# Set values to 0 if old values are 1. 
pheno_ukb20116$X20116.0.0_recode <- ifelse(pheno_ukb20116$X20116.0.0 %in% c(-3,0), NA
                                           ,ifelse(pheno_ukb20116$X20116.0.0==1,0
                                                   ,1))
# Check if the recoding is done correctly
table(pheno_ukb20116$X20116.0.0_recode,pheno_ukb20116$X20116.0.0,useNA = "ifany")                                      -3      0      1      2   <NA>
0         0      0 168254      0      0
1         0      0      0  51208      0
<NA>   1986 265431      0      0    530
# Example 2: conditions evaluated against a pattern
temp <- dat.manu3.sexPRS.inclu

# Simplify values in the "name_fix_eff" column 
## values starting with GSCAN set to "PRS" 
## values starting with sex_GSCAN set  to "sexPRS"
## values sex remain unchanged
temp$name_fix_eff2 <-ifelse(grepl("^GSCAN.*.S[1-8]$",temp$name_fix_eff), "PRS"
                            ,ifelse(grepl("^sex_GSCAN.*.S[1-8]$",temp$name_fix_eff),"sexPRS"
                                    ,"sex"))

table(temp$name_fix_eff2)
# PRS se sexPRS 
# 600 600 600

When nesting ifelse(), make sure each ifelse() contains 3 arugmenets: (1) condition to evaluate,(2) value if the condition is evaluated as TRUE, (3) value if the condition is evaluated as FALSE. This example has 3 condictions and 2 ifelse(). The 3rd argument in the first ifelse() is a nested ifelse().

m$analysis.performed <- ifelse(m$cohorts=="UKB" & m$traits.abb=="CI","OA",
                               ifelse((m$cohorts=="GSCAN" & m$traits.abb=="DPW")|(m$cohorts=="UKB" & m$traits.abb=="ESDPW")|(m$cohorts=="UKB" & m$traits.abb=="CCPD"),"OA, GA",
                               "GA"))

Recode variables using dplyr::recode

  • syntax form: dplyr::recode(variable,)
# Example 1
## Recode value 1 as 5, 2 as 4, 4 as 2, 5 as 1 
dplyr::mutate(quali.edu.6138.recoded=dplyr::recode(qualification_edu_6138
                                                   ,`1`=5
                                                   ,`2`=4
                                                   ,`3`=3
                                                   ,`4`=2
                                                   ,`5`=1))
# Example 2
# If values are -3 or 0, set to NA; else keep the original values
pheno_ukb20116$X20116.0.0_recode <- ifelse(pheno_ukb20116$X20116.0.0 %in% c(-3,0), NA, pheno_ukb20116$X20116.0.0)

# Check old variable and the new variable
## row names are values of 1st argument, columns are values of 2nd argument 
table(pheno_ukb20116$X20116.0.0_recode,pheno_ukb20116$X20116.0.0,useNA = "ifany")
            -3      0      1      2   <NA>
  1         0      0 168254      0      0
  2         0      0      0  51208      0
  <NA>   1986 265431      0      0    530

Append output from every iteration to a base data.frame

*Add row to dataframe

## Step 1: initialize a data.frame for appending the result from every iteration
## This data.frame should have same column names as the per-iteration result data.frame
base=data.frame(iteration=NULL
                ,directory=NULL
                ,file=NULL)

## Step 2: extract directory and names of files
for (dir in 1:length(GWASInfo$directory)){
  current_dir=GWASInfo$directory[dir]
  GWASFileList=list.files(path = current_dir,pattern = "^revised_bolt_imputed(.*).bgen.assoc$|(.*).glm.logistic$")
  
  for (file in 1:length(GWASFileList)){
    current_file=GWASFileList[file]
    current_iteration=(dir-1)*length(GWASFileList)+file
    print(paste0("current_dir=",current_dir))
    print(paste0("current_file= ",current_file))
    print(paste0("current_iteration=",current_iteration))
    resultPerIteration=data.frame(iteration=current_iteration,directory=current_dir,file=current_file)
    
    # Step 3 : Append per-iteration data.frame to the base data.frame
    base = rbind(base, resultPerIteration)
    }
}  

Remove columns that contain only NAs

# method 1
df[, colSums(is.na(df)) != nrow(df)]
# method 2
df[colSums(!is.na(df)) > 0]

Read files with delimiters as varying number of white spaces

# sep=" " refers to one whitespace character. sep="" refers to any length whitespace as being the delimiter
c1=read.table(paste0(dirSubfolders[1],filesWant[1]),header = T,sep = "")

Combine multiple data frames by row using do.call(rbind,df_list)

# Create an empty list for holding a subset generated within a for loop
GCTAOutfile_DSM5MentalDisorderDiag_part1_list <- list()

# Add each subset to the list
for (i in 1:length(filesWantList)){
  # Add data.frame of same suffix into the list
  ## You cannot use data.frame name like GCTAOutfile_DSM5MentalDisorderDiag_i_part1
  GCTAOutfile_DSM5MentalDisorderDiag_part1_list[[i]] <- get(paste0("GCTAOutfile_DSM5MentalDisorderDiag_", i,"_part1")) 
} 

# Combine all subsets using do.call and rbind
## 2nd argument of do.call is a list. That is why all subsets were added to a list
GCTAOut_DSM5MentalDisorderDiag_part1_all <- do.call(rbind, GCTAOutfile_DSM5MentalDisorderDiag_part1_list)

Combine multiple data frames by column using (1) do.call(cbind,df_list),(2) join_all(data_list,by=mergingKey,type=)

# Search variables to reshape
patterns_to_search=glob2rx("ID_char|GSCAN.*_r|ZYGOSITY")
variables_to_reshape=grep(names(df_twin),pattern=patterns_to_search,value = TRUE) # 42 elements

# Combine all reshaped variables to a single data frame
## Create an empty list for holding data frames to combine
df_to_bind_by_column <- list()
## Reshape variables and add them to the list  
for (i in 1:length(variables_to_reshape)){
  var_to_reshape=variables_to_reshape[i]
  
  var_to_keep=c(paste(var_to_reshape,c("01","02","50"),sep="."))
  
  # Reshape variables, one at a time
  reshaped_var <-reshape(df_twin
                         ,idvar = "famID"
                         ,v.names = var_to_reshape
                         ,timevar =  "ID_suffix"
                         ,direction = "wide" ) %>% select_(.dots=var_to_keep)
  
  # Add the reshaped variable data frame to the list
  df_to_bind_by_column[[i]] <- reshaped_var
}

# Method 1: Combine all data frames in the list to a single data frame using do.call(cbind,df_list)
reshaped_variables <- do.call(cbind,df_to_bind_by_column) # 126 columns

# Method 2: Combine all data frames in the list to a single data frame using join_all(data_list,by=mergingKey,type=)
df_reshaped_variables <-plyr::join_all(df_to_bind_by_column,by=c("famID","ZYGOSITY"),type="inner") #