R
list
do.call
rbind
table
ifelse
is.na
subset
select
merge.data.frame
plyr::join_all
grep
grepl
%>%
merge
group_by
filter
with
which
is.na
match
duplicated
foreach
lapply
as.character
as.numeric
dplyr::summarise
n
sum
by
str_detect
wildcards
q
reshape2::dcast
order
factor
adply
filter_at
str_locate
%in%
stack
glob2rx
na.omit
length
unite
melt
spread
match
anti_join
regex
mutate
format.pval
format
round
function
stringr::str_c
stringr::str_pad
stringr::str_replace_all
expand.grid
arrayInd
which.max
which.min
as.data.frame.table
rowSums
colSums
apply
tidyr::separate
as.Date
read.csv
read.table
gmodels::CrossTable
plyr::adply
plyr::ddply
seq_along
seq_len
rowr::cbind.fill
data.table::CJ
table(factor)
dplyr::case_when
dplyr::filter
dplyr::left_join
dplyr::anti_join
dplyr::arrange
dplyr::mutate_at
dplyr::recode
dplyr::rename_at
dplyr::rename_
dplyr::select
dplyr::select_
data.table::setnames
assign
setdiff
%<>%
substr
as.data.frame.matrix
plyr::join_all
as.POSIXct
lag
tryCatch
rank
purrr::reduce()
dplyr::slice
tidyr::pivot_wider
summary.default
unclass
tidyr::fill
tidyr::fill_
Filling missing value in group
have <- data.frame(
row.number=c(1:6)
,ID=c("A","A","A","B","B","B")
,v1=c(NA,1,NA, NA, NA, 3)
,v2=c(NA,NA,"Yes",NA,"Yes",NA)
,stringsAsFactors = F)
# Use column names
want <- have %>%
dplyr::group_by(ID) %>%
tidyr::fill(v1,v2) %>%
tidyr::fill(v1,v2, .direction="updown")
# Use column position [R: fill down multiple columns](https://stackoverflow.com/questions/37648980/r-fill-down-multiple-columns) want <- have %>%
dplyr::group_by(ID) %>%
tidyr::fill_(names(.)[3:4]) %>%
tidyr::fill_(names(.)[3:4], .direction="updown")
# Get data structure into data.frame
data.structure <- data.frame(
unclass(summary.default(data))
,check.names = FALSE
,stringsAsFactors = F
) # class(data.structure) [1] "data.frame" # dim(data.structure) 58 3
# Find rows with the minimum p.value
data %>% dplyr::slice(which.min(p.value))
# Find rows with the maximum estimate
data %>% dplyr::slice(which.max(p.value))
a <- c('a','b','c','d','e')
b <- c('a','b','d','e')
# Find elements of a that match elements of b
a[a %in% b]
[1] "a" "b" "d" "e"
# Find elements of a that don't match elements in b
a[!(a %in% b)]
[1] "c"
tryCatch{}
This allows iteration that results in erros to be skipped. Without this error handling, the iteration discontinues. # Use tryCatch() here as some iterations won't run because of not enough SNPs but we want the loopig continues even there is an error
tryCatch({
# R code that could result in errors
# End tryCatch() function
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
\n
# A log file to extract content
LDSC.rG.log.file.path
[1] "/mnt/lustre/working/lab_nickm/lunC/MR_ICC_GSCAN_201806/LD-score-correlation_run-via-HPC-Utility/rG_between_GSCAN-AI_and_ICC-CI/ai_noICC.ambiguousSNPRemoved-Cannabis_ICC_UKB_small_MAF0.99-0.01_cojo.result.rg.log"
# Read all lines of the file into R
LDSC.rG.log.file <- readLines(LDSC.rG.log.file.path) # length(LDSC.rG.log.file) 67
# Find row number/line number of a pattern "Heritability of phenotype 1"
## The pattern "Heritability of phenotype 1" is at line 33
## The pattern "Heritability of phenotype 2/2" is at line 41
row.number.h2.trait1.title <- which(grepl("Heritability of phenotype 1",LDSC.rG.log.file)) # 33
row.number.h2.trait2.title <- which(grepl("Heritability of phenotype 2/2",LDSC.rG.log.file)) # 41
## The section to extract is at a length of 5 lines
length.h2.trait1or2 <- 5
## Extract the five lines for trait 1
h2.trait1 <- scan(LDSC.rG.log.file.path
, what=character()
, sep='\n'
, skip=row.number.h2.trait1.title+1 # line number to skip before beginning to read data values.
, nlines=length.h2.trait1or2 # how many lines to read from the starting line
)
# Check content of the extract
h2.trait1
[1] "Total Liability scale h2: 0.0325 (0.0044)" "Lambda GC: 1.0679"
[3] "Mean Chi^2: 1.0748" "Intercept: 0.9984 (0.0069)"
[5] "Ratio < 0 (usually indicates GC correction)."
# The data contain quotes. Quotes are set to " and ' by default for read.table. Set the quote to empty character here (quote="") to treat them as ordinary data elements. Otherwise, importatino stops at the first line containing quotes.
reg <- read.table(paste0(dir.table.tennis.score,registrants)
,sep="\t"
,header = TRUE
,stringsAsFactors = F
,na.strings = ""
,quote = "")
# A data.frame to duplicate rows
dim(reg5)
[1] 23 2
# Duplicate each row for 3 times
reg6 <- reg5[rep(seq_len(nrow(reg5)),each=3),] # dim(reg6)
dim(reg6)
69 2
# Replace non missing values in the newly merged columns (exposure*) with numeric occurence of 1
## A data set to replace non missing (non NA) with a number of 1
temp <- left.table
head(temp)
SNP.IDs GSCAN.LDClumped.ai GSCAN.LDClumped.cpd GSCAN.LDClumped.sc GSCAN.LDClumped.si
1 rs62478582 ai <NA> <NA> <NA>
2 rs193122850 <NA> cpd <NA> <NA>
3 rs2118359 <NA> cpd <NA> <NA>
4 rs215600 <NA> cpd <NA> <NA>
5 rs2273500 <NA> cpd <NA> <NA>
6 rs56113850 <NA> cpd sc <NA>
# Loop through column 2 to end, replace non missing character values with 1
for (i in 1:ncol(temp)){
if (i > 1){
temp[!is.na(temp[,i]),i] <- 1
}
}
# Check results
head(temp)
SNP.IDs GSCAN.LDClumped.ai GSCAN.LDClumped.cpd GSCAN.LDClumped.sc GSCAN.LDClumped.si
1 rs62478582 1 <NA> <NA> <NA>
2 rs193122850 <NA> 1 <NA> <NA>
3 rs2118359 <NA> 1 <NA> <NA>
4 rs215600 <NA> 1 <NA> <NA>
5 rs2273500 <NA> 1 <NA> <NA>
6 rs56113850 <NA> 1 1 <NA>
## Assign them to the global environment
assign(data.name,data,envir = .GlobalEnv)
dply::arrnge(match(variable,vector-order))
# The data to subset
dim(fix.eff.PRS.manu3.allSexGroups) # [1] 1800 13
# The order of target phenotypes you want the phenotype column to be sorted
manu3.targ.pheno.predicted.by.SI
[1] "GSCAN_Q2_recode" "GSCAN_Q1" "GSCAN_Q3_recode" "nicdep4"
[5] "ftnd_dep" "GSCAN_Q5_Drinks_per_week" "alcdep4" "dsmiv_conductdx"
[9] "aspddx4"
# Subset target phenotypes predicted
## Reorder data to keep original ordering of the phenotype column using arrange
manu3.tem.SI <- fix.eff.PRS.manu3.allSexGroups %>%
dplyr::filter(phenotype %in% manu3.targ.pheno.predicted.by.SI & name_fixEffect_trait == "si") %>%
dplyr::arrange(match(phenotype, manu3.targ.pheno.predicted.by.SI)) # dim(manu3.tem.SI) 216 13 (9 target phenotypes predicted by 1 discovery trait * 3 sex groups * 8 p value thresholds)
# A dataset to work on
PRS_everDrug1to10_CUD <- read.table(paste0(locPheno,"pheno2drugUseCUD-IDremapped_standardised-IDremapped-PRS-GSCAN.txt")
,header = T
, sep=" "
, stringsAsFactors = F
, na.strings=c('.','NA'))
# The data structure with wanted columns shown
> str(PRS_everDrug1to10_CUD)
'data.frame': 2463 obs. of 76 variables:
$ nSEX : int 1 2 2 1 1 1 1 2 2 1 ...
$ GSCAN.ai.S2 : num 0.325 -2.008 -0.627 0.366 -0.731 ...
$ GSCAN.ai.S3 : num 0.99 -0.252 -2.104 -1.147 -1.202 ...
$ GSCAN.ai.S4 : num 1.505 3.152 -0.597 -1.691 -0.385 ...
$ GSCAN.ai.S5 : num 0.668 2.307 -2.531 -1.427 -0.707 ...
$ GSCAN.ai.S6 : num 0.922 1.447 -1.837 -1.418 -1.357 ...
$ GSCAN.ai.S7 : num -0.249 0.978 -3.346 -2.074 -2.022 ...
$ GSCAN.ai.S8 : num -0.302 1.045 -3.165 -2.128 -1.888 ...
$ GSCAN.cpd.S1: num -0.9651 0.799 -0.1756 0.1013 -0.0447 ...
$ GSCAN.cpd.S2: num -0.6752 0.8222 0.5144 -0.0802 -0.379 ...
$ GSCAN.cpd.S3: num -0.676 0.218 -0.765 -0.141 -0.543 ...
$ GSCAN.cpd.S4: num -0.9386 3.2686 -0.577 0.0432 -0.2242 ...
$ GSCAN.cpd.S5: num -0.095 4.892 -0.495 0.124 -0.11 ...
$ GSCAN.cpd.S6: num 0.276 5.415 -0.456 0.039 -0.173 ...
$ GSCAN.cpd.S7: num 0.2396 4.6077 -0.336 0.3825 -0.0603 ...
$ GSCAN.cpd.S8: num 0.138 4.398 -0.294 0.308 -0.247 ...
$ GSCAN.dpw.S1: num 0.118 -0.474 -0.551 -2.615 0.184 ...
$ GSCAN.dpw.S2: num 0.201 0.875 0.913 -1.487 0.314 ...
$ GSCAN.dpw.S3: num -1.583 -0.446 1.746 -0.259 0.44 ...
$ GSCAN.dpw.S4: num -0.162 0.621 1.23 -1.076 0.286 ...
$ GSCAN.dpw.S5: num 1.334 1.184 -0.477 -1.833 0.188 ...
$ GSCAN.dpw.S6: num 1.535 0.335 -1.644 -1.583 0.319 ...
$ GSCAN.dpw.S7: num 1.69 0.174 -1.82 -1.467 0.221 ...
$ GSCAN.dpw.S8: num 1.7488 -0.0555 -2 -1.3446 0.1773 ...
$ GSCAN.sc.S1 : num -0.7893 -1.171 -2.5624 -0.077 -0.0439 ...
$ GSCAN.sc.S2 : num -0.1717 -1.4473 -2.1254 0.304 -0.0508 ...
$ GSCAN.sc.S3 : num -0.858 -1.781 3.678 0.811 0.543 ...
$ GSCAN.sc.S4 : num -0.953 0.537 1.975 1.286 0.857 ...
$ GSCAN.sc.S5 : num -0.642 1.786 0.866 1.275 1.037 ...
$ GSCAN.sc.S6 : num -0.243 2.205 1.777 0.956 0.71 ...
$ GSCAN.sc.S7 : num 0.166 2.894 1.907 0.728 0.263 ...
$ GSCAN.sc.S8 : num 0.126 2.791 1.814 0.843 0.334 ...
$ GSCAN.si.S1 : num -1.273 -1.833 0.667 0.166 -0.717 ...
$ GSCAN.si.S2 : num -1.541 -2.335 0.444 0.276 -0.193 ...
$ GSCAN.si.S3 : num -2.082 -2.688 -1.063 -0.764 -0.981 ...
$ GSCAN.si.S4 : num -2.4386 -2.2505 -1.3685 -1.0822 -0.0356 ...
$ GSCAN.si.S5 : num -1.7702 -2.1189 -3.2785 -1.4399 0.0188 ...
$ GSCAN.si.S6 : num -1.622 -3.822 -2.089 -1.53 -0.833 ...
$ GSCAN.si.S7 : num -1.09 -3.89 -1.89 -2.13 -1.27 ...
$ GSCAN.si.S8 : num -1.11 -4.07 -1.71 -2.12 -1.24 ...
# Multiply every column that starts with GSCAN with nSEX, creating 40 new columns for the multiplication
library(dplyr)
# Multiply every column that starts with GSCAN (PRS columns) with nSEX and suffix newly created columns with nSEX
# To change the suffix "_nSEX" to prefix "sex_", use rename_at()
PRS_everDrug1to10_CUD2 <- PRS_everDrug1to10_CUD %>% mutate_at(vars(matches("^GSCAN.*.S[1-8]$")),.funs= funs(nSEX= .*nSEX)) %>%
rename_at(vars(contains("_nSEX")),funs(paste("sex", gsub("_nSEX", "", .), sep = "_")))
# Compare an old column with a new column
> head(PRS_everDrug1to10_CUD2$GSCAN.ai.S1)
[1] -0.5150501 -1.1410706 -1.3126117 -1.1851313 -1.1344985 -0.6225980
> head(PRS_everDrug1to10_CUD2$sex_GSCAN.ai.S1)
[1] -0.5150501 -2.2821412 -2.6252234 -1.1851313 -1.1344985 -0.6225980
>
starts_with()
: starts with a prefixends_with()
: ends with a prefixcontains()
: contains a literal stringmatches()
: matches a regular expressionnum_range()
a numerical range like x01, x02, x03.one_of()
: variables in character vector.everything()
: all variables.# Method 1
dd[ ,!(colnames(dd) %in% c("A", "B"))]
# Method 2: use dplyr::select(-one_of("columns-to-deselect"))
plyr::join_all(data.pheno.manu2.allSexes.list,by=c("ID"),type ="full") %>%
dplyr::select(-one_of("ID"))
dplyr::rename_at(vars=, list(~function1, ~function2,...))
or dplyr::rename(new.var.name=old.var.name)
data %>%
# Rename count variables by changing variable prefix "APExp60.1" to "APExp60.1.A"
dplyr::rename_at(.vars= vars(starts_with("APExp60.1"))
,list(~ sub(pattern = "^APExp60.1", replacement = "APExp60.1.A", x=.))
)
data.table::setnames()
## Rename ID and AGEINT column as iid and age for join_all() with the other 2 data sets
age.for.binary.pheno <- data.table::setnames(age.for.binary.pheno
, old=c("ID","AGEINT")
, new=c("iid", "age"))
# Suppose df1 and df2 have different row numbers
df3 <- rowr::cbind.fill(df1, df2, fill = NA)
apply(expand.grip(x,y))
and custom sorting# Create all combinations of vector 1 (MZ.twins.DZ.twins) and vector 2 (trait1.trait2). Note here the desired order is MZTwin1*, MZTwin2*, DZTwin1* and then DZTwin2*
vector1 <- c("MZTwin1","MZTwin2","DZTwin1","DZTwin2") # column dimension
vector2 <- c("trait1_0_trait2_0","trait1_1_trait2_0","trait1_0_trait2_1","trait1_1_trait2_1") # row dimension
all.combinations <- data.table::CJ(vector1, vector2, sorted = FALSE)[, paste(V1, V2, sep ="_")]
> all.combinations
[1] "MZTwin1_trait1_0_trait2_0" "MZTwin1_trait1_1_trait2_0" "MZTwin1_trait1_0_trait2_1" "MZTwin1_trait1_1_trait2_1"
[5] "MZTwin2_trait1_0_trait2_0" "MZTwin2_trait1_1_trait2_0" "MZTwin2_trait1_0_trait2_1" "MZTwin2_trait1_1_trait2_1"
[9] "DZTwin1_trait1_0_trait2_0" "DZTwin1_trait1_1_trait2_0" "DZTwin1_trait1_0_trait2_1" "DZTwin1_trait1_1_trait2_1"
[13] "DZTwin2_trait1_0_trait2_0" "DZTwin2_trait1_1_trait2_0" "DZTwin2_trait1_0_trait2_1" "DZTwin2_trait1_1_trait2_1"
# A vector of strings to process
> short_colnames_RSquare
# [1] "SI.S1" "SI.S2" "SI.S3" "SI.S4" "SI.S5" "SI.S6" "SI.S7" "SI.S8" "AI.S1" "AI.S2" "AI.S3" "AI.S4"
#[13] "AI.S5" "AI.S6" "AI.S7" "AI.S8" "CPD.S1" "CPD.S2" "CPD.S3" "CPD.S4" "CPD.S5" "CPD.S6" "CPD.S7" "CPD.S8"
#[25] "SC.S1" "SC.S2" "SC.S3" "SC.S4" "SC.S5" "SC.S6" "SC.S7" "SC.S8" "DPW.S1" "DPW.S2" "DPW.S3" "DPW.S4"
#[37] "DPW.S5" "DPW.S6" "DPW.S7" "DPW.S8"
# Replace S1:S8 with p< 5e-08, p< 1e-05, p< 1e-03, p< 1e-02, p< 5e-02, p< 0.1, p< 0.5 and p< 1
short_colnames_RSquare %>% stringr::str_replace_all(c( "S1"="p< 5e-08"
,"S2"="p< 1e-05"
,"S3"="p< 1e-03"
,"S4"="p< 1e-02"
,"S5"="p< 5e-02"
,"S6"="p< 0.1"
,"S7"="p< 0.5"
,"S8"="p< 1")
#[1] "SI.p< 5e-08" "SI.p< 1e-05" "SI.p< 1e-03" "SI.p< 1e-02" "SI.p< 5e-02" "SI.p< 0.1" "SI.p< 0.5"
#[8] "SI.p< 1" "AI.p< 5e-08" "AI.p< 1e-05" "AI.p< 1e-03" "AI.p< 1e-02" "AI.p< 5e-02" "AI.p< 0.1"
#[15] "AI.p< 0.5" "AI.p< 1" "CPD.p< 5e-08" "CPD.p< 1e-05" "CPD.p< 1e-03" "CPD.p< 1e-02" "CPD.p< 5e-02"
#[22] "CPD.p< 0.1" "CPD.p< 0.5" "CPD.p< 1" "SC.p< 5e-08" "SC.p< 1e-05" "SC.p< 1e-03" "SC.p< 1e-02"
#[29] "SC.p< 5e-02" "SC.p< 0.1" "SC.p< 0.5" "SC.p< 1" "DPW.p< 5e-08" "DPW.p< 1e-05" "DPW.p< 1e-03"
#[36] "DPW.p< 1e-02" "DPW.p< 5e-02" "DPW.p< 0.1" "DPW.p< 0.5" "DPW.p< 1"
# Example 1: by paste(strings, collapse)
## vector elements to be joined with a plus sign between them
other.predictors
[1] "all_coffee_cpd" "merged_pack_years_20161" "X3436_recodeMean"
[4] "X3456.0.0" "complete_alcohol_unitsweekly"
## Join strings and add + signs
paste0(other.predictors,collapse ="+")
[1] "all_coffee_cpd+merged_pack_years_20161+X3436_recodeMean+X3456.0.0+complete_alcohol_unitsweekly"
# Example 2
covariates <- stringr::str_c("age","sex","ageSq","sexAge","sexAgeSq",sep="+")
> covariates
[1] "age+sex+ageSq+sexAge+sexAgeSq"
# Number of cases and controls of trait1 in DZ twin1 and DZ twin2
num.case.ctrl.trait1.DZ <- table(factor(data.pheno.here.DZ[,trait1.name.twin1],levels=0:1)
,factor(data.pheno.here.DZ[,trait1.name.twin2],levels=0:1))
# Generate 2-way cross table of frequencies of 2 variables
gmodels::CrossTable(data.gp7$ID.suffix, data.gp7$ZYGOSITY)
Total Observations in Table: 3290
| data.gp7$ZYGOSITY
data.gp7$ID.suffix | 1 | 2 | 3 | 4 | 5 | 6 | 9 | Row Total |
-------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
01 | 269 | 214 | 380 | 274 | 264 | 224 | 6 | 1631 |
| 0.293 | 0.805 | 0.027 | 0.002 | 0.011 | 0.996 | 0.031 | |
| 0.165 | 0.131 | 0.233 | 0.168 | 0.162 | 0.137 | 0.004 | 0.496 |
| 0.512 | 0.527 | 0.492 | 0.495 | 0.493 | 0.464 | 0.462 | |
| 0.082 | 0.065 | 0.116 | 0.083 | 0.080 | 0.068 | 0.002 | |
-------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
02 | 230 | 179 | 352 | 255 | 242 | 222 | 4 | 1484 |
| 0.196 | 0.093 | 0.032 | 0.105 | 0.000 | 0.079 | 0.592 | |
| 0.155 | 0.121 | 0.237 | 0.172 | 0.163 | 0.150 | 0.003 | 0.451 |
| 0.438 | 0.441 | 0.455 | 0.460 | 0.451 | 0.460 | 0.308 | |
| 0.070 | 0.054 | 0.107 | 0.078 | 0.074 | 0.067 | 0.001 | |
-------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
# Calculate time difference between current Timestamp and previous Timestamp
temp2$time.difference <- with(temp2, Timestamp - lag(Timestamp))
# Display time difference as minutes
temp2$time.diff.minute <- as.numeric(temp2$time.difference, units = 'mins')
# Datetime from imported data
head(temp2$Timestamp,2)
[1] "3/13/2019 13:14:35" "3/13/2019 13:15:16"
# Convert character datetime to datetime
temp2$Timestamp <- as.POSIXct(temp$Timestamp, format="%m/%d/%Y %H:%M:%S", tz=Sys.timezone())
# Class changed
class(temp2$Timestamp)
[1] "POSIXct" "POSIXt"
as.Date()
# Nicotine dependence and other diagnoses, processed by Gu Zhu, added sex, zygosity and DOB
## Don't specify DOB column as "Date" in colClasses= option. Convert character to date format later
## Examples of DOB: 10/28/1948 4/10/1960... Note the month length is inconsistent
d6 <- read.csv(paste0(locPheno,"nic_use.csv")
,header = T
,na.strings = ""
,stringsAsFactors = F
,colClasses = c("character","character","integer","character","integer","integer","integer","integer","integer","integer","integer","integer","integer"))
## Split DOB column into mm, dd, and yyyy, pad leading zeros to mm and combine them together to make a new date
d6.1 <- d6 %>% tidyr::separate(DOB
,c("DOB.mm","DOB.dd","DOB.yyyy")
,sep= "/"
,remove=FALSE)
## Pad leading zeros to DOB.mm
d6.1$DOB.mm <- stringr::str_pad(d6.1$DOB.mm, width=2,side="left",pad="0")
## Combine DOB.dd, DOB.mm, and DOB.yyyy to make a new date column
d6.1$DOB.yyyymmdd <- with(d6.1,paste(DOB.yyyy,DOB.mm,DOB.dd,sep="-"))
## Format the new date column as class date
d6.1$DOB.yyyymmdd <- as.Date(d6.1$DOB.yyyymmdd,format="%Y-%m-%d" )
## Check the date column
> str(d6.1$DOB.yyyymmdd)
Date[1:9688], format: "1955-10-11" "1955-10-11" "1948-10-28" "1951-01-01" "1969-05-02" "1950-10-28" "1965-06-09" "1966-12-03" "1966-12-03" ...
# A string containing yyyy-mm-dd
> filePath
#[1] "/mnt/lustre/working/lab_nickm/lunC/MR_ICC_GSCAN_201806/reference/gwas-association-downloaded_2018-08-25-nicotine-dependence.tsv"
# Extract trait name by deleteing yyy-mm-dd, file prefix and file extension
yyyy_mm_dd="\\d{2,4}[.-]\\d{2}[.-]\\d{2,4}"
name1= gsub(basename(filePath),pattern =yyyy_mm_dd,replacement = "" )
name2=gsub(name1,pattern="gwas-association-downloaded_-",replacement = "")
name3=gsub(name2,pattern=".tsv",replacement = "")
> name3
[1] "nicotine-dependence"
# Read a csv file (exported from a matrix) into a matrix
## The data.frame must have row.names similar to the rowname dimension of the imported matrix file
m1 <- read.table(paste0(loc.pheno.corr,"phenotypic-corr-matrix-between-GSCAN-PRS-and-illicit-SU-SUDs-QIMR19Up.txt")
,header=TRUE
,row.names=1) # says first column are rownames
m1.matrix <- as.matrix(m1) # dim(m1.matrix) 26 40
m2 <- as.data.frame.table(m1.matrix) # 1040 obs. of 3 variables:
# Split Var2 into 2 columns for reshaping data from long to wide: "discovery.trait" and "p.value.threshold"
m2$Var1 <- as.character(m2$Var1)
m2$Var2 <- as.character(m2$Var2)
library(tidyr)
m3 <- m2 %>%
tidyr::separate(Var2, c("discovery.trait","p.value.threshold"),"\\.")
# Reshape data from long to wide, resulting in 1 row per target phenotype, discovery trait and p value threshold
# Reshape long-form data to wide-form data using reshape()
# # v.names = a vector of measure variables
# # idvar = a vector of variables that will form the row dimension of the transposed table
# # timevar= variable whose values are to reshape to wide, forming the column dimension of the transposed table
# # The newly created columns will be "v.names.timevar" (4 measure variables * 5 unique values of name_fixEffect_trait)
measure_variables=c("Freq") # measure variables
m4 <- reshape(m3
,v.names = measure_variables # measure variables
,idvar = c("Var1","p.value.threshold")
,timevar = "discovery.trait"
,direction = "wide"
,sep="_") # 208 obs. of 7 variables
# Custom sort data by target phenotype (order defined by the levels=) and then p.value.threshold
## Change Var1 back to factor
m4$Var1 <- factor(m4$Var1,levels=rownames(m1.matrix))
m5 <- m4[with(m4, order(Var1,p.value.threshold)),]
colnames(m5)[1] <- "target.phenotype"
# Summarise the phenotypic correlation matrix
range(as.vector(m1.matrix)) # -0.1374839 0.2099488
# Get maximal value of the matrix
## First, get the index of maximal value. This return row.position and column.position of the value
## which.max(), which.min() determine the location, i.e., index of the (first) minimum or maximum of a numeric (or logical) vector.
m1.matrix.maximum <- arrayInd(which.max(m1.matrix), .dim=dim(m1.matrix))
## Retrieve rowname and column name by the matrix index
rownames(m1.matrix)[m1.matrix.maximum[,1]] # [1] "age at onset of cannabis abuse"
colnames(m1.matrix)[m1.matrix.maximum[,2]] # [1] "DPW.S4"
# A matrix to work on
> str(phenotypic.corr.matrix.pheno.PRS.QIMR19Up)
num [1:26, 1:40] 0.0416 0.04234 0.03419 0.071 0.00115 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:26] "ever used cocaine" "ever used amphetamine" "ever used inhalants" "ever used sedatives" ...
..$ : chr [1:40] "SI.S1" "SI.S2" "SI.S3" "SI.S4" ...
# Export the matrix, keeping the rownames
write.table(phenotypic.corr.matrix.pheno.PRS.QIMR19Up
,file=paste0(loc.pheno.corr,"phenotypic-correlation-between-GSCAN-PRS-and-illicit-SU-SUDs-QIMR19Up.txt"))
# Import the file to see if the matrix structure is unchanged
phenotypic.corr.matrix.pheno.PRS.QIMR19Up2 <- read.table(paste0(loc.pheno.corr,"phenotypic-correlation-between-GSCAN-PRS-and-illicit-SU-SUDs-QIMR19Up.txt")
,header=TRUE
,row.names=1) # says first column are rownames
# Check the structure
> str(phenotypic.corr.matrix.pheno.PRS.QIMR19Up2)
'data.frame': 26 obs. of 40 variables:
$ SI.S1 : num 0.0416 0.04234 0.03419 0.071 0.00115 ...
$ SI.S2 : num 0.0327 0.0555 0.0261 0.0526 0.0202 ...
# A data file to process
> str(d1)
'data.frame': 42583 obs. of 8 variables:
$ ID : int 201 202 203 204 205 208 209 250 251 401 ...
# Fix ID to 7 character in width by padding with leading zeros (e.g. 201 becomes 0000201)
# Convert ID/iid column to character in both files d3 and d4
# Keep ID column in same name for merging purposes: iid
d1$ID <- as.character(d1$ID)
d1$iid <- stringr::str_pad(d1$ID,width=7, side="left", pad="0")
# Check the newly created iid column
> str(d1)
'data.frame': 42583 obs. of 9 variables:
$ ID : chr "201" "202" "203" "204" ...
$ iid : chr "0000201" "0000202" "0000203" "0000204" ...
par(mar(),mpg(),oma())
mar()
controls the margins, where plot axis titles are printedmpg()
sets the axis label locations relative to the edge of the inner plot window. The first value represents the location the labels (i.e. xlab and ylab in plot), the second the tick-mark labels, and third the tick marks. The default is c(3, 1, 0). Increase the label location when your axis label overlaps with axis ticksinner_join(df1, df2) #inner
left_join(df1, df2) #left outer
right_join(df1, df2) #right outer
left_join(df2, df1) #alternate right outer
full_join(df1, df2) #full join
semi_join(df1, df2) #keep only observations in df1 that match in df2.
anti_join(df1, df2) #drops all observations in df1 that match in df2.
# Exclude non-white people (file_notWhite) from a joined file (joined)
joined_rm1 <- joined %>% anti_join(file_notWhite) # 438870 obs. of 9 variables
# Exclude related individuals based on ID1 or ID2 from joined_rm1
joined_rm2_ID1rm <- anti_join(joined_rm1,file_related,by=c("IID" = "ID1")) # 361387 obs. of 9 variables
joined_rm2_ID2rm <- anti_join(joined_rm1,file_related,by=c("IID" = "ID2")) # 361443 obs. of 9 variables
*How to extract the first line from a text file?
## Read first line of a file
con <-file(filePath,"r") # Establish a connection to the file
first_line <- readLines(con,n=1) # Read 1st line of the file
# Example 1 with special characters ()
## greb the line that starts with keywords. Specify this pattern as a string by fixed=TRUE. Note when this option is there, you cannot use regular expression.
line_want <- grep("Number of indivs with no missing phenotype(s) to use",logFile,value = T,fixed=TRUE)
> line_want
[1] "Number of indivs with no missing phenotype(s) to use: 276533"
## greb the line that starts with keywords. Specify this pattern using regular expression. Ensure that special characters are escaped by double backslashes \\
line_want <- grep("^Number of indivs with no missing phenotype\\(s\\) to use",logFile,value = T)
> line_want
[1] "Number of indivs with no missing phenotype(s) to use: 276533"
# Example 2 with no special characters
## Suppose line 53-57 a file is
### Genetic Correlation
### -------------------
### Genetic Correlation: -0.3813 (0.0371)
### Z-score: -10.275
### P: 9.1379e-25
## To grep -0.3813
## Import log files
logFile=readLines(inputFilePath)
## Grep the line that starts with keywords "Genetic Correlation:"
line_want=grep("^Genetic Correlation:",logFile,value = T)
rG=as.numeric(unlist(strsplit(line_want," "))[3])
# Count number of twins per family and genotyped.zygosity by counting number of non-missing ID columns per row
## Subset columns containing values to count non-missing
df.twin.ID.wide.small <- df.twin.ID.wide[,c("ID.01","ID.02","ID.51","ID.04","ID.54","ID.53","ID.50","ID.56","ID.57","ID.52","ID.55")]
## Method1: Calculate number of non-missing per row
df.twin.ID.wide$numb.twins <- rowSums(!is.na(df.twin.ID.wide.small))
## Method2: Calculate number of non-missing per row
apply(X=df.twin.ID.wide.small
,MARGIN= 1 # Margin=1 indicates rows, 2 indicates columns
,FUN=function(x) length(x[!is.na(x)]))
# Number of non-missing values in alcoholAgeFirst
length(na.omit(PRS_alcoho_tobacc[,"alcoholAgeFirst"]))
# Method 1: use tydyr::unite().Combine two columns phenotype and subSampleSize into one column named phenotype. This replaces original phenotype. Be aware that data attribute may be changed by tidyr package.
GCTAOut_part3 <- tidyr::unite(GCTAOut_part3
,phenotype # name of the newly created column
, c(phenotype, subSampleSize) # columns to combine
, sep = "_" # separator
, remove=TRUE # remove input columns
)
# Method 2: with(data,paste0(column1,"sep",column2,"sep2"))
GCTAOut_part3$phenotype <- with(GCTAOut_part3,paste0(phenotype,"_",subSampleSize))
grep()
,glob2rx()
and grep()
, or match(patterns,object)
glob2rx()
changes wildcard or globbing pattern into regular expression# 1 positive search pattern. Return values using value=TRUE
target_var_group2=grep("alcohol",unique(f5$phenotype), value = TRUE) # 6 variables in red group
# 1 positive search pattern with wildcards
patterns_to_search=glob2rx("SU_DSM*cannabis*")
target_var_group3=grep(pattern_to_search,unique(f5$phenotype), value = TRUE)
# 2 positive patterns: 1 with wildcard
patterns_to_search=glob2rx("GSCAN.*_r|ZYGOSITY")
variables_to_reshape=grep(names(df_twin),pattern=patterns_to_search,value = TRUE)
# 3 positive patterns. None of them have wildcards
three_patterns_to_search=c("A1", "A9", "A6")
match(three_patterns_to_search, myfile$Letter)
*stacking columns into 1 column in R
# Stack PRS for 7 p value thresholds to a single column
GSCAN.ai <- stack(PRS_pheno_uniqueID[1:7])
# Method 1: use tydyr::separate
## The data to work on
> genetic_corr
trait_exp h2_liab h2_liab_se rg se z p
1 GSCAN-SI 0.094 0.003 0.509 0.026 19.429 4.4169e-84
2 GSCAN-AI 0.048 0.003 -0.073 0.044 -1.637 1.0160e-01
3 GSCAN-CPD 0.074 0.010 -0.084 0.040 -2.071 3.8300e-02
4 GSCAN-SC 0.052 0.004 -0.146 0.045 -3.256 1.1000e-03
5 GSCAN-DPW 0.050 0.002 0.379 0.034 11.162 6.2520e-29
## Split trait_exp by delimiter "-" into 2 columns : consortium and trait
genetic_corr2 <- genetic_corr %>% tidyr::separate(trait_exp,c("consortium","trait_exp"),"-")
> genetic_corr2
consortium trait_exp h2_liab h2_liab_se rg se z p
1 GSCAN SI 0.094 0.003 0.509 0.026 19.429 4.4169e-84
2 GSCAN AI 0.048 0.003 -0.073 0.044 -1.637 1.0160e-01
3 GSCAN CPD 0.074 0.010 -0.084 0.040 -2.071 3.8300e-02
4 GSCAN SC 0.052 0.004 -0.146 0.045 -3.256 1.1000e-03
5 GSCAN DPW 0.050 0.002 0.379 0.034 11.162 6.2520e-29
# Method 2: a complicated one
## A column "name_fix_eff" contains text.A.S1, text.B.S2,...
## Split the column values by dot and add part 2 and part 3 as separate columns
f.all$name_fixEffect_trait <- as.data.frame(do.call("rbind",strsplit(f.all$name_fix_eff,"\\.")),stringsAsFactors =FALSE )[,2]
f.all$name_fixEffect_pThre <- as.data.frame(do.call("rbind",strsplit(f.all$name_fix_eff,"\\.")),stringsAsFactors =FALSE)[,3]
# Extract last 2 characters of the ID column
library(magrittr)
tem.uniqueID %<>% mutate(ID.suffix=substr(ID,nchar(ID)-2+1,nchar(ID))) # dim(tem.uniqueID) 2463 5
tidyr::separate
and deselecting unwated columns## Give an order of columns in the new data sets.
### Reorder columns so that "target_pheno_group" appears as 1st column
columns.order <- c("target_pheno_group","phenotype","name_fix_eff","name_fixEffect_trait","name_fixEffect_pThre","fix_eff_esti","SE","pvalue2sided","R2")
# Stack all the phenotypes per manu 2 for all sexes
## Extract 2nd, 3rd part of name_fix_eff columns as two new columns: name_fixEffect_trait, name_fixEffect_pThre
## Reorder columns as the order aforespecified
fix.eff.PRS.manu2.sexPRS.exclu.allSexes <- rbind(fix.eff.PRS.phenoGp2.sexPRS.exclu.allSexes
,fix.eff.PRS.phenoGp5.sexPRS.exclu.allSexes) %>%
# Extract 2nd and last part of a column split by separator dot, extracting the middle and last part
tidyr::separate(col=name_fix_eff
,into=c("part1","name_fixEffect_trait","name_fixEffect_pThre")
,remove=FALSE) %>%
dplyr::select(-part1) %>% # Deleting parts that are not needed
dplyr::select_(.dots=columns.order) # dim(fix.eff.PRS.manu2.sexPRS.exclu.allSexes) 880 9
# Find position of the first _ in a string
str_locate("pheno_SU_cannabis_abuse_onset","_")
start end
[1,] 6 6
# Step1: Create a character vector for custom sorting. Here are all column names of PRS
## Create vectors for ordering characters
discovery.trait.order <- c("si","ai","cpd","sc","dpw")
p.value.thres.order <- paste0("S",c(1:7))
## Create all column names of PRS
PRS.columns <- apply(expand.grid(x=paste0("GSCAN.",c("si","ai","cpd","sc","dpw"))
,y=paste0("S",c(1:7)))
,1
,paste
,collapse=".")
> PRS.columns
[1] "GSCAN.si.S1" "GSCAN.ai.S1" "GSCAN.cpd.S1" "GSCAN.sc.S1" "GSCAN.dpw.S1" "GSCAN.si.S2" "GSCAN.ai.S2" "GSCAN.cpd.S2"
[9] "GSCAN.sc.S2" "GSCAN.dpw.S2" "GSCAN.si.S3" "GSCAN.ai.S3" "GSCAN.cpd.S3" "GSCAN.sc.S3" "GSCAN.dpw.S3" "GSCAN.si.S4"
[17] "GSCAN.ai.S4" "GSCAN.cpd.S4" "GSCAN.sc.S4" "GSCAN.dpw.S4" "GSCAN.si.S5" "GSCAN.ai.S5" "GSCAN.cpd.S5" "GSCAN.sc.S5"
[25] "GSCAN.dpw.S5" "GSCAN.si.S6" "GSCAN.ai.S6" "GSCAN.cpd.S6" "GSCAN.sc.S6" "GSCAN.dpw.S6" "GSCAN.si.S7" "GSCAN.ai.S7"
[33] "GSCAN.cpd.S7" "GSCAN.sc.S7" "GSCAN.dpw.S7"
# Step2: Convert PRS column vector to data.frame
PRS.columns.df <- data.frame(PRS.column= PRS.columns, stringsAsFactors = F)
# Step3: Split PRS column into 3 multiple columns by delimiter "."
PRS.columns.df2 <- PRS.columns.df %>% tidyr::separate(PRS.column,c("consortium","discovery.trait","p.value.thres")
,sep="\\."
,remove=F )
# Step4: Convert character columns that will be custom sorted to factor with desired order as their levels
PRS.columns.df2$discovery.trait <- factor(PRS.columns.df2$discovery.trait,levels=discovery.trait.order)
# Step5: Reorder/sort PRS column by two newly created columns
PRS.columns.df2.ordered <- PRS.columns.df2[
with(PRS.columns.df2, order(discovery.trait,p.value.thres)),
]
# Step6: Use the ordered column as a vector
> PRS.columns.df2.ordered$PRS.column
[1] "GSCAN.si.S1" "GSCAN.si.S2" "GSCAN.si.S3" "GSCAN.si.S4" "GSCAN.si.S5" "GSCAN.si.S6" "GSCAN.si.S7" "GSCAN.ai.S1"
[9] "GSCAN.ai.S2" "GSCAN.ai.S3" "GSCAN.ai.S4" "GSCAN.ai.S5" "GSCAN.ai.S6" "GSCAN.ai.S7" "GSCAN.cpd.S1" "GSCAN.cpd.S2"
[17] "GSCAN.cpd.S3" "GSCAN.cpd.S4" "GSCAN.cpd.S5" "GSCAN.cpd.S6" "GSCAN.cpd.S7" "GSCAN.sc.S1" "GSCAN.sc.S2" "GSCAN.sc.S3"
[25] "GSCAN.sc.S4" "GSCAN.sc.S5" "GSCAN.sc.S6" "GSCAN.sc.S7" "GSCAN.dpw.S1" "GSCAN.dpw.S2" "GSCAN.dpw.S3" "GSCAN.dpw.S4"
[33] "GSCAN.dpw.S5" "GSCAN.dpw.S6" "GSCAN.dpw.S7"
order(match(have,want))
*How to sort a character vector according to a specific order?
# A character vector to sort
x <- c("white","white","blue","green","red","blue","red")
# Your preferred order
y <- c("red","white","blue","green")
# Sort the character vector in your preferred order
x[order(match(x, y))]
# [1] "red" "red" "white" "white" "blue" "blue" "green"
# Example 1: round numeric to 3 decimal places
# The data to round
> tmp
trait_exp h2_liab h2_liab_se rg se.x z p.x bxy se.y p.y n_snps
1 UKB_CCPD 0.042 0.005 0.037 0.040 0.928 3.5320e-01 -0.16404600 0.03875430 2.30628e-05 36
2 UKB_CPD 0.007 0.002 -0.215 0.088 -2.451 1.4200e-02 -0.01694180 0.00766063 2.69981e-02 5
3 UKB_ESDPW 0.062 0.003 0.383 0.040 9.532 1.5417e-21 -0.00455798 0.00475730 3.38011e-01 41
4 UKB_PYOS 0.052 0.003 0.394 0.035 11.182 5.0067e-29 0.01481260 0.00694742 3.29983e-02 19
# Round numeric values to 3 decimal places. Don't round p values
round_numeric_3decimal <-function(x) round(x,digits = 3)
format_p_values <- function(x) format.pval(x,digits = 2,eps = 0.001,nsmall = 3)
# Test your functions
round_numeric_3decimal(1.2345)
format_p_values(0.0071)
# Round numeric values and format p values. Here columns are processed in the order we want
tmp <-cbind(tmp[c("trait_exp","h2_liab","h2_liab_se","rg","se.x","z")]
,lapply(tmp[c("p.x")],format_p_values)
,tmp[c("n_snps")]
,lapply(tmp[c("bxy","se.y")],round_numeric_3decimal)
,lapply(tmp[c("p.y")],format_p_values)
)
# Example 2: Format wanted numeric columns to three decimal places
tmp2$b_3decimal <- format(round(tmp2$b,3),nsmall=3)
tmp2$se_3decimal <- format(round(tmp2$se,3),nsmall=3)
tmp2$b_se_formatted <- with(tmp2,paste0(b_3decimal," (",se_3decimal,")"))
# digits : number of digits, but after the 0.0
# eps = the threshold value above wich the. Function will replace the pvalue by "<0.0xxx"
# nsmall = how much tails 0 to keep if digits of original value < to digits defined
tmp2$pval_3decimal <- format.pval(tmp2$pval
,digits = 2
,eps = 0.001
,nsmall = 3)
# "0.990" "0.200" "0.323" "<0.001" "0.002"
# Example 1
attach(tmp2)
tmp2 <- tmp2[order(exposure,SNP,-method),]
detach(tmp2)
# Example 2
# Order data by ascending outcome substance, outcome, exposure and descending type.analysis. This approach sorts rows by columns in a descending order, those character columns in ascending order are specified with -rank()
t.ordered <- t[order( -rank(t$outcome.substance)
,-rank(t$outcome)
,-rank(t$exposure)
,rank(t$type.analysis)
,decreasing = TRUE),]
# The source2 founction
source2 <- function(file, start, end, ...) {
file.lines <- scan(file, what=character(), skip=start-1, nlines=end-start+1, sep='\n')
file.lines.collapsed <- paste(file.lines, collapse='\n')
source(textConnection(file.lines.collapsed), ...)
}
# Run the function. start and end must cover a complete code chunk
filePathScript= paste0(locScripts,"PRS_UKB_201711_step18-01-01_adjust-significance-threshold-for-multiple-testing-in-a-single-data-set_pheno-group2-5.R")
source2(filePathScript,214,280)
# The problem: the phenotype column is ordered as everDrug1, everDrug10, everDrug2... rather than everDrug1, everDrug2... everDrug10
# list phenotype values as you want in the levels= option. The values should match phenotype values in the data to sort
f2$phenotype <- factor(f2$phenotype, levels = paste0("everDrug",c(1:10)))
# Order data by the phenotype column
f2.sorted <- f2[order(f2$phenotype),]
which(is.na(matrix_name),arr.ind=T)
Get positions for NAs only in the “middle” of a matrix column
# Show location of the NAs
which(is.na(f.all.RSquare.wide.matrix), arr.ind = T)
# row col
# DSM5 CUD ctrl vs cases 28 16
which(is.na(f.all.pValue.wide.matrix),arr.ind = T)
# row col
# DSM5 CUD ctrl vs cases 28 16
## Replace NA with 0 for R2, NA with 0.999 for p values
f.all.RSquare.wide.matrix[is.na(f.all.RSquare.wide.matrix) ] <- 0
f.all.pValue.wide.matrix[is.na(f.all.pValue.wide.matrix)] <- 0.999
library(plyr)
# Convert a matrix to a data.frame keeping its dimnames
f.all.percentageExplained.wide.df <- adply(f.all.percentageExplained.wide.matrix, 1, c)
matrix(as.matrix(DF))
. dimnames=
specifies the rownames and colnames of the matrix.matrix(as.matrix(f.all.pValue.wide[,c(2:41)])
,nrow = nrow(f.all.pValue.wide)
,ncol=ncol(f.all.pValue.wide[,c(2:41)])
,dimnames = list(label_phenotypes,short_colnames_pValue))
reshape()
. Limitation is only 1 timevar can be used. Use method 2 if multiple timevar# Method 1: use reshape. Limitation is only 1 timevar can be used. Use method 2 if multiple timevar
## v.names = a vector of measure variables
## idvar = a vector of variables that will form the row dimension of the transposed table
## timevar= variable whose values are to reshape to wide, forming the column dimension of the transposed table
## The newly created columns will be "v.names.timevar" (4 measure variables * 5 unique values of name_fixEffect_trait)
measure_variables=c("fix_eff_estimate","fix_eff_SE","pvalue2sided","R2") # multiple measure variables
f_test <- reshape(f.everDrug_AU_CU # data to reshape
,v.names = measure_variables # measure variables
,idvar = c("target_pheno_group","phenotype","name_fixEffect_pThre")
,timevar = "name_fixEffect_trait"
,direction = "wide")
reshape2::dcast()
. Limitation: when there are duplicate variables, dcast() applies its default function length to your data (e.g. when your value.var takes a character column, but the reshaped values will become 1 or 0). To overcome this limitation, you will create another new column that uniquely identifies the combinations of the ~ variables. Function arguments:# Reshape contingency table data to one row per variable with multiple timevar columns by dcast()
append1_wide <- reshape2::dcast(append1, variable ~ wave+ value , value.var = "Freq", drop = FALSE)
# Avoid dcast() applies fun.aggregate=length to reshaped values sepecifeid by value.var= when the values are character
## Make a data containing duplicates (i.e. variables id and cat, used by ~ formula, don't give unique combinations. See row 6 and 7 of the following dataset df2)
df2 <- structure(list(id = c("A", "B", "C", "A", "B", "C", "C"), cat = c("SS",
"SS", "SS", "SV", "SV", "SV", "SV"), val = c(220L, 222L, 223L,
224L, 225L, 220L, 1L)), .Names = c("id", "cat", "val"), class = "data.frame", row.names = c(NA,
-7L))
> tail(df2)
id cat val
2 B SS 222
3 C SS 223
4 A SV 224
5 B SV 225
6 C SV 220
7 C SV 1
library(reshape2)
library(plyr)
# Add a variable for how many times the id*cat combination has occured
## seq_along(cat) is to incrementally counting the occurence of cat. When cat has duplicates, it counts them incrementally (see row 7 of tmp)
tmp <- ddply(df2, .(id, cat), transform, newid = paste(id, seq_along(cat)))
> tail(tmp)
id cat val newid
2 A SV 224 A 1
3 B SS 222 B 1
4 B SV 225 B 1
5 C SS 223 C 1
6 C SV 220 C 1
7 C SV 1 C 2
## Aggregate using this newid and toss in the id so we don't lose it
out <- dcast(tmp, id + newid ~ cat, value.var = "val")
## Remove newid if we want
out <- out[,-which(colnames(out) == "newid")]
> out
# id SS SV
#1 A 220 224
#2 B 222 225
#3 C 223 220
#4 C NA 1
## step0: the table to reshape
str(f.everDrug_AU_CU) # 1040 obs. of 8 variables
## step1: Reshape measure variables from wide to long, reshaping them into 2 new columns
### (1) "variable" contains the variable name of the measure variable and
### (2) "value" contains the values of the measure variables
### melt(id.vars=non-measure-variables)
df<-f.everDrug_AU_CU %>% reshape2::melt(id.vars=c(1,2,3,4)) # 4160 obs. of 6 variables
## step2: Create a column for the column heading of the transposed table
### This heading column combines the variable to transpose into wide format and the names of the measure variables
df$column_heading<-paste0(df$name_fixEffect_trait,"_",df$variable)
## step3: Remove the two old columns
df$name_fixEffect_trait<-df$variable<-NULL
## step4: Transpose data into wide format
### spread(data, key, value)
#### key: The bare (unquoted) name of the column whose values will be used as column headings
#### value: The bare (unquoted) name of the column whose values will populate the cells
dd <- df %>% spread(column_heading,value)
## step5: Check the tranposed table
str(dd) # Classes ‘data.table’ and 'data.frame': 208 obs. of 23 variables
tidyr::spread()
The trick is to create a dummy column containing unique number representing rows of the reshaped data using gather()
unite()
group_by
and mutate()
and then reshape data using the dummy column as the key column, and eventually drop this dummy column# A long-format data to reshape to one row per month
jj <- data.frame(month=rep(1:3,4),
student=rep(c("Amy", "Bob"), each=6),
A=c(9, 7, 6, 8, 6, 9, 3, 2, 1, 5, 6, 5),
B=c(6, 7, 8, 5, 6, 7, 5, 4, 6, 3, 1, 5)) # dim(jj) 12 4
# Data structure of this data
## Non-measurement variables: month, student
## Measurement variables: A, B
str(jj)
'data.frame': 12 obs. of 4 variables:
$ month : int 1 2 3 1 2 3 1 2 3 1 ...
$ student: Factor w/ 2 levels "Amy","Bob": 1 1 1 1 1 1 2 2 2 2 ...
$ A : num 9 7 6 8 6 9 3 2 1 5 ...
$ B : num 6 7 8 5 6 7 5 4 6 3 ...
# Reshape data to wide-format
## gather() : Collapse measurement variables into two new columns variable and value. Simply add all the non-measurement variables within -(var1:varN). To use the range you must order columns so that columns are non-measurement then measurement
## unite(): Combine non-measurement variables that don't form the row dimension of the reshaped data to a temporary variable
jj.wide <- jj %>%
gather(key="variable", value="value", -(month:student)) %>%
unite(temp, student, variable) %>%
group_by(temp) %>%
mutate(id=1:n()) %>%
spread(temp, value) %>%
select(-id) # dim(jj.wide) 6 5
# Data structure of the newly reshaped data
## The row structure is 1 row per month which contains duplicates
str(jj.wide)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 6 obs. of 5 variables:
$ month: int 1 1 2 2 3 3
$ Amy_A: num 9 8 7 6 6 9
$ Amy_B: num 6 5 7 6 8 7
$ Bob_A: num 3 5 2 6 1 5
$ Bob_B: num 5 3 4 1 6 5
# Spread the levels of Tissue.Category (1 column) to 1 column per level (3 columns), resulting in three sets of columns (Tumor:,Stroma:, total: )
summary.Exp26.wide <- tidyr::pivot_wider( data=summary.Exp26
,id_cols = c(1) # Columns that form the row dimension
,names_from = c(2) # Columns to pivot wider
,names_sep = "_"
# Measurement variables, the data portion
,values_from = c(3:6))
# Spread Duplicate column, resulting in two sets of columns (:RBWH_A , :RBWH_B)
Exp08.survival.markers.wide <- tidyr::pivot_wider( data=Exp08.survival.markers
,id_cols = c(shortID, coreID)
,names_from = Duplicate # Columns to collapse into column dimension
,names_sep = "_"
,values_from = c(6,7,9:29)) # Measurement variables (data portion)
mutate(new.var=case_when)
Values of new variable are specified on the right of ~# Example 1: Conditionally create a new column method.abbreviated with values "Egger", "WM" and "IVW". Note the use of `TRUE ~ as.character(method)`
append_MR_files_selective_methods <- append_MR_files %>%
dplyr::filter(method %in% c("MR Egger","Weighted median","Inverse variance weighted")) %>%
dplyr::mutate(method.abbreviated= case_when(method=="MR Egger" ~ "Egger"
,method=="Weighted median" ~ "WM"
,method=="Inverse variance weighted" ~ "IVW"
,TRUE ~ as.character(method)))
# Example 1 that won't work
append_MR_files_selective_methods <- append_MR_files %>%
dplyr::filter(method %in% c("MR Egger","Weighted median","Inverse variance weighted")) %>%
dplyr::mutate(method.abbreviated= case_when(method=="MR Egger" ~ "Egger"
,method=="Weighted median" ~ "WM"
,method=="Inverse variance weighted" ~ "IVW"))
# Example 2: Conditionally create V5 based on the values of V1, V2,V3 and V4. V5 values are 1,2 or 0
library(dplyr)
myfile %>%
mutate(V5 = case_when(V1 == 1 & V2 != 4 ~ 1,
V2 == 4 & V3 != 1 ~ 2,
TRUE ~ 0))
# Example 3 working: recode values <=0 as numeric NA, else as original numeric values.
df <- df %>% mutate(new = case_when(old == 1 ~ 5,
old == 2 ~ NA_real_,
TRUE ~ as.numeric(old)))
*onditional filter of grouped factors - dplyr
# Condiitonally subset rows based on the value of the column pval.y (p value for horizontal pleiotropy)
## if pval.y < 0.05, keep rows with method= "MR Egger" or "Weighted median" (i.e. delete rows with method !%in% c("MR Egger","Weighted median") when pval.y >= 0.05)
## Do nothing when pval.y >= 0.05
## The condition to check is by a new variable "check"
OR_horiPleio_keep <- OR_horiPleio %>%
group_by(id.exposure,id.outcome) %>%
mutate(check= pval.y <0.05) %>%
filter(!(check==TRUE & method %in% c("MR Egger","Weighted median")))
# Example 1
# Exclude 2 rows with SNPs rs2472297 and rs4410790. Note that ! is used in the beginning of the condition
library("dplyr",lib.loc="/software/R/R-3.4.1/lib64/R/library")
subset(harmonised.UKBCCPD.ICCCI.5e8, !SNP %in% c("rs2472297","rs4410790")) # dim(harmonised.UKBCCPD.ICCCI.5e8.rm2SNPs)
harmonised.UKBCCPD.ICCCI.5e8 %>% dplyr::filter(!SNP %in% c("rs2472297","rs4410790"))
# Example 2
library(dplyr)
library(tidyr)
# Excluding values in phenotype containing "onset"
f5 <- read.table(file = paste0(input_pheno5,input_file5),header = T, sep=",", stringsAsFactors = F) %>%
filter(!grepl("onset",phenotype))
*How to drop columns by name pattern in R?
# Drop column names that end with "S8"
PRS_pheno_gp2=PRS_pheno_gp2[,-grep("S8", colnames(PRS_pheno_gp2))]
setdiff()
# Vertically combine all 3 sex groups as 1 data set
temp <- rbind(get(paste0(prefix.out.file.name.manu2,".allSexes"))
,get(paste0(prefix.out.file.name.manu2,".males"))
,get(paste0(prefix.out.file.name.manu2,".females")))
# Recorder columns and sort rows for tables in SAS
## Put these columns to first in a flexible way by setdiff()
columns.to.first <- c("phenotype","phenotype.order.num","var.label","sex.group")
temp <- temp[,c(columns.to.first, setdiff(names(temp),columns.to.first))]
dplyr::select_()
.library(dplyr)
library(tidyr)
# Example 1
# Reorder columns
columns_want_in_this_order=c("FID","IID","missing","missing","batch","kinship","exclude_kinship","excess_relative","age","sex","white.British","X3456.0.0","X3456.1.0","X3456.2.0","X3456.0.0_recode","X3456.1.0_recode","X3456.2.0_recode","X3456_mean")
pheno_ukb3456_NA20453_ordered <- pheno_ukb3456_NA20453 %>% select_(.dots=columns_want_in_this_order)
# Example 2
columns_to_select=c("phenotype","covarPheno","covariate","fix_eff","pvalue2sided","R2")
f2 <- read.table(file = paste0(input_pheno2,input_file2),header = T, sep=",", stringsAsFactors = F) %>%
filter(str_detect(covarPheno,"GSCAN")) %>% # rows reduced from 15800 to 8000
select_(.dots=columns_to_select)
filter(str_detect(variable,"A|B|C"))
. str_detect() accepts length-1 pattern# Subset rows from phenotype is "tobacco" or tobaccoFreq and rowNames is GSCAN.si.*
f3 %>%
filter(str_detect(phenotype, "tobaccoEver|tobaccoFreq")) %>%
filter(str_detect(rowNames,"GSCAN.si"))
# load packages
library(dplyr)
library(stringr)
# Import CSV data files
## Subset rows from covarPheno==GSCAN.*
f2=read.table(file = paste0(input_pheno2,input_file2),header = T, sep=",", stringsAsFactors = F) %>% filter(str_detect(covarPheno,"GSCAN")) # rows reduced from 15800 to 8000
## Subset ID (column 2) and PRS columns (names with prefix GSCAN.)
PRS_pheno_gp4= subset(data_gp4, select = c(2, grep("GSCAN\\.", names(data_gp4))))
library(dplyr)
target <- c("Tom", "Lynn")
filter(dat, name %in% target) # equivalently, dat %>% filter(name %in% target)
# Method 1
by(file$PVALUE<5e-8,file$CHROM,sum )
# Method 2: %>% filter() %>% group_by() %>% summarise(newColumnName=n())
## Be aware of conflict in same-named function summarise() of package dplyr and plyr
z2 <- file %>%
filter(PVALUE<5e-8) %>%
group_by(CHROM) %>% # multiple group columns
dplyr::summarise(numSNPMetPThres= n())
# Example: Multiple functions in the summarise()
testData <- read.table(file = "clipboard",header = TRUE)
require(dplyr)
testData %>%
group_by(Y) %>%
summarise(total = sum(income),freq = n()) %>%
filter(freq > 3)
# Strings to subset
"GSCAN_ai_S1_r_01" "GSCAN_ai_S1_r_02"
# Delete the ending part to get the part we want
string=c("GSCAN_ai_S1_r_01","GSCAN_ai_S1_r_02")
gsub(string,pattern="_01|_02",replacement="")
#[1] "GSCAN_ai_S1_r" "GSCAN_ai_S1_r"
# A data.frame has five columns
str(z_df)
'data.frame': 880 obs. of 5 variables:
$ metaData : chr "HRCr1.1" "HRCr1.1" "HRCr1.1" "HRCr1.1" ...
$ trait : chr "ai" "ai" "ai" "ai" ...
$ CHR : num 10 10 10 10 10 10 10 10 11 11 ...
$ pThreshold : num 5e-08 1e-05 1e-03 1e-02 5e-02 1e-01 5e-01 1e+00 5e-08 1e-05 ...
$ num_SNP_met_p: num 0 4 109 684 2557 ...
# Sum up num_SNP_met_p across 22 autosomes
library(dplyr)
z2 <- z_df %>%
group_by(metaData,trait,pThreshold) %>% # multiple group columns
summarise(sum= sum(num_SNP_met_p)) # multiple summary columns
lapply()
# Factor columns to numeric
## Example 1
> str(GSMR)
'data.frame': 4 obs. of 5 variables:
$ Exposure: Factor w/ 5 levels "UKB_CCPD","UKB_CPD",..: 2 1 3 4
$ n_snps : Factor w/ 5 levels "19","36","41",..: 4 2 3 1
$ bxy : Factor w/ 5 levels "-0.00455798",..: 2 3 1 4
$ se : Factor w/ 5 levels "0.0047573","0.00694742",..: 3 4 1 2
$ p : Factor w/ 5 levels "0.0269981","0.0329983",..: 1 4 3 2
## Convert column 2 to 5 to numeric
GSMR[,c(2:5)] <- lapply(GSMR[,c(2:5)], function(x) as.numeric(levels(x))[x])
> str(GSMR)
'data.frame': 4 obs. of 5 variables:
$ Exposure: Factor w/ 5 levels "UKB_CCPD","UKB_CPD",..: 2 1 3 4
$ n_snps : num 5 36 41 19
$ bxy : num -0.01694 -0.16405 -0.00456 0.01481
$ se : num 0.00766 0.03875 0.00476 0.00695
$ p : num 2.70e-02 2.31e-05 3.38e-01 3.30e-02
## Example 2: (not always working) Convert column 3 to 5 from character to numeric
z_df[,3:5]<-lapply(z_df[,3:5],as.numeric)
#Convert column 3 to 5 from factor to character
data[,1:3]<-lapply(data[,1:3],as.character)
# Convert all columns to character
data[] <-lapply(data,as.character)
# Convert all factor columns to character
datHar_CI_UKBCCPD[] <- lapply(datHar_CI_UKBCCPD, function(x) if(is.factor(x)) as.character(x) else x)
# Read an Excel file
dim(.BRAF.20181218) 183 49
# Get column names of all date columns ignoring case
BRAF.20181218.date.colnames <- grep(pattern="date", x=colnames(.BRAF.20181218), ignore.case = TRUE, value = TRUE)
# Assign data types to columns
BRAF.20181218.data.types <- ifelse(colnames(.BRAF.20181218) %in% BRAF.20181218.date.colnames, "date", "text")
# Read the excel file using the data types
BRAF.20181218 <- readxl::read_excel( path = filepath.source.clinical.data.BRAF.20181218
,sheet = "PRE_clindata"
,na=c(" ")
,n_max = 184
,guess_max =184
,col_types = BRAF.20181218.data.types) # dim(.BRAF.20181218) 183 49
# Change POSIXct to Date
BRAF.20181218[,BRAF.20181218.date.colnames] <- lapply(BRAF.20181218[, BRAF.20181218.date.colnames]
, function(x) as.Date(x, origin="1899-12-30"))
foreach()
loops are used for their return value, like lapply. In this way they are very different from for loops which are used for their side effects. By using the appropriate .combine functions, the inner foreach loop can return vectors which are combined row-wise into a matrix by the outer foreach loop. Notice that packages used in the inner foreach loop need to be called in the outer foreach loop# The following matrix x has 10*5 rows
x <- foreach(i=1:10, .combine='rbind', .packages=c("foreach","dplyr")) %dopar% {
foreach(j=1:5, .combine='rbind') %do% { }}
# The following matrix x has 10 rows. The result from the inner foreach loop is put together horizontally, probably not what you want
x <- foreach(i=1:10, .combine='rbind', .packages="foreach") %dopar% {
foreach(j=1:5, .combine='c') %do% { }}
# this subset 40 rows, which is correct.
significant <- subset(data, phenotype %in% c("everDrug1","everDrug3","everDrug6","everDrug7","everDrug8") & covarPheno=="GSCAN.si", select=c("phenotype","covariate","pvalue2sided","R2") )
# This subset just 8 rows. But where is wrong?
#significant <- data[data$phenotype==c("everDrug1","everDrug3","everDrug6","everDrug7","everDrug8") & data$covarPheno=="GSCAN.si",c("phenotype","covariate","pvalue2sided","R2")]
*Creating a new column to a data frame using a formula from another variable
# Perform calculation based on multiple columns
## syntax: data$newVar <- with(data, expression)
leftOuterJoin$twoPQBeta2 <- with(leftOuterJoin, 2 * ALT_FREQS * (1-ALT_FREQS) * beta * beta)
*removing duplicate units from data frame
# rbind PRS per group 2 and 5.
PRS_pheno <- rbind(PRS_pheno_gp2,PRS_pheno_gp5) # 4790 obs, 41 variables
## Remove duplicate rows based on the first column "ID"
PRS_pheno_uniqueID <-PRS_pheno[!duplicated(PRS_pheno[,1]),] # 2463 obs
# Make a data set
dat <- data.frame( ID = c("A","A","B","B","C","D"), val = 1:6 )
> dat
ID val
1 A 1
2 A 2
3 B 3
4 B 4
5 C 5
6 D 6
# Remove rows with column ID that occurs more than once
dat[which(is.na(match(dat$ID, dat$ID[duplicated(dat$ID)]))),]
ID val
5 C 5
6 D 6
# File rows Explantion
#-----------------------------
# left 594186 raw data
# left1 594046 left with non rs number ID removed
# left2 677 Number of rows with ID occurs more than once in left1
# left3 593369 remove left2 from left1
#-----------------------------
left=read.table(file=paste0(locGWAS_UKB20116,fileNameStart,".X20116_recodeFinal.glm.logistic")
,header = TRUE
,sep="\t"
,stringsAsFactors=F
,comment.char = "!") %>% select(c(X.CHROM,POS,ID,ALT,OR,P))
# Remove non-rs number variants
left1=left[grep("^rs",left$ID),]
#subset all rows that have a value in column ID occur more than once in the dataset.
library(dplyr)
left2 <- left1 %>% group_by(ID) %>% filter(n()>1)
# Remove all occurrences of duplicates
left3 <- left1 %>% group_by(ID) %>% filter(n()==1)
dplyr::left_join()
to merge()
because it keeps the order of merging key of the joined file the same as the left tablelibrary(dplyr)
library(magrittr)
# testing on UKB 20116, chr1
# Import data files
fileNameStart="X20116_recodeFinal-HRC.chr1.plink2.output"
# Note #CHROM causes 1st line to be considered as a comment
# File rows Explantion
#-----------------------------------------------------------------------------
# left 594186 raw data 1
# left1 594046 left with non rs number ID removed
# left2 677 Number of rows with ID occurs more than once in left1
# left3 593369 remove left2 from left1
# right 3074089 raw data 2
# right1 2845136 right with non rs number ID removed
# right2 21704 Number of rows with ID occurs more than once in right1
# right3 2823432 remove right2 from right1
# leftOuterJoin 593369 left outer join left3 and right3
#-----------------------------------------------------------------------------
left=read.table(file=paste0(locGWAS_UKB20116,fileNameStart,".X20116_recodeFinal.glm.logistic")
,header = TRUE
,sep="\t"
,stringsAsFactors=F
,comment.char = "!") %>% select(c(X.CHROM,POS,ID,ALT,OR,P))
# Remove non-rs number variants
left1=left[grep("^rs",left$ID),]
#subset all rows that have a value in column ID occur more than once in the dataset.
library(dplyr)
left2 <- left1 %>% group_by(ID) %>% filter(n()>1)
# Remove all occurrences of duplicates
left3 <- left1 %>% group_by(ID) %>% filter(n()==1)
# Note #CHROM causes 1st line to be considered as a comment
right=read.table(file=paste0(locGWAS_UKB20116,fileNameStart,".afreq")
,header = TRUE
,sep="\t"
,stringsAsFactors=F
,comment.char = "!") %>% select(c(ID,ALT,ALT_FREQS))
# Remove non-rs number variants
right1=right[grep("^rs",right$ID),]
#subset all rows that have a value in column ID occur more than once in the dataset.
right2= right1 %>% group_by(ID) %>% filter(n()>1)
# Remove all occurrences of duplicates
right3= right1 %>% group_by(ID) %>% filter(n()==1)
# Left join left3 (left table) and right3 (right table) with merging key=ID (rs number)
leftOuterJoin=merge(x= left3, y=right3, by= "ID", all.x=TRUE)
# LeftJoin plot data (allSuplotData, left table) and plot colors (subplotInfoSource, right table)
## keep columns of right table to those wanted
subplotInfoSource_small=subplotInfoSource[,c("PRSPheno","color","color_pale")]
leftJoin=merge(allSuplotData
,subplotInfoSource_small
,by.x = "covarPheno"
,by.y = "PRSPheno"
,all.x=TRUE)
rs
in a data.frame*Using grep to help subset a data frame in R
library(dplyr)
library(magrittr)
# Import data files
fileNameStart="X20116_recodeFinal-HRC.chr1.plink2.output"
# Note #CHROM causes 1st line to be considered as a comment
# N=594186
left=read.table(file=paste0(locGWAS_UKB20116,fileNameStart,".X20116_recodeFinal.glm.logistic")
,header = TRUE
,sep="\t"
,stringsAsFactors=F
,comment.char = "!") %>% select(c(X.CHROM,POS,ID,ALT,OR,P))
# Remove non-rs number variants (i.e. subset ID that start with 'rs')
## N= 594046
left1=left[grep("^rs",left$ID),]
*How to Sort a Data Frame by Multiple Columns in R
# Sort the data.frame base by column group_keyword1, group_keyword2, group_keyword3, and group_keyword4
## the with() generates the index order. This effectively creates a new environment using the passed in data in the first argument along with an expression for evaluating that data in the second argument.
base[
with(base, order(group_keyword1,group_keyword2,group_keyword3,group_keyword4)),
]
!is.na(data[,"columnA"])
# Suppose we are looping through multiple columns in the data set. The first column is everDrug1
variable="everDrug1"
variables_to_select=c(variable,"age","nSEX")
# Based on 1 column, Method 1: use subset() function
data_age_sex= subset(data,(!is.na(data[,variable])),select=variables_to_select)
# Based on 1 column, Method 2: use dplyr::filter
data_age_sex <- data %>%
filter_at(vars(variable),any_vars(!is.na(.)))
# Based on 2 columns
Q2 <- subset(PRS_GSCAN_pheno,(!is.na(PRS_GSCAN_pheno[,"GSCAN_Q2_recode"])) & (!is.na(PRS_GSCAN_pheno[,"age"]))) # 12441 obs. of 68 variables
list(df1,df2,df3...) %>% reduce(left_join, by = c("key1","key2"..))
# Left join data set across all 3 sex groups, each has 72 rows, 7 columns
library(tidyverse)
rP.GSCAN.PRS.targ.pheno.gp2.wide <- list(rP.GSCAN.PRS.targ.pheno.gp2.wide.allSexes
,rP.GSCAN.PRS.targ.pheno.gp2.wide.females
,rP.GSCAN.PRS.targ.pheno.gp2.wide.males) %>% reduce(left_join, by = c("Var1","p.value.threshold")) # dim(rP.GSCAN.PRS.targ.pheno.gp2.wide) 72 17
*join_all: Recursively join a list of data frames
PRS_UKB_201711_step11-01_inner-join_summedPRS-across-all-traits_PCs_impCov.R
library(plyr)
# Read in 5 files from each of the 16 groups (80 input files)
count=0
for (a in 1:length(unique(base$group_keyword1))){
for (b in 1:length(unique(base$group_keyword2))){
for (c in 1:length(unique(base$group_keyword3))){
for (d in 1:length(unique(base$group_keyword4))){
aa=unique(base$group_keyword1)[a]
bb=unique(base$group_keyword2)[b]
cc=unique(base$group_keyword3)[c]
dd=unique(base$group_keyword4)[d]
count=count+1;
print(paste("aa=",aa,"bb=",bb,"cc=",cc,"dd=",dd,sep = " "))
# Subset the paths of 5 input files from each of the 16 groups and the output folders where the joined files are exported
per_group_info= base[base$group_keyword1==aa & base$group_keyword2== bb & base$group_keyword3== cc & base$group_keyword4== dd, c("inputFilePath","outputFolderPath")]
# Read in the 5 files
## Drop column FID, PHENO from file2-file5 using %>% select(-)
per_group_input <- as.character(per_group_info$inputFilePath)
file1= read.table(per_group_input[1],header = T, stringsAsFactors = F)
file2= read.table(per_group_input[2],header = T, stringsAsFactors = F) %>% select(-c(FID,PHENO))
file3= read.table(per_group_input[3],header = T, stringsAsFactors = F) %>% select(-c(FID,PHENO))
file4= read.table(per_group_input[4],header = T, stringsAsFactors = F) %>% select(-c(FID,PHENO))
file5= read.table(per_group_input[5],header = T, stringsAsFactors = F) %>% select(-c(FID,PHENO))
# Recursively join a list of the 5 data.frames as a single data.frame using join_all
## join_all uses same-named column "IID" as merging key
dfs <- list(file1,file2,file3,file4,file5)
joined <- join_all(dfs,by=c("IID"),type = "inner") # 43 columns
# Merge joined file with pc_file
joined_pc= merge.data.frame(joined,pc_file,by.x = "IID",by.y = "INDID") # 53 columns
# Merge joined_pc and impCov_file
joined_pc_impCov= merge.data.frame(joined_pc,impCov_file,by.x = "IID", by.y = "V2") # 54 columns
# Change colname from V6 to impCov
colnames(joined_pc_impCov)[54] <- "impCov"
# Change colname from IID to ID, as at step11-02 GWAS-ID-remapper expects ID as $1
colnames(joined_pc_impCov)[1] <- "ID"
# Remove old files. Commented out as it has been done
#file.remove(paste0(outputFolderPath,"/","summed-PRS_S1-S8_all-pheno"))
## Export the joined_pc_impCov files
outputFolderPath <- as.character(unique(per_group_info$outputFolderPath)) # output folder path
outputFilePath <- paste0(outputFolderPath,"/","summed-PRS_S1-S8_all-pheno_10PCs_impCov")
write.table(joined_pc_impCov # object name the file to export
,col.names=T # keep column names
,row.names = F # remove row number
,file=outputFilePath
,dec="."
,sep=" "
,quote=FALSE
,na = "NA" ) # mark missing values as NA
print(paste0("================================= iteration", count, "========================="))
}
}
}
}
# merging key is IID in file x, INDID in file y
merge.data.frame(joined,pc_file,by.x = "IID",by.y = "INDID")
by.x
and by.y
. To prevent this default, skip by= if the merging key is same-named in both data sets.# Merging key is ID in both PRS_dir_1_standardised and PRS_dir_5_standardised
PRS_dir_1_5=merge(PRS_dir_1_standardised,PRS_dir_5_standardised)
*How to replace the “.” in column names generated by read.csv() with a single space when exporting?
# Read PRS files into a temporary data.frame called data, one file at a time. Unfortunately, R changes dashes to dots in the colname
data=read.table(current_PRS_file_path,header = T,sep=" ")
# A simple regular expression to replace dots with dashes. This might have unintended consequences, so be sure to check the results. Need to escape . with \\
names(data) <- gsub(x = names(data),pattern = "\\.",replacement = "-")
*R-friendly way to convert R data.frame column to a vector?
# Read file paths of PRS files, which were created in previous step. The read file becomes a one-column data.frame with no colname
fileList_PRS=read.table(paste0(locASCOut,"filePath_summedPRS_10PCs_impCov_IDremapped_GSCAN_UKB")
,header=FALSE
,stringsAsFactors = FALSE)
# Convert the data.frame above to a vector
fileList_PRS_vector=fileList_PRS[,1]
dirGWAS="/mnt/lustre/working/lab_nickm/lunC/PRS_UKB_201711/GWASSummaryStatistics";
dirGSCAN=paste0(dirGWAS,"/GWAS_GSCAN/noUKBiobank_results")
fileGWASInfo="GWAS_info.csv";
GWASInfo=read.csv(file = paste0(dirGWAS,"/",fileGWASInfo)
,header = TRUE
,stringsAsFactors = FALSE)
# Minimize the info file to GWAS that will be used in the project
GWAS_to_use <- GWASInfo[GWASInfo$useOrNot=='Y',]
# Row-subset (on left of comma) and column-subset (on right of comma) data
# Extract GWAS measure description
GWASInfo[GWASInfo$useOrNot=='Y' & GWASInfo$dataSource=='UKB',c("dataSource","measureAbb","measureDescription")]
# Assign color to "color" for significant p values, and "color_pale" to non-significant p values
## Set significance level to 0.05
signi_level=0.05
## Convert columns to copy from factor to character
leftJoin[,c("color","color_pale")] <- lapply(leftJoin[,c("color","color_pale")],as.character)
## Conditionally copy color or color_pale, based on pvalue2sided
leftJoin$color_signi <- ifelse(leftJoin$pvalue2sided < signi_level, leftJoin$color,leftJoin$color_pale)
if(condition A){actions} else if(condition B){actions} and else
doesn't perform vectorization. Running condition A, not including its brackets must return a single TRUE or FALSE. If it returns one line of multiple TRUE and/or FLASE, then you will get an error the condition has length > 1 and only the first element will be used. Suppose the if else is constructed based on the value of a column "dataSource" of a data.frame "GWAS_to_use"# wrong if else. Running "GWAS_to_use$dataSource=='GSCAN'" returns multiple TRUE/FALSE
if(GWAS_to_use$dataSource=='GSCAN'){ actions}
else actions
# it's wrong becasue the if condition
# correct way to write if else. Running "dataSource=='GSCAN'" returns a single TRUE/FALSE
for (dir in 1:length(GWAS_to_use$directory)){
dataSource= GWAS_to_use$dataSource[dir]
# Extract GSCAN GWAS file path
if(dataSource=='GSCAN'){
||
for "or"&&
for "and"A==B
for comparing variable A with variable B# Syndax form
if (condition1) {
action1
} else {action2}
# An example with only 2 conditions maximally
if (i >= 401 & i <=480){
categoricalCovars=c("wave","nSEX","ImpCov")
} else {
categoricalCovars=c("wave","wave","nSEX","ImpCov")
}
# Syndax form
if (condition1) {
action1
} else if (condition2){
action2
} else {
action3
}
# An exmaple of 3 conditions to evaluate maximally
## Specify colname names that vary from input files. Note column names vary within and between consortia
## colname names that are the same in all input files are specified as fixed values in read_exposure_data().
if(consortium=='GSCAN' & trait_name=="dpw"){
colname_beta="BETA_linear"
colname_effect_allele="ALT"
colname_other_allele="REF"
colname_sample_size="N"
} else if (consortium=='GSCAN' & trait_name!="dpw") {
colname_beta="BETA"
colname_effect_allele="ALT"
colname_other_allele="REF"
colname_sample_size="N"
} else {
trait_name=gsub(unlist(strsplit(basename(filePath),"_"))[3],pattern = "-",replacement = "_")
colname_beta="BETA"
colname_effect_allele="ALLELE1"
colname_other_allele="ALLELE0"
colname_sample_size="sample_size"
}
|
for "or", which is different from ||
used by if()
'&' for "and", which is different from &&
used by if()
==
for comparing variables, which is similar to ==
used by if()
# Example 1: conditions evaluated against fixed values
# Set values to NA if old values are -3 or 0.
# Set values to 1 if old values are 2.
# Set values to 0 if old values are 1.
pheno_ukb20116$X20116.0.0_recode <- ifelse(pheno_ukb20116$X20116.0.0 %in% c(-3,0), NA
,ifelse(pheno_ukb20116$X20116.0.0==1,0
,1))
# Check if the recoding is done correctly
table(pheno_ukb20116$X20116.0.0_recode,pheno_ukb20116$X20116.0.0,useNA = "ifany") -3 0 1 2 <NA>
0 0 0 168254 0 0
1 0 0 0 51208 0
<NA> 1986 265431 0 0 530
# Example 2: conditions evaluated against a pattern
temp <- dat.manu3.sexPRS.inclu
# Simplify values in the "name_fix_eff" column
## values starting with GSCAN set to "PRS"
## values starting with sex_GSCAN set to "sexPRS"
## values sex remain unchanged
temp$name_fix_eff2 <-ifelse(grepl("^GSCAN.*.S[1-8]$",temp$name_fix_eff), "PRS"
,ifelse(grepl("^sex_GSCAN.*.S[1-8]$",temp$name_fix_eff),"sexPRS"
,"sex"))
table(temp$name_fix_eff2)
# PRS se sexPRS
# 600 600 600
ifelse()
, make sure each ifelse()
contains 3 arugmenets: (1) condition to evaluate,(2) value if the condition is evaluated as TRUE, (3) value if the condition is evaluated as FALSE. This example has 3 condictions and 2 ifelse()
. The 3rd argument in the first ifelse() is a nested ifelse().m$analysis.performed <- ifelse(m$cohorts=="UKB" & m$traits.abb=="CI","OA",
ifelse((m$cohorts=="GSCAN" & m$traits.abb=="DPW")|(m$cohorts=="UKB" & m$traits.abb=="ESDPW")|(m$cohorts=="UKB" & m$traits.abb=="CCPD"),"OA, GA",
"GA"))
dplyr::recode
dplyr::recode(variable,)
# Example 1
## Recode value 1 as 5, 2 as 4, 4 as 2, 5 as 1
dplyr::mutate(quali.edu.6138.recoded=dplyr::recode(qualification_edu_6138
,`1`=5
,`2`=4
,`3`=3
,`4`=2
,`5`=1))
# Example 2
# If values are -3 or 0, set to NA; else keep the original values
pheno_ukb20116$X20116.0.0_recode <- ifelse(pheno_ukb20116$X20116.0.0 %in% c(-3,0), NA, pheno_ukb20116$X20116.0.0)
# Check old variable and the new variable
## row names are values of 1st argument, columns are values of 2nd argument
table(pheno_ukb20116$X20116.0.0_recode,pheno_ukb20116$X20116.0.0,useNA = "ifany")
-3 0 1 2 <NA>
1 0 0 168254 0 0
2 0 0 0 51208 0
<NA> 1986 265431 0 0 530
## Step 1: initialize a data.frame for appending the result from every iteration
## This data.frame should have same column names as the per-iteration result data.frame
base=data.frame(iteration=NULL
,directory=NULL
,file=NULL)
## Step 2: extract directory and names of files
for (dir in 1:length(GWASInfo$directory)){
current_dir=GWASInfo$directory[dir]
GWASFileList=list.files(path = current_dir,pattern = "^revised_bolt_imputed(.*).bgen.assoc$|(.*).glm.logistic$")
for (file in 1:length(GWASFileList)){
current_file=GWASFileList[file]
current_iteration=(dir-1)*length(GWASFileList)+file
print(paste0("current_dir=",current_dir))
print(paste0("current_file= ",current_file))
print(paste0("current_iteration=",current_iteration))
resultPerIteration=data.frame(iteration=current_iteration,directory=current_dir,file=current_file)
# Step 3 : Append per-iteration data.frame to the base data.frame
base = rbind(base, resultPerIteration)
}
}
# method 1
df[, colSums(is.na(df)) != nrow(df)]
# method 2
df[colSums(!is.na(df)) > 0]
# sep=" " refers to one whitespace character. sep="" refers to any length whitespace as being the delimiter
c1=read.table(paste0(dirSubfolders[1],filesWant[1]),header = T,sep = "")
do.call(rbind,df_list)
# Create an empty list for holding a subset generated within a for loop
GCTAOutfile_DSM5MentalDisorderDiag_part1_list <- list()
# Add each subset to the list
for (i in 1:length(filesWantList)){
# Add data.frame of same suffix into the list
## You cannot use data.frame name like GCTAOutfile_DSM5MentalDisorderDiag_i_part1
GCTAOutfile_DSM5MentalDisorderDiag_part1_list[[i]] <- get(paste0("GCTAOutfile_DSM5MentalDisorderDiag_", i,"_part1"))
}
# Combine all subsets using do.call and rbind
## 2nd argument of do.call is a list. That is why all subsets were added to a list
GCTAOut_DSM5MentalDisorderDiag_part1_all <- do.call(rbind, GCTAOutfile_DSM5MentalDisorderDiag_part1_list)
do.call(cbind,df_list)
,(2) join_all(data_list,by=mergingKey,type=)
# Search variables to reshape
patterns_to_search=glob2rx("ID_char|GSCAN.*_r|ZYGOSITY")
variables_to_reshape=grep(names(df_twin),pattern=patterns_to_search,value = TRUE) # 42 elements
# Combine all reshaped variables to a single data frame
## Create an empty list for holding data frames to combine
df_to_bind_by_column <- list()
## Reshape variables and add them to the list
for (i in 1:length(variables_to_reshape)){
var_to_reshape=variables_to_reshape[i]
var_to_keep=c(paste(var_to_reshape,c("01","02","50"),sep="."))
# Reshape variables, one at a time
reshaped_var <-reshape(df_twin
,idvar = "famID"
,v.names = var_to_reshape
,timevar = "ID_suffix"
,direction = "wide" ) %>% select_(.dots=var_to_keep)
# Add the reshaped variable data frame to the list
df_to_bind_by_column[[i]] <- reshaped_var
}
# Method 1: Combine all data frames in the list to a single data frame using do.call(cbind,df_list)
reshaped_variables <- do.call(cbind,df_to_bind_by_column) # 126 columns
# Method 2: Combine all data frames in the list to a single data frame using join_all(data_list,by=mergingKey,type=)
df_reshaped_variables <-plyr::join_all(df_to_bind_by_column,by=c("famID","ZYGOSITY"),type="inner") #