# Rare germline variants associated with breast cancer *BASED ON THE ARTICAL PUBLISHED ON NATURE COMMUNICATION:* **The impact of rare germline variants on human somatic mutation processes https://doi.org/10.1038/s41467-022-31483-1 https://github.com/lehner-lab/RDGVassociation https://mischanvp.shinyapps.io/rare_association_shiny/** ## 1. Case-control study case-control study design: https://www.drcath.net/toolkit/case-control case and control match: matchit (R package): https://datascienceplus.com/how-to-use-r-for-matching-samples-propensity-score/ ``` #R package MatchIt pacman::p_load(knitr, wakefield, MatchIt, tableone, captioner) data1=read.csv("/data4/una/breast_cancer/RGC_caco.csv") table1 <- CreateTableOne(vars = c('age'),data=data1,strata='caco') table1 <- print(table1,printToggle = FALSE,noSpaces = TRUE) kable(table1[,1:3], align = 'c',caption = 'Table 1: Comparison of unmatched samples') set.seed(1234) match.it <- matchit(caco ~ age, data = data1, method="nearest", ratio=4) a <- summary(match.it) kable(a$nn, digits = 2, align = 'c', caption = 'Table 2: Sample sizes') kable(a$sum.matched[c(1,2,4)], digits = 2, align = 'c', caption = 'Table 3: Summary of balance for matched data') plot(match.it, type = 'jitter', interactive = FALSE) df.match <- match.data(match.it)[1:ncol(data1)] table4 <- CreateTableOne(vars = c('age'),data = df.match,strata='caco') table4 <- print(table4,printToggle = FALSE,noSpaces = TRUE) kable(table4[,1:3],align = 'c',caption = 'Table 4: Comparison of matched samples') match.matrix<-match.it[["match.matrix"]] g.matches1 <- get_matches(match.it, data = data1,distance = "prop.score") head(g.matches1, 10) write.csv(g.matches1 , "/data4/una/breast_cancer/caco_match.csv", row.names=FALSE) ``` ## 2. Rare germline varient analysis SpliceAI: https://github.com/Illumina/SpliceAI MTR: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5630035/ https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6602522/#!po=62.5000 CCR: http://quinlanlab.org/blog/2018/12/20/constrained-coding-regions.html ## 3. SKAT-O analysis: https://cdn1.sph.harvard.edu/wp-content/uploads/sites/149/2012/10/skat.pdf ## Code note ### Age calculator: ``` breast_pt_age['age']=(((pd.to_datetime(breast_pt_age['diagdate'], format="%Y/%m/%d",errors="coerce")-pd.to_datetime(breast_pt_age['birthday'], format="%Y/%m/%d",errors="coerce")).dt.days)//365) breast_pt_age['age']=breast_pt_age['age'].fillna(0) for i in range(len(breast_pt_age['age'])): if breast_pt_age['age'][i]==0: a = datetime.strptime(breast_pt_age['birthday'][i], "%Y/%m/%d") b = datetime.strptime(breast_pt_age['CR_0205'][i], "%Y/%m/%d") delta = b - a year=delta.days//365 print(i) breast_pt_age['age'][i]=year ``` ### Split vcf file by shell: ``` split -l 100000 "/data4/una/breast_cancer/control/RGC_breast_cancer_new_control.vcf" /data4/una/breast_cancer/control ``` ### Value `p.value` It will be the p-value based on robust methods. `p.value_singlevariant` p-value for each single variant in this region-based test. `mac` total minor allele count (MAC). `param$n.marker` a number of SNPs in the genotype matrix. `param$n.marker.test` a number of SNPs used for the test. It can be different from `param$n.marker` when some markers are monomorphic or have higher missing rates than the missing_cutoff. `param$rho` the ρ parameter for all variants. `test.snp.mac` a vector of minor allele count (MAC) of the snps tested. The name is SNP-ID. ### Variant selection ``` import pandas as pd mis=pd.read_csv('/data4/una/breast_cancer_project/missense/missense_maf001_afx0.csv',index_col=0) print("Start: case") ptv_pat=[] pata=pd.read_csv("/data4/una/breast_vcf/a_patient_genotype.csv",index_col=0,dtype="category") pata=pata.reset_index() patb=pd.read_csv("/data4/una/breast_vcf/b_patient_genotype.csv",index_col=0,dtype="category") patb=patb.reset_index() ptva=pd.merge(ptv,pata,on='ID',how='inner') ptvb=pd.merge(ptv,patb,on='ID',how='inner') if ptva.empty==False: ptv_pat.append(ptva) if ptvb.empty==False: ptv_pat.append(ptvb) result_ptv=pd.concat(ptv_pat) result_ptv.to_csv('/data4/una/breast_cancer_project/missense/missense_patient.csv') print('case: done') print("Start: control") var_ctrl=[] ctrla=pd.read_csv('/data4/una/breast_cancer/control/a/geno_control_a.csv',index_col=0,dtype="category") ctrla=ctrla.reset_index() ctrlb=pd.read_csv('/data4/una/breast_cancer/control/a/geno_control_b.csv',index_col=0,dtype="category") ctrlb=ctrlb.reset_index() ctrla=pd.merge(var,ctrla,on='ID',how='inner') ctrlb=pd.merge(var,ctrlb,on='ID',how='inner') if ctrla.empty==False: var_ctrl.append(ctrla) if ctrlb.empty==False: var_ctrl.append(ctrlb) result_var=pd.concat(var_ctrl) result_var.to_csv('/data4/una/breast_cancer_project/missense/missense_control.csv') print('Done: control') ``` ### Gene aggregation ``` import pandas as pd #generate genotype table of 7720 patients after filtering MAF>1% variants for skato test ''' df=pd.read_csv('/data4/una/skatout/bigtable_ptv_num_maf.csv',index_col=0) data2=df[df['MAF']<0.01] geneid=data2[['ID']] data2=data2.drop(['MAF','pat_AF'],axis=1) data2=data2.set_index('ID') data2.to_csv('/data4/una/breast_cancer_project/data/ptv_maf001_7720.csv') geneid.to_csv('/data4/una/breast_cancer_project/data/ptv_maf001_7720_geneid.csv') ''' #aggregate variants by gene name to narrow down test based on gene level df=pd.read_csv("/data4/una/breast_cancer_project/data/ptv_maf001_7720.csv",index_col=0) genename=pd.read_csv("/data4/una/breast_cancer_project/data/ptv_maf001_7720_genename.csv",index_col=0) df=pd.merge(df,genename,on='ID',how='inner') allgene=genename[['SYMBOL']].drop_duplicates() for i in allgene.SYMBOL: data=df[df['SYMBOL']==i] data.to_csv("/data4/una/breast_cancer_project/data/gene/"+i+".csv") ``` ### Pathway analysis Tools: https://www.bioinformatics.babraham.ac.uk/training/Extracting_Biological_Information_Course/1.Gene%20Set%20Analysis.pdf ``` ``` ### SKATO ``` library(Matrix) library(SPAtest) library(RSpectra) x=read.csv("/data4/una/allptv7720.csv", header=TRUE, row.names=1) y=read.csv("/data4/una/allptv_caco.csv", header=TRUE, row.names=1) #y=as.matrix(read.table("/data4/una/allptv_caco.csv", sep=",",header=TRUE,row.name=1)) x= as.data.frame(x = t(x), stringsAsFactors = FALSE) x=as.matrix(x) y=as.matrix(y) lar=list(Y=y,X=x) obj<-SKAT_Null_Model(Y~1, out_type="D", data=lar) SKAT(x, obj, method = "SKATO")$p.value [1] 0.009201945 Warning messages: 1: 58194 SNPs with either high missing rates or no-variation are excluded! 2: The missing genotype rate is 0.001225. Imputation is applied. > out=SKAT(x, obj, method = "SKATO") TIPS:using 'gc()' at first while working with very large variant set, in these cases manually triggering the garbage collector with gc() can help to free up memory and improve performance. '''gene_levelfor_loop''' library(SKAT) library(Matrix) library(SPAtest) library(RSpectra) setwd("/data4/una/breast_cancer_project/missense/CADD/new_CADD30/") y=read.csv('/data4/una/breast_cancer_project/missense/7720_id_mis_rtu.csv',row.names=1) y=as.matrix(y) allresult = data.frame() df_genename=data.frame() listcsv <- dir(pattern = "*.csv") for (k in 1:length(listcsv)){ ldf[[k]] <- read.csv(listcsv[k],row.names=1) x=ldf[[k]] rownames(x)=x[,1] #x=x[grepl('TAICHUNG', colnames(x))] x= as.data.frame(x = t(x), stringsAsFactors = FALSE) x=as.matrix(x) x=apply(x,2,as.numeric) lar=list(Y=y,X=x) obj<-SKAT_Null_Model(Y~1, out_type="D", data=lar) pval=tryCatch({SKAT(x, obj, method = "SKATO")$p.value},error = function(e){listcsv[[k]]}) pval = data.frame(gene_id=names(pval), p_value=pval, row.names=NULL) genename=str_replace(listcsv[k], ".csv", "") df_genename=rbind(df_genename,genename) str(k) } allresult$gene_id=df_genename[,1] write.csv(allresult,'/data4/una/breast_cancer_project/skato_result/CADD30_gene_level_result.csv') ``` ### Firth's Logistic Regression ``` library(logistf) library(stringr) setwd("/data4/una/breast_cancer_project/sumgene/") listcsv <- dir(pattern = "*.csv") ldf=list() allresult = data.frame() df_genename=data.frame() df_error=data.frame() for (k in 1:length(listcsv)){ ldf[[k]] <- read.csv(listcsv[k],row.names=1) x=ldf[[k]] x2=x[,1] y=x[,2] if (sum(x2)!=0){ lar=list(Y=y,X=x2) fit=tryCatch({logistf(Y ~ X,data=lar)},error = function(e){print('error')}) odds_ratio = tryCatch({exp(fit$coefficients[2])},error = function(e){odds_ratio=NULL}) pval = tryCatch({fit$prob[2]},error = function(e){pval=NULL}) ciup = tryCatch({exp(fit$ci.upper[2])},error = function(e){ciup=NULL}) cilow = tryCatch({exp(fit$ci.lower[2])},error = function(e){cilow=NULL}) odds_ratio = data.frame(gene_id=names(odds_ratio), odds_ratio=odds_ratio, row.names=NULL) ciup = data.frame(gene_id=names(ciup), upper_95%CI=ciup, row.names=NULL) cilow = data.frame(gene_id=names(cilow), lower_95%CI=cilow, row.names=NULL) pval = data.frame(gene_id=names(pval), p_value=pval, row.names=NULL) result = Reduce(function(x, y) merge(x, y, all=TRUE), list(odds_ratio, cilow, ciup, pval)) result=result[result$gene_id!="(Intercept)",] allresult=rbind(allresult,result) genename=str_replace(listcsv[k], "_sum.csv", "") df_genename=rbind(df_genename,genename) str(k) }else{ error=str_replace(listcsv[k], "_sum.csv", "") df_error==rbind(df_error,error) } allresult$gene_id=df_genename[,1] write.csv(allresult,'/data4/una/breast_cancer_project/ptv_logistf_result.csv',row.names=FALSE) write.csv(df_error,'/data4/una/breast_cancer_project/ptv_logistf_error.csv',row.names=FALSE) ``` ### Odds Ratio calculation ``` #generate odds ratio table #put all candidate genes in a file and read them into dataframe import os import glob csv_files = glob.glob('./*{}'.format('_sum.csv')) df_concat = pd.concat([pd.read_csv(f) for f in csv_files ],axis=1) df_concat = df_concat.loc[:,~df_concat.columns.duplicated()].copy() #select genes df_concat = df_concat[['BRCA2', 'STARD9', 'BNIP1', 'BRCA1', 'PALB2','DNAH3','CEP112','TTN','PPIP5K2','RAD54B','caco']] df_concat=df_concat.astype(int) #do logit import numpy as np import pandas as pd from patsy import dmatrices import statsmodels.api as sm df = pd.DataFrame() X = df_concat.iloc[:, :-1] y = df_concat.iloc[:, -1] for i in top10.SYMBOL: y, X = dmatrices('caco ~ '+str(i), data=df_concat, return_type='dataframe') mod = sm.Logit(y, X) res = mod.fit() print(res.summary()) ci=res.conf_int() ci.columns=['lower_CI','upper_CI'] ci=np.exp(ci) pval=res.pvalues val=res.params.values ci['odds ratio']=math.exp(val[1]) ci['p_val']=pval[1] df=pd.concat([df,ci],axis=0) #select even rows of dataframe df2=df.iloc[1::2] df2.columns=['Lower 95% CI','Upper 95% CI','OR','p value'] df2=df2.sort_values('OR',ascending=False) #plot odds ratio import matplotlib.pyplot as plt plt.figure(figsize=(6, 4), dpi=150) CI=[df2.iloc[::-1]['OR'] - df2.iloc[::-1]['Lower 95% CI'].values, df2.iloc[::-1]['Upper 95% CI'].values - df2.iloc[::-1]['OR']] plt.errorbar(x=df2.iloc[::-1]['OR'], y=df2.iloc[::-1].index.values, xerr=CI, color='black', capsize=3, linestyle='None', linewidth=1, marker="o", markersize=5, mfc="black", mec="black") plt.axvline(x=1, linewidth=0.8, linestyle='--', color='black') plt.tick_params(axis='both', which='major', labelsize=8) plt.xlabel('Odds Ratio and 95% Confidence Interval', fontsize=8) plt.tight_layout() plt.savefig('/data4/una/breast_cancer_project/top10gene/renew_top10OR.png') plt.show() display(df2) ``` ## 4. File Location ## CONTROL FILE ### VCF file ``` filenames_a=[] for character in range_char("a", "z"): file_path = '/data4/una/breast_cancer/control/a/geno_control_'+character+'.csv' filenames_a.append(file_path) print("Start: A") a_control=[] for i in filenames_a: data=pd.read_csv(i,index_col=0) print(os.path.basename(i),': start') data=pd.merge(var,data,on='ID',how='inner') df = data.replace('0/0',0) df = df.replace('0|0',0) df = df.replace('1/0',1) df = df.replace('1|0',1) df = df.replace('0/1',1) df = df.replace('0|1',1) df = df.replace('1/1',2) df = df.replace('1|1',2) df = df.replace('./.',9) df = df.replace('.|.',9) df=df.fillna(9) if df.empty==False: a_control.append(df) print(os.path.basename(i),': done') result_a=pd.concat(a_control) result_a.to_csv('/data4/una/breast_cancer_project/data/geno_control_a_[types of variants].csv') filenames_b=[] for character in range_char("a", "w"): file_path = '/data4/una/breast_cancer/control/b/geno_control_'+character+'.csv' filenames_b.append(file_path) print("Start: B") b_control=[] for i in filenames_b: data=pd.read_csv(i,index_col=0) print(os.path.basename(i),': start') data=pd.merge(var,data,on='ID',how='inner') df = data.replace('0/0',0) df = df.replace('0|0',0) df = df.replace('1/0',1) df = df.replace('1|0',1) df = df.replace('0/1',1) df = df.replace('0|1',1) df = df.replace('1/1',2) df = df.replace('1|1',2) df = df.replace('./.',9) df = df.replace('.|.',9) df=df.fillna(9) if df.empty==False: b_control.append(df) print(os.path.basename(i),': done') result_b=pd.concat(b_control) result_b.to_csv('/data4/una/breast_cancer_project/data/geno_control_b_[types of variants].csv') ``` ### All genotypes' counts and MAF value of variants lower than 0.01 ``` '/data4/una/breast_cancer/control/all_control_geno_maf001.csv' ``` ### CADD: genotypes of missense variants w CADD>15 ``` con=pd.read_csv('/data4/una/breast_cancer_project/data/geno_control_a_cadd15.csv',index_col=0) con_1=pd.read_csv('/data4/una/breast_cancer_project/data/geno_control_b_cadd15_2.csv',index_col=0) ``` ## PATIENT FILE ### VCF file ``` pata=pd.read_csv("/data4/una/breast_vcf/a_patient_genotype.csv",index_col=0,dtype="category") pata=pata.reset_index() patb=pd.read_csv("/data4/una/breast_vcf/b_patient_genotype.csv",index_col=0,dtype="category") patb=patb.reset_index() ``` ## VARIANT FILE ### All variants ``` /data4/una/breastcancer_exon_var.csv ``` ### PTV annotation ``` ``` ### Missense annotation ``` /data4/una/breast_cancer_project/missense/missense_maf001_var_anno.csv ``` ## 5. Reference https://www.scirp.org/journal/paperinformation.aspx?paperid=114054#:~:text=additive%2C%20dominant%20and%20recessive%20in,the%20scope%20of%20the%20study. ## ``` input_path="/data4/una/CNest/nf/testdata/cram/" output_path="/data4/una/" ref_path= docker run -v "${input_path}:/input" -v "${output_path}:/output" -v "${ref_path}:/ref" -w "/output" -it --rm \ tomas81/cnest:dev step2 \ --project test_proj \ --sample 'test_cram' \ --input '/input/NA18543.test.cram' ```