Python 相關指令

# Python 相關指令 ###### tags: `Note` ## Print tools' version ```javascript! print("TensorFlow v" + tf.__version__) print("Numpy v" + np.__version__) ``` * 印出所有的 version > /Users/ching/anaconda3/envs/IMG_course/lib/python3.10/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils. > warnings.warn("Setuptools is replacing distutils.") > /var/folders/5n/ms4ll6f50sv31qy8rqt3l4l00000gn/T/ipykernel_69972/3724946357.py:14: DeprecationWarning: The '__version__' attribute is deprecated and will be removed in Werkzeug 3.1. Use feature detection or 'importlib.metadata.version("werkzeug")' instead. > version = getattr(module, '__version__', 'unknown') > 這些警告消息通常是由於套件更新或兼容性變更而引起的。警告通常不會影響程式碼的運行，但為了確保未來的兼容性，您可以考慮進行一些調整。 > > 對於第一條警告，它是關於 Setuptools 取代 Distutils 的警告。這是由於 Setuptools 已經成為 Python 中的主要構建工具，取代了 Distutils。通常情況下，這個警告是可以忽略的，因為 Setuptools 通常能夠提供更好的功能和兼容性。 > > 至於第二條警告，它是有關 Werkzeug 版本的。該警告建議您使用 importlib.metadata.version("werkzeug") 來獲取 Werkzeug 的版本，而不是直接使用 __version__ 屬性。這是由於 __version__ 屬性在 Werkzeug 3.1 中被標記為廢棄（Deprecation）。 ```javascript! import pkg_resources import importlib.metadata import types # 取得當前 notebook 中已載入的模組 modules = list(pkg_resources.working_set) modules_names = [module.project_name for module in modules] # 印出每個模組的名稱和版本 for name in modules_names: try: # 載入模組 module = __import__(name) # 取得模組版本 version = importlib.metadata.version(name) print(f"{name}: {version}") except ImportError: print(f"{name}: version information not available") ``` ## Print files ```javascript! import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) ``` ```javascript! import glob as gb import os path=f'/home/amy/TPMI/Ori_Apt_Adv/{FOLD}/Matched_codebook/' filenames=gb.glob(path+'Matched_case_control_codebook_*.txt') alldata = DataFrame() for filename in filenames: print("Running",filenames.index(filename)+1,'of 501') df=pd.read_csv(filename,sep='\t') df=pd.read_csv(filename,sep="\s+",header=None,names=['mode','affinity','rmsd l.b.','rmsd u.b.']) df.insert(0, "filename", os.path.dirname(filename)) df['filename'] = os.path.basename(filename) ``` ## Read file * Check encoding type ```javascript! import chardet with open('/Users/ching/Documents/work/03_CMUH/4_Manuscript/PRS/WebData_update_231220.txt', 'rb') as f: text = f.read() en = chardet.detect(text) print(chardet.detect(text)) ``` * Read vcf ```javascript! def read_vcf(path): with open(path, 'r') as f: lines = [l for l in f if not l.startswith('##')] return pd.read_csv( io.StringIO(''.join(lines)), dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str, 'QUAL': str, 'FILTER': str, 'INFO': str}, sep='\t' ).rename(columns={'#CHROM': 'CHROM'}) vcf_TPMI_b1tob22_ori = read_vcf('TPMI_b1tob22_original_ok_breast.vcf') TPMI_b1tob22_ori_df = pd.concat([vcf_TPMI_b1tob22_ori.iloc[:,2],vcf_TPMI_b1tob22_ori.iloc[:,9:]], axis=1).T new_header = TPMI_b1tob22_ori_df.iloc[0] TPMI_b1tob22_ori_df = TPMI_b1tob22_ori_df[1:] TPMI_b1tob22_ori_df.columns = new_header TPMI_b1tob22_ori_df.index.name = 'ID' TPMI_b1tob22_ori_df.reset_index(inplace=True) ``` combined ```javascript! APT=read_vcf(f'/home/amy/TPMI/pheWAS/TPMI_b1tob22_APT_OK_{PROJ}.vcf') ADV=read_vcf(f'/home/amy/TPMI/pheWAS/TPMI_b1tob22_advnorm_ok_{PROJ}.vcf') APT_df = pd.concat([APT.iloc[:,2],APT.iloc[:,9:]], axis=1).T new_header = APT_df.iloc[0] APT_df = APT_df[1:] APT_df.columns = new_header APT_df.index.name = 'ID' APT_df.reset_index(inplace=True) # APT_df2=APT_df.T.sort_index().T # display(APT_df) ADV_df = pd.concat([ADV.iloc[:,2],ADV.iloc[:,9:]], axis=1).T new_header = ADV_df.iloc[0] ADV_df = ADV_df[1:] ADV_df.columns = new_header ADV_df.index.name = 'ID' ADV_df.reset_index(inplace=True) # ADV_df2=ADV_df.T.sort_index().T # display(ADV_df) col_var_list = ADV_df.columns.tolist() col_var_list.remove('ID') COM=ADV_df COM = COM.iloc[0:0] COM["ID"] = ADV_df["ID"] for var in col_var_list: APT_df[var]=APT_df[var].astype(str) ADV_df[var]=ADV_df[var].astype(str) COM[var] = np.where(APT_df[var]==ADV_df[var],ADV_df[var],'./.') COM=COM.set_index('ID') COM.reset_index(inplace=True) COM['ID'] = COM['ID'].str.split("_", expand=True)[0] display(COM) ``` count ./. 0/1 1/1 0/0 ```javascript! SNP_count =COM gtype = [] for col in SNP_count.columns[1:]: tmp_series = SNP_count[col].value_counts() tmp_series.name = col gtype.append(tmp_series) SNP_count = pd.concat(gtype, axis=1,sort=True).T SNP_count = SNP_count.fillna(0) SNP_count.reset_index(level=0, inplace=True) SNP_count.rename(columns={'index':'SNP'}, inplace=True) SNP_count ``` ## Dataframe manipulation * 文字粗搜尋 ```javascript! data[data['column'].str.contains('str')] ``` * 文字完整搜尋 ```javascript! data.loc[data['column']=='str'] ``` * 文字粗搜尋不同字串 * 資料無大小寫不同的疑慮 ```javascript! searchfor = ['A','B'] extract = data[data["column"].str.contains('|'.join(searchfor))] ``` * 資料有大小寫不同的疑慮 ```javascript! searchfor = ['A','B'] extract = data[data["column"].str.lower().str.contains('|'.join(searchfor))] ``` * 視整個單字為主體 ex GLA 應避免搜尋出 CGLA or GLAO ```javascript= # case 指的是要不要管 null 的資料 ANNOVAR_unimputed[ (ANNOVAR_unimputed["Gene.refGene"].notnull())& (ANNOVAR_unimputed["Gene.refGene"].str.contains(fr"\b(?: {'|'.join(searchfor)})\b",case=False)) ] ``` * 在整個 dataframe 中搜尋 ```javascript! searchfor = ['A','B'] extract = data[data.apply(lambda r: r.str.contains('|'.join(searchfor), case=False).any(), axis=1)] ``` * 將有包含 searchfor 中任一字串的 column 列出來 ```javascript! extract2 = extract.isin(searchfor).apply(lambda r: ','.join(r.index[r]), axis=1).reset_index() extract_new = pd.concat([extract2['index'], extract2[0].str.split(',', expand=True)], axis=1) New_columns=['ID'] Probe_num=range(1,(len(extract2.columns))) for i in Probe_num: New_columns.append(f'ID{i}') extract_new.columns = New_columns ``` * 搜尋 perfect match 的字串例如 GLA 不可以包含 DSGLA ```javascript! ANNOVAR_unimputed[ANNOVAR_unimputed["Gene.refGene"].str.contains(fr"\b(?:{'|'.join(searchfor)})\b",case=False)] ``` * 文字完整搜尋不同字串 ```javascript! searchfor=['Likely pathogenic','pathogenic'] extract=data[data['column'].isin(searchfor)] ``` * 資料計算－取log * log10 ```javascript! data['log']=abs(np.log10(data['column'])) ``` * log2 fole change ```javascript! data['log2FC']=np.log2(data['b']/data['a']) ``` * 整理字串－刪除非數字的字元 ```javascript! data[pd.to_numeric(data['column'], errors='coerce').notnull()] ``` * 更改欄位格式 ```javascript! data.column=casedata_bim_bin.column.astype(str) ``` * 更換 List 的檔案格式 ```javascript! pID_list_new = [float(x) for x in pID_list_new] ``` * 將空格填滿其他文字 ```javascript! data['column']=data['column'].fillna(0) data['column']=data['column'].fillna('str') ``` * 資料切割 ```javascript! bins=[-1.0,0.0001, 0.0005, 0.001, 0.005, 0.01] data['Categories']=pd.cut(data['column'],bins) ``` * 列出 data frame 中所有 unique 值 ```javascript! print(list(pd.unique(data.values.ravel('K')))) ``` * 列出欄位中所有 unique 的值 ```javascript! data.column.unique() data['column'].unique() ``` * 文字取代 * 多個同時取代 ```javascript! data['column']=data['column'].replace({"A":"a","B":"b","C":"c"},inplace=True) ``` * 一次一個 ```javascript! data=data.replace('A','B') data.column=data.column.replace('A','B') data['column']=data['column']replace('A','B') data.loc[data['columnA']=='A','columnA']='B' ``` * Header 取代 ```javascript! data=data.rename(columns={'A':'a','B':'b','C':'c'}) ``` * Pivot-table ```javascript! data.pivot_table(values=['SNP'], columns=['ClinicalSignificance'], index=['Categories'], aggfunc=lambda x: len(x.unique()), aggfunc=lambda x: ', '.join(x), aggfunc=[np.mean,np.median, np.std], aggfunc=lambda x: ', '.join(x.unique()), aggfunc=lambda x: ', '.join(str(val) for val in x), dropna=False, margins=True, margins_name='All', fill_value=0) ``` * Combine multi-headers with "_" ```javascript! APT_val_table.columns = ['_'.join((col)) for col in APT_val_table.columns] ``` * If 判斷式 > true or false * 單一條件 ```javascript! data['TF'] = np.where(data['A']==data['B'], 'True', 'False') ``` * 多重條件 ```javascript! data.loc[(data['A']== 'a') & (data['B'] == 'b'), ’TF‘] = 'True' ``` * 有條件的處理數值 ```javascript! Target_list['RecurrentDate_ad'] = Target_list.apply(lambda x: x['RecurrentDate'] + 19110000 if x['RecurrentDate']>0 else x['RecurrentDate'],axis=1) ``` * 切割文字串取其中 ```javascript! data['New_column']=data['column'].str.split("_", expand=True)[0] ``` * 重組表格 ```javascript! data2=pd.melt(data, id_vars =['ID'], value_vars =columns,var_name ='catagories', value_name ='Value') ``` * 表格合併 ```javascript! pd.concat([table1,table2],axis=1,ignore_index=True) #有同樣的 index 合併 column pd.concat([table1,table2],axis=0,ignore_index=True) #有同樣的 column 合併 index alldata=alldata.append(data) ``` * 取出 str isin a list 的配對資料以方便合併 ```javascript! pat = '|'.join(r"{}".format(x) for x in data['target']) data['result'] = data.target.str.extract('('+ pat + ')', expand=False) ``` * 轉換日期格式 * 不包含時間且分隔為'/' ```javascript! data['DATE'] = data['DATE'].apply(pd.to_datetime, format='%Y/%m/%d', errors='coerce') ``` * 包含時間且分隔為'-' ```javascript! data['DATE'] = data['DATE'].apply(pd.to_datetime, format='%Y-%m-%d %H:%M', errors='coerce') ``` * 計算年紀 ```javascript! data['Age'] = data["Sampling"].dt.year-data["birth"].dt.year ``` * 排序（遞增） ```javascript! data=data.sort_values(by=['Age'],ascending=True) ``` * 去除文字空格 ```javascript! data.column = data.column.str.strip() ``` * 每3000行切割文字檔 ```javascript! def Main(): source_dir = f"/home/amy/TPMI/Ori_Apt_Adv/SNP_{PROJ}_plink.txt" target_dir = '/home/amy/TPMI/Ori_Apt_Adv/TXT_file/' flag = 0 name = 1 dataList = [] print("start") # print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) with open(source_dir,'r') as f_source: for line in f_source: flag+=1 dataList.append(line) if flag == 3000: with open(target_dir+"list_"+str(name)+".txt",'w+') as f_target: for data in dataList: f_target.write(data) name+=1 flag = 0 dataList = [] with open(target_dir+"list_"+str(name)+".txt",'w+') as f_target: for data in dataList: f_target.write(data) print("Done") # print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) if __name__ == "__main__": Main() ``` * Groupby + agg ```javascript! count=data.groupby(['SNP','RefSNP ID from dbSNP','A1A2']).agg({'Affy SNP ID':[np.size],'Probe Set ID':[np.unique],'MAF':[np.unique]}) ``` * 將multiple header 用 '_' 合併 ```javascript! count.columns = ['_'.join((col)) for col in count.columns] ``` * 將一個 column 依照分隔符號拆分 ```javascript! anno=pd.DataFrame(select['MAF_unique'].str.split(" ",).tolist()).rename(lambda x: 'x-{}'.format(x + 1), axis=1) ``` * applymap 可以對整個 dataframe 做計算 ex: 全部/100 ```javascript! case_bim_bin_table_sep2=case_bim_bin_table_sep2.applymap(lambda x: '%.2f%%' % (x*100)) ``` * read file in a folder & add a column with filename ```javascript! import glob as gb import os path=f'/home/amy/TPMI/Ori_Apt_Adv/{FOLD}/Matched_codebook/' filenames=gb.glob(path+'Matched_case_control_codebook_*.txt') alldata = DataFrame() for filename in filenames: print("Running",filenames.index(filename)+1,'of 501') df=pd.read_csv(filename,sep='\t') df=pd.read_csv(filename,sep="\s+",header=None,names=['mode','affinity','rmsd l.b.','rmsd u.b.']) df.insert(0, "filename", os.path.dirname(filename)) df['filename'] = os.path.basename(filename) df['filename'] = df['filename'].str.split("sample_|.txt", expand=True)[1] df.insert(1, "datatype", df['filename'].str.split("_").str[-1]) alldata=alldata.append(df) display(alldata.head()) ``` * read file with multiple sheets ```javascript! DIS='FH' FILE='FH_result_名單' excel_name =( '/home/amy/other_project/GWAS_validation result/FH result 名單.xlsx' ) wb = xlrd.open_workbook(excel_name) sheet = wb.sheet_names() print(sheet) alldata = DataFrame() for i in range(len(sheet)): # df = pd.read_excel(excel_name, sheet_name=i, index=False, encoding='utf8') df = pd.read_excel(excel_name, sheet_name=i, index=False, encoding='big5') df['filename'] = os.path.basename(excel_name) alldata = alldata.append(df) display(alldata) ``` * set index ```javascript! alldata=alldata.loc[alldata['Categories']=='All'].set_index(['filename','DIS','CATA']).reset_index() ``` * 抽出包含 0/1, 1/1 的欄位 ```javascript! mutation_select3 = APT_df2[APT_df2.apply(lambda r: r.str.contains('|'.join(mutation), case=False).any(), axis=1)] ``` * 列出 duplicated cells ```javascript! Target_list[Target_list.duplicated(['PatientID'],keep=False)] ``` * 去除欄位前後空白 ```javascript! ICD_diagnosis_diagage.DiagCode = ICD_diagnosis_diagage.DiagCode.str.strip() ``` * get unique element in a list ```javascript! mylist = [u'nowplaying', u'PBS', u'PBS', u'nowplaying', u'job', u'debate', u'thenandnow'] unique = reduce(lambda l, x: l.append(x) or l if x not in l else l, mylist, []) ``` * index 依照自訂 order 排序 ```javascript! order=['other','not_provided','Uncertain_significance','protective','Benign','Benign/Likely_benign','Likely_benign','Conflicting_interpretations_of_pathogenicity','risk_factor','Likely_pathogenic','Pathogenic/Likely_pathogenic','Pathogenic'] CLNSIG_count = CLNSIG_count.reindex(order) ``` * 建立自定義編號 <針對某個 group 編列個數> 001 002 003 ```javascript! ICDmap['group_count'] = ICDmap.groupby(['category']).cumcount()+1 ICDmap['group_count'] = ICDmap['group_count'].apply(lambda x: '{0:0>3}'.format(x)) ``` * Cut percentile ```javascript! PGS_score['percentile'] = pd.qcut(PGS_score['PRS'].rank(method='first'), q=10, labels=False,duplicates='raise')+1 ``` ## Read json files ```javascript= path=f'/media/volume2/data_base/aic_testing/14_LCMS/' filenames=gb.glob(path+'7E2Y_4D_peptide_*/model_*_multimer_v2_pred_*.log') alldata = pd.DataFrame() for filename in filenames: with open(filename, 'r') as file: data = json.load(file) # Create a dictionary to store the reorganized data reorganized_data = {} # Iterate over the rows in data for row in data: if isinstance(row, dict): # If row is a dictionary, assume the keys are column names and values are corresponding values column_names = list(row.keys())[:4] values = list(row.values())[0:] else: # Handle unsupported row format continue # Assign the values to the corresponding columns for i, column_name in enumerate(column_names): reorganized_data.setdefault(column_name, []).append(values[i] if i < len(values) else None) # Create the DataFrame from the reorganized data df = pd.DataFrame(reorganized_data) df.insert(0, "filepath", os.path.abspath(filename)) df.insert(1, "group", df['filepath'].str.split("/").str[6]) df.insert(2, "name", df['filepath'].str.split("/|.log").str[7]) df.insert(3, "model", df['name'].str.split("_").str[1]) df.insert(4, "cycle", df['name'].str.split("_|.log").str[5]) alldata=alldata.append(df) LOGfile=alldata.copy().reset_index(drop=True) LOGfile ``` ```javascript= path=f'/media/volume2/data_base/aic_testing/14_LCMS/' filenames=gb.glob(path+'7E2Y_4D_peptide_*/ranking_debug.json') alldata = pd.DataFrame() for filename in filenames: with open('/media/volume2/data_base/aic_testing/14_LCMS/7E2Y_4D_peptide_1/ranking_debug.json') as file: data = json.load(file) # Flatten data df_nested_list = pd.json_normalize(data).T df_nested_list = df_nested_list.drop('order').reset_index() df_nested_list.columns = ['name','0.2 iptm + 0.8 ptm'] df_nested_list['name']=df_nested_list['name'].str.split(".", expand=True)[1] df_nested_list['Rank'] = df_nested_list['0.2 iptm + 0.8 ptm'].rank(ascending=False).astype(int) df_nested_list.insert(0, "filepath", os.path.abspath(filename)) df_nested_list.insert(1, "group", df_nested_list['filepath'].str.split("/").str[6]) # df_nested_list.insert(3, "model", df_nested_list['name'].str.split("_").str[1]) # df_nested_list.insert(4, "cycle", df_nested_list['name'].str.split("_|.log").str[5]) alldata=alldata.append(df_nested_list) RANKfile=alldata.copy().reset_index(drop=True) RANKfile ``` ## Figure adjustment * 加水平線 ```javascript! ax.hlines(19.5, -0.5, 4.5, linestyle='--', linewidth=1) ``` https://www.cntofu.com/book/172/docs/13.md ### ECDF plot ```javascript! from matplotlib.ticker import FuncFormatter fig, ax = plt.subplots(figsize=(5, 4)) ecdf_plot = sns.ecdfplot( data=PGS_for_plot, x="Number of Individuals", hue="Used to build model", palette=["blue", "green","orange"]) # ecdf_plot.set_size_inches(15,15) # Define a function for formatting x-axis labels def format_x_ticks(value, pos): return '{:,.1f}M'.format(value * 1e-6) # Set the x-axis label formatter ax.xaxis.set_major_formatter(FuncFormatter(format_x_ticks)) lines = ecdf_plot.get_lines() x_values = [line.get_xdata() for line in lines] y_values = [line.get_ydata() for line in lines] # Find the x-value for proportion = 0.5 for each hue category target_proportion = 0.3 x_values_at_proportion = [x[np.argmax(y >= target_proportion)] for x, y in zip(x_values, y_values)] print(target_proportion, x_values_at_proportion) target_proportion = 0.6 x_values_at_proportion = [x[np.argmax(y >= target_proportion)] for x, y in zip(x_values, y_values)] print(target_proportion, x_values_at_proportion) target_proportion = 0.5 x_values_at_proportion = [x[np.argmax(y >= target_proportion)] for x, y in zip(x_values, y_values)] print(target_proportion, x_values_at_proportion) target_proportion = 0.5 x_values_at_proportion = [x[np.argmax(y >= target_proportion)] for x, y in zip(x_values, y_values)] print(target_proportion, x_values_at_proportion) plt.axhline(0.5, linestyle='--', linewidth=1, color='r') plt.annotate('{:,.0f}'.format(x_values_at_proportion[0]), xy = (x_values_at_proportion[0], 0.5), xytext = (x_values_at_proportion[0]+400000, 0.35), arrowprops = dict(facecolor = 'black', width = 0.2, headwidth = 10), horizontalalignment = 'center') plt.annotate('{:,.0f}'.format(x_values_at_proportion[2]), xy = (x_values_at_proportion[2], 0.5), xytext = (x_values_at_proportion[2]+350000, 0.28), arrowprops = dict(facecolor = 'black', width = 0.2, headwidth = 10), horizontalalignment = 'center') ax.set_xlabel("\nNumber of Individuals",fontsize=12) ax.set_ylabel("Proportion",fontsize=12) sns.move_legend(ax, loc='upper left', bbox_to_anchor=(1, 1), title=None) plt.savefig( f'/Users/ching/Documents/work/03_CMUH/4_Manuscript/PRS/outfiles/240119_suppfig.png', bbox_inches="tight", dpi=150) plt.show() ``` ![240119_suppfig](https://hackmd.io/_uploads/ry9BtSot6.png) ### Heatmap https://zhuanlan.zhihu.com/p/35494575 example: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2744-2/figures/5 * noralization with z-score: https://www.programsbuzz.com/article/python-seaborn-matrix-plots-cluster-map ### Hierarchically-clustered Heatmap https://www.geeksforgeeks.org/hierarchically-clustered-heatmap-in-python-with-seaborn-clustermap/ https://www.python-graph-gallery.com/404-dendrogram-with-heat-map/ https://cmdlinetips.com/2020/01/heatmaps-with-seaborns-clustermap/ example: https://stackoverflow.com/questions/60177861/plotting-annotated-heatmaps-clustermaps-with-multiple-legends-in-seaborn example: https://www.chrisremmel.com/post/seaborn-color-labels/ workbook: https://www.cntofu.com/book/172/docs/31.md * methods https://medium.com/qiubingcheng/%E6%AD%90%E6%B0%8F%E8%B7%9D%E9%9B%A2%E8%88%87%E9%A4%98%E5%BC%A6%E7%9B%B8%E4%BC%BC%E5%BA%A6%E7%9A%84%E6%AF%94%E8%BC%83-c78163ad51b https://www.twblogs.net/a/5cfe993fbd9eee14644ee0ce ![](https://i.imgur.com/bx8r7da.png) * set color var postion https://stackoverflow.com/questions/62882084/seaborn-clustermap-legend-overlap-with-figure * add grouped color https://datavizpyr.com/hierarchically-clustered-heatmap-with-seaborn-in-python/ ### Firth logistic regression Paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4063169/ Paper: https://pubmed.ncbi.nlm.nih.gov/28295456/ Paper: https://www.sas.com/content/dam/SAS/support/en/sas-global-forum-proceedings/2020/4654-2020.pdf knowledge: https://medium.datadriveninvestor.com/firths-logistic-regression-classification-with-datasets-that-are-small-imbalanced-or-separated-49d7782a13f1 R workbook: https://cran.r-project.org/web/packages/logistf/logistf.pdf workbook: https://www.rdocumentation.org/packages/logistf/versions/1.24/topics/logistf YouTube: https://www.youtube.com/watch?v=fVbrUz6V_uk ### Schedule plot https://wurmen.github.io/Genetic-Algorithm-for-Job-Shop-Scheduling-and-NSGA-II/implementation%20with%20python/GA-jobshop/Example1.html ### swarmplot ![](https://i.imgur.com/CGKHfm5.png) https://seaborn.pydata.org/generated/seaborn.swarmplot.html https://stackoverflow.com/questions/51826447/how-to-resize-a-seaborn-catplot ```javascript! sns.catplot(data=PGS_sum_df[PGS_sum_df['AUROC'].notnull()], x="Broad Ancestry Category", y="AUROC", hue="contain_covar",kind="swarm",aspect=3) ``` ![](https://hackmd.io/_uploads/BkDKoja82.png) ### BOX plot ```javascript= fig, ax = plt.subplots(figsize=(7,6)) ax = sns.boxplot(data=case_bim_bin_fig, x=x, y=y,order=order,\ width=0.3,showfliers=False,color='w',\ boxprops=dict(linewidth=1.2,edgecolor='black'),whiskerprops=dict(linewidth=1.2,color='black'),\ medianprops=dict(linewidth=1.7,color='orange'),capprops=dict(linewidth=1.2,color='black')) add_stat_annotation(ax, data=case_bim_bin_fig, x=x, y=y,order=order, box_pairs=[("(0.01%, 0.05%]", "(0.05%, 0.1%]"), \ ("(0.01%, 0.05%]", "(0.1%, 0.5%]"), \ ("(0.01%, 0.05%]", "(0.5%, 1%]")], test='Mann-Whitney', text_format='star', loc='outside', verbose=2) ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.2%}'.format(y))) ax.grid(color = 'silver', linestyle = '--', linewidth = 1) # ax.set(xlabel='Minor allele frequency in WGS (n=2170)', ylabel=f'{TAR}', title='') plt.suptitle('') plt.title('') # plt.xlabel('Minor allele frequency in WGS (n=2,170)', fontsize=14,labelpad=15) plt.ylabel(f'{TAR}', fontsize=14,labelpad=15) ax.spines['bottom'].set_color('black') ax.spines['top'].set_color('black') ax.spines['right'].set_color('black') ax.spines['left'].set_color('black') plt.xticks(fontsize=12) plt.yticks(fontsize=12) plt.tick_params(direction='out',bottom = 'on',left = 'on') plt.rcParams['savefig.dpi'] = 300 ``` ### Density plot ```javascript= def density_plot(a,c): case_bim_bin_fig=case_bim_bin2[['SNP',a]] # case_bim_bin_fig=case_bim_bin_fig.loc[case_bim_bin_fig['MAF_WGS']>0] case_bim_bin_fig=pd.merge(case_bim_bin_fig,c,left_on='SNP',right_on='0',how='left') case_bim_bin_fig['TF'] = np.where(case_bim_bin_fig['SNP']==case_bim_bin_fig['0'], 'True', 'False') # case_bim_bin_fig=case_bim_bin_fig.loc[case_bim_bin_fig['TF']=='True'] x1=list(case_bim_bin_fig[case_bim_bin_fig['TF'] == 'True'][a]) x2=list(case_bim_bin_fig[case_bim_bin_fig['TF'] == 'False'][a]) fig, ax = plt.subplots(figsize=(12, 6)) # ax=case_bim_bin_fig[b].plot(kind='density',linewidth=3) # sns.kdeplot(data=case_bim_bin_fig, x=a, hue="TF",multiple="stack") # sns.kdeplot(case_bim_bin_fig[a], shade=True, color="r") # colors = ['blue', 'red'] counts, bins, patches=ax.hist(x1, bins = int(10), # color = colors, edgecolor = 'black') # sample = np.hstack((case_bim_bin_fig[b], case_bim_bin_fig[a])) # ax.plot(sample, [0.01]*len(sample), '|', color='k', markersize=12) # ax.plot(case_bim_bin_fig['MAF_WGS'], density(case_bim_bin_fig['MAF_WGS'])) plt.grid(linestyle = '--') # plt.xlim(-0.004, 0.012) # plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left') ax.spines['bottom'].set_color('black') ax.spines['top'].set_color('black') ax.spines['right'].set_color('black') ax.spines['left'].set_color('black') plt.xticks(fontsize=12) plt.yticks(fontsize=12) plt.title(a, fontsize=16) # plt.xlabel('Minor allele frequency in WGS (n=2,170)', fontsize=14,labelpad=15) plt.ylabel('Count', fontsize=14,labelpad=15) ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.2%}'.format(x))) ax.yaxis.set_major_formatter(FuncFormatter(comma)) plt.tick_params(direction='out',bottom = 'on',left = 'on') plt.rcParams['savefig.dpi'] = 300 bin_x_centers = 0.5 * np.diff(bins) + bins[:-1] bin_y_centers = ax.get_yticks()[1] * 0.25 for i in range(len(bins)-1): bin_label = "{0:,}".format(counts[i]) + " ({0:,.2f}%)".format((counts[i]/counts.sum())*100) plt.text(bin_x_centers[i], bin_y_centers, bin_label, rotation=90,rotation_mode='anchor') plt.tight_layout() plt.show() density_plot('MAF_WGS',APT_list) density_plot('MAF_WGS',ADV_list) ``` ### Distribution plot ```javascript= palette = {1: "#30A9DE", 2:'#ef5285'} ax1=sns.displot(data=best_practice, x="PRS", hue="caco", kind="kde", palette=palette,common_norm=False, common_grid=True, height=3.5, aspect=1.25, fill=True, alpha=.1) ax1.map(plt.axvline,x=0,ls='--', c='gray') plt.show() sns.displot(data=best_practice, x="PRS", hue="caco", kind="ecdf", palette=palette, height=3.5, aspect=1.25) plt.show() ``` * case control compare ```javascript= sns.displot(merge_info_all, x="merge_age", hue="Sex", bins=30, fill=True,palette=["#0080FF", "#FF3399"],col='pheno',facet_kws={'sharey': False}) ``` ![](https://i.imgur.com/vEg8eYc.png) ```javascript= tIDcode = tIDcode[tIDcode['Age_bin']!='nan'] sns.displot(tIDcode, x="Age", hue="Sex", kind="kde", fill=True,palette=["#FF3399", "#0080FF"]) plt.savefig('/media/volume1/amy/TPMI/TMPI_29W_aged.pdf',bbox_inches="tight", dpi=300) plt.savefig('/media/volume1/amy/TPMI/TMPI_29W_aged.png',bbox_inches="tight", dpi=300) plt.show() ``` ![](https://hackmd.io/_uploads/By821sTLh.png) ```javascript= # Define the color palette palette = {"Male": "#0080FF", "Female": "#FF3399"} # Create the displot g = sns.displot(df, x="DiagAge", hue="sex", bins=30, fill=True, palette=palette, facet_kws={'sharey': True}, col='sex') g.set_titles("{col_name}", size=10) # Get the axes of the subplots axes = g.axes.flat # Iterate over the subplots for ax in axes: # Get the gender of the subplot gender = ax.get_title() # Filter the DataFrame based on the gender filtered_df = df[df['sex'] == gender] # Calculate the count of each gender count = filtered_df.shape[0] # Add the count as text annotation on the plot props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) ax.text(0.8, 0.9, f"Count: {count}", transform=ax.transAxes, ha='center', va='top', fontsize=10, bbox=props) # Show the plot plt.show() ``` ![](https://hackmd.io/_uploads/r1R5vb2u3.png) ### Pyramid Chart ```javascript= bins = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 180] agelable = ["0-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85-89", "90-94", "95-99", "100+"] df['group'] = pd.cut(df["DiagAge"], bins=bins, labels=agelable) population_df = df.pivot_table( values='patientID', index='group', columns='sex', aggfunc=np.size, margins=False, fill_value=0 ).reset_index() population_df["Female_Left"] = 0 population_df["Female_Width"] = population_df["Female"] population_df["Male_Left"] = -population_df["Male"] population_df["Male_Width"] = population_df["Male"] female_color = "#ee7a87" male_color = "#4682b4" fig, ax = plt.subplots(figsize=(9, 6)) plt.barh(y=population_df["group"], width=population_df["Female_Width"],color="#ee7a87", label="Female") plt.barh(y=population_df["group"], width=population_df["Male_Width"], left=population_df["Male_Left"], color="#4682b4", label="Male") for idx in range(len(population_df)): plt.text(x=population_df["Male_Left"][idx]-0.1, y=idx, s=" {} ".format(population_df["Male"][idx]),ha="right", va="center",fontsize=10, color="#4682b4") plt.text(x=population_df["Female_Width"][idx]+0.1, y=idx, s=" {} ".format(population_df["Female"][idx]),ha="left", va="center",fontsize=10, color="#ee7a87") plt.xlim(-100,50) ticks = ax.get_xticks() ax.set_xticklabels([int(abs(tick)) for tick in ticks]) plt.xlabel("Count", fontsize=14, fontweight="bold") plt.ylabel("Age Range", fontsize=14, fontweight="bold") plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left') plt.title("Stroke Population Pyramid Chart", loc="center", pad=10, fontsize=14,fontweight="bold") ``` ![](https://hackmd.io/_uploads/r16ioCsun.png) ```javascript= age = np.array( ["0 - 4", "5 - 9", "10 - 14", "15 - 19", "20 - 24", "25 - 29", "30 - 34", "35 - 39", "40 - 44", "45 - 49", "50 - 54", "55 - 59", "60 - 64", "65 - 69", "70 - 74", "75 - 79", "80 - 84", "85 - 89", "90 - 94", "95 - 99", "> 100"]) m = np.array( [1915887, 2100931, 2494484, 2464805, 2361297, 2529633, 2669927, 2754129, 2753282, 2552531, 2211649, 1697221, 1311024, 902675, 722246, 482686, 273915, 108639, 35867, 10965, 1238]) f = np.array( [1823981, 1980712, 2369795, 2435784, 2330223, 2562964, 2724990, 2860720, 2918730, 2713534, 2376384, 1869867, 1454373, 1030677, 885393, 640698, 388748, 172302, 64170, 19868, 2711]) x = np.arange(age.size) # Modified before the function because you are shifting the bars left/right tick_val = np.array([-3000000, -2000000, -1000000, 1000000, 2000000, 3000000]) shift = 300000 tick_val[0:3] -= shift tick_val[3:] += shift # Modified inside the function plt.barh(x, -m, alpha=.75, height=.75, left=-shift, align='center' , color="deepskyblue") plt.barh(x, f, alpha=.75, height=.75, left = shift, align='center', color="pink") for i, j in enumerate(age): if i==0 or i==1: plt.text(-150000, x[i]-0.2, j, fontsize=10) else: plt.text(-230000, x[i]-0.2, j, fontsize=10) ``` ![download](https://hackmd.io/_uploads/ryRWHX7Vp.png) ### Hist plot ```javascript= bins = [0,0.0001, 0.0005, 0.001, 0.005, 0.01] data= fig, ax = plt.subplots(figsize=(12, 6)) # # ax=case_bim_bin_fig[b].plot(kind='density',linewidth=3) # # sns.kdeplot(data=case_bim_bin_fig, x=a, hue="TF",multiple="stack") # # sns.kdeplot(case_bim_bin_fig[a], shade=True, color="r") colors = ['blue', 'red','green'] ax.hist([list(case_bim_bin_fig['MAF_WGS']), list(case_bim_bin_fig[case_bim_bin_fig['TF_APT'] == 'True']['MAF_WGS']), list(case_bim_bin_fig[case_bim_bin_fig['TF_ADV'] == 'True']['MAF_WGS'])], bins = bins, color = colors, edgecolor = 'black') # # sample = np.hstack((case_bim_bin_fig[b], case_bim_bin_fig[a])) # # ax.plot(sample, [0.01]*len(sample), '|', color='k', markersize=12) # # ax.plot(case_bim_bin_fig['MAF_WGS'], density(case_bim_bin_fig['MAF_WGS'])) plt.grid(linestyle = '--') plt.xscale('log') # # plt.xlim(-0.004, 0.012) # # plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left') ax.spines['bottom'].set_color('black') ax.spines['top'].set_color('black') ax.spines['right'].set_color('black') ax.spines['left'].set_color('black') plt.xticks(fontsize=12) plt.yticks(fontsize=12) # plt.title(a, fontsize=16) # # plt.xlabel('Minor allele frequency in WGS (n=2,170)', fontsize=14,labelpad=15) plt.ylabel('Count', fontsize=14,labelpad=15) ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.2%}'.format(x))) ax.yaxis.set_major_formatter(FuncFormatter(comma)) plt.tick_params(direction='out',bottom = 'on',left = 'on') plt.rcParams['savefig.dpi'] = 300 plt.tight_layout() plt.show() ``` ```javascript= fig, ax = plt.subplots(figsize=(12, 6)) # ax=case_bim_bin_fig.MAF_WGS.plot(kind='density',linewidth=3) # plt.xlim(-0.1, 1.0) sample = np.hstack((case_bim_bin_fig['MAF_WGS'], case_bim_bin_fig[f'{TAR}'])) ax.plot(sample, [0.1]*len(sample), '|', color='k', markersize=12) ax.hist(case_bim_bin_fig['MAF_WGS'],1000, ec='red', fc='none', lw=1.5, histtype='step', label='Dist A') # ax.plot(case_bim_bin_fig['MAF_WGS'], density(case_bim_bin_fig['MAF_WGS'])) plt.grid(linestyle = '--') # plt.xlim(-0.1, 1.0) # plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left') ax.spines['bottom'].set_color('black') ax.spines['top'].set_color('black') ax.spines['right'].set_color('black') ax.spines['left'].set_color('black') plt.xticks(fontsize=12) plt.yticks(fontsize=12) plt.title(f'{TAR}', fontsize=16) plt.xlabel('Minor allele frequency in WGS (n=2,170)', fontsize=14,labelpad=15) plt.ylabel('Density', fontsize=14,labelpad=15) # ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.2%}'.format(x))) plt.tick_params(direction='out',bottom = 'on',left = 'on') plt.rcParams['savefig.dpi'] = 300 plt.tight_layout() plt.show() ``` ```javascript= # p = sns.displot(data=PGS_sum_df[PGS_sum_df['AUROC'].notnull()], x='AUROC', stat='percent', col='contain_covar', height=3) fg = sns.displot(data=PGS_sum_df[(PGS_sum_df['AUROC'].notnull())], x='AUROC', stat='percent', height=3.5, aspect=1.25, bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]) for ax in fg.axes.ravel(): # add annotations for c in ax.containers: # custom label calculates percent and add an empty string so 0 value bars don't have a number labels = [f'{w:0.2f}%' if (w := v.get_height()) > 0 else '' for v in c] print(labels) percentile_list = pd.DataFrame({'AUC': ['(0.0, 0.1]', '(0.1, 0.2]', '(0.2, 0.3]', '(0.3, 0.4]','(0.4, 0.5]','(0.5, 0.6]','(0.6, 0.7]', '(0.7, 0.8]', '(0.8, 0.9]', '(0.9, 1.0]'],'percentage': labels}) ax.bar_label(c, labels=labels, label_type='edge', fontsize=8, rotation=90, padding=2) ax.margins(y=0.2) plt.show() ``` ![](https://hackmd.io/_uploads/BJp3ij6U2.png) ```javascript= # p = sns.displot(data=PGS_sum_df[PGS_sum_df['AUROC'].notnull()], x='AUROC', stat='percent', col='contain_covar', height=3) fg = sns.displot(data=PGS_sum_df[(PGS_sum_df['AUROC'].notnull())], x='Broad Ancestry Category', stat='percent', height=3.5, aspect=1.25, bins=[0,3,5,10,18]) fg.set_xticklabels(rotation=90) for ax in fg.axes.ravel(): # add annotations for c in ax.containers: # custom label calculates percent and add an empty string so 0 value bars don't have a number labels = [f'{w:0.3f}%' if (w := v.get_height()) > 0 else '' for v in c] print(labels) # percentile_list = pd.DataFrame({'AUC': ['(0.0, 0.1]', '(0.1, 0.2]', '(0.2, 0.3]', '(0.3, 0.4]','(0.4, 0.5]','(0.5, 0.6]','(0.6, 0.7]', '(0.7, 0.8]', '(0.8, 0.9]', '(0.9, 1.0]'],'percentage': labels}) ax.bar_label(c, labels=labels, label_type='edge', fontsize=8, rotation=90, padding=2) ax.margins(y=0.2) plt.show() ``` ![](https://hackmd.io/_uploads/rJu1niTLn.png) ### Bar chart ```javascript= bins = [0,0.0001, 0.0005, 0.001, 0.005, 0.01] data1=list(case_bim_bin_fig[case_bim_bin_fig['TF_APT'] == 'True']['MAF_WGS']) data2=list(case_bim_bin_fig[case_bim_bin_fig['TF_APT'] == 'False']['MAF_WGS']) # print(data1) hist1, bin_edges1 = np.histogram(data1,bins) hist2, bin_edges2 = np.histogram(data2,bins) # Setting the positions and width for the bars pos = list(range(len(hist1))) width = 0.3 print(range(len(hist1))) print(range(len(hist2))) fig, ax = plt.subplots(figsize=(12, 6)) ax.bar([p - width*0.5 for p in pos], hist1, width, edgecolor='black',linewidth=1.2, color='black',error_kw=dict(ecolor='black',elinewidth=0.5,lolims=True,marker='o')) ax.bar([p + width*0.5 for p in pos], hist2, width, edgecolor='black',linewidth=1.2, color='white',hatch='\\\\',error_kw=dict(ecolor='black',elinewidth=0.5,lolims=True,marker='o')) # df2=plt.bar([p - width*1 for p in pos], hist2, width, edgecolor='black',linewidth=1.2, # label=case_bim_bin_fig['Categories'][1],color='black',error_kw=dict(ecolor='black',elinewidth=0.5,lolims=True,marker='o')) # Set the xticklabels to a string that tells us what the bin edges were ax.set_xticklabels(['{} - {}'.format(bins[i],bins[i+1]) for i,j in enumerate(hist1)]) plt.tight_layout() ``` ```javascript= Disease_dis = ICD_diag_CRC_all.pivot_table( values=['PatientNo'], index=['DiagName'], aggfunc=lambda x: len(x.unique()) ).reset_index() Disease_dis.columns =['Disease', 'Count'] Disease_dis = Disease_dis.sort_values(by=['Count'],ascending=False) fig, ax = plt.subplots(figsize=(4, 6)) sns.barplot(x="Count", y="Disease", data=Disease_dis.head(20), label="Count", color="black") ``` ![](https://hackmd.io/_uploads/BkPbuspIh.png) ```javascript= plt.figure(figsize=(25,5)) sns.barplot(data = pheno_count_map[pheno_count_map['case']>5000].sort_values(by=['case'],ascending=False), x="phenotype", y='case') plt.xticks(rotation=90,ha='center') plt.savefig('/media/volume1/amy/TPMI/Phecode_samplecount.png', bbox_inches = 'tight', dpi = 300) plt.savefig('/media/volume1/amy/TPMI/Phecode_samplecount.pdf', bbox_inches = 'tight', dpi = 300) plt.show() ``` ![](https://hackmd.io/_uploads/BkDBljTIn.png) ```javascript= category = 'mental disorders' fig, ax = plt.subplots(figsize=(25,5)) df_draw = pheno_count_map[pheno_count_map['category']==category].sort_values(by=['case'],ascending=False) clrs = ['grey' if (x < 1000) else 'red' for x in df_draw['case'] ] props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) sns.barplot( data = df_draw, x="phenotype", y='case', palette=clrs) plt.xticks(rotation=90,ha='center') ax.text(.8, .85, "category = {}".format(category),transform=ax.transAxes,size=16,bbox=props) plt.savefig(f'/media/volume1/amy/TPMI/phecode_result/Phecode_samplecount_{category}.png', bbox_inches = 'tight', dpi = 300) plt.savefig(f'/media/volume1/amy/TPMI/phecode_result/Phecode_samplecount_{category}.pdf', bbox_inches = 'tight', dpi = 300) plt.show() ``` ![](https://hackmd.io/_uploads/rkAPSipU3.png) ```javascript= PGS_sum_df['AUROC_TF']=PGS_sum_df['AUROC'].notnull() PGS_sum_df['HR_TF']=PGS_sum_df['Hazard Ratio (HR)'].notnull() PGS_sum_df['OR_TF']=PGS_sum_df['Odds Ratio (OR)'].notnull() table = PGS_sum_df.pivot_table( values=['PGS Performance Metric (PPM) ID'], columns=['AUROC_TF','HR_TF','OR_TF'], aggfunc=lambda x: len(x.unique()), dropna=False ) splot = sns.barplot(data=table,palette=['darkorange','darkorange','darkorange','darkorange','cornflowerblue','cornflowerblue','cornflowerblue','cornflowerblue']) splot.set_ylim(0, 11000) # plt.xticks(rotation=90, ha='center') plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom=True, # ticks along the bottom edge are off top=False, # ticks along the top edge are off labelbottom=False) # labels along the bottom edge are off splot.set_ylabel("Number of performance records",fontsize=12) for p in splot.patches: splot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points') plt.savefig( f'Output/Barplot_of_PGS_catalog.png', bbox_inches="tight", dpi=300) ``` ![](https://hackmd.io/_uploads/HyAeoi6Lh.png) ```javascript= df_stack = (PGS_score .groupby("percentile")["Pheno"] .value_counts(normalize=True) .mul(100) .round(2) .unstack()) df_stack fig, ax = plt.subplots(figsize = (9,9)) ax.bar(df_stack.index, df_stack["Case"], label = "Case", width = 0.9, color = "#FF5858") ax.bar(df_stack.index, df_stack["Control"], bottom = df_stack.Case, label = "Control", width = 0.9, color = "#00ABB3") # Add labels for c in ax.containers: labels = [str(round(v.get_height(), 2)) + "%" if v.get_height() > 0 else '' for v in c] ax.bar_label(c, label_type='center', labels = labels, size = 10) # add a container object "c" as first argument # Remove spines for s in ["top", "right"]: ax.spines[s].set_visible(False) # Add labels ax.tick_params(labelsize = 12, labelrotation = 0) ax.set_xticks(df_stack.index) ax.set_ylabel("Percentage (%)", size = 12) ax.set_xlabel("Percentile", size = 12) plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left',title="Pheno") ax.spines['top'].set_visible(True) ax.spines['right'].set_visible(True) ``` ![](https://hackmd.io/_uploads/SJKh8iaI3.png) ```javascript= from scipy.stats import norm from sklearn.datasets import make_blobs import colorcet as cc blobs, labels = make_blobs(n_samples=1000, centers=2, center_box=(-100, 100)) palette = sns.color_palette(cc.glasbey_hv, n_colors=10).as_hex() PRS_name=plot_file.split("/")[8] g = sns.displot( np.array(PGS_score[PGS_score['Pheno']=='Case']['SCORE']), bins = 40, kde=False, stat='percent', height=3.5, aspect=1, # hist_kws=dict(edgecolor="black", linewidth=1,alpha=0.9) ) count_patient=PGS_score[PGS_score['Pheno']=='Case']['IID'].nunique() print(count_patient) # plt.text(4.8, 40,'n = {:.0f}'.format(count_patient),fontsize=10) plt.text(2, 6.3,'n = {:.0f}'.format(count_patient),fontsize=10) # for i in bins_SCORE: # plt.axvline(x=i, linestyle='--') for ax in g.axes.flat: for rectangle in ax.patches: if (rectangle.get_x() >= bins_SCORE[0])&(rectangle.get_x() < bins_SCORE[1]): rectangle.set_facecolor(palette[0]) elif (rectangle.get_x() >= bins_SCORE[1])&(rectangle.get_x() < bins_SCORE[2]): rectangle.set_facecolor(palette[1]) elif (rectangle.get_x() >= bins_SCORE[2])&(rectangle.get_x() < bins_SCORE[3]): rectangle.set_facecolor(palette[2]) elif (rectangle.get_x() >= bins_SCORE[3])&(rectangle.get_x() < bins_SCORE[4]): rectangle.set_facecolor(palette[3]) elif (rectangle.get_x() >= bins_SCORE[4])&(rectangle.get_x() < bins_SCORE[5]): rectangle.set_facecolor(palette[4]) elif (rectangle.get_x() >= bins_SCORE[5])&(rectangle.get_x() < bins_SCORE[6]): rectangle.set_facecolor(palette[5]) elif (rectangle.get_x() >= bins_SCORE[6])&(rectangle.get_x() < bins_SCORE[7]): rectangle.set_facecolor(palette[6]) elif (rectangle.get_x() >= bins_SCORE[7])&(rectangle.get_x() < bins_SCORE[8]): rectangle.set_facecolor(palette[7]) elif (rectangle.get_x() >= bins_SCORE[8])&(rectangle.get_x() < bins_SCORE[9]): rectangle.set_facecolor(palette[8]) elif (rectangle.get_x() >= bins_SCORE[9]): rectangle.set_facecolor(palette[9]) g.set_axis_labels("Polygenic risk score", "Density of disease cases (%)\n",fontsize=12) plt.savefig( f'Output/Displot_of_cases_{PRS_name}.png', bbox_inches="tight", dpi=300) plt.show() ``` ![](https://hackmd.io/_uploads/Sk0Hci68n.png) ```javascript= ``` ### Violin plot ```javascript= # 比較年齡分布 plt.figure(figsize=(3,4)) ax = sns.violinplot(data=Target_codebook_inTPMI, x='Disease',y="Age_now", hue="SexName", hue_order=['Female','Male'],palette=['#FF7777','#3DB2FF'], width=0.5) add_stat_annotation(ax, data=Target_codebook_inTPMI,x='Disease', y="Age_now", hue="SexName", hue_order=['Female','Male'], box_pairs=[(('CRC','Female'),('CRC','Male')) ], test='t-test_ind', text_format='simple', loc='inside', verbose=2) plt.legend(loc='upper left', bbox_to_anchor=(1.03, 1)) # # ax.grid(color = 'silver', linestyle = '--', linewidth = 1) # plt.xticks(rotation=30,ha='right' ) # plt.savefig('./Dementia_project/Output/TPMI/Diagnosis_age.pdf',bbox_inches="tight", dpi=300) # plt.savefig('./Dementia_project/Output/TPMI/Diagnosis_age.png',bbox_inches="tight", dpi=300) plt.show() ``` ```javascript= from statannot import add_stat_annotation from statannotations.Annotator import Annotator fig, ax = plt.subplots(figsize=(4,4)) ax = sns.violinplot( data=PGS_for_plot, y='Used to build model', x='Number of Individuals', width=0.9,showfliers=True, palette=['blue','green','orange'], # boxprops=dict(linewidth=1.2,edgecolor='black'), # whiskerprops=dict(linewidth=1.2,color='black'), # medianprops=dict(linewidth=1.7,color='orange'), # capprops=dict(linewidth=1.2,color='black') ) # Define a function for formatting x-axis labels def format_x_ticks(value, pos): return '{:,.1f}M'.format(value * 1e-6) # Set the x-axis label formatter ax.xaxis.set_major_formatter(FuncFormatter(format_x_ticks)) # add_stat_annotation( # ax, # data=PGS_for_plot, # y='Used to build model', # x='Number of Individuals', # box_pairs=[('Original','Passed to the next step'), # ('Original','Selected for optimized model'), # ('Passed to the next step','Selected for optimized model')], # test='Mann-Whitney', # text_format='simple', # loc='outside', # verbose=0.5 # ) ax.set_ylabel("Used to build model",fontsize=12) ax.set_xlabel("\nNumber of Individuals",fontsize=12) # plt.yticks(rotation=30,ha='right') plt.savefig( f'/Users/ching/Documents/work/03_CMUH/4_Manuscript/PRS/outfiles/240118_suppfig2-2.png', bbox_inches="tight", dpi=150) plt.show() ``` ![240118_suppfig2-2](https://hackmd.io/_uploads/HJz3-IoKa.png) ```javascript= x = "Subtype" y = "DiagAge" hue = "SexName" plt.figure(figsize=(7,4)) ax = sns.boxplot(data=test, x=x, y=y, hue=hue,palette=['#FF7777','#3DB2FF']) add_stat_annotation(ax, data=test, x=x, y=y, hue=hue, box_pairs=[(("Alzheimer\'s disease", "Female"), ("Alzheimer\'s disease", "Male")), (("Lewy Body", "Female"), ("Lewy Body", "Male")), (("Frontotemporal dementia", "Female"), ("Frontotemporal dementia", "Male")), (("Vascular dementia", "Female"), ("Vascular dementia", "Male")), (("Other cerebral degeneration", "Female"), ("Other cerebral degeneration", "Male")), (("Non-specific dementias", "Female"), ("Non-specific dementias", "Male")), (("Secondary Parkinsonism", "Female"), ("Secondary Parkinsonism", "Male")), (("Not included in study definition of dementia", "Female"), ("Not included in study definition of dementia", "Male")), (("other comorbidity", "Female"), ("other comorbidity", "Male")) ], test='t-test_ind', text_format='star', loc='inside', verbose=2) plt.legend(loc='upper left', bbox_to_anchor=(1.03, 1)) # ax.grid(color = 'silver', linestyle = '--', linewidth = 1) plt.xticks(rotation=30,ha='right' ) plt.savefig('./Dementia_project/Output/TPMI/Diagnosis_age.pdf',bbox_inches="tight", dpi=300) plt.savefig('./Dementia_project/Output/TPMI/Diagnosis_age.png',bbox_inches="tight", dpi=300) plt.show() ``` ![](https://hackmd.io/_uploads/rkXxFcTI2.png) ```javascript= # define custom plotting function def custom_boxplot(*args, **kwargs): ax = sns.violinplot(*args, **kwargs,palette=['#2EC4B6','#EE7785']) statannot.add_stat_annotation( ax, plot='boxplot', data=kwargs['data'], x=kwargs['x'], y=kwargs['y'], box_pairs=[('Control', 'LAA')],loc='outside', test='Mann-Whitney', text_format='simple' ) # create plot g = sns.FacetGrid( df_m1, col="item",sharey=False, sharex=False, col_wrap=6, height=3, aspect=1,margin_titles=True) g.map_dataframe(custom_boxplot, x='Group', y='value') g.set_titles("{col_name}\n\n") g.fig.subplots_adjust(hspace =0.5) g.add_legend() plt.savefig( 'LAA_project/subfigure_1.png', bbox_inches="tight", dpi=300) plt.show() ``` ![](https://hackmd.io/_uploads/H19Xoc683.png) ```javascript= # single for i in item_order: x = "Age_bin" y = "Age" hue = "Sex" plt.figure(figsize=(4,4)) ax = sns.violinplot(data=tIDcode[tIDcode['Age_bin']==i], x=x, y=y, hue=hue,palette=['#FF7777','#3DB2FF'],order=item_order) add_stat_annotation(ax, data=tIDcode[tIDcode['Age_bin']!='nan'], x=x, y=y, hue=hue,order=item_order,comparisons_correction='bonferroni', box_pairs=[((i, "Female"), (i, "Male")), ], test='t-test_ind', text_format='star', loc='inside', verbose=2,line_offset_to_box=0.3) plt.legend(loc='upper left', bbox_to_anchor=(1.03, 1)) # ax.grid(color = 'silver', linestyle = '--', linewidth = 1) # plt.xticks(rotation=30,ha='right' ) # plt.savefig('/media/volume1/amy/TPMI/TMPI_29W_age.pdf',bbox_inches="tight", dpi=300) # plt.savefig('/media/volume1/amy/TPMI/TMPI_29W_age.png',bbox_inches="tight", dpi=300) plt.show() ``` ![](https://hackmd.io/_uploads/SJ0llo683.png) ### Venn plot ```javascript! ICD_diag_cancer = ICD_diagnosis_diagage[(ICD_diagnosis_diagage['DiagName'].notnull())& (ICD_diagnosis_diagage['DiagName'].str.contains('惡性腫瘤'))] set_CRC=set(ICD_diag_CRC.PatientNo.unique().tolist()) set_CANCER=set(ICD_diag_cancer.PatientNo.unique().tolist()) difference = list(set(set_CRC) - set(set_CANCER)) same = list(set(set_CRC) & set(difference)) join = list(set(set_CRC) | set(set_CANCER)) set_join = set(join) print('CRC:',len(set_CRC),'CANCER:',len(set_CANCER),'joined:',len(same),'not in ICD:',len(difference)) v=venn2([set_CRC, set_CANCER], ('CRC', 'Cancer')) c=venn2_circles([set_CRC, set_CANCER], linestyle='dashed', linewidth=1, color="grey") plt.show() v=venn3([set_CRC, set_CANCER, set_join], ('CRC', 'Cancer','TPMI')) c=venn3_circles([set_CRC, set_CANCER, set_join], linestyle='dashed', linewidth=1, color="grey") plt.show() ``` ```javascript! set_unimputed = set(unimputed) set_imputed = set(imputed) difference = list(set(set_unimputed) - set(set_imputed)) same = list(set(set_unimputed) & set(set_imputed)) # join = list(set(set_CRC) | set(set_CANCER)) # # set_join = set(join) print('unimputed:',len(unimputed),'imputed:',len(imputed)) print('joined:',len(same),'not in unimputed data:',len(difference)) v=venn2([set_unimputed, set_imputed], ('unimputed', 'imputed')) c=venn2_circles([set_unimputed, set_imputed], linestyle='dashed', linewidth=1, color="grey") plt.show() ``` ![](https://i.imgur.com/fuztoIJ.png) ```javascript= v=venn3([set_17W_sig, set_29W_sig_un, set_29W_sig], ('17W sig', '29W sig + uncertain','29W sig')) c=venn3_circles([set_17W_sig, set_29W_sig_un, set_29W_sig], linestyle='dashed', linewidth=1, color="grey") plt.annotate(',\n'.join(set_17W_sig-set_29W_sig_un-set_29W_sig), xy=v.get_label_by_id('100').get_position() + np.array([-0.04, 0]), xytext=(-130,-120), ha='right', textcoords='offset points', bbox=dict(boxstyle='round,pad=0.5', fc='lightgray', alpha=0.9), arrowprops=dict(arrowstyle='->', connectionstyle='arc',color='black')) plt.annotate(',\n'.join(set_17W_sig&set_29W_sig_un-set_29W_sig), xy=v.get_label_by_id('110').get_position() + np.array([-0.02, 0.02]), xytext=(-140,10), ha='right', textcoords='offset points', bbox=dict(boxstyle='round,pad=0.5', fc='lightgray', alpha=0.9), arrowprops=dict(arrowstyle='->', connectionstyle='arc',color='black')) plt.annotate(',\n'.join(set_17W_sig&set_29W_sig_un&set_29W_sig), xy=v.get_label_by_id('111').get_position() + np.array([0.02, 0]), xytext=(170,20), ha='left', textcoords='offset points', bbox=dict(boxstyle='round,pad=0.5', fc='lightgray', alpha=0.9), arrowprops=dict(arrowstyle='->', connectionstyle='arc',color='black')) plt.annotate(',\n'.join(set_29W_sig_un&set_29W_sig-set_17W_sig), xy=v.get_label_by_id('011').get_position() + np.array([0.02, 0]), xytext=(120,-30), ha='left', textcoords='offset points', bbox=dict(boxstyle='round,pad=0.5', fc='lightgray', alpha=0.9), arrowprops=dict(arrowstyle='->', connectionstyle='arc',color='black')) plt.annotate(',\n'.join(set_29W_sig_un-set_17W_sig-set_29W_sig), xy=v.get_label_by_id('010').get_position() + np.array([0.02, 0]), xytext=(160,-90), ha='left', textcoords='offset points', bbox=dict(boxstyle='round,pad=0.5', fc='lightgray', alpha=0.9), arrowprops=dict(arrowstyle='->', connectionstyle='arc',color='black')) plt.savefig('/media/volume1/amy/TPMI/29W_fGWAS_result/Alzheimer_EEG_SNPcompare.png',dpi=300,bbox_inches="tight") plt.savefig('/media/volume1/amy/TPMI/29W_fGWAS_result/Alzheimer_EEG_SNPcompare.pdf',dpi=300,bbox_inches="tight") plt.show() ``` ![](https://hackmd.io/_uploads/SJbOxsaU3.png) ```javascript= # four groups import venn set_AD=set(dementia_df[dementia_df['Alzheimer\'s disease']==2.0]['id'].unique().tolist()) set_senile=set(dementia_df[dementia_df['Senile dementia']==2.0]['id'].unique().tolist()) set_vascular=set(dementia_df[dementia_df['Vascular dementia']==2.0]['id'].unique().tolist()) set_cerebral=set(dementia_df[dementia_df['Dementia with cerebral degenerations']==2.0]['id'].unique().tolist()) # v=venn3([set_AD, set_senile, set_vascular], ('Alzheimer\'s disease', 'Senile dementia','Vascular dementia')) # c=venn3_circles([set_AD, set_senile, set_vascular], linestyle='dashed', linewidth=1, color="grey") # plt.show() labels = venn.get_labels([set_AD, set_cerebral, set_senile, set_vascular], fill=['number']) fig, ax = venn.venn4(labels, names=['Alzheimer\'s disease','Dementia with cerebral degenerations','Senile dementia','Vascular dementia']) fig.show() ``` ### Bar chart with 省略波浪 ```javascript= CLNSIG_count=ANNOVAR_g[ANNOVAR_g['CLNALLELEID'].notnull()].pivot_table( values=['avsnp150'], columns=['filename'], index=['Class'], aggfunc=lambda x: len(x.unique()), ) CLNSIG_count=CLNSIG_count.avsnp150 order=['other','not_provided','Uncertain_significance','protective','Benign','Benign/Likely_benign','Likely_benign','Conflicting_interpretations_of_pathogenicity','risk_factor','Likely_pathogenic','Pathogenic/Likely_pathogenic','Pathogenic'] CLNSIG_count = CLNSIG_count.reindex(order) display(CLNSIG_count) # 建立畫布 fig, ax = plt.subplots(ncols=3, figsize=(5,3), sharey=True, gridspec_kw={'width_ratios': (2,1,1)} ) fig.subplots_adjust(wspace=0.0) CLNSIG_count.plot.barh(ax = ax[0],color={"imputed": "#3399FF", "unimputed": "#FFB266"}) CLNSIG_count.plot.barh(ax = ax[1],color={"imputed": "#3399FF", "unimputed": "#FFB266"}) CLNSIG_count.plot.barh(ax = ax[2],color={"imputed": "#3399FF", "unimputed": "#FFB266"}) # 調整愈顯示的單位區間及距離 ax[0].set_xlim(0,450) # 顯示軸長 0~450 ax[0].set_xticks(np.arange(0,400+10,100)) # 顯示刻度 ax[1].set_xlim(1000,7500) ax[1].set_xticks(np.arange(3000,6000+1000,3000)) ax[2].set_xlim(22500,25000) ax[2].set_xticks(np.arange(23300,24500+100,1200)) # 隱藏兩個 subfig 之間的圖表邊界 ax[0].spines['right'].set_visible(False) ax[1].spines['left'].set_visible(False) ax[1].spines['right'].set_visible(False) ax[2].spines['left'].set_visible(False) # 隱藏單位刻度的標示線 ax[1].tick_params(axis='y', which='both', left=False, labelleft=False) ax[2].tick_params(axis='y', which='both', left=False, labelleft=False) # 更改XY軸單位/類別文字的大小 ax[0].yaxis.set_tick_params(labelsize=10) # 繪製波浪線 d1 = 0.04 d2 = 0.025 # 波風波谷振福 wn = 59 # 波浪數量，需為奇數 pp = (0,d2,0,-d2) px = np.linspace(-d1,1+d1,wn) py = np.array([1+pp[i%4] for i in range(0,wn)]) p = Path(list(zip(py,px)), [Path.MOVETO]+[Path.CURVE3]*(wn-1)) line1 = mpatches.PathPatch(p, lw=4, edgecolor='black', facecolor='None', clip_on=False, transform=ax[0].transAxes, zorder=10) line2 = mpatches.PathPatch(p,lw=3, edgecolor='white', facecolor='None', clip_on=False, transform=ax[0].transAxes, zorder=10, capstyle='round') a = ax[1].add_patch(line1) a = ax[1].add_patch(line2) d1 = 0.04 d2 = 0.06 wn = 59 pp = (0,d2,0,-d2) px = np.linspace(-d1,1+d1,wn) py = np.array([1+pp[i%4] for i in range(0,wn)]) p = Path(list(zip(py,px)), [Path.MOVETO]+[Path.CURVE3]*(wn-1)) line1 = mpatches.PathPatch(p, lw=4, edgecolor='black', facecolor='None', clip_on=False, transform=ax[1].transAxes, zorder=10) line2 = mpatches.PathPatch(p,lw=3, edgecolor='white', facecolor='None', clip_on=False, transform=ax[1].transAxes, zorder=10, capstyle='round') a = ax[2].add_patch(line1) a = ax[2].add_patch(line2) # 調整 legend 的數量跟位置 ax[0].get_legend().remove() ax[1].get_legend().remove() ax[2].legend(bbox_to_anchor=(1.05, 1.0), loc='upper left') # 影像輸出 plt.tight_layout() plt.savefig('/media/volume1/amy/TPMI/TPMI_29W_ClinVar.png',dpi=300,bbox_inches="tight") plt.savefig('/media/volume1/amy/TPMI/TPMI_29W_ClinVar.pdf',dpi=300,bbox_inches="tight") plt.show() ``` ![](https://hackmd.io/_uploads/Syz-YiTL2.png) ### Pointplot <https://stackoverflow.com/questions/45849028/seaborn-facetgrid-pointplot-label-data-points> <https://stackoverflow.com/questions/61234555/how-to-remove-repeated-axes-labels-in-a-seaborn-facetgrid> ### Scatter plot ```javascript= from sklearn.datasets import make_blobs import colorcet as cc blobs, labels = make_blobs(n_samples=1000, centers=2, center_box=(-100, 100)) palette = sns.color_palette(cc.glasbey_bw, n_colors=18) fig, ax = plt.subplots(figsize=(20, 6)) sns.scatterplot( data=p_lowest_metadata.sort_values('prevalence rate (per 100,000)', ascending=False), x='logp.value_U', y='prevalence rate (per 100,000)', hue='category', # style='sex', palette=palette, edgecolor=None, alpha=0.8, s=60, # cmap="viridis" ) # plt.title('Exploring Physical Attributes of Different Penguins') # plt.xlabel('Bill Length (mm)') # plt.ylabel('Bill Depth (mm)') plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0) plt.tight_layout() plt.show() plt.savefig( f'Output/Disease_precalence_logp_all_huebyDisease.png', bbox_inches="tight", dpi=300) ``` ![](https://hackmd.io/_uploads/SyvBKi6L2.png) ```javascript= from sklearn.datasets import make_blobs import colorcet as cc # blobs, labels = make_blobs(n_samples=1000, centers=2, center_box=(-100, 100)) # palette = sns.color_palette(cc.glasbey, n_colors=17) g = sns.FacetGrid( p_lowest_metadata, col="category", hue="category", palette=palette, height=2, aspect=1.5, col_wrap = 6) g = g.map(plt.scatter, "logp.value_U", "prevalence rate (per 100,000)", edgecolor=None, alpha=0.8, s=60) grouped = p_lowest_metadata.groupby(["category"], sort=False) for ax, (name, df) in zip(g.axes.flat, grouped): SNP_raw_count = np.size(df['phenotype_codebook']) sig_SNP_raw_count = np.size(df[df['logp.value_U']>-np.log10(0.0000025)]['phenotype_codebook']) print(df['category'].unique(),SNP_raw_count,sig_SNP_raw_count) ax.text(.7, .9, "n = "+'{:.0f} / {:.0f}'.format(sig_SNP_raw_count,SNP_raw_count),transform=ax.transAxes,size=10) # ax.text(.8, .7, " n = "+'{:.0f}'.format(sig_SNP_raw_count),transform=ax.transAxes,size=8) g.set_titles("{col_name}",size = 10,) g.set_axis_labels("$-log_{10}{(P)}$", "Prevalence rate\n(per 100,000 people)") plt.savefig( f'Output/Disease_precalence_logp_all_byDisease.png', bbox_inches="tight", dpi=300) ``` ![](https://hackmd.io/_uploads/B1ZPYiTIn.png) ```javascript= fig, ax = plt.subplots(figsize=(7,4)) PRS_name=plot_file.split("/")[8] ax = sns.scatterplot( x="percentile_100", y="IID", data=PGS_score_count[PGS_score_count['Pheno']=='Case'], s=60, alpha=0.9, edgecolor=None, hue='percentile', palette=palette ) ax.spines[['right', 'top']].set_visible(False) plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0) ax.set_xlabel("Percentile of polygenic score",fontsize=12) ax.set_ylabel("Prevalence of disease (%)",fontsize=12) plt.savefig( f'Output/Scatterplot_of_cases_{PRS_name}.png', bbox_inches="tight", dpi=300) plt.show() ``` ![](https://hackmd.io/_uploads/Syrpqia82.png) ```javascript= import matplotlib.pyplot as plt import numpy as np; np.random.seed(0) import matplotlib.transforms as transforms T_PGS_sum_df=PGS_sum_df[(PGS_sum_df['AUROC'].notnull())&(PGS_sum_df['contain_covar']==True)] T_PGS_sum_df['Broad Ancestry Category'] = pd.Categorical(T_PGS_sum_df['Broad Ancestry Category'], ['European', 'European, African unspecified', 'European, Not reported', 'East Asian', 'South Asian', 'South East Asian', 'Asian unspecified', 'African American or Afro-Caribbean', 'African unspecified', 'Sub-Saharan African', 'Greater Middle Eastern', 'Native American', 'Hispanic or Latin American', 'Oceanian', 'Other admixed ancestry', 'Multi-ancestry', 'Not reported']) F_PGS_sum_df=PGS_sum_df[(PGS_sum_df['AUROC'].notnull())&(PGS_sum_df['contain_covar']==False)] F_PGS_sum_df['Broad Ancestry Category'] = pd.Categorical(F_PGS_sum_df['Broad Ancestry Category'], ['European', 'European, African unspecified', 'European, Not reported', 'East Asian', 'South Asian', 'South East Asian', 'Asian unspecified', 'African American or Afro-Caribbean','African unspecified', 'Sub-Saharan African', 'Greater Middle Eastern', 'Native American', 'Hispanic or Latin American', 'Oceanian', 'Other admixed ancestry', 'Multi-ancestry', 'Not reported']) # year = T_PGS_sum_df['Broad Ancestry Category'] # values = T_PGS_sum_df['AUROC'] fig, ax = plt.subplots(figsize=(10,6)) offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans) trans = plt.gca().transData sc1 = plt.scatter(T_PGS_sum_df['Broad Ancestry Category'], T_PGS_sum_df['AUROC'], facecolors='none', edgecolors='blue', s = 25, transform=trans+offset(-3),alpha=0.8) plt.scatter(T_PGS_sum_df['Broad Ancestry Category'], T_PGS_sum_df['AUROC'], facecolors='none', edgecolors='none', s = 25,alpha=1) plt.scatter(F_PGS_sum_df['Broad Ancestry Category'], F_PGS_sum_df['AUROC'], facecolors='none', edgecolors='red', s = 25,alpha=0.8, transform=trans+offset(3)) plt.xticks(rotation=90, ha='center') ax.xaxis.grid(ls='--', c='gray',linewidth=1,alpha=0.3) plt.show() ``` ![](https://hackmd.io/_uploads/SJJHjop82.png) ### lmplot ```javascript= from scipy import stats # fig, ax = plt.subplots(figsize=(20, 6)) g = sns.lmplot( data=p_lowest_metadata, x='No.SNP_PGScatalog', y='No.SNP_CMUH_caculate', fit_reg=True, lowess=True, ci=95, height=5, aspect=1, line_kws={'color':'b',"lw":2}, scatter_kws={"color": "black", "alpha":0.3, "s":60}, order=0, # logx=True, # color="g" ) g.set_axis_labels("Number of SNPs list in PGS catalog", "Number of SNPs used for calculation",fontsize=12) slope = stats.stats.linregress(p_lowest_metadata['No.SNP_PGScatalog'],p_lowest_metadata['No.SNP_CMUH_caculate'])[0] r, p = stats.pearsonr(p_lowest_metadata['No.SNP_PGScatalog'],p_lowest_metadata['No.SNP_CMUH_caculate']) print('p=',p) # plt.text(0, 4.5e6,'slope = {:.2f}'.format(slope),fontsize=10) plt.text(0, 4.0e6,'r = {:.2f}'.format(r),fontsize=10) plt.text(0, 3.5e6,'p = {:.2f}'.format(p),fontsize=10) for ax in g.axes.flat: labels = ax.get_xticks() # get x labels new_labels = ['{:,.0f}'.format(label) + 'M' for label in labels/1000000] # ax.set_xticks(labels) ax.set_xticklabels(new_labels, rotation=0) # set new labels labels = ax.get_yticks() # get y labels new_labels = ['{:,.0f}'.format(label) + 'M' for label in labels/1000000] # ax.set_yticks(labels) ax.set_yticklabels(new_labels, rotation=0) # set new labels plt.savefig( f'Output/p_lowest_metadata_SNP_count.png', bbox_inches="tight", dpi=300) ``` ![](https://hackmd.io/_uploads/HJMiFoa8h.png) ### Jointplot ```javascript= from scipy import stats from sklearn.preprocessing import StandardScaler scale = StandardScaler() #z-scaler物件 sub_set = p_lowest_metadata[['logp.value_U','prevalence rate (per 100,000)']] sub_set = sub_set[~sub_set.isin([np.nan, np.inf, -np.inf]).any(axis=1)] train_set_scaled = pd.DataFrame(scale.fit_transform(sub_set), columns=sub_set.keys()) j=sns.jointplot( sub_set['logp.value_U'], sub_set['prevalence rate (per 100,000)'], kind="reg", marker='x', color='b', scatter_kws={'alpha':0.5}, marginal_kws=dict(bins=20,kde=True,color='b'), ) r, p = stats.pearsonr(sub_set['logp.value_U'],sub_set['prevalence rate (per 100,000)']) slope = stats.stats.linregress(sub_set['logp.value_U'],sub_set['prevalence rate (per 100,000)'])[0] # j.ax_joint.annotate('slope = {:.2f}'.format(slope), xy=(.83, .85), xycoords='axes fraction',fontsize=10) j.ax_joint.annotate('r = {:.2f} '.format(r), xy=(.73, .85), xycoords='axes fraction',fontsize=10) print('p=',p) j.ax_joint.annotate('$P = 6.44 x 10^{%d}$'%(-3), xy=(.73, .78), xycoords='axes fraction',fontsize=10) j.ax_joint.annotate('$P = 2.5 x 10^{%d}$'%(-6), xy=(.10, .8), xycoords='axes fraction',fontsize=10, color='crimson') for ax in (j.ax_joint, j.ax_marg_x): ax.axvline(-np.log10(0.0000025), color='crimson', ls='--', lw=1.5) j.set_axis_labels("\n$-log_{10}{(P)}$", "Prevalence rate\n(per 100,000 people)\n",fontsize=12) j.fig.set_size_inches(5,5) plt.savefig( f'Output/Disease_precalence_logp_all.png', bbox_inches="tight", dpi=300) plt.show() ``` ![](https://hackmd.io/_uploads/HJ2aKi68n.png) ```javascript= PGS_sum_df['Category_num2']=np.where(PGS_sum_df['contain_covar']==True, PGS_sum_df['Category_num']+0.2, PGS_sum_df['Category_num']-0.2) g=sns.jointplot(data=PGS_sum_df[PGS_sum_df['AUROC'].notnull()], x="Category_num2", y="AUROC", hue="contain_covar",palette={True:'cornflowerblue',False:'darkorange'},marker='o', s = 50,alpha=0.4) g.fig.set_size_inches((10, 4)) g.ax_marg_x.set_xlim(0, 18) g.ax_joint.set_xticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]) value_label_map = {'1':'European', '2':'European, African unspecified', '3':'European, Not reported', '4':'East Asian', '5':'South Asian', '6':'South East Asian', '7':'Asian unspecified', '8':'African American or Afro-Caribbean', '9':'African unspecified', '10':'Sub-Saharan African', '11':'Greater Middle Eastern', '12':'Native American', '13':'Hispanic or Latin American', '14':'Oceanian', '15':'Other admixed ancestry', '16':'Multi-ancestry', '17':'Not reported'} ticks = g.ax_joint.get_xticks() print(ticks) new_tick_labels = ['','European', 'European, African unspecified', 'European, Not reported', 'East Asian', 'South Asian', 'South East Asian', 'Asian unspecified', 'African American or Afro-Caribbean', 'African unspecified', 'Sub-Saharan African', 'Greater Middle Eastern', 'Native American', 'Hispanic or Latin American', 'Oceanian', 'Other admixed ancestry', 'Multi-ancestry', 'Not reported',''] print(new_tick_labels) g.ax_joint.set_xticklabels(new_tick_labels,rotation=90) g.ax_joint.legend(labels =['Covariates included','Without covariates'],bbox_to_anchor=(1,0.2), borderaxespad=0) g.set_axis_labels("\nBroad Ancestry Category", "Area Under Curve (AUC)",fontsize=12) plt.savefig( f'Output/Scatterplot_of_PGS_catalog.png', bbox_inches="tight", dpi=300) ``` ![](https://hackmd.io/_uploads/r1Gvio6In.png) ### Heatmap ```javascript= # AUC_catagory = p_lowest_metadata[p_lowest_metadata['logp.value_U']>-np.log10(0.0000025)] AUC_catagory = p_lowest_metadata.copy() bins=[0,0.3,0.4,0.5,0.6,0.7,0.8,0.9] # print(pd.cut(AUC_catagory['AUC'],bins)) AUC_catagory['categories_AUC']=pd.cut(AUC_catagory['AUC'],bins) AUC_catagory_count=AUC_catagory.pivot_table( index='categories_AUC', columns='category', values='phenotype_codebook', aggfunc=lambda x: len(x.unique()), fill_value=0 ) # AUC_catagory_count=AUC_catagory_count[['endocrine/metabolic','circulatory system','musculoskeletal','neoplasms','respiratory','dermatologic','genitourinary','digestive','mental disorders','injuries & poisonings','sense organs','infectious diseases','neurological','symptoms','others','hematopoietic','pregnancy complications','congenital anomalies']] ax = sns.heatmap(AUC_catagory_count,cmap='plasma',annot=True,fmt='d',square=False) # ax.set(xlabel="category", ylabel="Area Under Curve (AUC)",fontsize=10) ax.set_xlabel("Disease category",fontsize=12) ax.set_ylabel("Area Under Curve (AUC)\n",fontsize=12) # plt.savefig( # f'Output/Disease_precalence_logp_all.png', # bbox_inches="tight", dpi=300) plt.show() ``` ![](https://hackmd.io/_uploads/H1Fl9opI2.png) ### Pie chart ```javascript= fig, ax = plt.subplots(figsize=(10, 4), subplot_kw=dict(aspect="equal")) # x = count['patientID'] def func(s,d): t = int(round(s/100.*sum(d))) # 透過百分比反推原本的數值 return f'{s:.1f}%\n( {t} )' # 使用文字格式化的方式，顯示內容 wedges, texts, autotexts = ax.pie(x, radius=1.5, textprops={'color':'black', 'weight':'bold', 'size':12}, # 設定文字樣式 # explode=(0, 0, 0, 0, 0), pctdistance=0.7, autopct=lambda i: func(i,x), # labels=count['TOAST'], # wedgeprops={'linewidth':3,'edgecolor':'white'}, # 繪製每個扇形的外框 ) ax.legend(wedges, count['TOAST'], title="TOAST", title_fontsize = 14, fontsize="12", loc="center left", # bbox_to_anchor=(1.05, 1.0) bbox_to_anchor=(1.2, 0, 0.5, 1) ) plt.show() ``` ![](https://hackmd.io/_uploads/B1kPw-3d3.png) ### RFECV ```javascript= RFECV = pd.read_csv('LAA_project/g_scores_LR.txt',sep='\t') RFECV_arr = RFECV.to_numpy() RFECV = RFECV.reset_index() RFECV.columns = ['Number of features','cross1','cross2','cross3','cross4','cross5','cross6','cross7','cross8','cross9','cross10'] RFECV['Number of features'] = RFECV['Number of features'] +1 RFECV_m1 = pd.melt( RFECV, id_vars = ['Number of features'], value_vars = ['cross1','cross2','cross3','cross4','cross5','cross6','cross7','cross8','cross9','cross10'], var_name = 'cross', value_name ='AUC') RFECV_avg = np.average(RFECV_arr, axis=1) xmax = np.arange(0, len(RFECV_avg))[np.argmax(RFECV_avg)] ymax = RFECV_avg.max() fig, ax = plt.subplots(figsize=(5,3)) ax.title.set_text('RFECV for Logistic Regression') g=sns.lineplot( data=RFECV_m1, x="Number of features", y="AUC", markers=True, dashes=False) g.axvline(x=xmax, color='r', label='test lines',linestyle='--') text= "AUC = {:.3f} \n Num. of features = {:.0f} ".format(ymax,xmax+1) kw = dict(xycoords='data',textcoords="axes fraction", ha="right", va="center") g.annotate(text, xy=(xmax, ymax), xytext=(0.95, 0.1), **kw,fontsize=10) plt.savefig( 'LAA_project/figure_RFECV1.png', bbox_inches="tight", dpi=300) plt.yticks(np.arange(0.5, 0.95, step=0.05)) plt.show() ``` ![](https://hackmd.io/_uploads/Hkvl25T8h.png) ### Manhaton * Auto select top 15 genes ```javascript= path=f'/media/volume1/amy/TPMI/2_29W_GWAS_result/new_script_result/' imputedLOG=gb.glob(path+f'*_{PROJ}_imputed/Marge_{PROJ}_TPMI_imputed_adjGWAS.*.logistic') imputedANNOT=gb.glob(path+f'*_{PROJ}_imputed/Marge_{PROJ}_TPMI_imputed_adjGWAS.*.annot') rawLOG=gb.glob(path+f'*_{PROJ}_raw/Marge_{PROJ}_TPMI_raw_adjGWAS.*.logistic') rawANNOT=gb.glob(path+f'*_{PROJ}_raw/Marge_{PROJ}_TPMI_raw_adjGWAS.*.annot') statistic_df = pd.read_csv(imputedLOG[0],sep='\t') statistic_df = statistic_df.sort_values('#CHROM', ascending=True).reset_index(drop=True) # display(statistic_df.head(5)) annotate_df = pd.read_csv(imputedANNOT[0],sep='\t') annotate_df['ANNOT'] = annotate_df['P ANNOT'].str.split(' ',expand=True)[1] # display(annotate_df.head(5)) df1 = pd.merge(statistic_df,annotate_df[['SNP','ANNOT']].drop_duplicates(),left_on='ID',right_on='SNP',how='left') display(df1.head(5)) statistic_df = pd.read_csv(rawLOG[0],sep='\t') statistic_df = statistic_df.sort_values('#CHROM', ascending=True).reset_index(drop=True) # display(statistic_df.head(5)) annotate_df = pd.read_csv(rawANNOT[0],sep='\t') annotate_df['ANNOT'] = annotate_df['P ANNOT'].str.split(' ',expand=True)[1] # display(annotate_df.head(5)) df2 = pd.merge(statistic_df,annotate_df[['SNP','ANNOT']].drop_duplicates(),left_on='ID',right_on='SNP',how='left') display(df2.head(5)) df1['-logp'] = -np.log10(df1['P']) running_pos = 0 cumulative_pos = [] for chrom, group_df in df1.groupby('#CHROM'): cumulative_pos.append(group_df['POS']+running_pos) # print(chrom,group_df['POS']+running_pos) running_pos+=group_df['POS'].max() df1['cumulative_pos'] = pd.concat(cumulative_pos) df1['SNP number'] = df1.index df2['-logp'] = -np.log10(df2['P']) running_pos = 0 cumulative_pos = [] for chrom, group_df in df2.groupby('#CHROM'): cumulative_pos.append(group_df['POS']+running_pos) # print(chrom,group_df['POS']+running_pos) running_pos+=group_df['POS'].max() df2['cumulative_pos'] = pd.concat(cumulative_pos) df2['SNP number'] = df2.index df2['-logp']=-1*df2['-logp'] fig, ax = plt.subplots(figsize=(12, 6)) sns.scatterplot( data = df1, x = 'cumulative_pos', y = '-logp', # aspect = 2, hue = '#CHROM', palette = ["#F0997D","#D3756B"]*11, # palette = 'Set1', # color="steelblue", linewidth=0, s=15, legend=None, ax = ax ) sns.scatterplot( data = df2, x = 'cumulative_pos', y = '-logp', # aspect = 2, hue = '#CHROM', palette = ["#205295", "#5DA7DB"]*11, # palette = 'Set1', # palette=["red"], linewidth=0, s=15, legend=None, ax = ax ) ax.set_xlabel('Chromosome') ax.set_ylabel('$-log_{10}{(P)}$') ax.set_xticks(df1.groupby('#CHROM')['cumulative_pos'].median()) ax.set_xticklabels(df1['#CHROM'].unique()) plt.tight_layout() # g.fig.suptitle('GWAS plot showing association between SNPs on autosomes and speeding\n') df3 = df1.sort_values(by='-logp', ascending=False) annotations1 = df3[df3['ANNOT'].notnull()].drop_duplicates(subset=['ANNOT'], keep='first') annotations1 = annotations1.head(15) annotations1 = annotations1.apply(lambda p : ax.annotate(p['ANNOT'], (p['cumulative_pos'], p['-logp'])), axis=1).to_list() df4 = df2.sort_values(by='-logp', ascending=True) annotations2 = df4[df4['ANNOT'].notnull()].drop_duplicates(subset=['ANNOT'], keep='first') annotations2 = annotations2.head(15) annotations2 = annotations2.apply(lambda p : ax.annotate(p['ANNOT'], (p['cumulative_pos'], p['-logp'])), axis=1).to_list() ax.axhline(0, linestyle='-', linewidth=1.5, color='black') ax.axhline(-np.log10(0.00000005), linestyle='-', linewidth=1, color='r') ax.axhline(np.log10(0.00000005), linestyle='-', linewidth=1, color='r') ax.axhline(5, linestyle='--', linewidth=1, color='b') ax.axhline(-5, linestyle='--', linewidth=1, color='b') df1_name='imputed data' df2_name='raw data' props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) ax.text(.03, .92, "{}".format(df1_name),transform=ax.transAxes,size=12,bbox=props) ax.text(.03, .05, "{}".format(df2_name),transform=ax.transAxes,size=12,bbox=props) adjust_text(annotations1) adjust_text(annotations1, arrowprops = {'arrowstyle' : '-', 'color' : 'green'}) adjust_text(annotations2) adjust_text(annotations2, arrowprops = {'arrowstyle' : '-', 'color' : 'green'}) ticks = ax.get_yticks() plt.yticks(np.arange(min(ticks), max(ticks)+3, 3.0)) ticks = ax.get_yticks() ax.set_yticklabels([int(abs(tick)) for tick in ticks]) # plt.savefig( # f'/media/volume1/amy/TPMI/2_29W_GWAS_result/{TIME}_Subgroups_{PROJ}_pyManhatton.png', # bbox_inches="tight", dpi=100) # plt.savefig( # f'/media/volume1/amy/TPMI/2_29W_GWAS_result/{TIME}_Subgroups_{PROJ}_pyManhatton.pdf', # bbox_inches="tight", dpi=100) plt.show() ``` ![](https://hackmd.io/_uploads/SksxroTIh.png) * Manual select sig threshold for annotation on the fig ```javascript= # imputed and raw statistic_df = pd.read_csv('/media/volume1/bioinfo/Terry-Packages/Result/20221216_PD_match/GWAS/20221216_PD_match.Pheno.glm.logistic.hybrid',sep='\t') display(statistic_df.head(5)) annotate_df = pd.read_csv('/media/volume1/bioinfo/Terry-Packages/Result/20221216_PD_match/GWAS/20221216_PD_match.txt.annot',sep='\t') annotate_df['ANNOT'] = annotate_df['ERRCODE ANNOT'].str.split('.',expand=True)[1] display(annotate_df.head(5)) df1 = pd.merge(statistic_df,annotate_df[['SNP','ANNOT']].drop_duplicates(),left_on='ID',right_on='SNP',how='left') display(df1.head(5)) statistic_df = pd.read_csv('/media/volume1/bioinfo/Terry-Packages/Result/20221220_PD_match_raw/GWAS/20221220_PD_match_raw.Pheno.glm.logistic.hybrid',sep='\t') display(statistic_df.head(5)) annotate_df = pd.read_csv('/media/volume1/bioinfo/Terry-Packages/Result/20221220_PD_match_raw/GWAS/20221220_PD_match_raw.txt.annot',sep='\t') annotate_df['ANNOT'] = annotate_df['ERRCODE ANNOT'].str.split('.',expand=True)[1] display(annotate_df.head(5)) df2 = pd.merge(statistic_df,annotate_df[['SNP','ANNOT']].drop_duplicates(),left_on='ID',right_on='SNP',how='left') display(df2.head(5)) df1['-logp'] = -np.log10(df1['P']) running_pos = 0 cumulative_pos = [] for chrom, group_df in df1.groupby('#CHROM'): cumulative_pos.append(group_df['POS']+running_pos) # print(chrom,group_df['POS']+running_pos) running_pos+=group_df['POS'].max() df1['cumulative_pos'] = pd.concat(cumulative_pos) df1['SNP number'] = df1.index df2['-logp'] = -np.log10(df2['P']) running_pos = 0 cumulative_pos = [] for chrom, group_df in df2.groupby('#CHROM'): cumulative_pos.append(group_df['POS']+running_pos) # print(chrom,group_df['POS']+running_pos) running_pos+=group_df['POS'].max() df2['cumulative_pos'] = pd.concat(cumulative_pos) df2['SNP number'] = df2.index df2['-logp']=-1*df2['-logp'] sig_set_1=5.5 sig_set_2=4.5 print('imputed') print(df1[(df1['-logp'] > sig_set_1)&(df1['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first').ANNOT.nunique()) print(df1[(df1['-logp'] > sig_set_1)&(df1['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first').ANNOT.unique()) print("{:-^50s}".format("Split Line")) print('unimputed') print(df2[(df2['-logp'] < sig_set_2*-1)&(df2['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first').ANNOT.nunique()) print(df2[(df2['-logp'] < sig_set_2*-1)&(df2['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first').ANNOT.unique()) fig, ax = plt.subplots(figsize=(12, 6)) sns.scatterplot( data = df1, x = 'cumulative_pos', y = '-logp', # aspect = 2, hue = '#CHROM', palette = ["peachpuff", "sandybrown"]*11, # palette = ["#F0997D","#D3756B"]*11, # palette = 'Set1', # color="steelblue", linewidth=0, s=10, legend=None, ax = ax ) sns.scatterplot( data = df2, x = 'cumulative_pos', y = '-logp', # aspect = 2, hue = '#CHROM', palette = ["steelblue", "lightsteelblue"]*11, # palette = ["#205295", "#5DA7DB"]*11, # palette = 'Set1', # palette=["red"], linewidth=0, s=10, legend=None, ax = ax ) ax.set_xlabel('Chromosome') ax.set_ylabel('$-log_{10}{(P)}$') ax.set_xticks(df1.groupby('#CHROM')['cumulative_pos'].median()) ax.set_xticklabels(df1['#CHROM'].unique()) plt.tight_layout() # g.fig.suptitle('GWAS plot showing association between SNPs on autosomes and speeding\n') df3 = df1.sort_values(by='-logp', ascending=False) annotations1 = df3[(df3['-logp'] > sig_set_1)&(df3['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first') annotations1 = annotations1.apply(lambda p : ax.annotate(p['ANNOT'], (p['cumulative_pos'], p['-logp'])), axis=1).to_list() df4 = df2.sort_values(by='-logp', ascending=False) annotations2 = df4[(df4['-logp'] < sig_set_2*-1)&(df4['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first') annotations2 = annotations2.apply(lambda p : ax.annotate(p['ANNOT'], (p['cumulative_pos'], p['-logp'])), axis=1).to_list() ax.axhline(0, linestyle='-', linewidth=1.5, color='black') ax.axhline(-np.log10(0.00000005), linestyle='-', linewidth=1, color='r') ax.axhline(np.log10(0.00000005), linestyle='-', linewidth=1, color='r') ax.axhline(5, linestyle='--', linewidth=1, color='b') ax.axhline(-5, linestyle='--', linewidth=1, color='b') adjust_text(annotations1) # adjust_text(annotations1, arrowprops = {'arrowstyle' : '-', 'color' : 'blue'}) adjust_text(annotations2) # adjust_text(annotations2, arrowprops = {'arrowstyle' : '-', 'color' : 'blue'}) ticks = ax.get_yticks() plt.yticks(np.arange(min(ticks), max(ticks)+1, 3.0)) ticks = ax.get_yticks() ax.set_yticklabels([int(abs(tick)) for tick in ticks]) plt.savefig( f'/media/volume1/amy/TPMI/2_29W_GWAS_result/Subgroups_{PROJ}_pyManhatton.png', bbox_inches="tight", dpi=100) plt.savefig( f'/media/volume1/amy/TPMI/2_29W_GWAS_result/Subgroups_{PROJ}_pyManhatton.pdf', bbox_inches="tight", dpi=100) plt.show() ``` ![](https://hackmd.io/_uploads/H17IbjpU3.png) ```javascript= # Single statistic_df = pd.read_csv('/media/volume1/bioinfo/Terry-Packages/Result/20221215_terry_PD_GWAS/GWAS/20221215_PD.Pheno.glm.logistic.hybrid',sep='\t') display(statistic_df.head(5)) annotate_df = pd.read_csv('/media/volume1/bioinfo/Terry-Packages/Result/20221215_terry_PD_GWAS/GWAS/20221215_PD.txt.annot',sep='\t') annotate_df['ANNOT'] = annotate_df['ERRCODE ANNOT'].str.split('.',expand=True)[1] display(annotate_df.head(5)) df = pd.merge(statistic_df,annotate_df[['SNP','ANNOT']].drop_duplicates(),left_on='ID',right_on='SNP',how='left') display(df.head(5)) df['-logp'] = -np.log10(df['P']) running_pos = 0 cumulative_pos = [] for chrom, group_df in df.groupby('#CHROM'): cumulative_pos.append(group_df['POS']+running_pos) # print(chrom,group_df['POS']+running_pos) running_pos+=group_df['POS'].max() df['cumulative_pos'] = pd.concat(cumulative_pos) df['SNP number'] = df.index df sig_set=6.5 print(df[(df['-logp'] > sig_set)&(df['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first').ANNOT.nunique()) print(df[(df['-logp'] > sig_set)&(df['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first').ANNOT.unique()) g = sns.relplot( data = df, x = 'cumulative_pos', y = '-logp', aspect = 2, hue = '#CHROM', palette = ["steelblue", "lightsteelblue"] * 11, # palette = 'Set1', linewidth=0, s=10, legend=None ) g.ax.set_xlabel('Chromosome') g.ax.set_ylabel('$-log_{10}{(P)}$') g.ax.set_xticks(df.groupby('#CHROM')['cumulative_pos'].median()) g.ax.set_xticklabels(df['#CHROM'].unique()) # g.fig.suptitle('GWAS plot showing association between SNPs on autosomes and speeding\n') df2 = df.sort_values(by='-logp', ascending=False) annotations = df2[(df['-logp'] > sig_set)&(df['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first') # annotations = df2[df2['-logp'] > -np.log10(0.00000005)].drop_duplicates(subset=['ANNOT'], keep='first') annotations = annotations.apply(lambda p : g.ax.annotate(p['ANNOT'], (p['cumulative_pos'], p['-logp'])), axis=1).to_list() for ax in g.axes.flat: ax.axhline(-np.log10(0.00000005), linestyle='-', linewidth=1, color='r') ax.axhline(5, linestyle='--', linewidth=1, color='b') # adjust_text(annotations) adjust_text(annotations, arrowprops = {'arrowstyle' : '-', 'color' : 'blue'}) plt.savefig( f'/media/volume1/amy/TPMI/2_29W_GWAS_result/{PROJ}_pyManhatton.png', bbox_inches="tight", dpi=300) plt.savefig( f'/media/volume1/amy/TPMI/2_29W_GWAS_result/{PROJ}_pyManhatton.pdf', bbox_inches="tight", dpi=300) plt.show() ``` ![](https://hackmd.io/_uploads/Sk76ZiaLn.png) ### QQ plot ```javascript= kwargs = { "marker": "o", "s":20, } if __name__ == "__main__": f, ax = plt.subplots(figsize=(4, 4), facecolor="w", edgecolor="k") ax = qqplot(data=df1[df1['P'].notnull()]['P'], xlabel=r"Expected $-log_{10}{(P)}$", ylabel=r"Observed $-log_{10}{(P)}$", alpha=0.5, color="blue", # title="Test", ax=ax, **kwargs ) plt.savefig( f'/media/volume1/amy/TPMI/2_29W_GWAS_result/Subgroups_{PROJ}_imputed_pyQQplot.png', bbox_inches="tight", dpi=100) plt.savefig( f'/media/volume1/amy/TPMI/2_29W_GWAS_result/Subgroups_{PROJ}_imputed_pyQQplot.pdf', bbox_inches="tight", dpi=100) plt.show() kwargs = { "marker": "o", "s":20, } if __name__ == "__main__": f, ax = plt.subplots(figsize=(4, 4), facecolor="w", edgecolor="k") ax = qqplot(data=df2[df2['P'].notnull()]['P'], xlabel=r"Expected $-log_{10}{(P)}$", ylabel=r"Observed $-log_{10}{(P)}$", alpha=0.5, color="blue", # title="Test", ax=ax, **kwargs ) plt.savefig( f'/media/volume1/amy/TPMI/2_29W_GWAS_result/Subgroups_{PROJ}_raw_pyQQplot.png', bbox_inches="tight", dpi=100) plt.savefig( f'/media/volume1/amy/TPMI/2_29W_GWAS_result/Subgroups_{PROJ}_raw_pyQQplot.pdf', bbox_inches="tight", dpi=100) plt.show() ``` ![](https://hackmd.io/_uploads/Hyd_ZoTUh.png) ### PGS plot ```javascript= def displot_annotated(data_all,col_wrap, save=None): palette = {"Control": "#30A9DE", "Case":'#ef5285'} grid = sns.FacetGrid( data_all, col='PGS_catalog',col_wrap=col_wrap, hue='Pheno', height=2, aspect=2,sharex=False, sharey=False,palette=palette) grid.map(sns.kdeplot, 'SCORE', shade=True) # grid.map(plt.axvline,x=0,ls='--', c='gray') grouped = data_all.groupby(["PGS_catalog"]) def vertical_mean_line_survived(x, **kwargs): ls = {"Control":"-","Case":"-"} plt.axvline(x.mean(), linestyle =ls[kwargs.get("label","0")], color = kwargs.get("color", "g")) # if kwargs.get("label","0") == "Control": # txkw = dict(size=12, color = kwargs.get("color", "g"), rotation=90) # tx = "$\mu$ = {:.2f}, $\sigma$ = {:.2f}".format(x.mean(),x.std()) # plt.text(x.mean()-0.65, 0.052, tx, **txkw,fontsize=8) # else: # txkw = dict(size=12, color = kwargs.get("color", "g"), rotation=90) # tx = "$\mu$ = {:.2f}, $\sigma$ = {:.2f}".format(x.mean(),x.std()) # plt.text(x.mean()+0.45, 0.052, tx, **txkw,fontsize=8) for ax, (name, df) in zip(grid.axes.flat, grouped): U1, p = stats.mannwhitneyu( df[(df['Pheno']=='Control')]['SCORE'], df[(df['Pheno']=='Case')]['SCORE'], method="auto") if p<0.000025: ax.text(.7, .75, "$\it{p}$"+' = {:.2e}'.format(p), transform=ax.transAxes, size=8, color='red', fontweight='bold') elif (0.000025<=p)&(p<0.001): ax.text(.7, .75, "$\it{p}$"+' = {:.2e}'.format(p), transform=ax.transAxes, size=8, color='blue', fontweight='bold') else: ax.text(.7, .75, "$\it{p}$"+' = {:.2e}'.format(p), transform=ax.transAxes, size=8, color='black') # ax.text(.7, .75, "$\it{p}$"+' = {:.2g}'.format(p),transform=ax.transAxes,size=8) SNP_raw_count = np.mean(df['No.SNP_Raw']) SNP_pgs_count = np.mean(df['No.SNP_caculate']) ax.text(.7, .9, "SNPs = "+'{:.0f} / {:.0f}'.format(SNP_pgs_count, SNP_raw_count),transform=ax.transAxes,size=8) df2 = df.groupby("Pheno") # group by each thing that has its own color and run stats on it for i, (group, data) in enumerate(df2): x = data["SCORE"] # calculate stats and create label n = len(x) mean = np.mean(x) std = np.std(x) # label = r"%s: $\mu$=%.2f $\sigma$=%.2f" %(group, mean, std) label = r"$\mu$ $_{%s}$ = %.2f " %(group.lower(), mean) # label = r"$\mu$$_{m}$={:.2f}" .format("m":group, mean) ax.annotate(label, xy=(0.7,0.5-(i*0.15)), xycoords='axes fraction', ha='left', size=8) wd = stats.wasserstein_distance( df[(df['Pheno']=='Control')]['SCORE'], df[(df['Pheno']=='Case')]['SCORE']) if wd>0.15: ax.text(.7, .15, "$D_{w}$"+' = {:.2f}'.format(wd), transform=ax.transAxes, size=8, color='red', fontweight='bold') elif (wd<=0.15)&(wd>0.1): ax.text(.7, .15, "$D_{w}$"+' = {:.2f}'.format(wd), transform=ax.transAxes, size=8, color='blue', fontweight='bold') else: ax.text(.7, .15, "$D_{w}$"+' = {:.2f}'.format(wd), transform=ax.transAxes, size=8, color='black') # wd_lable = r"D$\it{w}$ = %.2f " %(group.lower(), wd) grid.map(vertical_mean_line_survived, 'SCORE') grid.set_titles("{col_name}",size = 10) grid.set_axis_labels("", "") grid.fig.subplots_adjust(hspace =0.7) grid.add_legend() if save==None: pass else : plt.savefig( f'{save}.png', bbox_inches="tight", dpi=300) plt.savefig( f'{save}.pdf', bbox_inches="tight", dpi=300) ``` ![](https://i.imgur.com/4odMYmU.png) ```javascript= # create stacked bar chart for monthly temperatures # fig, ax = plt.subplots(figsize=(9,6)) count.plot(kind='barh', stacked=True,figsize=(5, 15)) # labels for x & y axis plt.xlabel('SNP count') # plt.xticks(rotation=45,ha='right' ) plt.ylabel('Gene') # title of plot plt.title('SNP count in gene and function') plt.savefig( f'/media/volume1/amy/TPMI/{PROJ}/SNP_distribution.png', bbox_inches="tight", dpi=300) plt.show() ``` ![](https://hackmd.io/_uploads/BJNVys6U2.png) ```javascript= PGS_pheno = pd.read_csv('/media/volume1/amy/TEST/230118-AD/EEG-old/EEG.PRS.pheno.txt',sep='\t') PGS_score = pd.read_csv('/media/volume1/amy/TEST/230118-AD/PRS-Raw-EEG/raw/PRS-Raw-EEG.best',sep=' ') PGS_summary = pd.read_csv('/media/volume1/amy/TEST/230118-AD/PRS-Raw-EEG/raw/PRS-Raw-EEG.summary',sep='\t') PGS_score = pd.merge(PGS_pheno, PGS_score, on=['FID','IID'],how='inner').drop_duplicates() PGS_score['PGS_catalog'] = 'AIC' PGS_score['Pheno'] = PGS_score['Pheno'].replace({2:"Case",1:"Control"}) display(PGS_score) display(PGS_summary) PGS_summary['Num_SNP'][0] fig, ax = plt.subplots(figsize=(9,9)) palette = {"Control": "#30A9DE", "Case":'#ef5285'} ax = sns.kdeplot(data=PGS_score, x='PRS', shade=True, hue='Pheno', palette=palette) plt.axvline(PGS_score[PGS_score['Pheno']=='Case']['PRS'].mean(), linestyle ='-', color = "#ef5285") plt.axvline(PGS_score[PGS_score['Pheno']=='Control']['PRS'].mean(), linestyle ='-', color = "#30A9DE") # for ax, (name, df) in zip(grid.axes.flat, grouped): U1, p = stats.mannwhitneyu( PGS_score[(PGS_score['Pheno']=='Control')]['PRS'], PGS_score[(PGS_score['Pheno']=='Case')]['PRS'], method="auto") if p<0.000025: ax.text(.7, .57, "$\it{p}$"+' = {:.2e}'.format(p), transform=ax.transAxes, size=12, color='red', fontweight='bold') elif (0.000025<=p)&(p<0.001): ax.text(.7, .57, "$\it{p}$"+' = {:.2e}'.format(p), transform=ax.transAxes, size=12, color='blue', fontweight='bold') else: ax.text(.7, .57, "$\it{p}$"+' = {:.2e}'.format(p), transform=ax.transAxes, size=12, color='black') # ax.text(.7, .75, "$\it{p}$"+' = {:.2g}'.format(p),transform=ax.transAxes,size=8) SNP_pgs_count = PGS_summary['Num_SNP'][0] ax.text(.7, .65, "SNPs = "+'{:.0f}'.format(SNP_pgs_count),transform=ax.transAxes,size=12) df2 = PGS_score.groupby("Pheno") # group by each thing that has its own color and run stats on it for i, (group, data) in enumerate(df2): x = data["PRS"] # calculate stats and create label n = len(x) mean = np.mean(x) std = np.std(x) label = r"$\mu$ $_{%s}$ = %.2f " %(group.lower(), mean) ax.annotate(label, xy=(0.7,0.5-(i*0.05)), xycoords='axes fraction', ha='left', size=12) wd = stats.wasserstein_distance( PGS_score[(PGS_score['Pheno']=='Control')]['PRS'], PGS_score[(PGS_score['Pheno']=='Case')]['PRS']) if wd>0.15: ax.text(.7, .35, "$D_{w}$"+' = {:.2f}'.format(wd), transform=ax.transAxes, size=12, color='red', fontweight='bold') elif (wd<=0.15)&(wd>0.1): ax.text(.7, .35, "$D_{w}$"+' = {:.2f}'.format(wd), transform=ax.transAxes, size=12, color='blue', fontweight='bold') else: ax.text(.7, .35, "$D_{w}$"+' = {:.2f}'.format(wd), transform=ax.transAxes, size=12, color='black') ``` ![](https://hackmd.io/_uploads/HyRf8s6Ih.png) ## Additional process * Date ```javascript! TIME=pd.Timestamp.now().strftime("%Y%m%d") ICD9['DATE'] = ICD9['DATE'].apply(pd.to_datetime, format='%Y-%m-%d', errors='coerce') SocialHum['now'] = pd.to_datetime('now', format='%Y-%m-%d') ``` * Sort ```javascript! merge_cancer=merge_cancer.sort_values(by=['Age_merged'],ascending=True) ``` * TableOne ```javascript! cancer_select=merge_cancer.drop_duplicates(subset=['cancer_registry_database','索引號'], keep='first') columns = ['Entry_age','Diagnosis_age','性別','Age_bin','cancer_registry_database'] categorical = ['性別','Age_bin'] groupby = 'cancer_registry_database' mytable = TableOne(cancer_select, columns, categorical, groupby, pval = True) mytable.tableone.to_excel(f'/home/amy/TPMI/Ori_Apt_Adv/{FOLD}/Demographic characteristics_{TIME}.xlsx') mytable.tableone ``` * Add color in the Jupyter Mark down ```javascript! Statistic of <span style="color:yellow">Title</span> in genes ``` * Split line ```javascript! print("{:-^50s}".format("Split Line")) ``` * chunks ```javascript! def chunks(lst, n): """Yield successive n-sized chunks from lst.""" for i in range(0, len(lst), n): yield lst[i:i + n] pID_chunks = list(chunks(pID_list, 1000)) ``` * chunk list to file ```javascript= path=f'/media/volume1/amy/TPMI/Dementia_re' outpath=f'/media/volume1/amy/TPMI/Dementia_re/chunk' num_size=5 def chunks(lst, n): """Yield successive n-sized chunks from lst.""" for i in range(0, len(lst), n): yield lst[i:i + n] filenames=gb.glob(path+"/"+'rs*_list.csv') file_chunks = list(chunks(filenames, num_size)) for i in range(0,len(file_chunks)): print(i) f = open(f"{outpath}/chunksample_{i}.txt", 'w') for element in file_chunks[i]: f.write("%s\n" % element) f.close() ``` * SQL連線與資料抓取 (每1000個) ```javascript= pID_list = Target_list['PatientID'].unique().tolist() pID_chunks = list(chunks(pID_list, 1000)) f_out=pd.DataFrame() conn = pymssql.connect(server = '10.60.212.206' , user = 'ADCMUH\\T36536' , password = 'LLL123849578' , database = 'DataWareHouse') for i in range(len(pID_chunks)): text="select a.PatientNo, a.ChartNo, a.SexName, a.Birthday \ from PatientFactDim as a \ Where (a.PatientNo in ({})".format(str(pID_chunks[i])[1:-1]) + ")" MedCode_Review = pd.read_sql(text,conn) # display(MedCode_Review.head(2)) f_out=f_out.append(MedCode_Review) # print('N in datawarehoues:',f_out.PatientNo.nunique()) SocialHum = f_out ``` * SQL連線雨資料抓取(全部) ```javascript! case_list = TPMI_fam_codebook_case['PatientNo'].unique().tolist() t = tuple(case_list) f_out=pd.DataFrame() conn = pymssql.connect(server = '10.17.11.66' , user = 'ADCMUH\\T36536' , password = 'LL123849578' , database = 'DataWareHouse') text="select b.PatientNo, a.VisitNo, b.VisitType, b.DrNo, f.EmpName, b.DivNo, e.ChineseName, e.EnglishName, a.DiagType, d.IcdType, a.DiagCode, d.DiagName, d.DiagEngName, a.OrderTime as 'DiagDate'\ from DiagRecordFactDim as a \ left join VisitRecordFact b on a.VisitNo = b.VisitNo \ left join DiagBasicDim d on a.DiagCode = d.DiagCode \ left join DivisionDim e on b.DivNo = e.DivNo \ left join EmployeeFactDim f on b.DrNo = f.EmpNo \ Where b.PatientNo in {}".format(t) VisitRecord_case = pd.read_sql(text,conn) VisitRecord_case ``` * print list 不斷行 ```javascript! np.set_printoptions(threshold=np.inf) ``` * 不讀檔到 python 直接計算 nunique / value_count ```javascript! awk '{print $17}' snplist.annovar.hg38_multianno.txt | grep -v "avsnp150" | uniq | wc -l ``` ```javascript! awk '{print $17}' snplist.annovar.hg38_multianno.txt | grep -v "avsnp150" | uniq -c | wc -l ``` * 計算 loop 數量 ```javascript! print('Running',GWAS_name,'(',filenames.index(filename)+1,'of 501)') ``` ## Statistic <https://docs.scipy.org/doc/scipy/reference/stats.html> ```javascript! # Calculate normal chisquare and fisher's exact test def fisher_exact_test(pDataFile1, pDataFile2): test_result = [] for i, (group1, group2) in enumerate(zip(pDataFile1, pDataFile2)): # print(group1,group2) odds, p_value = stats.fisher_exact(np.ceil([group1, group2])) test_result.append([odds,p_value]) return pd.DataFrame([test_result]).T def chisquare_test(pDataFile1, pDataFile2): test_result = [] for i, (group1, group2) in enumerate(zip(pDataFile1, pDataFile2)): # print(group1) # print(group2) try: obs = np.array([group1, group2]) # print(obs) chi2, p_value, dof, ex = stats.chi2_contingency(obs,correction=True) # print(chi2) test_result.append([chi2,p_value,dof]) except ValueError: test_result.append(['na','na','na']) return pd.DataFrame([test_result]).T def oddsration_95CI(pDataFile1, pDataFile2): test_result = [] for i, (group1, group2) in enumerate(zip(pDataFile1, pDataFile2)): # print(group1,group2) confidence_95=sm.stats.Table2x2(np.ceil([group1, group2])).oddsratio_confint(alpha=0.05, method='normal') test_result.append(confidence_95) return pd.DataFrame([test_result]).T ``` * 印出 seaborn 顏色 <https://seaborn.pydata.org/tutorial/color_palettes.html> ![](https://i.imgur.com/dAj32xr.png) ```javascript! color = ["azure", "steelblue", "lightsteelblue", "Yellow", "Green", "Grey"] sns.set_palette(color) sns.palplot(sns.color_palette()) ``` ![](https://hackmd.io/_uploads/B1S7Njp8n.png) ### logistic regression with printing parameters ```javascript! import pandas as pd import statsmodels.formula.api as smf titanic = pd.read_csv("/home/amy/TPMI/Ischemic_stroke_project/titanic.csv") titanic = titanic[["survived", "pclass", "sex", "age", "embark_town"]] titanic = titanic.dropna() log_reg = smf.logit("survived ~ sex + age + embark_town", data=titanic).fit() # Summary of results print(log_reg.summary()) # ... imports, load data, etc. # Define and fit model log_reg = smf.logit("survived ~ sex + age + embark_town", data=titanic).fit() # Inspect paramaters print(log_reg.params) print(dir(log_reg)) # ... imports, load data, etc. import numpy as np # ... Define and fit model odds_ratios = pd.DataFrame( { "OR": log_reg.params, "Lower CI": log_reg.conf_int()[0], "Upper CI": log_reg.conf_int()[1], "p-value":log_reg.pvalues, } ) odds_ratios = np.exp(odds_ratios) print(odds_ratios) ```