# Python 相關指令
###### tags: `Note`
## Print tools' version
```javascript!
print("TensorFlow v" + tf.__version__)
print("Numpy v" + np.__version__)
```
* 印出所有的 version
> /Users/ching/anaconda3/envs/IMG_course/lib/python3.10/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.
> warnings.warn("Setuptools is replacing distutils.")
> /var/folders/5n/ms4ll6f50sv31qy8rqt3l4l00000gn/T/ipykernel_69972/3724946357.py:14: DeprecationWarning: The '__version__' attribute is deprecated and will be removed in Werkzeug 3.1. Use feature detection or 'importlib.metadata.version("werkzeug")' instead.
> version = getattr(module, '__version__', 'unknown')
> 這些警告消息通常是由於套件更新或兼容性變更而引起的。警告通常不會影響程式碼的運行,但為了確保未來的兼容性,您可以考慮進行一些調整。
>
> 對於第一條警告,它是關於 Setuptools 取代 Distutils 的警告。這是由於 Setuptools 已經成為 Python 中的主要構建工具,取代了 Distutils。通常情況下,這個警告是可以忽略的,因為 Setuptools 通常能夠提供更好的功能和兼容性。
>
> 至於第二條警告,它是有關 Werkzeug 版本的。該警告建議您使用 importlib.metadata.version("werkzeug") 來獲取 Werkzeug 的版本,而不是直接使用 __version__ 屬性。這是由於 __version__ 屬性在 Werkzeug 3.1 中被標記為廢棄(Deprecation)。
```javascript!
import pkg_resources
import importlib.metadata
import types
# 取得當前 notebook 中已載入的模組
modules = list(pkg_resources.working_set)
modules_names = [module.project_name for module in modules]
# 印出每個模組的名稱和版本
for name in modules_names:
try:
# 載入模組
module = __import__(name)
# 取得模組版本
version = importlib.metadata.version(name)
print(f"{name}: {version}")
except ImportError:
print(f"{name}: version information not available")
```
## Print files
```javascript!
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
```
```javascript!
import glob as gb
import os
path=f'/home/amy/TPMI/Ori_Apt_Adv/{FOLD}/Matched_codebook/'
filenames=gb.glob(path+'Matched_case_control_codebook_*.txt')
alldata = DataFrame()
for filename in filenames:
print("Running",filenames.index(filename)+1,'of 501')
df=pd.read_csv(filename,sep='\t')
df=pd.read_csv(filename,sep="\s+",header=None,names=['mode','affinity','rmsd l.b.','rmsd u.b.'])
df.insert(0, "filename", os.path.dirname(filename))
df['filename'] = os.path.basename(filename)
```
## Read file
* Check encoding type
```javascript!
import chardet
with open('/Users/ching/Documents/work/03_CMUH/4_Manuscript/PRS/WebData_update_231220.txt', 'rb') as f:
text = f.read()
en = chardet.detect(text)
print(chardet.detect(text))
```
* Read vcf
```javascript!
def read_vcf(path):
with open(path, 'r') as f:
lines = [l for l in f if not l.startswith('##')]
return pd.read_csv(
io.StringIO(''.join(lines)),
dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
'QUAL': str, 'FILTER': str, 'INFO': str},
sep='\t'
).rename(columns={'#CHROM': 'CHROM'})
vcf_TPMI_b1tob22_ori = read_vcf('TPMI_b1tob22_original_ok_breast.vcf')
TPMI_b1tob22_ori_df = pd.concat([vcf_TPMI_b1tob22_ori.iloc[:,2],vcf_TPMI_b1tob22_ori.iloc[:,9:]], axis=1).T
new_header = TPMI_b1tob22_ori_df.iloc[0]
TPMI_b1tob22_ori_df = TPMI_b1tob22_ori_df[1:]
TPMI_b1tob22_ori_df.columns = new_header
TPMI_b1tob22_ori_df.index.name = 'ID'
TPMI_b1tob22_ori_df.reset_index(inplace=True)
```
combined
```javascript!
APT=read_vcf(f'/home/amy/TPMI/pheWAS/TPMI_b1tob22_APT_OK_{PROJ}.vcf')
ADV=read_vcf(f'/home/amy/TPMI/pheWAS/TPMI_b1tob22_advnorm_ok_{PROJ}.vcf')
APT_df = pd.concat([APT.iloc[:,2],APT.iloc[:,9:]], axis=1).T
new_header = APT_df.iloc[0]
APT_df = APT_df[1:]
APT_df.columns = new_header
APT_df.index.name = 'ID'
APT_df.reset_index(inplace=True)
# APT_df2=APT_df.T.sort_index().T
# display(APT_df)
ADV_df = pd.concat([ADV.iloc[:,2],ADV.iloc[:,9:]], axis=1).T
new_header = ADV_df.iloc[0]
ADV_df = ADV_df[1:]
ADV_df.columns = new_header
ADV_df.index.name = 'ID'
ADV_df.reset_index(inplace=True)
# ADV_df2=ADV_df.T.sort_index().T
# display(ADV_df)
col_var_list = ADV_df.columns.tolist()
col_var_list.remove('ID')
COM=ADV_df
COM = COM.iloc[0:0]
COM["ID"] = ADV_df["ID"]
for var in col_var_list:
APT_df[var]=APT_df[var].astype(str)
ADV_df[var]=ADV_df[var].astype(str)
COM[var] = np.where(APT_df[var]==ADV_df[var],ADV_df[var],'./.')
COM=COM.set_index('ID')
COM.reset_index(inplace=True)
COM['ID'] = COM['ID'].str.split("_", expand=True)[0]
display(COM)
```
count ./. 0/1 1/1 0/0
```javascript!
SNP_count =COM
gtype = []
for col in SNP_count.columns[1:]:
tmp_series = SNP_count[col].value_counts()
tmp_series.name = col
gtype.append(tmp_series)
SNP_count = pd.concat(gtype, axis=1,sort=True).T
SNP_count = SNP_count.fillna(0)
SNP_count.reset_index(level=0, inplace=True)
SNP_count.rename(columns={'index':'SNP'}, inplace=True)
SNP_count
```
## Dataframe manipulation
* 文字粗搜尋
```javascript!
data[data['column'].str.contains('str')]
```
* 文字完整搜尋
```javascript!
data.loc[data['column']=='str']
```
* 文字粗搜尋不同字串
* 資料無大小寫不同的疑慮
```javascript!
searchfor = ['A','B']
extract = data[data["column"].str.contains('|'.join(searchfor))]
```
* 資料有大小寫不同的疑慮
```javascript!
searchfor = ['A','B']
extract = data[data["column"].str.lower().str.contains('|'.join(searchfor))]
```
* 視整個單字為主體 ex GLA 應避免搜尋出 CGLA or GLAO
```javascript=
# case 指的是要不要管 null 的資料
ANNOVAR_unimputed[
(ANNOVAR_unimputed["Gene.refGene"].notnull())&
(ANNOVAR_unimputed["Gene.refGene"].str.contains(fr"\b(?: {'|'.join(searchfor)})\b",case=False))
]
```
* 在整個 dataframe 中搜尋
```javascript!
searchfor = ['A','B']
extract = data[data.apply(lambda r: r.str.contains('|'.join(searchfor), case=False).any(), axis=1)]
```
* 將有包含 searchfor 中任一字串的 column 列出來
```javascript!
extract2 = extract.isin(searchfor).apply(lambda r: ','.join(r.index[r]), axis=1).reset_index()
extract_new = pd.concat([extract2['index'], extract2[0].str.split(',', expand=True)], axis=1)
New_columns=['ID']
Probe_num=range(1,(len(extract2.columns)))
for i in Probe_num:
New_columns.append(f'ID{i}')
extract_new.columns = New_columns
```
* 搜尋 perfect match 的字串例如 GLA 不可以包含 DSGLA
```javascript!
ANNOVAR_unimputed[ANNOVAR_unimputed["Gene.refGene"].str.contains(fr"\b(?:{'|'.join(searchfor)})\b",case=False)]
```
* 文字完整搜尋不同字串
```javascript!
searchfor=['Likely pathogenic','pathogenic']
extract=data[data['column'].isin(searchfor)]
```
* 資料計算-取log
* log10
```javascript!
data['log']=abs(np.log10(data['column']))
```
* log2 fole change
```javascript!
data['log2FC']=np.log2(data['b']/data['a'])
```
* 整理字串-刪除非數字的字元
```javascript!
data[pd.to_numeric(data['column'], errors='coerce').notnull()]
```
* 更改欄位格式
```javascript!
data.column=casedata_bim_bin.column.astype(str)
```
* 更換 List 的檔案格式
```javascript!
pID_list_new = [float(x) for x in pID_list_new]
```
* 將空格填滿其他文字
```javascript!
data['column']=data['column'].fillna(0)
data['column']=data['column'].fillna('str')
```
* 資料切割
```javascript!
bins=[-1.0,0.0001, 0.0005, 0.001, 0.005, 0.01]
data['Categories']=pd.cut(data['column'],bins)
```
* 列出 data frame 中所有 unique 值
```javascript!
print(list(pd.unique(data.values.ravel('K'))))
```
* 列出欄位中所有 unique 的值
```javascript!
data.column.unique()
data['column'].unique()
```
* 文字取代
* 多個同時取代
```javascript!
data['column']=data['column'].replace({"A":"a","B":"b","C":"c"},inplace=True)
```
* 一次一個
```javascript!
data=data.replace('A','B')
data.column=data.column.replace('A','B')
data['column']=data['column']replace('A','B')
data.loc[data['columnA']=='A','columnA']='B'
```
* Header 取代
```javascript!
data=data.rename(columns={'A':'a','B':'b','C':'c'})
```
* Pivot-table
```javascript!
data.pivot_table(values=['SNP'],
columns=['ClinicalSignificance'],
index=['Categories'],
aggfunc=lambda x: len(x.unique()),
aggfunc=lambda x: ', '.join(x),
aggfunc=[np.mean,np.median, np.std],
aggfunc=lambda x: ', '.join(x.unique()),
aggfunc=lambda x: ', '.join(str(val) for val in x),
dropna=False,
margins=True,
margins_name='All',
fill_value=0)
```
* Combine multi-headers with "_"
```javascript!
APT_val_table.columns = ['_'.join((col)) for col in APT_val_table.columns]
```
* If 判斷式 > true or false
* 單一條件
```javascript!
data['TF'] = np.where(data['A']==data['B'], 'True', 'False')
```
* 多重條件
```javascript!
data.loc[(data['A']== 'a') & (data['B'] == 'b'), ’TF‘] = 'True'
```
* 有條件的處理數值
```javascript!
Target_list['RecurrentDate_ad'] = Target_list.apply(lambda x: x['RecurrentDate'] + 19110000 if x['RecurrentDate']>0 else x['RecurrentDate'],axis=1)
```
* 切割文字串取其中
```javascript!
data['New_column']=data['column'].str.split("_", expand=True)[0]
```
* 重組表格
```javascript!
data2=pd.melt(data, id_vars =['ID'], value_vars =columns,var_name ='catagories', value_name ='Value')
```
* 表格合併
```javascript!
pd.concat([table1,table2],axis=1,ignore_index=True) #有同樣的 index 合併 column
pd.concat([table1,table2],axis=0,ignore_index=True) #有同樣的 column 合併 index
alldata=alldata.append(data)
```
* 取出 str isin a list 的配對資料以方便合併
```javascript!
pat = '|'.join(r"{}".format(x) for x in data['target'])
data['result'] = data.target.str.extract('('+ pat + ')', expand=False)
```
* 轉換日期格式
* 不包含時間且分隔為'/'
```javascript!
data['DATE'] = data['DATE'].apply(pd.to_datetime, format='%Y/%m/%d', errors='coerce')
```
* 包含時間且分隔為'-'
```javascript!
data['DATE'] = data['DATE'].apply(pd.to_datetime, format='%Y-%m-%d %H:%M', errors='coerce')
```
* 計算年紀
```javascript!
data['Age'] = data["Sampling"].dt.year-data["birth"].dt.year
```
* 排序 (遞增)
```javascript!
data=data.sort_values(by=['Age'],ascending=True)
```
* 去除文字空格
```javascript!
data.column = data.column.str.strip()
```
* 每3000行切割文字檔
```javascript!
def Main():
source_dir = f"/home/amy/TPMI/Ori_Apt_Adv/SNP_{PROJ}_plink.txt"
target_dir = '/home/amy/TPMI/Ori_Apt_Adv/TXT_file/'
flag = 0
name = 1
dataList = []
print("start")
# print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
with open(source_dir,'r') as f_source:
for line in f_source:
flag+=1
dataList.append(line)
if flag == 3000:
with open(target_dir+"list_"+str(name)+".txt",'w+') as f_target:
for data in dataList:
f_target.write(data)
name+=1
flag = 0
dataList = []
with open(target_dir+"list_"+str(name)+".txt",'w+') as f_target:
for data in dataList:
f_target.write(data)
print("Done")
# print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
if __name__ == "__main__":
Main()
```
* Groupby + agg
```javascript!
count=data.groupby(['SNP','RefSNP ID from dbSNP','A1A2']).agg({'Affy SNP ID':[np.size],'Probe Set ID':[np.unique],'MAF':[np.unique]})
```
* 將multiple header 用 '_' 合併
```javascript!
count.columns = ['_'.join((col)) for col in count.columns]
```
* 將一個 column 依照分隔符號拆分
```javascript!
anno=pd.DataFrame(select['MAF_unique'].str.split(" ",).tolist()).rename(lambda x: 'x-{}'.format(x + 1), axis=1)
```
* applymap 可以對整個 dataframe 做計算
ex: 全部/100
```javascript!
case_bim_bin_table_sep2=case_bim_bin_table_sep2.applymap(lambda x: '%.2f%%' % (x*100))
```
* read file in a folder & add a column with filename
```javascript!
import glob as gb
import os
path=f'/home/amy/TPMI/Ori_Apt_Adv/{FOLD}/Matched_codebook/'
filenames=gb.glob(path+'Matched_case_control_codebook_*.txt')
alldata = DataFrame()
for filename in filenames:
print("Running",filenames.index(filename)+1,'of 501')
df=pd.read_csv(filename,sep='\t')
df=pd.read_csv(filename,sep="\s+",header=None,names=['mode','affinity','rmsd l.b.','rmsd u.b.'])
df.insert(0, "filename", os.path.dirname(filename))
df['filename'] = os.path.basename(filename)
df['filename'] = df['filename'].str.split("sample_|.txt", expand=True)[1]
df.insert(1, "datatype", df['filename'].str.split("_").str[-1])
alldata=alldata.append(df)
display(alldata.head())
```
* read file with multiple sheets
```javascript!
DIS='FH'
FILE='FH_result_名單'
excel_name =( '/home/amy/other_project/GWAS_validation result/FH result 名單.xlsx' )
wb = xlrd.open_workbook(excel_name)
sheet = wb.sheet_names()
print(sheet)
alldata = DataFrame()
for i in range(len(sheet)):
# df = pd.read_excel(excel_name, sheet_name=i, index=False, encoding='utf8')
df = pd.read_excel(excel_name, sheet_name=i, index=False, encoding='big5')
df['filename'] = os.path.basename(excel_name)
alldata = alldata.append(df)
display(alldata)
```
* set index
```javascript!
alldata=alldata.loc[alldata['Categories']=='All'].set_index(['filename','DIS','CATA']).reset_index()
```
* 抽出包含 0/1, 1/1 的欄位
```javascript!
mutation_select3 = APT_df2[APT_df2.apply(lambda r: r.str.contains('|'.join(mutation), case=False).any(), axis=1)]
```
* 列出 duplicated cells
```javascript!
Target_list[Target_list.duplicated(['PatientID'],keep=False)]
```
* 去除欄位前後空白
```javascript!
ICD_diagnosis_diagage.DiagCode = ICD_diagnosis_diagage.DiagCode.str.strip()
```
* get unique element in a list
```javascript!
mylist = [u'nowplaying', u'PBS', u'PBS', u'nowplaying', u'job', u'debate', u'thenandnow']
unique = reduce(lambda l, x: l.append(x) or l if x not in l else l, mylist, [])
```
* index 依照自訂 order 排序
```javascript!
order=['other','not_provided','Uncertain_significance','protective','Benign','Benign/Likely_benign','Likely_benign','Conflicting_interpretations_of_pathogenicity','risk_factor','Likely_pathogenic','Pathogenic/Likely_pathogenic','Pathogenic']
CLNSIG_count = CLNSIG_count.reindex(order)
```
* 建立自定義編號 <針對某個 group 編列個數> 001 002 003
```javascript!
ICDmap['group_count'] = ICDmap.groupby(['category']).cumcount()+1
ICDmap['group_count'] = ICDmap['group_count'].apply(lambda x: '{0:0>3}'.format(x))
```
* Cut percentile
```javascript!
PGS_score['percentile'] = pd.qcut(PGS_score['PRS'].rank(method='first'), q=10, labels=False,duplicates='raise')+1
```
## Read json files
```javascript=
path=f'/media/volume2/data_base/aic_testing/14_LCMS/'
filenames=gb.glob(path+'7E2Y_4D_peptide_*/model_*_multimer_v2_pred_*.log')
alldata = pd.DataFrame()
for filename in filenames:
with open(filename, 'r') as file:
data = json.load(file)
# Create a dictionary to store the reorganized data
reorganized_data = {}
# Iterate over the rows in data
for row in data:
if isinstance(row, dict):
# If row is a dictionary, assume the keys are column names and values are corresponding values
column_names = list(row.keys())[:4]
values = list(row.values())[0:]
else:
# Handle unsupported row format
continue
# Assign the values to the corresponding columns
for i, column_name in enumerate(column_names):
reorganized_data.setdefault(column_name, []).append(values[i] if i < len(values) else None)
# Create the DataFrame from the reorganized data
df = pd.DataFrame(reorganized_data)
df.insert(0, "filepath", os.path.abspath(filename))
df.insert(1, "group", df['filepath'].str.split("/").str[6])
df.insert(2, "name", df['filepath'].str.split("/|.log").str[7])
df.insert(3, "model", df['name'].str.split("_").str[1])
df.insert(4, "cycle", df['name'].str.split("_|.log").str[5])
alldata=alldata.append(df)
LOGfile=alldata.copy().reset_index(drop=True)
LOGfile
```
```javascript=
path=f'/media/volume2/data_base/aic_testing/14_LCMS/'
filenames=gb.glob(path+'7E2Y_4D_peptide_*/ranking_debug.json')
alldata = pd.DataFrame()
for filename in filenames:
with open('/media/volume2/data_base/aic_testing/14_LCMS/7E2Y_4D_peptide_1/ranking_debug.json') as file:
data = json.load(file)
# Flatten data
df_nested_list = pd.json_normalize(data).T
df_nested_list = df_nested_list.drop('order').reset_index()
df_nested_list.columns = ['name','0.2 iptm + 0.8 ptm']
df_nested_list['name']=df_nested_list['name'].str.split(".", expand=True)[1]
df_nested_list['Rank'] = df_nested_list['0.2 iptm + 0.8 ptm'].rank(ascending=False).astype(int)
df_nested_list.insert(0, "filepath", os.path.abspath(filename))
df_nested_list.insert(1, "group", df_nested_list['filepath'].str.split("/").str[6])
# df_nested_list.insert(3, "model", df_nested_list['name'].str.split("_").str[1])
# df_nested_list.insert(4, "cycle", df_nested_list['name'].str.split("_|.log").str[5])
alldata=alldata.append(df_nested_list)
RANKfile=alldata.copy().reset_index(drop=True)
RANKfile
```
## Figure adjustment
* 加水平線
```javascript!
ax.hlines(19.5, -0.5, 4.5, linestyle='--', linewidth=1)
```
https://www.cntofu.com/book/172/docs/13.md
### ECDF plot
```javascript!
from matplotlib.ticker import FuncFormatter
fig, ax = plt.subplots(figsize=(5, 4))
ecdf_plot = sns.ecdfplot(
data=PGS_for_plot,
x="Number of Individuals",
hue="Used to build model",
palette=["blue", "green","orange"])
# ecdf_plot.set_size_inches(15,15)
# Define a function for formatting x-axis labels
def format_x_ticks(value, pos):
return '{:,.1f}M'.format(value * 1e-6)
# Set the x-axis label formatter
ax.xaxis.set_major_formatter(FuncFormatter(format_x_ticks))
lines = ecdf_plot.get_lines()
x_values = [line.get_xdata() for line in lines]
y_values = [line.get_ydata() for line in lines]
# Find the x-value for proportion = 0.5 for each hue category
target_proportion = 0.3
x_values_at_proportion = [x[np.argmax(y >= target_proportion)] for x, y in zip(x_values, y_values)]
print(target_proportion, x_values_at_proportion)
target_proportion = 0.6
x_values_at_proportion = [x[np.argmax(y >= target_proportion)] for x, y in zip(x_values, y_values)]
print(target_proportion, x_values_at_proportion)
target_proportion = 0.5
x_values_at_proportion = [x[np.argmax(y >= target_proportion)] for x, y in zip(x_values, y_values)]
print(target_proportion, x_values_at_proportion)
target_proportion = 0.5
x_values_at_proportion = [x[np.argmax(y >= target_proportion)] for x, y in zip(x_values, y_values)]
print(target_proportion, x_values_at_proportion)
plt.axhline(0.5, linestyle='--', linewidth=1, color='r')
plt.annotate('{:,.0f}'.format(x_values_at_proportion[0]),
xy = (x_values_at_proportion[0], 0.5),
xytext = (x_values_at_proportion[0]+400000, 0.35),
arrowprops = dict(facecolor = 'black', width = 0.2, headwidth = 10),
horizontalalignment = 'center')
plt.annotate('{:,.0f}'.format(x_values_at_proportion[2]),
xy = (x_values_at_proportion[2], 0.5),
xytext = (x_values_at_proportion[2]+350000, 0.28),
arrowprops = dict(facecolor = 'black', width = 0.2, headwidth = 10),
horizontalalignment = 'center')
ax.set_xlabel("\nNumber of Individuals",fontsize=12)
ax.set_ylabel("Proportion",fontsize=12)
sns.move_legend(ax, loc='upper left', bbox_to_anchor=(1, 1), title=None)
plt.savefig(
f'/Users/ching/Documents/work/03_CMUH/4_Manuscript/PRS/outfiles/240119_suppfig.png',
bbox_inches="tight",
dpi=150)
plt.show()
```

### Heatmap
https://zhuanlan.zhihu.com/p/35494575
example: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2744-2/figures/5
* noralization
with z-score: https://www.programsbuzz.com/article/python-seaborn-matrix-plots-cluster-map
### Hierarchically-clustered Heatmap
https://www.geeksforgeeks.org/hierarchically-clustered-heatmap-in-python-with-seaborn-clustermap/
https://www.python-graph-gallery.com/404-dendrogram-with-heat-map/
https://cmdlinetips.com/2020/01/heatmaps-with-seaborns-clustermap/
example: https://stackoverflow.com/questions/60177861/plotting-annotated-heatmaps-clustermaps-with-multiple-legends-in-seaborn
example: https://www.chrisremmel.com/post/seaborn-color-labels/
workbook: https://www.cntofu.com/book/172/docs/31.md
* methods
https://medium.com/qiubingcheng/%E6%AD%90%E6%B0%8F%E8%B7%9D%E9%9B%A2%E8%88%87%E9%A4%98%E5%BC%A6%E7%9B%B8%E4%BC%BC%E5%BA%A6%E7%9A%84%E6%AF%94%E8%BC%83-c78163ad51b
https://www.twblogs.net/a/5cfe993fbd9eee14644ee0ce

* set color var postion
https://stackoverflow.com/questions/62882084/seaborn-clustermap-legend-overlap-with-figure
* add grouped color
https://datavizpyr.com/hierarchically-clustered-heatmap-with-seaborn-in-python/
### Firth logistic regression
Paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4063169/
Paper: https://pubmed.ncbi.nlm.nih.gov/28295456/
Paper: https://www.sas.com/content/dam/SAS/support/en/sas-global-forum-proceedings/2020/4654-2020.pdf
knowledge: https://medium.datadriveninvestor.com/firths-logistic-regression-classification-with-datasets-that-are-small-imbalanced-or-separated-49d7782a13f1
R workbook: https://cran.r-project.org/web/packages/logistf/logistf.pdf
workbook: https://www.rdocumentation.org/packages/logistf/versions/1.24/topics/logistf
YouTube: https://www.youtube.com/watch?v=fVbrUz6V_uk
### Schedule plot
https://wurmen.github.io/Genetic-Algorithm-for-Job-Shop-Scheduling-and-NSGA-II/implementation%20with%20python/GA-jobshop/Example1.html
### swarmplot

https://seaborn.pydata.org/generated/seaborn.swarmplot.html
https://stackoverflow.com/questions/51826447/how-to-resize-a-seaborn-catplot
```javascript!
sns.catplot(data=PGS_sum_df[PGS_sum_df['AUROC'].notnull()], x="Broad Ancestry Category", y="AUROC", hue="contain_covar",kind="swarm",aspect=3)
```

### BOX plot
```javascript=
fig, ax = plt.subplots(figsize=(7,6))
ax = sns.boxplot(data=case_bim_bin_fig, x=x, y=y,order=order,\
width=0.3,showfliers=False,color='w',\
boxprops=dict(linewidth=1.2,edgecolor='black'),whiskerprops=dict(linewidth=1.2,color='black'),\
medianprops=dict(linewidth=1.7,color='orange'),capprops=dict(linewidth=1.2,color='black'))
add_stat_annotation(ax, data=case_bim_bin_fig, x=x, y=y,order=order,
box_pairs=[("(0.01%, 0.05%]", "(0.05%, 0.1%]"), \
("(0.01%, 0.05%]", "(0.1%, 0.5%]"), \
("(0.01%, 0.05%]", "(0.5%, 1%]")],
test='Mann-Whitney', text_format='star', loc='outside', verbose=2)
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.2%}'.format(y)))
ax.grid(color = 'silver', linestyle = '--', linewidth = 1)
# ax.set(xlabel='Minor allele frequency in WGS (n=2170)', ylabel=f'{TAR}', title='')
plt.suptitle('')
plt.title('')
# plt.xlabel('Minor allele frequency in WGS (n=2,170)', fontsize=14,labelpad=15)
plt.ylabel(f'{TAR}', fontsize=14,labelpad=15)
ax.spines['bottom'].set_color('black')
ax.spines['top'].set_color('black')
ax.spines['right'].set_color('black')
ax.spines['left'].set_color('black')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tick_params(direction='out',bottom = 'on',left = 'on')
plt.rcParams['savefig.dpi'] = 300
```
### Density plot
```javascript=
def density_plot(a,c):
case_bim_bin_fig=case_bim_bin2[['SNP',a]]
# case_bim_bin_fig=case_bim_bin_fig.loc[case_bim_bin_fig['MAF_WGS']>0]
case_bim_bin_fig=pd.merge(case_bim_bin_fig,c,left_on='SNP',right_on='0',how='left')
case_bim_bin_fig['TF'] = np.where(case_bim_bin_fig['SNP']==case_bim_bin_fig['0'], 'True', 'False')
# case_bim_bin_fig=case_bim_bin_fig.loc[case_bim_bin_fig['TF']=='True']
x1=list(case_bim_bin_fig[case_bim_bin_fig['TF'] == 'True'][a])
x2=list(case_bim_bin_fig[case_bim_bin_fig['TF'] == 'False'][a])
fig, ax = plt.subplots(figsize=(12, 6))
# ax=case_bim_bin_fig[b].plot(kind='density',linewidth=3)
# sns.kdeplot(data=case_bim_bin_fig, x=a, hue="TF",multiple="stack")
# sns.kdeplot(case_bim_bin_fig[a], shade=True, color="r")
# colors = ['blue', 'red']
counts, bins, patches=ax.hist(x1, bins = int(10),
# color = colors,
edgecolor = 'black')
# sample = np.hstack((case_bim_bin_fig[b], case_bim_bin_fig[a]))
# ax.plot(sample, [0.01]*len(sample), '|', color='k', markersize=12)
# ax.plot(case_bim_bin_fig['MAF_WGS'], density(case_bim_bin_fig['MAF_WGS']))
plt.grid(linestyle = '--')
# plt.xlim(-0.004, 0.012)
# plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
ax.spines['bottom'].set_color('black')
ax.spines['top'].set_color('black')
ax.spines['right'].set_color('black')
ax.spines['left'].set_color('black')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.title(a, fontsize=16)
# plt.xlabel('Minor allele frequency in WGS (n=2,170)', fontsize=14,labelpad=15)
plt.ylabel('Count', fontsize=14,labelpad=15)
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.2%}'.format(x)))
ax.yaxis.set_major_formatter(FuncFormatter(comma))
plt.tick_params(direction='out',bottom = 'on',left = 'on')
plt.rcParams['savefig.dpi'] = 300
bin_x_centers = 0.5 * np.diff(bins) + bins[:-1]
bin_y_centers = ax.get_yticks()[1] * 0.25
for i in range(len(bins)-1):
bin_label = "{0:,}".format(counts[i]) + " ({0:,.2f}%)".format((counts[i]/counts.sum())*100)
plt.text(bin_x_centers[i], bin_y_centers, bin_label, rotation=90,rotation_mode='anchor')
plt.tight_layout()
plt.show()
density_plot('MAF_WGS',APT_list)
density_plot('MAF_WGS',ADV_list)
```
### Distribution plot
```javascript=
palette = {1: "#30A9DE", 2:'#ef5285'}
ax1=sns.displot(data=best_practice, x="PRS", hue="caco", kind="kde",
palette=palette,common_norm=False, common_grid=True, height=3.5, aspect=1.25,
fill=True, alpha=.1)
ax1.map(plt.axvline,x=0,ls='--', c='gray')
plt.show()
sns.displot(data=best_practice, x="PRS", hue="caco", kind="ecdf",
palette=palette, height=3.5, aspect=1.25)
plt.show()
```
* case control compare
```javascript=
sns.displot(merge_info_all, x="merge_age", hue="Sex", bins=30, fill=True,palette=["#0080FF", "#FF3399"],col='pheno',facet_kws={'sharey': False})
```

```javascript=
tIDcode = tIDcode[tIDcode['Age_bin']!='nan']
sns.displot(tIDcode, x="Age", hue="Sex", kind="kde", fill=True,palette=["#FF3399", "#0080FF"])
plt.savefig('/media/volume1/amy/TPMI/TMPI_29W_aged.pdf',bbox_inches="tight", dpi=300)
plt.savefig('/media/volume1/amy/TPMI/TMPI_29W_aged.png',bbox_inches="tight", dpi=300)
plt.show()
```

```javascript=
# Define the color palette
palette = {"Male": "#0080FF", "Female": "#FF3399"}
# Create the displot
g = sns.displot(df, x="DiagAge", hue="sex", bins=30, fill=True, palette=palette, facet_kws={'sharey': True}, col='sex')
g.set_titles("{col_name}", size=10)
# Get the axes of the subplots
axes = g.axes.flat
# Iterate over the subplots
for ax in axes:
# Get the gender of the subplot
gender = ax.get_title()
# Filter the DataFrame based on the gender
filtered_df = df[df['sex'] == gender]
# Calculate the count of each gender
count = filtered_df.shape[0]
# Add the count as text annotation on the plot
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
ax.text(0.8, 0.9, f"Count: {count}", transform=ax.transAxes, ha='center', va='top', fontsize=10, bbox=props)
# Show the plot
plt.show()
```

### Pyramid Chart
```javascript=
bins = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 180]
agelable = ["0-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85-89", "90-94", "95-99", "100+"]
df['group'] = pd.cut(df["DiagAge"], bins=bins, labels=agelable)
population_df = df.pivot_table(
values='patientID',
index='group',
columns='sex',
aggfunc=np.size,
margins=False,
fill_value=0
).reset_index()
population_df["Female_Left"] = 0
population_df["Female_Width"] = population_df["Female"]
population_df["Male_Left"] = -population_df["Male"]
population_df["Male_Width"] = population_df["Male"]
female_color = "#ee7a87"
male_color = "#4682b4"
fig, ax = plt.subplots(figsize=(9, 6))
plt.barh(y=population_df["group"], width=population_df["Female_Width"],color="#ee7a87", label="Female")
plt.barh(y=population_df["group"], width=population_df["Male_Width"],
left=population_df["Male_Left"],
color="#4682b4", label="Male")
for idx in range(len(population_df)):
plt.text(x=population_df["Male_Left"][idx]-0.1, y=idx, s=" {} ".format(population_df["Male"][idx]),ha="right", va="center",fontsize=10, color="#4682b4")
plt.text(x=population_df["Female_Width"][idx]+0.1, y=idx, s=" {} ".format(population_df["Female"][idx]),ha="left", va="center",fontsize=10, color="#ee7a87")
plt.xlim(-100,50)
ticks = ax.get_xticks()
ax.set_xticklabels([int(abs(tick)) for tick in ticks])
plt.xlabel("Count", fontsize=14, fontweight="bold")
plt.ylabel("Age Range", fontsize=14, fontweight="bold")
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.title("Stroke Population Pyramid Chart", loc="center", pad=10, fontsize=14,fontweight="bold")
```

```javascript=
age = np.array(
["0 - 4", "5 - 9", "10 - 14", "15 - 19", "20 - 24", "25 - 29", "30 - 34", "35 - 39",
"40 - 44", "45 - 49", "50 - 54", "55 - 59", "60 - 64", "65 - 69", "70 - 74",
"75 - 79", "80 - 84", "85 - 89", "90 - 94", "95 - 99", "> 100"])
m = np.array(
[1915887, 2100931, 2494484, 2464805, 2361297, 2529633, 2669927, 2754129, 2753282,
2552531, 2211649, 1697221, 1311024, 902675, 722246, 482686, 273915, 108639, 35867,
10965, 1238])
f = np.array(
[1823981, 1980712, 2369795, 2435784, 2330223, 2562964, 2724990, 2860720, 2918730,
2713534, 2376384, 1869867, 1454373, 1030677, 885393, 640698, 388748, 172302, 64170,
19868, 2711])
x = np.arange(age.size)
# Modified before the function because you are shifting the bars left/right
tick_val = np.array([-3000000, -2000000, -1000000, 1000000, 2000000, 3000000])
shift = 300000
tick_val[0:3] -= shift
tick_val[3:] += shift
# Modified inside the function
plt.barh(x, -m, alpha=.75, height=.75, left=-shift, align='center' , color="deepskyblue")
plt.barh(x, f, alpha=.75, height=.75, left = shift, align='center', color="pink")
for i, j in enumerate(age):
if i==0 or i==1:
plt.text(-150000, x[i]-0.2, j, fontsize=10)
else:
plt.text(-230000, x[i]-0.2, j, fontsize=10)
```

### Hist plot
```javascript=
bins = [0,0.0001, 0.0005, 0.001, 0.005, 0.01]
data=
fig, ax = plt.subplots(figsize=(12, 6))
# # ax=case_bim_bin_fig[b].plot(kind='density',linewidth=3)
# # sns.kdeplot(data=case_bim_bin_fig, x=a, hue="TF",multiple="stack")
# # sns.kdeplot(case_bim_bin_fig[a], shade=True, color="r")
colors = ['blue', 'red','green']
ax.hist([list(case_bim_bin_fig['MAF_WGS']),
list(case_bim_bin_fig[case_bim_bin_fig['TF_APT'] == 'True']['MAF_WGS']),
list(case_bim_bin_fig[case_bim_bin_fig['TF_ADV'] == 'True']['MAF_WGS'])], bins = bins,
color = colors, edgecolor = 'black')
# # sample = np.hstack((case_bim_bin_fig[b], case_bim_bin_fig[a]))
# # ax.plot(sample, [0.01]*len(sample), '|', color='k', markersize=12)
# # ax.plot(case_bim_bin_fig['MAF_WGS'], density(case_bim_bin_fig['MAF_WGS']))
plt.grid(linestyle = '--')
plt.xscale('log')
# # plt.xlim(-0.004, 0.012)
# # plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
ax.spines['bottom'].set_color('black')
ax.spines['top'].set_color('black')
ax.spines['right'].set_color('black')
ax.spines['left'].set_color('black')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.title(a, fontsize=16)
# # plt.xlabel('Minor allele frequency in WGS (n=2,170)', fontsize=14,labelpad=15)
plt.ylabel('Count', fontsize=14,labelpad=15)
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.2%}'.format(x)))
ax.yaxis.set_major_formatter(FuncFormatter(comma))
plt.tick_params(direction='out',bottom = 'on',left = 'on')
plt.rcParams['savefig.dpi'] = 300
plt.tight_layout()
plt.show()
```
```javascript=
fig, ax = plt.subplots(figsize=(12, 6))
# ax=case_bim_bin_fig.MAF_WGS.plot(kind='density',linewidth=3)
# plt.xlim(-0.1, 1.0)
sample = np.hstack((case_bim_bin_fig['MAF_WGS'], case_bim_bin_fig[f'{TAR}']))
ax.plot(sample, [0.1]*len(sample), '|', color='k', markersize=12)
ax.hist(case_bim_bin_fig['MAF_WGS'],1000, ec='red', fc='none', lw=1.5, histtype='step', label='Dist A')
# ax.plot(case_bim_bin_fig['MAF_WGS'], density(case_bim_bin_fig['MAF_WGS']))
plt.grid(linestyle = '--')
# plt.xlim(-0.1, 1.0)
# plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
ax.spines['bottom'].set_color('black')
ax.spines['top'].set_color('black')
ax.spines['right'].set_color('black')
ax.spines['left'].set_color('black')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.title(f'{TAR}', fontsize=16)
plt.xlabel('Minor allele frequency in WGS (n=2,170)', fontsize=14,labelpad=15)
plt.ylabel('Density', fontsize=14,labelpad=15)
# ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.2%}'.format(x)))
plt.tick_params(direction='out',bottom = 'on',left = 'on')
plt.rcParams['savefig.dpi'] = 300
plt.tight_layout()
plt.show()
```
```javascript=
# p = sns.displot(data=PGS_sum_df[PGS_sum_df['AUROC'].notnull()], x='AUROC', stat='percent', col='contain_covar', height=3)
fg = sns.displot(data=PGS_sum_df[(PGS_sum_df['AUROC'].notnull())], x='AUROC', stat='percent', height=3.5, aspect=1.25, bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
for ax in fg.axes.ravel():
# add annotations
for c in ax.containers:
# custom label calculates percent and add an empty string so 0 value bars don't have a number
labels = [f'{w:0.2f}%' if (w := v.get_height()) > 0 else '' for v in c]
print(labels)
percentile_list = pd.DataFrame({'AUC': ['(0.0, 0.1]', '(0.1, 0.2]', '(0.2, 0.3]', '(0.3, 0.4]','(0.4, 0.5]','(0.5, 0.6]','(0.6, 0.7]', '(0.7, 0.8]', '(0.8, 0.9]', '(0.9, 1.0]'],'percentage': labels})
ax.bar_label(c, labels=labels, label_type='edge', fontsize=8, rotation=90, padding=2)
ax.margins(y=0.2)
plt.show()
```

```javascript=
# p = sns.displot(data=PGS_sum_df[PGS_sum_df['AUROC'].notnull()], x='AUROC', stat='percent', col='contain_covar', height=3)
fg = sns.displot(data=PGS_sum_df[(PGS_sum_df['AUROC'].notnull())], x='Broad Ancestry Category', stat='percent', height=3.5, aspect=1.25, bins=[0,3,5,10,18])
fg.set_xticklabels(rotation=90)
for ax in fg.axes.ravel():
# add annotations
for c in ax.containers:
# custom label calculates percent and add an empty string so 0 value bars don't have a number
labels = [f'{w:0.3f}%' if (w := v.get_height()) > 0 else '' for v in c]
print(labels)
# percentile_list = pd.DataFrame({'AUC': ['(0.0, 0.1]', '(0.1, 0.2]', '(0.2, 0.3]', '(0.3, 0.4]','(0.4, 0.5]','(0.5, 0.6]','(0.6, 0.7]', '(0.7, 0.8]', '(0.8, 0.9]', '(0.9, 1.0]'],'percentage': labels})
ax.bar_label(c, labels=labels, label_type='edge', fontsize=8, rotation=90, padding=2)
ax.margins(y=0.2)
plt.show()
```

### Bar chart
```javascript=
bins = [0,0.0001, 0.0005, 0.001, 0.005, 0.01]
data1=list(case_bim_bin_fig[case_bim_bin_fig['TF_APT'] == 'True']['MAF_WGS'])
data2=list(case_bim_bin_fig[case_bim_bin_fig['TF_APT'] == 'False']['MAF_WGS'])
# print(data1)
hist1, bin_edges1 = np.histogram(data1,bins)
hist2, bin_edges2 = np.histogram(data2,bins)
# Setting the positions and width for the bars
pos = list(range(len(hist1)))
width = 0.3
print(range(len(hist1)))
print(range(len(hist2)))
fig, ax = plt.subplots(figsize=(12, 6))
ax.bar([p - width*0.5 for p in pos], hist1, width, edgecolor='black',linewidth=1.2,
color='black',error_kw=dict(ecolor='black',elinewidth=0.5,lolims=True,marker='o'))
ax.bar([p + width*0.5 for p in pos], hist2, width, edgecolor='black',linewidth=1.2,
color='white',hatch='\\\\',error_kw=dict(ecolor='black',elinewidth=0.5,lolims=True,marker='o'))
# df2=plt.bar([p - width*1 for p in pos], hist2, width, edgecolor='black',linewidth=1.2,
# label=case_bim_bin_fig['Categories'][1],color='black',error_kw=dict(ecolor='black',elinewidth=0.5,lolims=True,marker='o'))
# Set the xticklabels to a string that tells us what the bin edges were
ax.set_xticklabels(['{} - {}'.format(bins[i],bins[i+1]) for i,j in enumerate(hist1)])
plt.tight_layout()
```
```javascript=
Disease_dis = ICD_diag_CRC_all.pivot_table(
values=['PatientNo'],
index=['DiagName'],
aggfunc=lambda x: len(x.unique())
).reset_index()
Disease_dis.columns =['Disease', 'Count']
Disease_dis = Disease_dis.sort_values(by=['Count'],ascending=False)
fig, ax = plt.subplots(figsize=(4, 6))
sns.barplot(x="Count", y="Disease", data=Disease_dis.head(20),
label="Count", color="black")
```

```javascript=
plt.figure(figsize=(25,5))
sns.barplot(data = pheno_count_map[pheno_count_map['case']>5000].sort_values(by=['case'],ascending=False), x="phenotype", y='case')
plt.xticks(rotation=90,ha='center')
plt.savefig('/media/volume1/amy/TPMI/Phecode_samplecount.png', bbox_inches = 'tight', dpi = 300)
plt.savefig('/media/volume1/amy/TPMI/Phecode_samplecount.pdf', bbox_inches = 'tight', dpi = 300)
plt.show()
```

```javascript=
category = 'mental disorders'
fig, ax = plt.subplots(figsize=(25,5))
df_draw = pheno_count_map[pheno_count_map['category']==category].sort_values(by=['case'],ascending=False)
clrs = ['grey' if (x < 1000) else 'red' for x in df_draw['case'] ]
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
sns.barplot(
data = df_draw,
x="phenotype",
y='case',
palette=clrs)
plt.xticks(rotation=90,ha='center')
ax.text(.8, .85, "category = {}".format(category),transform=ax.transAxes,size=16,bbox=props)
plt.savefig(f'/media/volume1/amy/TPMI/phecode_result/Phecode_samplecount_{category}.png', bbox_inches = 'tight', dpi = 300)
plt.savefig(f'/media/volume1/amy/TPMI/phecode_result/Phecode_samplecount_{category}.pdf', bbox_inches = 'tight', dpi = 300)
plt.show()
```

```javascript=
PGS_sum_df['AUROC_TF']=PGS_sum_df['AUROC'].notnull()
PGS_sum_df['HR_TF']=PGS_sum_df['Hazard Ratio (HR)'].notnull()
PGS_sum_df['OR_TF']=PGS_sum_df['Odds Ratio (OR)'].notnull()
table = PGS_sum_df.pivot_table(
values=['PGS Performance Metric (PPM) ID'],
columns=['AUROC_TF','HR_TF','OR_TF'],
aggfunc=lambda x: len(x.unique()),
dropna=False
)
splot = sns.barplot(data=table,palette=['darkorange','darkorange','darkorange','darkorange','cornflowerblue','cornflowerblue','cornflowerblue','cornflowerblue'])
splot.set_ylim(0, 11000)
# plt.xticks(rotation=90, ha='center')
plt.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom=True, # ticks along the bottom edge are off
top=False, # ticks along the top edge are off
labelbottom=False) # labels along the bottom edge are off
splot.set_ylabel("Number of performance records",fontsize=12)
for p in splot.patches:
splot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.savefig(
f'Output/Barplot_of_PGS_catalog.png',
bbox_inches="tight", dpi=300)
```

```javascript=
df_stack = (PGS_score
.groupby("percentile")["Pheno"]
.value_counts(normalize=True)
.mul(100)
.round(2)
.unstack())
df_stack
fig, ax = plt.subplots(figsize = (9,9))
ax.bar(df_stack.index, df_stack["Case"], label = "Case", width = 0.9, color = "#FF5858")
ax.bar(df_stack.index, df_stack["Control"], bottom = df_stack.Case, label = "Control", width = 0.9, color = "#00ABB3")
# Add labels
for c in ax.containers:
labels = [str(round(v.get_height(), 2)) + "%" if v.get_height() > 0 else '' for v in c]
ax.bar_label(c,
label_type='center',
labels = labels,
size = 10) # add a container object "c" as first argument
# Remove spines
for s in ["top", "right"]:
ax.spines[s].set_visible(False)
# Add labels
ax.tick_params(labelsize = 12, labelrotation = 0)
ax.set_xticks(df_stack.index)
ax.set_ylabel("Percentage (%)", size = 12)
ax.set_xlabel("Percentile", size = 12)
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left',title="Pheno")
ax.spines['top'].set_visible(True)
ax.spines['right'].set_visible(True)
```

```javascript=
from scipy.stats import norm
from sklearn.datasets import make_blobs
import colorcet as cc
blobs, labels = make_blobs(n_samples=1000, centers=2, center_box=(-100, 100))
palette = sns.color_palette(cc.glasbey_hv, n_colors=10).as_hex()
PRS_name=plot_file.split("/")[8]
g = sns.displot(
np.array(PGS_score[PGS_score['Pheno']=='Case']['SCORE']),
bins = 40,
kde=False,
stat='percent',
height=3.5, aspect=1,
# hist_kws=dict(edgecolor="black", linewidth=1,alpha=0.9)
)
count_patient=PGS_score[PGS_score['Pheno']=='Case']['IID'].nunique()
print(count_patient)
# plt.text(4.8, 40,'n = {:.0f}'.format(count_patient),fontsize=10)
plt.text(2, 6.3,'n = {:.0f}'.format(count_patient),fontsize=10)
# for i in bins_SCORE:
# plt.axvline(x=i, linestyle='--')
for ax in g.axes.flat:
for rectangle in ax.patches:
if (rectangle.get_x() >= bins_SCORE[0])&(rectangle.get_x() < bins_SCORE[1]):
rectangle.set_facecolor(palette[0])
elif (rectangle.get_x() >= bins_SCORE[1])&(rectangle.get_x() < bins_SCORE[2]):
rectangle.set_facecolor(palette[1])
elif (rectangle.get_x() >= bins_SCORE[2])&(rectangle.get_x() < bins_SCORE[3]):
rectangle.set_facecolor(palette[2])
elif (rectangle.get_x() >= bins_SCORE[3])&(rectangle.get_x() < bins_SCORE[4]):
rectangle.set_facecolor(palette[3])
elif (rectangle.get_x() >= bins_SCORE[4])&(rectangle.get_x() < bins_SCORE[5]):
rectangle.set_facecolor(palette[4])
elif (rectangle.get_x() >= bins_SCORE[5])&(rectangle.get_x() < bins_SCORE[6]):
rectangle.set_facecolor(palette[5])
elif (rectangle.get_x() >= bins_SCORE[6])&(rectangle.get_x() < bins_SCORE[7]):
rectangle.set_facecolor(palette[6])
elif (rectangle.get_x() >= bins_SCORE[7])&(rectangle.get_x() < bins_SCORE[8]):
rectangle.set_facecolor(palette[7])
elif (rectangle.get_x() >= bins_SCORE[8])&(rectangle.get_x() < bins_SCORE[9]):
rectangle.set_facecolor(palette[8])
elif (rectangle.get_x() >= bins_SCORE[9]):
rectangle.set_facecolor(palette[9])
g.set_axis_labels("Polygenic risk score", "Density of disease cases (%)\n",fontsize=12)
plt.savefig(
f'Output/Displot_of_cases_{PRS_name}.png',
bbox_inches="tight", dpi=300)
plt.show()
```

```javascript=
```
### Violin plot
```javascript=
# 比較年齡分布
plt.figure(figsize=(3,4))
ax = sns.violinplot(data=Target_codebook_inTPMI, x='Disease',y="Age_now", hue="SexName",
hue_order=['Female','Male'],palette=['#FF7777','#3DB2FF'], width=0.5)
add_stat_annotation(ax, data=Target_codebook_inTPMI,x='Disease', y="Age_now", hue="SexName",
hue_order=['Female','Male'],
box_pairs=[(('CRC','Female'),('CRC','Male'))
],
test='t-test_ind', text_format='simple', loc='inside', verbose=2)
plt.legend(loc='upper left', bbox_to_anchor=(1.03, 1))
# # ax.grid(color = 'silver', linestyle = '--', linewidth = 1)
# plt.xticks(rotation=30,ha='right' )
# plt.savefig('./Dementia_project/Output/TPMI/Diagnosis_age.pdf',bbox_inches="tight", dpi=300)
# plt.savefig('./Dementia_project/Output/TPMI/Diagnosis_age.png',bbox_inches="tight", dpi=300)
plt.show()
```
```javascript=
from statannot import add_stat_annotation
from statannotations.Annotator import Annotator
fig, ax = plt.subplots(figsize=(4,4))
ax = sns.violinplot(
data=PGS_for_plot,
y='Used to build model',
x='Number of Individuals',
width=0.9,showfliers=True,
palette=['blue','green','orange'],
# boxprops=dict(linewidth=1.2,edgecolor='black'),
# whiskerprops=dict(linewidth=1.2,color='black'),
# medianprops=dict(linewidth=1.7,color='orange'),
# capprops=dict(linewidth=1.2,color='black')
)
# Define a function for formatting x-axis labels
def format_x_ticks(value, pos):
return '{:,.1f}M'.format(value * 1e-6)
# Set the x-axis label formatter
ax.xaxis.set_major_formatter(FuncFormatter(format_x_ticks))
# add_stat_annotation(
# ax,
# data=PGS_for_plot,
# y='Used to build model',
# x='Number of Individuals',
# box_pairs=[('Original','Passed to the next step'),
# ('Original','Selected for optimized model'),
# ('Passed to the next step','Selected for optimized model')],
# test='Mann-Whitney',
# text_format='simple',
# loc='outside',
# verbose=0.5
# )
ax.set_ylabel("Used to build model",fontsize=12)
ax.set_xlabel("\nNumber of Individuals",fontsize=12)
# plt.yticks(rotation=30,ha='right')
plt.savefig(
f'/Users/ching/Documents/work/03_CMUH/4_Manuscript/PRS/outfiles/240118_suppfig2-2.png',
bbox_inches="tight",
dpi=150)
plt.show()
```

```javascript=
x = "Subtype"
y = "DiagAge"
hue = "SexName"
plt.figure(figsize=(7,4))
ax = sns.boxplot(data=test, x=x, y=y, hue=hue,palette=['#FF7777','#3DB2FF'])
add_stat_annotation(ax, data=test, x=x, y=y, hue=hue,
box_pairs=[(("Alzheimer\'s disease", "Female"), ("Alzheimer\'s disease", "Male")),
(("Lewy Body", "Female"), ("Lewy Body", "Male")),
(("Frontotemporal dementia", "Female"), ("Frontotemporal dementia", "Male")),
(("Vascular dementia", "Female"), ("Vascular dementia", "Male")),
(("Other cerebral degeneration", "Female"), ("Other cerebral degeneration", "Male")),
(("Non-specific dementias", "Female"), ("Non-specific dementias", "Male")),
(("Secondary Parkinsonism", "Female"), ("Secondary Parkinsonism", "Male")),
(("Not included in study definition of dementia", "Female"), ("Not included in study definition of dementia", "Male")),
(("other comorbidity", "Female"), ("other comorbidity", "Male"))
],
test='t-test_ind', text_format='star', loc='inside', verbose=2)
plt.legend(loc='upper left', bbox_to_anchor=(1.03, 1))
# ax.grid(color = 'silver', linestyle = '--', linewidth = 1)
plt.xticks(rotation=30,ha='right' )
plt.savefig('./Dementia_project/Output/TPMI/Diagnosis_age.pdf',bbox_inches="tight", dpi=300)
plt.savefig('./Dementia_project/Output/TPMI/Diagnosis_age.png',bbox_inches="tight", dpi=300)
plt.show()
```

```javascript=
# define custom plotting function
def custom_boxplot(*args, **kwargs):
ax = sns.violinplot(*args, **kwargs,palette=['#2EC4B6','#EE7785'])
statannot.add_stat_annotation(
ax, plot='boxplot',
data=kwargs['data'], x=kwargs['x'], y=kwargs['y'],
box_pairs=[('Control', 'LAA')],loc='outside',
test='Mann-Whitney', text_format='simple'
)
# create plot
g = sns.FacetGrid(
df_m1, col="item",sharey=False, sharex=False,
col_wrap=6, height=3, aspect=1,margin_titles=True)
g.map_dataframe(custom_boxplot, x='Group', y='value')
g.set_titles("{col_name}\n\n")
g.fig.subplots_adjust(hspace =0.5)
g.add_legend()
plt.savefig(
'LAA_project/subfigure_1.png',
bbox_inches="tight", dpi=300)
plt.show()
```

```javascript=
# single
for i in item_order:
x = "Age_bin"
y = "Age"
hue = "Sex"
plt.figure(figsize=(4,4))
ax = sns.violinplot(data=tIDcode[tIDcode['Age_bin']==i], x=x, y=y, hue=hue,palette=['#FF7777','#3DB2FF'],order=item_order)
add_stat_annotation(ax, data=tIDcode[tIDcode['Age_bin']!='nan'], x=x, y=y, hue=hue,order=item_order,comparisons_correction='bonferroni',
box_pairs=[((i, "Female"), (i, "Male")),
],
test='t-test_ind', text_format='star', loc='inside', verbose=2,line_offset_to_box=0.3)
plt.legend(loc='upper left', bbox_to_anchor=(1.03, 1))
# ax.grid(color = 'silver', linestyle = '--', linewidth = 1)
# plt.xticks(rotation=30,ha='right' )
# plt.savefig('/media/volume1/amy/TPMI/TMPI_29W_age.pdf',bbox_inches="tight", dpi=300)
# plt.savefig('/media/volume1/amy/TPMI/TMPI_29W_age.png',bbox_inches="tight", dpi=300)
plt.show()
```

### Venn plot
```javascript!
ICD_diag_cancer = ICD_diagnosis_diagage[(ICD_diagnosis_diagage['DiagName'].notnull())&
(ICD_diagnosis_diagage['DiagName'].str.contains('惡性腫瘤'))]
set_CRC=set(ICD_diag_CRC.PatientNo.unique().tolist())
set_CANCER=set(ICD_diag_cancer.PatientNo.unique().tolist())
difference = list(set(set_CRC) - set(set_CANCER))
same = list(set(set_CRC) & set(difference))
join = list(set(set_CRC) | set(set_CANCER))
set_join = set(join)
print('CRC:',len(set_CRC),'CANCER:',len(set_CANCER),'joined:',len(same),'not in ICD:',len(difference))
v=venn2([set_CRC, set_CANCER], ('CRC', 'Cancer'))
c=venn2_circles([set_CRC, set_CANCER], linestyle='dashed', linewidth=1, color="grey")
plt.show()
v=venn3([set_CRC, set_CANCER, set_join], ('CRC', 'Cancer','TPMI'))
c=venn3_circles([set_CRC, set_CANCER, set_join], linestyle='dashed', linewidth=1, color="grey")
plt.show()
```
```javascript!
set_unimputed = set(unimputed)
set_imputed = set(imputed)
difference = list(set(set_unimputed) - set(set_imputed))
same = list(set(set_unimputed) & set(set_imputed))
# join = list(set(set_CRC) | set(set_CANCER))
# # set_join = set(join)
print('unimputed:',len(unimputed),'imputed:',len(imputed))
print('joined:',len(same),'not in unimputed data:',len(difference))
v=venn2([set_unimputed, set_imputed], ('unimputed', 'imputed'))
c=venn2_circles([set_unimputed, set_imputed], linestyle='dashed', linewidth=1, color="grey")
plt.show()
```

```javascript=
v=venn3([set_17W_sig, set_29W_sig_un, set_29W_sig], ('17W sig', '29W sig + uncertain','29W sig'))
c=venn3_circles([set_17W_sig, set_29W_sig_un, set_29W_sig], linestyle='dashed', linewidth=1, color="grey")
plt.annotate(',\n'.join(set_17W_sig-set_29W_sig_un-set_29W_sig), xy=v.get_label_by_id('100').get_position() +
np.array([-0.04, 0]), xytext=(-130,-120), ha='right',
textcoords='offset points',
bbox=dict(boxstyle='round,pad=0.5', fc='lightgray', alpha=0.9),
arrowprops=dict(arrowstyle='->',
connectionstyle='arc',color='black'))
plt.annotate(',\n'.join(set_17W_sig&set_29W_sig_un-set_29W_sig), xy=v.get_label_by_id('110').get_position() +
np.array([-0.02, 0.02]), xytext=(-140,10), ha='right',
textcoords='offset points',
bbox=dict(boxstyle='round,pad=0.5', fc='lightgray', alpha=0.9),
arrowprops=dict(arrowstyle='->',
connectionstyle='arc',color='black'))
plt.annotate(',\n'.join(set_17W_sig&set_29W_sig_un&set_29W_sig), xy=v.get_label_by_id('111').get_position() +
np.array([0.02, 0]), xytext=(170,20), ha='left',
textcoords='offset points',
bbox=dict(boxstyle='round,pad=0.5', fc='lightgray', alpha=0.9),
arrowprops=dict(arrowstyle='->',
connectionstyle='arc',color='black'))
plt.annotate(',\n'.join(set_29W_sig_un&set_29W_sig-set_17W_sig), xy=v.get_label_by_id('011').get_position() +
np.array([0.02, 0]), xytext=(120,-30), ha='left',
textcoords='offset points',
bbox=dict(boxstyle='round,pad=0.5', fc='lightgray', alpha=0.9),
arrowprops=dict(arrowstyle='->',
connectionstyle='arc',color='black'))
plt.annotate(',\n'.join(set_29W_sig_un-set_17W_sig-set_29W_sig), xy=v.get_label_by_id('010').get_position() +
np.array([0.02, 0]), xytext=(160,-90), ha='left',
textcoords='offset points',
bbox=dict(boxstyle='round,pad=0.5', fc='lightgray', alpha=0.9),
arrowprops=dict(arrowstyle='->',
connectionstyle='arc',color='black'))
plt.savefig('/media/volume1/amy/TPMI/29W_fGWAS_result/Alzheimer_EEG_SNPcompare.png',dpi=300,bbox_inches="tight")
plt.savefig('/media/volume1/amy/TPMI/29W_fGWAS_result/Alzheimer_EEG_SNPcompare.pdf',dpi=300,bbox_inches="tight")
plt.show()
```

```javascript=
# four groups
import venn
set_AD=set(dementia_df[dementia_df['Alzheimer\'s disease']==2.0]['id'].unique().tolist())
set_senile=set(dementia_df[dementia_df['Senile dementia']==2.0]['id'].unique().tolist())
set_vascular=set(dementia_df[dementia_df['Vascular dementia']==2.0]['id'].unique().tolist())
set_cerebral=set(dementia_df[dementia_df['Dementia with cerebral degenerations']==2.0]['id'].unique().tolist())
# v=venn3([set_AD, set_senile, set_vascular], ('Alzheimer\'s disease', 'Senile dementia','Vascular dementia'))
# c=venn3_circles([set_AD, set_senile, set_vascular], linestyle='dashed', linewidth=1, color="grey")
# plt.show()
labels = venn.get_labels([set_AD, set_cerebral, set_senile, set_vascular], fill=['number'])
fig, ax = venn.venn4(labels, names=['Alzheimer\'s disease','Dementia with cerebral degenerations','Senile dementia','Vascular dementia'])
fig.show()
```
### Bar chart with 省略波浪
```javascript=
CLNSIG_count=ANNOVAR_g[ANNOVAR_g['CLNALLELEID'].notnull()].pivot_table(
values=['avsnp150'],
columns=['filename'],
index=['Class'],
aggfunc=lambda x: len(x.unique()),
)
CLNSIG_count=CLNSIG_count.avsnp150
order=['other','not_provided','Uncertain_significance','protective','Benign','Benign/Likely_benign','Likely_benign','Conflicting_interpretations_of_pathogenicity','risk_factor','Likely_pathogenic','Pathogenic/Likely_pathogenic','Pathogenic']
CLNSIG_count = CLNSIG_count.reindex(order)
display(CLNSIG_count)
# 建立畫布
fig, ax = plt.subplots(ncols=3, figsize=(5,3), sharey=True,
gridspec_kw={'width_ratios': (2,1,1)} )
fig.subplots_adjust(wspace=0.0)
CLNSIG_count.plot.barh(ax = ax[0],color={"imputed": "#3399FF", "unimputed": "#FFB266"})
CLNSIG_count.plot.barh(ax = ax[1],color={"imputed": "#3399FF", "unimputed": "#FFB266"})
CLNSIG_count.plot.barh(ax = ax[2],color={"imputed": "#3399FF", "unimputed": "#FFB266"})
# 調整愈顯示的單位區間及距離
ax[0].set_xlim(0,450) # 顯示軸長 0~450
ax[0].set_xticks(np.arange(0,400+10,100)) # 顯示刻度
ax[1].set_xlim(1000,7500)
ax[1].set_xticks(np.arange(3000,6000+1000,3000))
ax[2].set_xlim(22500,25000)
ax[2].set_xticks(np.arange(23300,24500+100,1200))
# 隱藏兩個 subfig 之間的圖表邊界
ax[0].spines['right'].set_visible(False)
ax[1].spines['left'].set_visible(False)
ax[1].spines['right'].set_visible(False)
ax[2].spines['left'].set_visible(False)
# 隱藏單位刻度的標示線
ax[1].tick_params(axis='y', which='both', left=False, labelleft=False)
ax[2].tick_params(axis='y', which='both', left=False, labelleft=False)
# 更改XY軸單位/類別文字的大小
ax[0].yaxis.set_tick_params(labelsize=10)
# 繪製波浪線
d1 = 0.04
d2 = 0.025 # 波風波谷振福
wn = 59 # 波浪數量,需為奇數
pp = (0,d2,0,-d2)
px = np.linspace(-d1,1+d1,wn)
py = np.array([1+pp[i%4] for i in range(0,wn)])
p = Path(list(zip(py,px)), [Path.MOVETO]+[Path.CURVE3]*(wn-1))
line1 = mpatches.PathPatch(p, lw=4, edgecolor='black',
facecolor='None', clip_on=False,
transform=ax[0].transAxes, zorder=10)
line2 = mpatches.PathPatch(p,lw=3, edgecolor='white',
facecolor='None', clip_on=False,
transform=ax[0].transAxes, zorder=10,
capstyle='round')
a = ax[1].add_patch(line1)
a = ax[1].add_patch(line2)
d1 = 0.04
d2 = 0.06
wn = 59
pp = (0,d2,0,-d2)
px = np.linspace(-d1,1+d1,wn)
py = np.array([1+pp[i%4] for i in range(0,wn)])
p = Path(list(zip(py,px)), [Path.MOVETO]+[Path.CURVE3]*(wn-1))
line1 = mpatches.PathPatch(p, lw=4, edgecolor='black',
facecolor='None', clip_on=False,
transform=ax[1].transAxes, zorder=10)
line2 = mpatches.PathPatch(p,lw=3, edgecolor='white',
facecolor='None', clip_on=False,
transform=ax[1].transAxes, zorder=10,
capstyle='round')
a = ax[2].add_patch(line1)
a = ax[2].add_patch(line2)
# 調整 legend 的數量跟位置
ax[0].get_legend().remove()
ax[1].get_legend().remove()
ax[2].legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
# 影像輸出
plt.tight_layout()
plt.savefig('/media/volume1/amy/TPMI/TPMI_29W_ClinVar.png',dpi=300,bbox_inches="tight")
plt.savefig('/media/volume1/amy/TPMI/TPMI_29W_ClinVar.pdf',dpi=300,bbox_inches="tight")
plt.show()
```

### Pointplot
<https://stackoverflow.com/questions/45849028/seaborn-facetgrid-pointplot-label-data-points>
<https://stackoverflow.com/questions/61234555/how-to-remove-repeated-axes-labels-in-a-seaborn-facetgrid>
### Scatter plot
```javascript=
from sklearn.datasets import make_blobs
import colorcet as cc
blobs, labels = make_blobs(n_samples=1000, centers=2, center_box=(-100, 100))
palette = sns.color_palette(cc.glasbey_bw, n_colors=18)
fig, ax = plt.subplots(figsize=(20, 6))
sns.scatterplot(
data=p_lowest_metadata.sort_values('prevalence rate (per 100,000)', ascending=False),
x='logp.value_U',
y='prevalence rate (per 100,000)',
hue='category',
# style='sex',
palette=palette,
edgecolor=None,
alpha=0.8,
s=60,
# cmap="viridis"
)
# plt.title('Exploring Physical Attributes of Different Penguins')
# plt.xlabel('Bill Length (mm)')
# plt.ylabel('Bill Depth (mm)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
plt.tight_layout()
plt.show()
plt.savefig(
f'Output/Disease_precalence_logp_all_huebyDisease.png',
bbox_inches="tight", dpi=300)
```

```javascript=
from sklearn.datasets import make_blobs
import colorcet as cc
# blobs, labels = make_blobs(n_samples=1000, centers=2, center_box=(-100, 100))
# palette = sns.color_palette(cc.glasbey, n_colors=17)
g = sns.FacetGrid(
p_lowest_metadata,
col="category",
hue="category",
palette=palette,
height=2, aspect=1.5,
col_wrap = 6)
g = g.map(plt.scatter, "logp.value_U", "prevalence rate (per 100,000)", edgecolor=None, alpha=0.8, s=60)
grouped = p_lowest_metadata.groupby(["category"], sort=False)
for ax, (name, df) in zip(g.axes.flat, grouped):
SNP_raw_count = np.size(df['phenotype_codebook'])
sig_SNP_raw_count = np.size(df[df['logp.value_U']>-np.log10(0.0000025)]['phenotype_codebook'])
print(df['category'].unique(),SNP_raw_count,sig_SNP_raw_count)
ax.text(.7, .9, "n = "+'{:.0f} / {:.0f}'.format(sig_SNP_raw_count,SNP_raw_count),transform=ax.transAxes,size=10)
# ax.text(.8, .7, " n = "+'{:.0f}'.format(sig_SNP_raw_count),transform=ax.transAxes,size=8)
g.set_titles("{col_name}",size = 10,)
g.set_axis_labels("$-log_{10}{(P)}$", "Prevalence rate\n(per 100,000 people)")
plt.savefig(
f'Output/Disease_precalence_logp_all_byDisease.png',
bbox_inches="tight", dpi=300)
```

```javascript=
fig, ax = plt.subplots(figsize=(7,4))
PRS_name=plot_file.split("/")[8]
ax = sns.scatterplot(
x="percentile_100",
y="IID",
data=PGS_score_count[PGS_score_count['Pheno']=='Case'],
s=60,
alpha=0.9,
edgecolor=None,
hue='percentile',
palette=palette
)
ax.spines[['right', 'top']].set_visible(False)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
ax.set_xlabel("Percentile of polygenic score",fontsize=12)
ax.set_ylabel("Prevalence of disease (%)",fontsize=12)
plt.savefig(
f'Output/Scatterplot_of_cases_{PRS_name}.png',
bbox_inches="tight", dpi=300)
plt.show()
```

```javascript=
import matplotlib.pyplot as plt
import numpy as np; np.random.seed(0)
import matplotlib.transforms as transforms
T_PGS_sum_df=PGS_sum_df[(PGS_sum_df['AUROC'].notnull())&(PGS_sum_df['contain_covar']==True)]
T_PGS_sum_df['Broad Ancestry Category'] = pd.Categorical(T_PGS_sum_df['Broad Ancestry Category'], ['European', 'European, African unspecified', 'European, Not reported', 'East Asian', 'South Asian', 'South East Asian', 'Asian unspecified', 'African American or Afro-Caribbean', 'African unspecified', 'Sub-Saharan African', 'Greater Middle Eastern', 'Native American', 'Hispanic or Latin American', 'Oceanian', 'Other admixed ancestry', 'Multi-ancestry', 'Not reported'])
F_PGS_sum_df=PGS_sum_df[(PGS_sum_df['AUROC'].notnull())&(PGS_sum_df['contain_covar']==False)]
F_PGS_sum_df['Broad Ancestry Category'] = pd.Categorical(F_PGS_sum_df['Broad Ancestry Category'], ['European', 'European, African unspecified', 'European, Not reported', 'East Asian', 'South Asian', 'South East Asian', 'Asian unspecified', 'African American or Afro-Caribbean','African unspecified', 'Sub-Saharan African', 'Greater Middle Eastern', 'Native American', 'Hispanic or Latin American', 'Oceanian', 'Other admixed ancestry', 'Multi-ancestry', 'Not reported'])
# year = T_PGS_sum_df['Broad Ancestry Category']
# values = T_PGS_sum_df['AUROC']
fig, ax = plt.subplots(figsize=(10,6))
offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
sc1 = plt.scatter(T_PGS_sum_df['Broad Ancestry Category'], T_PGS_sum_df['AUROC'], facecolors='none', edgecolors='blue', s = 25, transform=trans+offset(-3),alpha=0.8)
plt.scatter(T_PGS_sum_df['Broad Ancestry Category'], T_PGS_sum_df['AUROC'], facecolors='none', edgecolors='none', s = 25,alpha=1)
plt.scatter(F_PGS_sum_df['Broad Ancestry Category'], F_PGS_sum_df['AUROC'], facecolors='none', edgecolors='red', s = 25,alpha=0.8, transform=trans+offset(3))
plt.xticks(rotation=90, ha='center')
ax.xaxis.grid(ls='--', c='gray',linewidth=1,alpha=0.3)
plt.show()
```

### lmplot
```javascript=
from scipy import stats
# fig, ax = plt.subplots(figsize=(20, 6))
g = sns.lmplot(
data=p_lowest_metadata,
x='No.SNP_PGScatalog',
y='No.SNP_CMUH_caculate',
fit_reg=True,
lowess=True,
ci=95,
height=5, aspect=1,
line_kws={'color':'b',"lw":2},
scatter_kws={"color": "black", "alpha":0.3, "s":60},
order=0,
# logx=True,
# color="g"
)
g.set_axis_labels("Number of SNPs list in PGS catalog", "Number of SNPs used for calculation",fontsize=12)
slope = stats.stats.linregress(p_lowest_metadata['No.SNP_PGScatalog'],p_lowest_metadata['No.SNP_CMUH_caculate'])[0]
r, p = stats.pearsonr(p_lowest_metadata['No.SNP_PGScatalog'],p_lowest_metadata['No.SNP_CMUH_caculate'])
print('p=',p)
# plt.text(0, 4.5e6,'slope = {:.2f}'.format(slope),fontsize=10)
plt.text(0, 4.0e6,'r = {:.2f}'.format(r),fontsize=10)
plt.text(0, 3.5e6,'p = {:.2f}'.format(p),fontsize=10)
for ax in g.axes.flat:
labels = ax.get_xticks() # get x labels
new_labels = ['{:,.0f}'.format(label) + 'M' for label in labels/1000000]
# ax.set_xticks(labels)
ax.set_xticklabels(new_labels, rotation=0) # set new labels
labels = ax.get_yticks() # get y labels
new_labels = ['{:,.0f}'.format(label) + 'M' for label in labels/1000000]
# ax.set_yticks(labels)
ax.set_yticklabels(new_labels, rotation=0) # set new labels
plt.savefig(
f'Output/p_lowest_metadata_SNP_count.png',
bbox_inches="tight", dpi=300)
```

### Jointplot
```javascript=
from scipy import stats
from sklearn.preprocessing import StandardScaler
scale = StandardScaler() #z-scaler物件
sub_set = p_lowest_metadata[['logp.value_U','prevalence rate (per 100,000)']]
sub_set = sub_set[~sub_set.isin([np.nan, np.inf, -np.inf]).any(axis=1)]
train_set_scaled = pd.DataFrame(scale.fit_transform(sub_set),
columns=sub_set.keys())
j=sns.jointplot(
sub_set['logp.value_U'], sub_set['prevalence rate (per 100,000)'],
kind="reg",
marker='x',
color='b',
scatter_kws={'alpha':0.5},
marginal_kws=dict(bins=20,kde=True,color='b'),
)
r, p = stats.pearsonr(sub_set['logp.value_U'],sub_set['prevalence rate (per 100,000)'])
slope = stats.stats.linregress(sub_set['logp.value_U'],sub_set['prevalence rate (per 100,000)'])[0]
# j.ax_joint.annotate('slope = {:.2f}'.format(slope), xy=(.83, .85), xycoords='axes fraction',fontsize=10)
j.ax_joint.annotate('r = {:.2f} '.format(r), xy=(.73, .85), xycoords='axes fraction',fontsize=10)
print('p=',p)
j.ax_joint.annotate('$P = 6.44 x 10^{%d}$'%(-3), xy=(.73, .78), xycoords='axes fraction',fontsize=10)
j.ax_joint.annotate('$P = 2.5 x 10^{%d}$'%(-6), xy=(.10, .8), xycoords='axes fraction',fontsize=10, color='crimson')
for ax in (j.ax_joint, j.ax_marg_x):
ax.axvline(-np.log10(0.0000025), color='crimson', ls='--', lw=1.5)
j.set_axis_labels("\n$-log_{10}{(P)}$", "Prevalence rate\n(per 100,000 people)\n",fontsize=12)
j.fig.set_size_inches(5,5)
plt.savefig(
f'Output/Disease_precalence_logp_all.png',
bbox_inches="tight", dpi=300)
plt.show()
```

```javascript=
PGS_sum_df['Category_num2']=np.where(PGS_sum_df['contain_covar']==True, PGS_sum_df['Category_num']+0.2, PGS_sum_df['Category_num']-0.2)
g=sns.jointplot(data=PGS_sum_df[PGS_sum_df['AUROC'].notnull()], x="Category_num2", y="AUROC",
hue="contain_covar",palette={True:'cornflowerblue',False:'darkorange'},marker='o', s = 50,alpha=0.4)
g.fig.set_size_inches((10, 4))
g.ax_marg_x.set_xlim(0, 18)
g.ax_joint.set_xticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18])
value_label_map = {'1':'European',
'2':'European, African unspecified',
'3':'European, Not reported',
'4':'East Asian',
'5':'South Asian',
'6':'South East Asian',
'7':'Asian unspecified',
'8':'African American or Afro-Caribbean',
'9':'African unspecified',
'10':'Sub-Saharan African',
'11':'Greater Middle Eastern',
'12':'Native American',
'13':'Hispanic or Latin American',
'14':'Oceanian',
'15':'Other admixed ancestry',
'16':'Multi-ancestry',
'17':'Not reported'}
ticks = g.ax_joint.get_xticks()
print(ticks)
new_tick_labels = ['','European', 'European, African unspecified', 'European, Not reported', 'East Asian', 'South Asian', 'South East Asian', 'Asian unspecified', 'African American or Afro-Caribbean', 'African unspecified', 'Sub-Saharan African', 'Greater Middle Eastern', 'Native American', 'Hispanic or Latin American', 'Oceanian', 'Other admixed ancestry', 'Multi-ancestry', 'Not reported','']
print(new_tick_labels)
g.ax_joint.set_xticklabels(new_tick_labels,rotation=90)
g.ax_joint.legend(labels =['Covariates included','Without covariates'],bbox_to_anchor=(1,0.2), borderaxespad=0)
g.set_axis_labels("\nBroad Ancestry Category", "Area Under Curve (AUC)",fontsize=12)
plt.savefig(
f'Output/Scatterplot_of_PGS_catalog.png',
bbox_inches="tight", dpi=300)
```

### Heatmap
```javascript=
# AUC_catagory = p_lowest_metadata[p_lowest_metadata['logp.value_U']>-np.log10(0.0000025)]
AUC_catagory = p_lowest_metadata.copy()
bins=[0,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
# print(pd.cut(AUC_catagory['AUC'],bins))
AUC_catagory['categories_AUC']=pd.cut(AUC_catagory['AUC'],bins)
AUC_catagory_count=AUC_catagory.pivot_table(
index='categories_AUC',
columns='category',
values='phenotype_codebook',
aggfunc=lambda x: len(x.unique()),
fill_value=0
)
# AUC_catagory_count=AUC_catagory_count[['endocrine/metabolic','circulatory system','musculoskeletal','neoplasms','respiratory','dermatologic','genitourinary','digestive','mental disorders','injuries & poisonings','sense organs','infectious diseases','neurological','symptoms','others','hematopoietic','pregnancy complications','congenital anomalies']]
ax = sns.heatmap(AUC_catagory_count,cmap='plasma',annot=True,fmt='d',square=False)
# ax.set(xlabel="category", ylabel="Area Under Curve (AUC)",fontsize=10)
ax.set_xlabel("Disease category",fontsize=12)
ax.set_ylabel("Area Under Curve (AUC)\n",fontsize=12)
# plt.savefig(
# f'Output/Disease_precalence_logp_all.png',
# bbox_inches="tight", dpi=300)
plt.show()
```

### Pie chart
```javascript=
fig, ax = plt.subplots(figsize=(10, 4), subplot_kw=dict(aspect="equal"))
# x = count['patientID']
def func(s,d):
t = int(round(s/100.*sum(d))) # 透過百分比反推原本的數值
return f'{s:.1f}%\n( {t} )' # 使用文字格式化的方式,顯示內容
wedges, texts, autotexts = ax.pie(x,
radius=1.5,
textprops={'color':'black', 'weight':'bold', 'size':12}, # 設定文字樣式
# explode=(0, 0, 0, 0, 0),
pctdistance=0.7,
autopct=lambda i: func(i,x),
# labels=count['TOAST'],
# wedgeprops={'linewidth':3,'edgecolor':'white'}, # 繪製每個扇形的外框
)
ax.legend(wedges, count['TOAST'],
title="TOAST",
title_fontsize = 14,
fontsize="12",
loc="center left",
# bbox_to_anchor=(1.05, 1.0)
bbox_to_anchor=(1.2, 0, 0.5, 1)
)
plt.show()
```

### RFECV
```javascript=
RFECV = pd.read_csv('LAA_project/g_scores_LR.txt',sep='\t')
RFECV_arr = RFECV.to_numpy()
RFECV = RFECV.reset_index()
RFECV.columns = ['Number of features','cross1','cross2','cross3','cross4','cross5','cross6','cross7','cross8','cross9','cross10']
RFECV['Number of features'] = RFECV['Number of features'] +1
RFECV_m1 = pd.melt(
RFECV, id_vars = ['Number of features'],
value_vars = ['cross1','cross2','cross3','cross4','cross5','cross6','cross7','cross8','cross9','cross10'],
var_name = 'cross', value_name ='AUC')
RFECV_avg = np.average(RFECV_arr, axis=1)
xmax = np.arange(0, len(RFECV_avg))[np.argmax(RFECV_avg)]
ymax = RFECV_avg.max()
fig, ax = plt.subplots(figsize=(5,3))
ax.title.set_text('RFECV for Logistic Regression')
g=sns.lineplot(
data=RFECV_m1, x="Number of features", y="AUC",
markers=True, dashes=False)
g.axvline(x=xmax, color='r', label='test lines',linestyle='--')
text= "AUC = {:.3f} \n Num. of features = {:.0f} ".format(ymax,xmax+1)
kw = dict(xycoords='data',textcoords="axes fraction",
ha="right", va="center")
g.annotate(text, xy=(xmax, ymax), xytext=(0.95, 0.1), **kw,fontsize=10)
plt.savefig(
'LAA_project/figure_RFECV1.png',
bbox_inches="tight", dpi=300)
plt.yticks(np.arange(0.5, 0.95, step=0.05))
plt.show()
```

### Manhaton
* Auto select top 15 genes
```javascript=
path=f'/media/volume1/amy/TPMI/2_29W_GWAS_result/new_script_result/'
imputedLOG=gb.glob(path+f'*_{PROJ}_imputed/Marge_{PROJ}_TPMI_imputed_adjGWAS.*.logistic')
imputedANNOT=gb.glob(path+f'*_{PROJ}_imputed/Marge_{PROJ}_TPMI_imputed_adjGWAS.*.annot')
rawLOG=gb.glob(path+f'*_{PROJ}_raw/Marge_{PROJ}_TPMI_raw_adjGWAS.*.logistic')
rawANNOT=gb.glob(path+f'*_{PROJ}_raw/Marge_{PROJ}_TPMI_raw_adjGWAS.*.annot')
statistic_df = pd.read_csv(imputedLOG[0],sep='\t')
statistic_df = statistic_df.sort_values('#CHROM', ascending=True).reset_index(drop=True)
# display(statistic_df.head(5))
annotate_df = pd.read_csv(imputedANNOT[0],sep='\t')
annotate_df['ANNOT'] = annotate_df['P ANNOT'].str.split(' ',expand=True)[1]
# display(annotate_df.head(5))
df1 = pd.merge(statistic_df,annotate_df[['SNP','ANNOT']].drop_duplicates(),left_on='ID',right_on='SNP',how='left')
display(df1.head(5))
statistic_df = pd.read_csv(rawLOG[0],sep='\t')
statistic_df = statistic_df.sort_values('#CHROM', ascending=True).reset_index(drop=True)
# display(statistic_df.head(5))
annotate_df = pd.read_csv(rawANNOT[0],sep='\t')
annotate_df['ANNOT'] = annotate_df['P ANNOT'].str.split(' ',expand=True)[1]
# display(annotate_df.head(5))
df2 = pd.merge(statistic_df,annotate_df[['SNP','ANNOT']].drop_duplicates(),left_on='ID',right_on='SNP',how='left')
display(df2.head(5))
df1['-logp'] = -np.log10(df1['P'])
running_pos = 0
cumulative_pos = []
for chrom, group_df in df1.groupby('#CHROM'):
cumulative_pos.append(group_df['POS']+running_pos)
# print(chrom,group_df['POS']+running_pos)
running_pos+=group_df['POS'].max()
df1['cumulative_pos'] = pd.concat(cumulative_pos)
df1['SNP number'] = df1.index
df2['-logp'] = -np.log10(df2['P'])
running_pos = 0
cumulative_pos = []
for chrom, group_df in df2.groupby('#CHROM'):
cumulative_pos.append(group_df['POS']+running_pos)
# print(chrom,group_df['POS']+running_pos)
running_pos+=group_df['POS'].max()
df2['cumulative_pos'] = pd.concat(cumulative_pos)
df2['SNP number'] = df2.index
df2['-logp']=-1*df2['-logp']
fig, ax = plt.subplots(figsize=(12, 6))
sns.scatterplot(
data = df1,
x = 'cumulative_pos',
y = '-logp',
# aspect = 2,
hue = '#CHROM',
palette = ["#F0997D","#D3756B"]*11,
# palette = 'Set1',
# color="steelblue",
linewidth=0,
s=15,
legend=None,
ax = ax
)
sns.scatterplot(
data = df2,
x = 'cumulative_pos',
y = '-logp',
# aspect = 2,
hue = '#CHROM',
palette = ["#205295", "#5DA7DB"]*11,
# palette = 'Set1',
# palette=["red"],
linewidth=0,
s=15,
legend=None,
ax = ax
)
ax.set_xlabel('Chromosome')
ax.set_ylabel('$-log_{10}{(P)}$')
ax.set_xticks(df1.groupby('#CHROM')['cumulative_pos'].median())
ax.set_xticklabels(df1['#CHROM'].unique())
plt.tight_layout()
# g.fig.suptitle('GWAS plot showing association between SNPs on autosomes and speeding\n')
df3 = df1.sort_values(by='-logp', ascending=False)
annotations1 = df3[df3['ANNOT'].notnull()].drop_duplicates(subset=['ANNOT'], keep='first')
annotations1 = annotations1.head(15)
annotations1 = annotations1.apply(lambda p : ax.annotate(p['ANNOT'], (p['cumulative_pos'], p['-logp'])), axis=1).to_list()
df4 = df2.sort_values(by='-logp', ascending=True)
annotations2 = df4[df4['ANNOT'].notnull()].drop_duplicates(subset=['ANNOT'], keep='first')
annotations2 = annotations2.head(15)
annotations2 = annotations2.apply(lambda p : ax.annotate(p['ANNOT'], (p['cumulative_pos'], p['-logp'])), axis=1).to_list()
ax.axhline(0, linestyle='-', linewidth=1.5, color='black')
ax.axhline(-np.log10(0.00000005), linestyle='-', linewidth=1, color='r')
ax.axhline(np.log10(0.00000005), linestyle='-', linewidth=1, color='r')
ax.axhline(5, linestyle='--', linewidth=1, color='b')
ax.axhline(-5, linestyle='--', linewidth=1, color='b')
df1_name='imputed data'
df2_name='raw data'
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
ax.text(.03, .92, "{}".format(df1_name),transform=ax.transAxes,size=12,bbox=props)
ax.text(.03, .05, "{}".format(df2_name),transform=ax.transAxes,size=12,bbox=props)
adjust_text(annotations1)
adjust_text(annotations1, arrowprops = {'arrowstyle' : '-', 'color' : 'green'})
adjust_text(annotations2)
adjust_text(annotations2, arrowprops = {'arrowstyle' : '-', 'color' : 'green'})
ticks = ax.get_yticks()
plt.yticks(np.arange(min(ticks), max(ticks)+3, 3.0))
ticks = ax.get_yticks()
ax.set_yticklabels([int(abs(tick)) for tick in ticks])
# plt.savefig(
# f'/media/volume1/amy/TPMI/2_29W_GWAS_result/{TIME}_Subgroups_{PROJ}_pyManhatton.png',
# bbox_inches="tight", dpi=100)
# plt.savefig(
# f'/media/volume1/amy/TPMI/2_29W_GWAS_result/{TIME}_Subgroups_{PROJ}_pyManhatton.pdf',
# bbox_inches="tight", dpi=100)
plt.show()
```

* Manual select sig threshold for annotation on the fig
```javascript=
# imputed and raw
statistic_df = pd.read_csv('/media/volume1/bioinfo/Terry-Packages/Result/20221216_PD_match/GWAS/20221216_PD_match.Pheno.glm.logistic.hybrid',sep='\t')
display(statistic_df.head(5))
annotate_df = pd.read_csv('/media/volume1/bioinfo/Terry-Packages/Result/20221216_PD_match/GWAS/20221216_PD_match.txt.annot',sep='\t')
annotate_df['ANNOT'] = annotate_df['ERRCODE ANNOT'].str.split('.',expand=True)[1]
display(annotate_df.head(5))
df1 = pd.merge(statistic_df,annotate_df[['SNP','ANNOT']].drop_duplicates(),left_on='ID',right_on='SNP',how='left')
display(df1.head(5))
statistic_df = pd.read_csv('/media/volume1/bioinfo/Terry-Packages/Result/20221220_PD_match_raw/GWAS/20221220_PD_match_raw.Pheno.glm.logistic.hybrid',sep='\t')
display(statistic_df.head(5))
annotate_df = pd.read_csv('/media/volume1/bioinfo/Terry-Packages/Result/20221220_PD_match_raw/GWAS/20221220_PD_match_raw.txt.annot',sep='\t')
annotate_df['ANNOT'] = annotate_df['ERRCODE ANNOT'].str.split('.',expand=True)[1]
display(annotate_df.head(5))
df2 = pd.merge(statistic_df,annotate_df[['SNP','ANNOT']].drop_duplicates(),left_on='ID',right_on='SNP',how='left')
display(df2.head(5))
df1['-logp'] = -np.log10(df1['P'])
running_pos = 0
cumulative_pos = []
for chrom, group_df in df1.groupby('#CHROM'):
cumulative_pos.append(group_df['POS']+running_pos)
# print(chrom,group_df['POS']+running_pos)
running_pos+=group_df['POS'].max()
df1['cumulative_pos'] = pd.concat(cumulative_pos)
df1['SNP number'] = df1.index
df2['-logp'] = -np.log10(df2['P'])
running_pos = 0
cumulative_pos = []
for chrom, group_df in df2.groupby('#CHROM'):
cumulative_pos.append(group_df['POS']+running_pos)
# print(chrom,group_df['POS']+running_pos)
running_pos+=group_df['POS'].max()
df2['cumulative_pos'] = pd.concat(cumulative_pos)
df2['SNP number'] = df2.index
df2['-logp']=-1*df2['-logp']
sig_set_1=5.5
sig_set_2=4.5
print('imputed')
print(df1[(df1['-logp'] > sig_set_1)&(df1['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first').ANNOT.nunique())
print(df1[(df1['-logp'] > sig_set_1)&(df1['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first').ANNOT.unique())
print("{:-^50s}".format("Split Line"))
print('unimputed')
print(df2[(df2['-logp'] < sig_set_2*-1)&(df2['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first').ANNOT.nunique())
print(df2[(df2['-logp'] < sig_set_2*-1)&(df2['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first').ANNOT.unique())
fig, ax = plt.subplots(figsize=(12, 6))
sns.scatterplot(
data = df1,
x = 'cumulative_pos',
y = '-logp',
# aspect = 2,
hue = '#CHROM',
palette = ["peachpuff", "sandybrown"]*11,
# palette = ["#F0997D","#D3756B"]*11,
# palette = 'Set1',
# color="steelblue",
linewidth=0,
s=10,
legend=None,
ax = ax
)
sns.scatterplot(
data = df2,
x = 'cumulative_pos',
y = '-logp',
# aspect = 2,
hue = '#CHROM',
palette = ["steelblue", "lightsteelblue"]*11,
# palette = ["#205295", "#5DA7DB"]*11,
# palette = 'Set1',
# palette=["red"],
linewidth=0,
s=10,
legend=None,
ax = ax
)
ax.set_xlabel('Chromosome')
ax.set_ylabel('$-log_{10}{(P)}$')
ax.set_xticks(df1.groupby('#CHROM')['cumulative_pos'].median())
ax.set_xticklabels(df1['#CHROM'].unique())
plt.tight_layout()
# g.fig.suptitle('GWAS plot showing association between SNPs on autosomes and speeding\n')
df3 = df1.sort_values(by='-logp', ascending=False)
annotations1 = df3[(df3['-logp'] > sig_set_1)&(df3['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first')
annotations1 = annotations1.apply(lambda p : ax.annotate(p['ANNOT'], (p['cumulative_pos'], p['-logp'])), axis=1).to_list()
df4 = df2.sort_values(by='-logp', ascending=False)
annotations2 = df4[(df4['-logp'] < sig_set_2*-1)&(df4['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first')
annotations2 = annotations2.apply(lambda p : ax.annotate(p['ANNOT'], (p['cumulative_pos'], p['-logp'])), axis=1).to_list()
ax.axhline(0, linestyle='-', linewidth=1.5, color='black')
ax.axhline(-np.log10(0.00000005), linestyle='-', linewidth=1, color='r')
ax.axhline(np.log10(0.00000005), linestyle='-', linewidth=1, color='r')
ax.axhline(5, linestyle='--', linewidth=1, color='b')
ax.axhline(-5, linestyle='--', linewidth=1, color='b')
adjust_text(annotations1)
# adjust_text(annotations1, arrowprops = {'arrowstyle' : '-', 'color' : 'blue'})
adjust_text(annotations2)
# adjust_text(annotations2, arrowprops = {'arrowstyle' : '-', 'color' : 'blue'})
ticks = ax.get_yticks()
plt.yticks(np.arange(min(ticks), max(ticks)+1, 3.0))
ticks = ax.get_yticks()
ax.set_yticklabels([int(abs(tick)) for tick in ticks])
plt.savefig(
f'/media/volume1/amy/TPMI/2_29W_GWAS_result/Subgroups_{PROJ}_pyManhatton.png',
bbox_inches="tight", dpi=100)
plt.savefig(
f'/media/volume1/amy/TPMI/2_29W_GWAS_result/Subgroups_{PROJ}_pyManhatton.pdf',
bbox_inches="tight", dpi=100)
plt.show()
```

```javascript=
# Single
statistic_df = pd.read_csv('/media/volume1/bioinfo/Terry-Packages/Result/20221215_terry_PD_GWAS/GWAS/20221215_PD.Pheno.glm.logistic.hybrid',sep='\t')
display(statistic_df.head(5))
annotate_df = pd.read_csv('/media/volume1/bioinfo/Terry-Packages/Result/20221215_terry_PD_GWAS/GWAS/20221215_PD.txt.annot',sep='\t')
annotate_df['ANNOT'] = annotate_df['ERRCODE ANNOT'].str.split('.',expand=True)[1]
display(annotate_df.head(5))
df = pd.merge(statistic_df,annotate_df[['SNP','ANNOT']].drop_duplicates(),left_on='ID',right_on='SNP',how='left')
display(df.head(5))
df['-logp'] = -np.log10(df['P'])
running_pos = 0
cumulative_pos = []
for chrom, group_df in df.groupby('#CHROM'):
cumulative_pos.append(group_df['POS']+running_pos)
# print(chrom,group_df['POS']+running_pos)
running_pos+=group_df['POS'].max()
df['cumulative_pos'] = pd.concat(cumulative_pos)
df['SNP number'] = df.index
df
sig_set=6.5
print(df[(df['-logp'] > sig_set)&(df['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first').ANNOT.nunique())
print(df[(df['-logp'] > sig_set)&(df['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first').ANNOT.unique())
g = sns.relplot(
data = df,
x = 'cumulative_pos',
y = '-logp',
aspect = 2,
hue = '#CHROM',
palette = ["steelblue", "lightsteelblue"] * 11,
# palette = 'Set1',
linewidth=0,
s=10,
legend=None
)
g.ax.set_xlabel('Chromosome')
g.ax.set_ylabel('$-log_{10}{(P)}$')
g.ax.set_xticks(df.groupby('#CHROM')['cumulative_pos'].median())
g.ax.set_xticklabels(df['#CHROM'].unique())
# g.fig.suptitle('GWAS plot showing association between SNPs on autosomes and speeding\n')
df2 = df.sort_values(by='-logp', ascending=False)
annotations = df2[(df['-logp'] > sig_set)&(df['ANNOT'].notnull())].drop_duplicates(subset=['ANNOT'], keep='first')
# annotations = df2[df2['-logp'] > -np.log10(0.00000005)].drop_duplicates(subset=['ANNOT'], keep='first')
annotations = annotations.apply(lambda p : g.ax.annotate(p['ANNOT'], (p['cumulative_pos'], p['-logp'])), axis=1).to_list()
for ax in g.axes.flat:
ax.axhline(-np.log10(0.00000005), linestyle='-', linewidth=1, color='r')
ax.axhline(5, linestyle='--', linewidth=1, color='b')
# adjust_text(annotations)
adjust_text(annotations, arrowprops = {'arrowstyle' : '-', 'color' : 'blue'})
plt.savefig(
f'/media/volume1/amy/TPMI/2_29W_GWAS_result/{PROJ}_pyManhatton.png',
bbox_inches="tight", dpi=300)
plt.savefig(
f'/media/volume1/amy/TPMI/2_29W_GWAS_result/{PROJ}_pyManhatton.pdf',
bbox_inches="tight", dpi=300)
plt.show()
```

### QQ plot
```javascript=
kwargs = {
"marker": "o",
"s":20,
}
if __name__ == "__main__":
f, ax = plt.subplots(figsize=(4, 4), facecolor="w", edgecolor="k")
ax = qqplot(data=df1[df1['P'].notnull()]['P'],
xlabel=r"Expected $-log_{10}{(P)}$",
ylabel=r"Observed $-log_{10}{(P)}$",
alpha=0.5,
color="blue",
# title="Test",
ax=ax,
**kwargs
)
plt.savefig(
f'/media/volume1/amy/TPMI/2_29W_GWAS_result/Subgroups_{PROJ}_imputed_pyQQplot.png',
bbox_inches="tight", dpi=100)
plt.savefig(
f'/media/volume1/amy/TPMI/2_29W_GWAS_result/Subgroups_{PROJ}_imputed_pyQQplot.pdf',
bbox_inches="tight", dpi=100)
plt.show()
kwargs = {
"marker": "o",
"s":20,
}
if __name__ == "__main__":
f, ax = plt.subplots(figsize=(4, 4), facecolor="w", edgecolor="k")
ax = qqplot(data=df2[df2['P'].notnull()]['P'],
xlabel=r"Expected $-log_{10}{(P)}$",
ylabel=r"Observed $-log_{10}{(P)}$",
alpha=0.5,
color="blue",
# title="Test",
ax=ax,
**kwargs
)
plt.savefig(
f'/media/volume1/amy/TPMI/2_29W_GWAS_result/Subgroups_{PROJ}_raw_pyQQplot.png',
bbox_inches="tight", dpi=100)
plt.savefig(
f'/media/volume1/amy/TPMI/2_29W_GWAS_result/Subgroups_{PROJ}_raw_pyQQplot.pdf',
bbox_inches="tight", dpi=100)
plt.show()
```

### PGS plot
```javascript=
def displot_annotated(data_all,col_wrap, save=None):
palette = {"Control": "#30A9DE", "Case":'#ef5285'}
grid = sns.FacetGrid(
data_all, col='PGS_catalog',col_wrap=col_wrap, hue='Pheno',
height=2, aspect=2,sharex=False, sharey=False,palette=palette)
grid.map(sns.kdeplot, 'SCORE', shade=True)
# grid.map(plt.axvline,x=0,ls='--', c='gray')
grouped = data_all.groupby(["PGS_catalog"])
def vertical_mean_line_survived(x, **kwargs):
ls = {"Control":"-","Case":"-"}
plt.axvline(x.mean(), linestyle =ls[kwargs.get("label","0")],
color = kwargs.get("color", "g"))
# if kwargs.get("label","0") == "Control":
# txkw = dict(size=12, color = kwargs.get("color", "g"), rotation=90)
# tx = "$\mu$ = {:.2f}, $\sigma$ = {:.2f}".format(x.mean(),x.std())
# plt.text(x.mean()-0.65, 0.052, tx, **txkw,fontsize=8)
# else:
# txkw = dict(size=12, color = kwargs.get("color", "g"), rotation=90)
# tx = "$\mu$ = {:.2f}, $\sigma$ = {:.2f}".format(x.mean(),x.std())
# plt.text(x.mean()+0.45, 0.052, tx, **txkw,fontsize=8)
for ax, (name, df) in zip(grid.axes.flat, grouped):
U1, p = stats.mannwhitneyu(
df[(df['Pheno']=='Control')]['SCORE'],
df[(df['Pheno']=='Case')]['SCORE'], method="auto")
if p<0.000025:
ax.text(.7, .75, "$\it{p}$"+' = {:.2e}'.format(p), transform=ax.transAxes, size=8, color='red', fontweight='bold')
elif (0.000025<=p)&(p<0.001):
ax.text(.7, .75, "$\it{p}$"+' = {:.2e}'.format(p), transform=ax.transAxes, size=8, color='blue', fontweight='bold')
else:
ax.text(.7, .75, "$\it{p}$"+' = {:.2e}'.format(p), transform=ax.transAxes, size=8, color='black')
# ax.text(.7, .75, "$\it{p}$"+' = {:.2g}'.format(p),transform=ax.transAxes,size=8)
SNP_raw_count = np.mean(df['No.SNP_Raw'])
SNP_pgs_count = np.mean(df['No.SNP_caculate'])
ax.text(.7, .9, "SNPs = "+'{:.0f} / {:.0f}'.format(SNP_pgs_count, SNP_raw_count),transform=ax.transAxes,size=8)
df2 = df.groupby("Pheno") # group by each thing that has its own color and run stats on it
for i, (group, data) in enumerate(df2):
x = data["SCORE"]
# calculate stats and create label
n = len(x)
mean = np.mean(x)
std = np.std(x)
# label = r"%s: $\mu$=%.2f $\sigma$=%.2f" %(group, mean, std)
label = r"$\mu$ $_{%s}$ = %.2f " %(group.lower(), mean)
# label = r"$\mu$$_{m}$={:.2f}" .format("m":group, mean)
ax.annotate(label, xy=(0.7,0.5-(i*0.15)), xycoords='axes fraction', ha='left', size=8)
wd = stats.wasserstein_distance(
df[(df['Pheno']=='Control')]['SCORE'],
df[(df['Pheno']=='Case')]['SCORE'])
if wd>0.15:
ax.text(.7, .15, "$D_{w}$"+' = {:.2f}'.format(wd), transform=ax.transAxes, size=8, color='red', fontweight='bold')
elif (wd<=0.15)&(wd>0.1):
ax.text(.7, .15, "$D_{w}$"+' = {:.2f}'.format(wd), transform=ax.transAxes, size=8, color='blue', fontweight='bold')
else:
ax.text(.7, .15, "$D_{w}$"+' = {:.2f}'.format(wd), transform=ax.transAxes, size=8, color='black')
# wd_lable = r"D$\it{w}$ = %.2f " %(group.lower(), wd)
grid.map(vertical_mean_line_survived, 'SCORE')
grid.set_titles("{col_name}",size = 10)
grid.set_axis_labels("", "")
grid.fig.subplots_adjust(hspace =0.7)
grid.add_legend()
if save==None:
pass
else :
plt.savefig(
f'{save}.png',
bbox_inches="tight", dpi=300)
plt.savefig(
f'{save}.pdf',
bbox_inches="tight", dpi=300)
```

```javascript=
# create stacked bar chart for monthly temperatures
# fig, ax = plt.subplots(figsize=(9,6))
count.plot(kind='barh', stacked=True,figsize=(5, 15))
# labels for x & y axis
plt.xlabel('SNP count')
# plt.xticks(rotation=45,ha='right' )
plt.ylabel('Gene')
# title of plot
plt.title('SNP count in gene and function')
plt.savefig(
f'/media/volume1/amy/TPMI/{PROJ}/SNP_distribution.png',
bbox_inches="tight", dpi=300)
plt.show()
```

```javascript=
PGS_pheno = pd.read_csv('/media/volume1/amy/TEST/230118-AD/EEG-old/EEG.PRS.pheno.txt',sep='\t')
PGS_score = pd.read_csv('/media/volume1/amy/TEST/230118-AD/PRS-Raw-EEG/raw/PRS-Raw-EEG.best',sep=' ')
PGS_summary = pd.read_csv('/media/volume1/amy/TEST/230118-AD/PRS-Raw-EEG/raw/PRS-Raw-EEG.summary',sep='\t')
PGS_score = pd.merge(PGS_pheno, PGS_score, on=['FID','IID'],how='inner').drop_duplicates()
PGS_score['PGS_catalog'] = 'AIC'
PGS_score['Pheno'] = PGS_score['Pheno'].replace({2:"Case",1:"Control"})
display(PGS_score)
display(PGS_summary)
PGS_summary['Num_SNP'][0]
fig, ax = plt.subplots(figsize=(9,9))
palette = {"Control": "#30A9DE", "Case":'#ef5285'}
ax = sns.kdeplot(data=PGS_score, x='PRS', shade=True, hue='Pheno', palette=palette)
plt.axvline(PGS_score[PGS_score['Pheno']=='Case']['PRS'].mean(), linestyle ='-',
color = "#ef5285")
plt.axvline(PGS_score[PGS_score['Pheno']=='Control']['PRS'].mean(), linestyle ='-',
color = "#30A9DE")
# for ax, (name, df) in zip(grid.axes.flat, grouped):
U1, p = stats.mannwhitneyu(
PGS_score[(PGS_score['Pheno']=='Control')]['PRS'],
PGS_score[(PGS_score['Pheno']=='Case')]['PRS'], method="auto")
if p<0.000025:
ax.text(.7, .57, "$\it{p}$"+' = {:.2e}'.format(p), transform=ax.transAxes, size=12, color='red', fontweight='bold')
elif (0.000025<=p)&(p<0.001):
ax.text(.7, .57, "$\it{p}$"+' = {:.2e}'.format(p), transform=ax.transAxes, size=12, color='blue', fontweight='bold')
else:
ax.text(.7, .57, "$\it{p}$"+' = {:.2e}'.format(p), transform=ax.transAxes, size=12, color='black')
# ax.text(.7, .75, "$\it{p}$"+' = {:.2g}'.format(p),transform=ax.transAxes,size=8)
SNP_pgs_count = PGS_summary['Num_SNP'][0]
ax.text(.7, .65, "SNPs = "+'{:.0f}'.format(SNP_pgs_count),transform=ax.transAxes,size=12)
df2 = PGS_score.groupby("Pheno") # group by each thing that has its own color and run stats on it
for i, (group, data) in enumerate(df2):
x = data["PRS"]
# calculate stats and create label
n = len(x)
mean = np.mean(x)
std = np.std(x)
label = r"$\mu$ $_{%s}$ = %.2f " %(group.lower(), mean)
ax.annotate(label, xy=(0.7,0.5-(i*0.05)), xycoords='axes fraction', ha='left', size=12)
wd = stats.wasserstein_distance(
PGS_score[(PGS_score['Pheno']=='Control')]['PRS'],
PGS_score[(PGS_score['Pheno']=='Case')]['PRS'])
if wd>0.15:
ax.text(.7, .35, "$D_{w}$"+' = {:.2f}'.format(wd), transform=ax.transAxes, size=12, color='red', fontweight='bold')
elif (wd<=0.15)&(wd>0.1):
ax.text(.7, .35, "$D_{w}$"+' = {:.2f}'.format(wd), transform=ax.transAxes, size=12, color='blue', fontweight='bold')
else:
ax.text(.7, .35, "$D_{w}$"+' = {:.2f}'.format(wd), transform=ax.transAxes, size=12, color='black')
```

## Additional process
* Date
```javascript!
TIME=pd.Timestamp.now().strftime("%Y%m%d")
ICD9['DATE'] = ICD9['DATE'].apply(pd.to_datetime, format='%Y-%m-%d', errors='coerce')
SocialHum['now'] = pd.to_datetime('now', format='%Y-%m-%d')
```
* Sort
```javascript!
merge_cancer=merge_cancer.sort_values(by=['Age_merged'],ascending=True)
```
* TableOne
```javascript!
cancer_select=merge_cancer.drop_duplicates(subset=['cancer_registry_database','索引號'], keep='first')
columns = ['Entry_age','Diagnosis_age','性別','Age_bin','cancer_registry_database']
categorical = ['性別','Age_bin']
groupby = 'cancer_registry_database'
mytable = TableOne(cancer_select, columns, categorical, groupby, pval = True)
mytable.tableone.to_excel(f'/home/amy/TPMI/Ori_Apt_Adv/{FOLD}/Demographic characteristics_{TIME}.xlsx')
mytable.tableone
```
* Add color in the Jupyter Mark down
```javascript!
Statistic of <span style="color:yellow">Title</span> in genes
```
* Split line
```javascript!
print("{:-^50s}".format("Split Line"))
```
* chunks
```javascript!
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
pID_chunks = list(chunks(pID_list, 1000))
```
* chunk list to file
```javascript=
path=f'/media/volume1/amy/TPMI/Dementia_re'
outpath=f'/media/volume1/amy/TPMI/Dementia_re/chunk'
num_size=5
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
filenames=gb.glob(path+"/"+'rs*_list.csv')
file_chunks = list(chunks(filenames, num_size))
for i in range(0,len(file_chunks)):
print(i)
f = open(f"{outpath}/chunksample_{i}.txt", 'w')
for element in file_chunks[i]:
f.write("%s\n" % element)
f.close()
```
* SQL連線與資料抓取 (每1000個)
```javascript=
pID_list = Target_list['PatientID'].unique().tolist()
pID_chunks = list(chunks(pID_list, 1000))
f_out=pd.DataFrame()
conn = pymssql.connect(server = '10.60.212.206' , user = 'ADCMUH\\T36536' , password = 'LLL123849578' , database = 'DataWareHouse')
for i in range(len(pID_chunks)):
text="select a.PatientNo, a.ChartNo, a.SexName, a.Birthday \
from PatientFactDim as a \
Where (a.PatientNo in ({})".format(str(pID_chunks[i])[1:-1]) + ")"
MedCode_Review = pd.read_sql(text,conn)
# display(MedCode_Review.head(2))
f_out=f_out.append(MedCode_Review)
# print('N in datawarehoues:',f_out.PatientNo.nunique())
SocialHum = f_out
```
* SQL連線雨資料抓取(全部)
```javascript!
case_list = TPMI_fam_codebook_case['PatientNo'].unique().tolist()
t = tuple(case_list)
f_out=pd.DataFrame()
conn = pymssql.connect(server = '10.17.11.66' , user = 'ADCMUH\\T36536' , password = 'LL123849578' , database = 'DataWareHouse')
text="select b.PatientNo, a.VisitNo, b.VisitType, b.DrNo, f.EmpName, b.DivNo, e.ChineseName, e.EnglishName, a.DiagType, d.IcdType, a.DiagCode, d.DiagName, d.DiagEngName, a.OrderTime as 'DiagDate'\
from DiagRecordFactDim as a \
left join VisitRecordFact b on a.VisitNo = b.VisitNo \
left join DiagBasicDim d on a.DiagCode = d.DiagCode \
left join DivisionDim e on b.DivNo = e.DivNo \
left join EmployeeFactDim f on b.DrNo = f.EmpNo \
Where b.PatientNo in {}".format(t)
VisitRecord_case = pd.read_sql(text,conn)
VisitRecord_case
```
* print list 不斷行
```javascript!
np.set_printoptions(threshold=np.inf)
```
* 不讀檔到 python 直接計算 nunique / value_count
```javascript!
awk '{print $17}' snplist.annovar.hg38_multianno.txt | grep -v "avsnp150" | uniq | wc -l
```
```javascript!
awk '{print $17}' snplist.annovar.hg38_multianno.txt | grep -v "avsnp150" | uniq -c | wc -l
```
* 計算 loop 數量
```javascript!
print('Running',GWAS_name,'(',filenames.index(filename)+1,'of 501)')
```
## Statistic
<https://docs.scipy.org/doc/scipy/reference/stats.html>
```javascript!
# Calculate normal chisquare and fisher's exact test
def fisher_exact_test(pDataFile1, pDataFile2):
test_result = []
for i, (group1, group2) in enumerate(zip(pDataFile1, pDataFile2)):
# print(group1,group2)
odds, p_value = stats.fisher_exact(np.ceil([group1, group2]))
test_result.append([odds,p_value])
return pd.DataFrame([test_result]).T
def chisquare_test(pDataFile1, pDataFile2):
test_result = []
for i, (group1, group2) in enumerate(zip(pDataFile1, pDataFile2)):
# print(group1)
# print(group2)
try:
obs = np.array([group1, group2])
# print(obs)
chi2, p_value, dof, ex = stats.chi2_contingency(obs,correction=True)
# print(chi2)
test_result.append([chi2,p_value,dof])
except ValueError:
test_result.append(['na','na','na'])
return pd.DataFrame([test_result]).T
def oddsration_95CI(pDataFile1, pDataFile2):
test_result = []
for i, (group1, group2) in enumerate(zip(pDataFile1, pDataFile2)):
# print(group1,group2)
confidence_95=sm.stats.Table2x2(np.ceil([group1, group2])).oddsratio_confint(alpha=0.05, method='normal')
test_result.append(confidence_95)
return pd.DataFrame([test_result]).T
```
* 印出 seaborn 顏色
<https://seaborn.pydata.org/tutorial/color_palettes.html>

```javascript!
color = ["azure", "steelblue", "lightsteelblue", "Yellow", "Green", "Grey"]
sns.set_palette(color)
sns.palplot(sns.color_palette())
```

### logistic regression with printing parameters
```javascript!
import pandas as pd
import statsmodels.formula.api as smf
titanic = pd.read_csv("/home/amy/TPMI/Ischemic_stroke_project/titanic.csv")
titanic = titanic[["survived", "pclass", "sex", "age", "embark_town"]]
titanic = titanic.dropna()
log_reg = smf.logit("survived ~ sex + age + embark_town", data=titanic).fit()
# Summary of results
print(log_reg.summary())
# ... imports, load data, etc.
# Define and fit model
log_reg = smf.logit("survived ~ sex + age + embark_town", data=titanic).fit()
# Inspect paramaters
print(log_reg.params)
print(dir(log_reg))
# ... imports, load data, etc.
import numpy as np
# ... Define and fit model
odds_ratios = pd.DataFrame(
{
"OR": log_reg.params,
"Lower CI": log_reg.conf_int()[0],
"Upper CI": log_reg.conf_int()[1],
"p-value":log_reg.pvalues,
}
)
odds_ratios = np.exp(odds_ratios)
print(odds_ratios)
```