# MonarchKG Dangling Edge Analysis Want to know, which curies from edge ingests are missing, and where are we losing those edges from. ``` q -t -H "\ select prefix, provided_by, count(*) as amount from ( select substr(subject,1,instr(subject,':')) as prefix, provided_by \ from monarch-kg-dangling-edges.tsv \ where subject not in (select id from monarch-kg_nodes.tsv) \ union all \ select substr(object,1,instr(object,':')) as prefix, provided_by \ from monarch-kg-dangling-edges.tsv \ where object not in (select id from monarch-kg_nodes.tsv) \ ) group by 1,2 having amount > 100 order by 3 desc \ " ``` Finds | prefix | provided_by | number of missing edges | | --- | --- | --- | | EMAPA: | alliance_gene_to_expression_edges.tsv | 728282 | | UniProtKB: | goa_go_annotation_edges.tsv | 555871 | | ZFA: | alliance_gene_to_expression_edges.tsv | 507141 | | AspGD: | goa_go_annotation_edges.tsv | 493755 | | FBbt: | alliance_gene_to_expression_edges.tsv | 394846 | | ENSEMBL: | panther_ref_genome_orthologs_edges.tsv | 209356 | | OBO: | phenio_edges.tsv | 128156 | | OMIM: | hpoa_disease_phenotype_edges.tsv | 113993 | | | phenio_edges.tsv | 113881 | | WBbt: | alliance_gene_to_expression_edges.tsv | 102069 | | dictyBase: | panther_ref_genome_orthologs_edges.tsv | 88368 | | dictyBase: | goa_go_annotation_edges.tsv | 84636 | | FYPO: | pombase_gene_to_phenotype_edges.tsv | 84282 | | CLO: | phenio_edges.tsv | 77712 | | NCBITaxon: | phenio_edges.tsv | 62826 | | MESH: | ctd_chemical_to_disease_edges.tsv | 51570 | | POMBASE: | panther_ref_genome_orthologs_edges.tsv | 48892 | | http: | phenio_edges.tsv | 47934 | | UMLS: | phenio_edges.tsv | 40143 | | NCBIGene: | string_protein_links_edges.tsv | 37092 | | DOID: | phenio_edges.tsv | 19949 | | NCIT: | phenio_edges.tsv | 14405 | | CL: | phenio_edges.tsv | 11047 | | CHEBI: | reactome_chemical_to_pathway_edges.tsv | 9184 | | OMIM: | omim_gene_to_disease_edges.tsv | 7134 | | ECO: | phenio_edges.tsv | 7017 | | PW: | phenio_edges.tsv | 6516 | | SO: | phenio_edges.tsv | 6486 | | PATO: | phenio_edges.tsv | 5720 | | https: | phenio_edges.tsv | 4994 | | IAO: | phenio_edges.tsv | 3545 | | ZP: | zfin_gene_to_phenotype_edges.tsv | 3072 | | EFO: | phenio_edges.tsv | 2843 | | RO: | phenio_edges.tsv | 2672 | | Xenbase: | xenbase_gene_to_phenotype_edges.tsv | 2592 | | PR: | phenio_edges.tsv | 2282 | | RGD: | panther_ref_genome_orthologs_edges.tsv | 1802 | | SGD: | goa_go_annotation_edges.tsv | 1705 | | OBI: | phenio_edges.tsv | 1692 | | XCO: | phenio_edges.tsv | 1637 | | ENVO: | phenio_edges.tsv | 1611 | | MEDDRA: | phenio_edges.tsv | 1455 | | PO: | phenio_edges.tsv | 1305 | | ORPHA: | hpoa_disease_phenotype_edges.tsv | 1000 | | RGD: | goa_go_annotation_edges.tsv | 899 | | NCBIGene: | omim_gene_to_disease_edges.tsv | 872 | | MGI: | alliance_gene_to_expression_edges.tsv | 869 | | NBO: | phenio_edges.tsv | 862 | | FBcv: | phenio_edges.tsv | 764 | | GENO: | phenio_edges.tsv | 736 | | BFO: | phenio_edges.tsv | 689 | | owl: | phenio_edges.tsv | 636 | | ECTO: | phenio_edges.tsv | 628 | | BSPO: | phenio_edges.tsv | 532 | | FOODON: | phenio_edges.tsv | 504 | | dct: | phenio_edges.tsv | 441 | | DDANAT: | phenio_edges.tsv | 353 | | ZFIN: | goa_go_annotation_edges.tsv | 321 | | ZFIN: | panther_ref_genome_orthologs_edges.tsv | 321 | | DECIPHER: | hpoa_disease_phenotype_edges.tsv | 292 | | SNOMED: | phenio_edges.tsv | 289 | | ZFIN: | alliance_gene_to_expression_edges.tsv | 284 | | MGI: | panther_ref_genome_orthologs_edges.tsv | 266 | | PCO: | phenio_edges.tsv | 241 | | foaf: | phenio_edges.tsv | 238 | | SEPIO: | phenio_edges.tsv | 235 | | GO: | goa_go_annotation_edges.tsv | 196 | | OMIMPS: | phenio_edges.tsv | 173 | | MAXO: | phenio_edges.tsv | 163 | | FB: | panther_ref_genome_orthologs_edges.tsv | 149 | | FB: | goa_go_annotation_edges.tsv | 147 | | WB: | goa_go_annotation_edges.tsv | 128 | | SCTID: | phenio_edges.tsv | 113 | | | #### The ENSEMBL: prefix alone needs more analysis to know more about what kind of IDs we're missing ``` q -t -H "\ select subject from monarch-kg-dangling-edges.tsv where subject like 'ENSEMBL:%' and subject not in (select id from monarch-kg_nodes.tsv) \ union \ select object from monarch-kg-dangling-edges.tsv where object like 'ENSEMBL:%' and object not in (select id from monarch-kg_nodes.tsv) \ " | sed "s/[0-9]*//g" | sort | uniq -c ``` Looking at ENSDARGs How many does ZFIN consider to be 1-1 with ZFIN genes? These we potentially want. ``` q "select subject from monarch-kg-dangling-edges.tsv where subject like 'ENSEMBL:ENSDARG%' and subject in (select ensdarg from zfin_ensembl_1_to_1.tsv) union select object from monarch-kg-dangling-edges.tsv where object like 'ENSEMBL:ENSDARG%' and object in (select ensdarg from zfin_ensembl_1_to_1.tsv)" | sort | uniq | wc -l 1092 ``` How many does ZFIN consider not to be 1-1 with ZFIN genes? These won't line up to any genes we'd get from ZFIN. ``` q "select subject from monarch-kg-dangling-edges.tsv where subject like 'ENSEMBL:ENSDARG%' and subject not in (select ensdarg from zfin_ensembl_1_to_1.tsv) union select object from monarch-kg-dangling-edges.tsv where object like 'ENSEMBL:ENSDARG%' and object not in (select ensdarg from zfin_ensembl_1_to_1.tsv)" | sort | uniq | wc -l 1610 ``` (these numbers are node IDs, not total number of edges) #### OBO ``` q "\ select subject from monarch-kg-dangling-edges.tsv where subject like 'OBO:%' union select object from monarch-kg-dangling-edges.tsv where object like 'OBO:%' " | sed 's@[0-9]@@g' | sort | uniq -c | sort -rn ``` and subject not like '%.owl' and object not like '%.owl' #### OMIM ``` x ``` #### paths from root ``` q -t -O -b -H "\ select prefix, provided_by, count(*) as amount from ( select substr(subject,1,instr(subject,':')) as prefix, provided_by \ from output/qc/monarch-kg-dangling-edges.tsv \ where subject not in (select id from output/monarch-kg_nodes.tsv) \ union all \ select substr(object,1,instr(object,':')) as prefix, provided_by \ from output/qc/monarch-kg-dangling-edges.tsv \ where object not in (select id from output/monarch-kg_nodes.tsv) \ ) group by 1,2 having amount > 100 order by 3 desc" | pbcopy ``` sqlite: ``` echo "select prefix, provided_by, count(*) as amount \ from ( \ select substr(subject,1,instr(subject,':')) as prefix, provided_by \ from dangling_edges \ where subject not in (select id from nodes) \ union all \ select substr(object,1,instr(object,':')) as prefix, provided_by \ from dangling_edges \ where object not in (select id from nodes) \ ) group by 1,2 having amount > 100 order by 3 desc; ``` Grouped by provided_by ``` q -t -H "\ select provided_by, prefix, count(*) as amount from ( select substr(subject,1,instr(subject,':')) as prefix, provided_by \ from output/qc/monarch-kg-dangling-edges.tsv \ where subject not in (select id from output/monarch-kg_nodes.tsv) \ union all \ select substr(object,1,instr(object,':')) as prefix, provided_by \ from output/qc/monarch-kg-dangling-edges.tsv \ where object not in (select id from output/monarch-kg_nodes.tsv) \ ) group by 1,2 having amount > 100 order by 1,3 desc \ " ``` Just provided_by ``` q -t -H "\ select provided_by, count(*) as amount from ( select substr(subject,1,instr(subject,':')) as prefix, provided_by \ from output/qc/monarch-kg-dangling-edges.tsv \ where subject not in (select id from output/monarch-kg_nodes.tsv) \ union all \ select substr(object,1,instr(object,':')) as prefix, provided_by \ from output/qc/monarch-kg-dangling-edges.tsv \ where object not in (select id from output/monarch-kg_nodes.tsv) \ ) group by 1 having amount > 100 order by 2 desc \ " ``` #### NCBIGene ``` q -t -H "\ select subject from output/qc/monarch-kg-dangling-edges.tsv where subject like 'NCBIGene:%' and subject not in (select id from output/monarch-kg_nodes.tsv) \ union \ select object from output/qc/monarch-kg-dangling-edges.tsv where object like 'NCBIGene:%' and object not in (select id from output/monarch-kg_nodes.tsv) " ``` #### GO Annotation q -O "select distinct provided_by, substr(original_subject,1,instr(original_subject,':')) as original_subject_prefix, substr(subject,1,instr(subject,':')) as subject_prefix, substr(object,1,instr(object,':')) as object_prefix, count(*) from monarch-kg_edges.tsv where provided_by like '%go_annotation%' group by 1,2,3 order by 4 desc" provided_by subject_prefix object_prefix count(*) output/transform_output/goa_go_annotation_edges.tsv ZFIN: GO: 260379 output/transform_output/goa_go_annotation_edges.tsv WB: GO: 135277 output/transform_output/goa_go_annotation_edges.tsv RGD: GO: 131937 output/transform_output/goa_go_annotation_edges.tsv SGD: GO: 120301 output/transform_output/goa_go_annotation_edges.tsv FB: GO: 120290 output/transform_output/goa_go_annotation_edges.tsv PomBase: GO: 56473 q -b -O "select distinct provided_by, substr(subject,1,instr(subject,':')) as subject_prefix, substr(object,1,instr(object,':')) as object_prefix, count(*) from qc/monarch-kg-dangling-edges.tsv where provided_by like '%go_annotation%' group by 1,2,3 order by 4 desc" provided_by subject_prefix object_prefix count(*) output/transform_output/goa_go_annotation_edges.tsv AspGD: GO: 493755 output/transform_output/goa_go_annotation_edges.tsv UniProtKB: GO: 446880 output/transform_output/goa_go_annotation_edges.tsv dictyBase: GO: 84483 output/transform_output/goa_go_annotation_edges.tsv SGD: GO: 1716 output/transform_output/goa_go_annotation_edges.tsv ZFIN: GO: 330 output/transform_output/goa_go_annotation_edges.tsv FB: GO: 270 output/transform_output/goa_go_annotation_edges.tsv WB: GO: 137 output/transform_output/goa_go_annotation_edges.tsv RGD: GO: 45 output/transform_output/goa_go_annotation_edges.tsv PomBase: GO: 41 q -b -O "subject from qc/monarch-kg-dangling-edges.tsv where provided_by like '%go_annotation%' group by 1,2,3 order by 4 desc"taxon from data/ncbi/gene_info/.tsv join #### Mapping ``` select distinct in_taxon as taxon, substr(subject_id,1,instr(subject_id,':')) as subject_prefix, substr(object_id,1,instr(object_id,':')) as object_prefix, count(*) as total from mapping join nodes on subject_id = id where in_taxon <> "" group by 1,2,3 order by 1,2,3; ``` ``` select provided_by, prefix, count(*) as amount from ( select substr(subject,1,instr(subject,':')) as prefix, provided_by from dangling_edges where subject not in (select id from nodes) union all select substr(object,1,instr(object,':')) as prefix, provided_by from dangling_edges where object not in (select id from nodes) ) group by 1,2 having amount > 100 order by 1,2; ``` ``` select provided_by, prefix, count(*) as amount from ( select substr(subject,1,instr(subject,':')) as prefix, provided_by from dangling_edges where subject not in (select id from nodes) union all select substr(object,1,instr(object,':')) as prefix, provided_by from dangling_edges where object not in (select id from nodes) ) where provided_by like 'hpoa_disease_phenotype%' group by 1,2 having amount > 100 order by 1,2; ``` ``` mdb " select prefix, provided_by, count(*) as amount \ from ( \ select substr(subject,1,instr(subject,':')) as prefix, provided_by \ from dangling_edges \ where subject not in (select id from nodes) \ union all \ select substr(object,1,instr(object,':')) as prefix, provided_by \ from dangling_edges \ where object not in (select id from nodes) \ ) group by 1,2 having amount > 100 order by 3 desc \ " ```