# MonarchKG Dangling Edge Analysis
Want to know, which curies from edge ingests are missing, and where are we losing those edges from.
```
q -t -H "\
select prefix, provided_by, count(*) as amount
from (
select substr(subject,1,instr(subject,':')) as prefix, provided_by \
from monarch-kg-dangling-edges.tsv \
where subject not in (select id from monarch-kg_nodes.tsv) \
union all \
select substr(object,1,instr(object,':')) as prefix, provided_by \
from monarch-kg-dangling-edges.tsv \
where object not in (select id from monarch-kg_nodes.tsv) \
) group by 1,2 having amount > 100 order by 3 desc \
"
```
Finds
| prefix | provided_by | number of missing edges |
| --- | --- | --- |
| EMAPA: | alliance_gene_to_expression_edges.tsv | 728282 |
| UniProtKB: | goa_go_annotation_edges.tsv | 555871 |
| ZFA: | alliance_gene_to_expression_edges.tsv | 507141 |
| AspGD: | goa_go_annotation_edges.tsv | 493755 |
| FBbt: | alliance_gene_to_expression_edges.tsv | 394846 |
| ENSEMBL: | panther_ref_genome_orthologs_edges.tsv | 209356 |
| OBO: | phenio_edges.tsv | 128156 |
| OMIM: | hpoa_disease_phenotype_edges.tsv | 113993 |
| | phenio_edges.tsv | 113881 |
| WBbt: | alliance_gene_to_expression_edges.tsv | 102069 |
| dictyBase: | panther_ref_genome_orthologs_edges.tsv | 88368 |
| dictyBase: | goa_go_annotation_edges.tsv | 84636 |
| FYPO: | pombase_gene_to_phenotype_edges.tsv | 84282 |
| CLO: | phenio_edges.tsv | 77712 |
| NCBITaxon: | phenio_edges.tsv | 62826 |
| MESH: | ctd_chemical_to_disease_edges.tsv | 51570 |
| POMBASE: | panther_ref_genome_orthologs_edges.tsv | 48892 |
| http: | phenio_edges.tsv | 47934 |
| UMLS: | phenio_edges.tsv | 40143 |
| NCBIGene: | string_protein_links_edges.tsv | 37092 |
| DOID: | phenio_edges.tsv | 19949 |
| NCIT: | phenio_edges.tsv | 14405 |
| CL: | phenio_edges.tsv | 11047 |
| CHEBI: | reactome_chemical_to_pathway_edges.tsv | 9184 |
| OMIM: | omim_gene_to_disease_edges.tsv | 7134 |
| ECO: | phenio_edges.tsv | 7017 |
| PW: | phenio_edges.tsv | 6516 |
| SO: | phenio_edges.tsv | 6486 |
| PATO: | phenio_edges.tsv | 5720 |
| https: | phenio_edges.tsv | 4994 |
| IAO: | phenio_edges.tsv | 3545 |
| ZP: | zfin_gene_to_phenotype_edges.tsv | 3072 |
| EFO: | phenio_edges.tsv | 2843 |
| RO: | phenio_edges.tsv | 2672 |
| Xenbase: | xenbase_gene_to_phenotype_edges.tsv | 2592 |
| PR: | phenio_edges.tsv | 2282 |
| RGD: | panther_ref_genome_orthologs_edges.tsv | 1802 |
| SGD: | goa_go_annotation_edges.tsv | 1705 |
| OBI: | phenio_edges.tsv | 1692 |
| XCO: | phenio_edges.tsv | 1637 |
| ENVO: | phenio_edges.tsv | 1611 |
| MEDDRA: | phenio_edges.tsv | 1455 |
| PO: | phenio_edges.tsv | 1305 |
| ORPHA: | hpoa_disease_phenotype_edges.tsv | 1000 |
| RGD: | goa_go_annotation_edges.tsv | 899 |
| NCBIGene: | omim_gene_to_disease_edges.tsv | 872 |
| MGI: | alliance_gene_to_expression_edges.tsv | 869 |
| NBO: | phenio_edges.tsv | 862 |
| FBcv: | phenio_edges.tsv | 764 |
| GENO: | phenio_edges.tsv | 736 |
| BFO: | phenio_edges.tsv | 689 |
| owl: | phenio_edges.tsv | 636 |
| ECTO: | phenio_edges.tsv | 628 |
| BSPO: | phenio_edges.tsv | 532 |
| FOODON: | phenio_edges.tsv | 504 |
| dct: | phenio_edges.tsv | 441 |
| DDANAT: | phenio_edges.tsv | 353 |
| ZFIN: | goa_go_annotation_edges.tsv | 321 |
| ZFIN: | panther_ref_genome_orthologs_edges.tsv | 321 |
| DECIPHER: | hpoa_disease_phenotype_edges.tsv | 292 |
| SNOMED: | phenio_edges.tsv | 289 |
| ZFIN: | alliance_gene_to_expression_edges.tsv | 284 |
| MGI: | panther_ref_genome_orthologs_edges.tsv | 266 |
| PCO: | phenio_edges.tsv | 241 |
| foaf: | phenio_edges.tsv | 238 |
| SEPIO: | phenio_edges.tsv | 235 |
| GO: | goa_go_annotation_edges.tsv | 196 |
| OMIMPS: | phenio_edges.tsv | 173 |
| MAXO: | phenio_edges.tsv | 163 |
| FB: | panther_ref_genome_orthologs_edges.tsv | 149 |
| FB: | goa_go_annotation_edges.tsv | 147 |
| WB: | goa_go_annotation_edges.tsv | 128 |
| SCTID: | phenio_edges.tsv | 113 |
| |
####
The ENSEMBL: prefix alone needs more analysis to know more about what kind of IDs we're missing
```
q -t -H "\
select subject from monarch-kg-dangling-edges.tsv where subject like 'ENSEMBL:%' and subject not in (select id from monarch-kg_nodes.tsv) \
union \
select object from monarch-kg-dangling-edges.tsv where object like 'ENSEMBL:%' and object not in (select id from monarch-kg_nodes.tsv) \
" | sed "s/[0-9]*//g" | sort | uniq -c
```
Looking at ENSDARGs
How many does ZFIN consider to be 1-1 with ZFIN genes? These we potentially want.
```
q "select subject from monarch-kg-dangling-edges.tsv where subject like 'ENSEMBL:ENSDARG%' and subject in (select ensdarg from zfin_ensembl_1_to_1.tsv) union select object from monarch-kg-dangling-edges.tsv where object like 'ENSEMBL:ENSDARG%' and object in (select ensdarg from zfin_ensembl_1_to_1.tsv)" | sort | uniq | wc -l
1092
```
How many does ZFIN consider not to be 1-1 with ZFIN genes? These won't line up to any genes we'd get from ZFIN.
```
q "select subject from monarch-kg-dangling-edges.tsv where subject like 'ENSEMBL:ENSDARG%' and subject not in (select ensdarg from zfin_ensembl_1_to_1.tsv) union select object from monarch-kg-dangling-edges.tsv where object like 'ENSEMBL:ENSDARG%' and object not in (select ensdarg from zfin_ensembl_1_to_1.tsv)" | sort | uniq | wc -l
1610
```
(these numbers are node IDs, not total number of edges)
#### OBO
```
q "\
select subject
from monarch-kg-dangling-edges.tsv
where subject like 'OBO:%'
union
select object
from monarch-kg-dangling-edges.tsv
where object like 'OBO:%'
" | sed 's@[0-9]@@g' | sort | uniq -c | sort -rn
```
and subject not like '%.owl'
and object not like '%.owl'
#### OMIM
```
x
```
#### paths from root
```
q -t -O -b -H "\
select prefix, provided_by, count(*) as amount
from (
select substr(subject,1,instr(subject,':')) as prefix, provided_by \
from output/qc/monarch-kg-dangling-edges.tsv \
where subject not in (select id from output/monarch-kg_nodes.tsv) \
union all \
select substr(object,1,instr(object,':')) as prefix, provided_by \
from output/qc/monarch-kg-dangling-edges.tsv \
where object not in (select id from output/monarch-kg_nodes.tsv) \
) group by 1,2 having amount > 100 order by 3 desc" | pbcopy
```
sqlite:
```
echo "select prefix, provided_by, count(*) as amount \
from ( \
select substr(subject,1,instr(subject,':')) as prefix, provided_by \
from dangling_edges \
where subject not in (select id from nodes) \
union all \
select substr(object,1,instr(object,':')) as prefix, provided_by \
from dangling_edges \
where object not in (select id from nodes) \
) group by 1,2 having amount > 100 order by 3 desc;
```
Grouped by provided_by
```
q -t -H "\
select provided_by, prefix, count(*) as amount
from (
select substr(subject,1,instr(subject,':')) as prefix, provided_by \
from output/qc/monarch-kg-dangling-edges.tsv \
where subject not in (select id from output/monarch-kg_nodes.tsv) \
union all \
select substr(object,1,instr(object,':')) as prefix, provided_by \
from output/qc/monarch-kg-dangling-edges.tsv \
where object not in (select id from output/monarch-kg_nodes.tsv) \
) group by 1,2 having amount > 100 order by 1,3 desc \
"
```
Just provided_by
```
q -t -H "\
select provided_by, count(*) as amount
from (
select substr(subject,1,instr(subject,':')) as prefix, provided_by \
from output/qc/monarch-kg-dangling-edges.tsv \
where subject not in (select id from output/monarch-kg_nodes.tsv) \
union all \
select substr(object,1,instr(object,':')) as prefix, provided_by \
from output/qc/monarch-kg-dangling-edges.tsv \
where object not in (select id from output/monarch-kg_nodes.tsv) \
) group by 1 having amount > 100 order by 2 desc \
"
```
#### NCBIGene
```
q -t -H "\
select subject from output/qc/monarch-kg-dangling-edges.tsv where subject like 'NCBIGene:%' and subject not in (select id from output/monarch-kg_nodes.tsv) \
union \
select object from output/qc/monarch-kg-dangling-edges.tsv where object like 'NCBIGene:%' and object not in (select id from output/monarch-kg_nodes.tsv) "
```
#### GO Annotation
q -O "select distinct provided_by, substr(original_subject,1,instr(original_subject,':')) as original_subject_prefix, substr(subject,1,instr(subject,':')) as subject_prefix, substr(object,1,instr(object,':')) as object_prefix, count(*) from monarch-kg_edges.tsv where provided_by like '%go_annotation%' group by 1,2,3 order by 4 desc"
provided_by subject_prefix object_prefix count(*)
output/transform_output/goa_go_annotation_edges.tsv ZFIN: GO: 260379
output/transform_output/goa_go_annotation_edges.tsv WB: GO: 135277
output/transform_output/goa_go_annotation_edges.tsv RGD: GO: 131937
output/transform_output/goa_go_annotation_edges.tsv SGD: GO: 120301
output/transform_output/goa_go_annotation_edges.tsv FB: GO: 120290
output/transform_output/goa_go_annotation_edges.tsv PomBase: GO: 56473
q -b -O "select distinct provided_by, substr(subject,1,instr(subject,':')) as subject_prefix, substr(object,1,instr(object,':')) as object_prefix, count(*) from qc/monarch-kg-dangling-edges.tsv where provided_by like '%go_annotation%' group by 1,2,3 order by 4 desc"
provided_by subject_prefix object_prefix count(*)
output/transform_output/goa_go_annotation_edges.tsv AspGD: GO: 493755
output/transform_output/goa_go_annotation_edges.tsv UniProtKB: GO: 446880
output/transform_output/goa_go_annotation_edges.tsv dictyBase: GO: 84483
output/transform_output/goa_go_annotation_edges.tsv SGD: GO: 1716
output/transform_output/goa_go_annotation_edges.tsv ZFIN: GO: 330
output/transform_output/goa_go_annotation_edges.tsv FB: GO: 270
output/transform_output/goa_go_annotation_edges.tsv WB: GO: 137
output/transform_output/goa_go_annotation_edges.tsv RGD: GO: 45
output/transform_output/goa_go_annotation_edges.tsv PomBase: GO: 41
q -b -O "subject from qc/monarch-kg-dangling-edges.tsv where provided_by like '%go_annotation%' group by 1,2,3 order by 4 desc"taxon from data/ncbi/gene_info/.tsv join
#### Mapping
```
select distinct
in_taxon as taxon,
substr(subject_id,1,instr(subject_id,':')) as subject_prefix,
substr(object_id,1,instr(object_id,':')) as object_prefix,
count(*) as total
from mapping
join nodes on subject_id = id
where in_taxon <> ""
group by 1,2,3
order by 1,2,3;
```
```
select provided_by, prefix, count(*) as amount
from (
select substr(subject,1,instr(subject,':')) as prefix, provided_by
from dangling_edges
where subject not in (select id from nodes)
union all
select substr(object,1,instr(object,':')) as prefix, provided_by
from dangling_edges
where object not in (select id from nodes)
) group by 1,2 having amount > 100 order by 1,2;
```
```
select provided_by, prefix, count(*) as amount
from (
select substr(subject,1,instr(subject,':')) as prefix, provided_by
from dangling_edges
where subject not in (select id from nodes)
union all
select substr(object,1,instr(object,':')) as prefix, provided_by
from dangling_edges
where object not in (select id from nodes)
)
where provided_by like 'hpoa_disease_phenotype%'
group by 1,2 having amount > 100 order by 1,2;
```
```
mdb "
select prefix, provided_by, count(*) as amount \
from ( \
select substr(subject,1,instr(subject,':')) as prefix, provided_by \
from dangling_edges \
where subject not in (select id from nodes) \
union all \
select substr(object,1,instr(object,':')) as prefix, provided_by \
from dangling_edges \
where object not in (select id from nodes) \
) group by 1,2 having amount > 100 order by 3 desc \
"
```