# Strawberry code
[TOC]
## Strategic approach
## Tangle stats
Count: bp and nodes in tangles, in frontiers and others (nodes in frontiers are ENDS, obviously).
For the nodes in tangles, plot a distribution of count and bp by count of internal elements (i.e. to give an idea of how tangled the graph is and how complex these tangles are). probably a good idea to plot these numbers as a progress from 0% to 100% of a total count/size.
### Improving the initial graph.
The initial graph needs improvement, so the tangle definition gets easier to work with (i.e. smaller tangles within . This being strawberry, the two more obvious ways to do this are to clip tips and to phase bubbles.
**Gonza is working on phasing bubbly paths and solving other trivial structures by just expanding canonical repeats**
**DONE**
- Simple stats for repeats, bubbles and tips
- Canonical repeat resolution with single-end short reads
**TO DO**
- Canonical repeat resolution with pairs (fallback after failing with single reads)
- Bubble popping (error and heterozygous bubble mode, use KCI and paths)
- Tip clipping (using paths)
Ideally we would want to have a single heuristic step that can do: tip clippling, repeat expansion and bubble phasing in one go.
For now, we have this pretty stupid simple code to just do repeat expansion:
```python
from collections import Counter
def is_canonical_repeat(nv):
if len(nv.prev())==len(nv.next())>1:
for pl in nv.prev():
if len(pl.node().next())>1: return False
for nl in nv.next():
if len(nl.node().prev())>1: return False
return True
return False
def is_bubble_side(nv):
if len(nv.parallels())==len(nv.prev())==len(nv.next())==1 and len(nv.prev()[0].node().next())==len(nv.next()[0].node().prev())==2: return True
return False
def is_tip(nv):
if len(nv.prev())==0 or len(nv.next())==0: return True
return False
def simple_structures_stats(dg):
total=canrep=bubsides=tips=0
for nv in dg.get_all_nodeviews():
total+=1
if is_canonical_repeat(nv): canrep+=1
if is_bubble_side(nv): bubsides+=1
if is_tip(nv): tips+=1
print("%d tips, %d canonical repeats and %d bubble sides out of %d nodes"%(tips,canrep,bubsides,total))
def solve_canonical_repeat(ge,nv,peds,min_support=5,max_noise=10,snr=10,verbose=False):
if verbose: print("\nSolving ",nv,nv.size(),"bp")
out_nids=[x.node().node_id() for x in nv.next()]
sols=[]
for pnv in [x.node() for x in nv.prev()]:
c=Counter()
for p in peds.mapper.all_paths_fw(pnv.node_id(),False):
if p[0]!=nv.node_id():
c[p[0]]+=1
elif len(p)>1:
c[p[1]]+=1
if verbose: print(c.most_common())
t=sum(c.values())
if t<min_support:
if verbose: print("FW too few paths!")
return False
(winner,votes)=c.most_common(1)[0]
if votes<min_support:
if verbose: print("FW winner poorly supported!")
return False
noise=t-votes
if winner in out_nids and noise*snr<=votes and noise<max_noise:
sols.append((-pnv.node_id(),winner))
else:
if verbose: print("FW Not a clear winner!")
return False
if verbose: print(sols)
if len(sols)<len(nv.prev()) or len(sols)>len(set([x[1] for x in sols])): return False
in_nids=[-x.node().node_id() for x in nv.prev()]
sols2=[]
for nnv in [x.node() for x in nv.next()]:
c=Counter()
for p in peds.mapper.all_paths_fw(-nnv.node_id(),False):
if p[0]!=-nv.node_id():
c[p[0]]+=1
elif len(p)>1:
c[p[1]]+=1
t=sum(c.values())
if t<min_support:
if verbose: print("BW too few paths!")
return False
(winner,votes)=c.most_common(1)[0]
if votes<min_support:
if verbose: print("BW winner poorly supported!")
return False
noise=t-votes
if winner in in_nids and noise*snr<=votes and noise<max_noise:
sols2.append((winner,nnv.node_id()))
else:
if verbose: print("BW Not a clear winner!")
return False
if verbose: print(sols2)
sols2.sort()
sols.sort()
if sols!=sols2:
if verbose: print("FW and BW solutions not the same")
return False
ge.queue_node_expansion(nv.node_id(),sols)
#print(sols)
return True
def solve_all_canonical(peds,size=1000,apply=False):
ge=SDG.GraphEditor(ws)
total=solved=0
for nv in ws.sdg.get_all_nodeviews():
if nv.size()<=size and is_canonical_repeat(nv):
total+=1
if solve_canonical_repeat(ge,nv,peds):
solved+=1
print("\r%d / %d canonical repeats solved" % (solved,total), end='')
print("%d / %d canonical repeats solved" % (solved,total))
if apply:
ge.apply_all()
ws.sdg.join_all_unitigs()
#######RUN THIS BIT
simple_structures_stats(ws.sdg)
solve_all_canonical(peds,apply=True)
simple_structures_stats(ws.sdg)
kc.update_graph_counts()
kc.compute_all_kcis()
print(ws.sdg.stats_by_kci())
```
### Tangle drill down
If we start by getting all tangles with frontiers of a large size, we can then decompose the unclassified tangles in their internal sub-tangles for a smaller size. By definition every internal node on a smaller-frontier defined tangle will be included in a single larger-frontier tangle. This allows for different types of tangles to be definied at independently-determined tangle sizes.
# Code dumps!!!
```python
import sys
sys.path.append('/hpc-home/ggarcia/git_sources/bsg/gonza_master/SDGpython/')
import SDGpython as SDG
ws = SDG.WorkSpace('./00-raw_graph.sdgws')
gc = SDG.GraphContigger(ws)
gc.clip_tips(200)
kc = ws.get_kmer_counter('kmers-pe-31')
for i, c in enumerate(kc.count_spectra('pe')[:400]):
print("%s,%s" %(i, c))
kc.update_graph_counts()
kc.set_kci_peak(100)
kc.compute_all_kcis()
print(ws.sdg.stats_by_kci())
ws.dump('./01-tip_clipped.sdgws')
kc.dump_cache('./01-tip_clipped.kci.cache')
peds = ws.get_paired_reads_datastore('pe')
## Map reads
peds.mapper.path_reads()
peds.mapper.dump_readpaths('./01-tip_clipped.pe.paths')
```

```python
import sys
sys.path.append('/hpc-home/ggarcia/git_sources/bsg/gonza_master/SDGpython/')
import SDGpython as SDG
ws = SDG.WorkSpace('./01-tip_clipped.sdgws')
kc = ws.get_kmer_counter('kmers-pe-31')
kc.load_cache('./01-tip_clipped.kci.cache')
## Load mapped reads
peds = ws.get_paired_reads_datastore('pe')
peds.mapper.load_readpaths('./01-tip_clipped.pe.paths')
simple_structures_stats(ws.sdg)
solve_all_canonical(peds,True)
simple_structures_stats(ws.sdg)
kc.update_graph_counts()
kc.compute_all_kcis()
print(ws.sdg.stats_by_kci())
```


[Functions defined in](https://hackmd.io/7i2WQxdEQyOcOC3YLsY-VQ?view)
```python
for ii in range(3):
print('\n\nRound',ii,'starting...')
if STATS: simple_structures_stats(ws.sdg)
for i in range(3):
peds.mapper.path_reads()
clip_all_tips(peds,apply=True)
if STATS: simple_structures_stats(ws.sdg)
if STATS:
kc.update_graph_counts()
kc.compute_all_kcis()
print(ws.sdg.stats_by_kci())
for i in [70,100,200]:
peds.mapper.path_reads()
solve_all_tangles(ws,i,apply=True)
kc.update_graph_counts()
kc.compute_all_kcis()
if STATS: print(ws.sdg.stats_by_kci())
peds.mapper.path_reads()
s=SDG.Strider(ws)
s.add_datastore(peds)
s.stride_from_anchors(min_kci=.5,max_kci=1.5)
strider_run_from_cpp()
if STATS:
kc.update_graph_counts()
kc.compute_all_kcis()
print(ws.sdg.stats_by_kci())
```
```python
lords = ws.add_long_reads_datastore('./nanoporeds.lords.loseq')
lrr = SDG.LongReadsRecruiter()
```
## testing mapper with k=63 and pathing
```python
def mapping_report(peds,peds2,offset=1000000,total=1000000):
same=old_longer=new_longer=old_empty=new_empty=same_difference=both_empty=0
for x in range(offset,offset+total):
p1=list(peds.mapper.read_paths[x].path)
p2=list(peds2.mapper.read_paths[x].path)
if p1==p2:
same+=1
if len(p1)==0: both_empty+=1
else:
if len(p1)>len(p2): old_longer+=1
elif len(p2)>len(p1): new_longer+=1
else: same_difference+=1
if len(p1)==0: old_empty+=1
if len(p2)==0: new_empty+=1
print("same = %d (%.2f%%) --> both empty: %d (%.2f%%) " % (same,100*same/total,both_empty,100*both_empty/total))
print("old longer = %d (%.2f%%)" % (old_longer-new_empty,100*(old_longer-new_empty)/total))
print("new longer = %d (%.2f%%)" % (new_longer-old_empty,100*(new_longer-old_empty)/total))
print("old empty = %d (%.2f%%)" % (old_empty,100*old_empty/total))
print("new empty = %d (%.2f%%)" % (new_empty,100*new_empty/total))
print("same length different = %d (%.2f%%)" % (same_difference,100*same_difference/total))
peds1=ws.add_paired_reads_datastore('test.prseq','test1')
peds2=ws.add_paired_reads_datastore('test.prseq','test2')
peds1.mapper.path_reads(31,50)
peds2.mapper.path_reads()
mapping_report(peds1,peds2,1,peds1.size())
```