# E.coli: development test 2020-08-19
## Graph Construction
Graph construction is done with a DBG at k=63 and coverage=2. This is a very noisy, but well connected graph
### Graph simplification functions
The graph simplification functions use very simple heuristics based on read paths. These replace the old heuristics on GraphContigger
```python
from collections import Counter
def is_canonical_repeat(nv):
if len(nv.prev())==len(nv.next())>1:
for pl in nv.prev():
if len(pl.node().next())>1: return False
for nl in nv.next():
if len(nl.node().prev())>1: return False
return True
return False
def is_bubble_side(nv):
if len(nv.parallels())==len(nv.prev())==len(nv.next())==1 and len(nv.prev()[0].node().next())==len(nv.next()[0].node().prev())==2: return True
return False
def is_tip(nv):
if (len(nv.prev())==0 and len(nv.next())>0) or (len(nv.next())==0 and len(nv.prev())>0): return True
return False
def simple_structures_stats(dg):
total=canrep=bubsides=tips=0
for nv in dg.get_all_nodeviews():
total+=1
if is_canonical_repeat(nv): canrep+=1
if is_bubble_side(nv): bubsides+=1
if is_tip(nv): tips+=1
print("%d tips, %d canonical repeats and %d bubble sides out of %d nodes"%(tips,canrep,bubsides,total))
def solve_canonical_repeat(ge,nv,peds,min_support=5,max_noise=10,snr=10,verbose=False):
if verbose: print("\nSolving ",nv,nv.size(),"bp")
out_nids=[x.node().node_id() for x in nv.next()]
sols=[]
for pnv in [x.node() for x in nv.prev()]:
c=Counter()
for p in peds.mapper.all_paths_fw(pnv.node_id(),False):
if p[0]!=nv.node_id():
c[p[0]]+=1
elif len(p)>1:
c[p[1]]+=1
if verbose: print(c.most_common())
t=sum(c.values())
if t<min_support:
if verbose: print("FW too few paths!")
return False
(winner,votes)=c.most_common(1)[0]
if votes<min_support:
if verbose: print("FW winner poorly supported!")
return False
noise=t-votes
if winner in out_nids and noise*snr<=votes and noise<max_noise:
sols.append((-pnv.node_id(),winner))
else:
if verbose: print("FW Not a clear winner!")
return False
if verbose: print(sols)
if len(sols)<len(nv.prev()) or len(sols)>len(set([x[1] for x in sols])): return False
in_nids=[-x.node().node_id() for x in nv.prev()]
sols2=[]
for nnv in [x.node() for x in nv.next()]:
c=Counter()
for p in peds.mapper.all_paths_fw(-nnv.node_id(),False):
if p[0]!=-nv.node_id():
c[p[0]]+=1
elif len(p)>1:
c[p[1]]+=1
t=sum(c.values())
if t<min_support:
if verbose: print("BW too few paths!")
return False
(winner,votes)=c.most_common(1)[0]
if votes<min_support:
if verbose: print("BW winner poorly supported!")
return False
noise=t-votes
if winner in in_nids and noise*snr<=votes and noise<max_noise:
sols2.append((winner,nnv.node_id()))
else:
if verbose: print("BW Not a clear winner!")
return False
if verbose: print(sols2)
sols2.sort()
sols.sort()
if sols!=sols2:
if verbose: print("FW and BW solutions not the same")
return False
ge.queue_node_expansion(nv.node_id(),sols)
#print(sols)
return True
def clip_tip(ge,nv,peds,min_support=5,max_noise=10,snr=3):
if len(nv.prev())==0: nv=nv.rc()
if len(nv.prev())!=1 or len(nv.next())!=0: return False
ayes=nays=0
for p in peds.mapper.all_paths_fw(nv.prev()[0].node().node_id(),False):
if p[0]==nv.node_id():
ayes+=1
else:
nays+=1
if ayes<=max_noise and ayes*snr<=nays and nays>=min_support:
ge.queue_node_deletion(abs(nv.node_id()))
return True
else:
return False
def pop_error_bubble(ge,nv1,nv2,peds,min_support=5,max_noise=10,snr=10):
if not is_bubble_side(nv1): return False
if nv1.parallels()[0]!=nv2 or abs(nv1.node_id())>abs(nv2.node_id()): return False
v1=v2=vo=0
for p in peds.mapper.all_paths_fw(nv1.prev()[0].node().node_id(),False):
if p[0]==nv1.node_id(): v1+=1
elif p[0]==nv2.node_id(): v2+=1
else: vo+=1
for p in peds.mapper.all_paths_fw(-nv1.next()[0].node().node_id(),False):
if p[0]==-nv1.node_id(): v1+=1
elif p[0]==-nv2.node_id(): v2+=1
else: vo+=1
if v1>min_support and v2+vo<max_noise and v1>=snr*(v2+vo):
ge.queue_node_deletion(nv2.node_id())
return True
if v2>min_support and v1+vo<max_noise and v2>=snr*(v1+vo):
ge.queue_node_deletion(nv1.node_id())
return True
return False
def solve_all_canonical(peds,size=1000,apply=False):
ge=SDG.GraphEditor(ws)
total=solved=0
for nv in ws.sdg.get_all_nodeviews():
if nv.size()<=size and is_canonical_repeat(nv):
total+=1
if solve_canonical_repeat(ge,nv,peds):
solved+=1
print("%d / %d canonical repeats solved" % (solved,total))
if apply:
ge.apply_all()
ws.sdg.join_all_unitigs()
def clip_all_tips(peds,size=300,apply=False):
ge=SDG.GraphEditor(ws)
total=solved=0
for nv in ws.sdg.get_all_nodeviews():
if nv.size()<=size and is_tip(nv):
total+=1
if clip_tip(ge,nv,peds):
solved+=1
print("%d / %d tips clipped as errors" % (solved,total))
if apply:
ge.apply_all()
ws.sdg.join_all_unitigs()
def pop_all_error_bubbles(peds,size=1000,apply=False):
ge=SDG.GraphEditor(ws)
total=solved=0
for nv in ws.sdg.get_all_nodeviews():
if nv.size()<=size and is_bubble_side(nv) and abs(nv.node_id())<abs(nv.parallels()[0].node_id()):
total+=1
if pop_error_bubble(ge,nv,nv.parallels()[0],peds):
solved+=1
print("%d / %d bubbles popped as error" % (solved,total))
if apply:
ge.apply_all()
ws.sdg.join_all_unitigs()
```
## Tangle resolution
The tangle resolution heuristics use Strider in a local fashion, within a single tangle, if the tangle is solved, they modify the graph to include the solution and if necessary eliminate every edge in the tangle. To this effect tangles are first classified, and then analysed with the a heuristic fit to the tangle type. Unclassified tangles are analysed by looking at a single node in the tangle producing a haplotype-specific point-to-point solution between 2 frontiers and every frontier being part of one and only one solution.
```python
def solve_bubble(t,s,ge):
#This checks if a bubble can be due to a sequencing error and queues a node deletion on the error node
fapath=s.stride_out_in_order(t.frontiers[0].rc().node_id()).nodes[:3]
fbpath=[-x for x in s.stride_out_in_order(t.frontiers[1].rc().node_id()).nodes[:3][::-1]]
#print(fapath,fbpath)
if len(fapath)<2: fapath=fbpath
if len(fbpath)<2: fbpath=fapath
if len(fapath)<2: return False
if fapath==fbpath and abs(t.internals[0].node_id())==abs(fapath[1]):
return ge.queue_node_deletion(t.internals[1].node_id())
if fapath==fbpath and abs(t.internals[1].node_id())==abs(fapath[1]):
return ge.queue_node_deletion(t.internals[0].node_id())
def solve_repeat(t,s,ge):
#solves a single-node repeat
rnid=t.internals[0].node_id()
ins=[x.node() for x in t.internals[0].prev()]
inids=[x.node_id() for x in ins]
outs=[x.node() for x in t.internals[0].next()]
onids=[x.node_id() for x in outs]
solutions=[]
if len(ins)!=len(outs): return False
for inv in ins:
fpath=s.stride_out_in_order(inv.node_id()).nodes[:3]
if len(fpath)==3 and fpath[1]==rnid and fpath[2] in onids:
bpath=[-x for x in s.stride_out_in_order(-fpath[2]).nodes[:3][::-1]]
if bpath==fpath: solutions.append(fpath)
if len(solutions)!=len(ins): return False
ge.queue_node_expansion(rnid,[[-x[0],x[2]] for x in solutions])
return True
def solve_tip(t,s,ge):
#just check that each frontier connects to the other one through reads
f0s=s.stride_out_in_order(-t.frontiers[0].node_id()).nodes
f1s=s.stride_out_in_order(-t.frontiers[1].node_id()).nodes
if len(f0s)>1 and f0s[1]==t.frontiers[1].node_id() and len(f1s)>1 and f1s[1]==t.frontiers[0].node_id():
ge.queue_node_deletion(t.internals[0].node_id())
return True
return False
def end_to_end_solution(p,fnids):
for i in range(len(p)):
if -p[i] in fnids:
for j in range(i+1,len(p)):
if p[j] in fnids:
if abs(p[i])<abs(p[j]): sol=p[i:j+1]
else: sol=[-x for x in p[i:j+1][::-1]]
#go through sol, if there's a single-option missing node, add it
csol=[sol[0]]
for n in sol[1:]:
if n in ws.sdg.fw_reached_nodes(csol[-1],1):
csol.append(n)
elif n in ws.sdg.fw_reached_nodes(csol[-1],2):
a=[]
for skipped in ws.sdg.fw_reached_nodes(csol[-1],1):
if n in ws.sdg.fw_reached_nodes(skipped,1):
a.append(skipped)
if len(a)==1:
csol.append(a[0])
csol.append(n)
else:
return sol
else:
return sol
return csol
return []
return []
def solve_unclassified(t,s,ge):
#first stride from all frontiers:
fs={}
fnids=[x.node_id() for x in t.frontiers]
sols={x:[] for x in fnids}
for f in t.frontiers:
fs[-f.node_id()]=s.stride_out_in_order(-f.node_id()).nodes
#for x in fs: print(x,fs[x])
ns={}
for nv in t.internals:
ns[nv.node_id()]=[-x for x in s.stride_out_in_order(-nv.node_id()).nodes[:1:-1]]+s.stride_out_in_order(nv.node_id()).nodes
#for x in ns: print(x,ns[x])
sols=[]
for x in list(fs.values())+list(ns.values()):
sol=end_to_end_solution(x,fnids)
if sol and sol not in sols:
sols.append(sol)
#for x in sols: print(x)
if len(sols)==len(fnids)/2 and len(set([x[0] for x in sols]+[-x[-1] for x in sols]))==len(fnids):
#print("UNCLASSIFIED SOLVED:")
for x in sols:
try:
ps=SDG.SequenceDistanceGraphPath(ws.sdg,x).sequence()
except:
return False
for x in sols: ge.queue_path_detachment(x,True)
for x in t.internals: ge.queue_node_deletion(x.node_id())
return True
return False
def solve_all_tangles(ws,fsize=220,fminkci=-1,fmaxkci=-1,apply=False):
total=solved=unsolved=0
s=SDG.Strider(ws)
s.add_datastore(peds)
ge=SDG.GraphEditor(ws)
solved=Counter()
unsolved=Counter()
for t in ws.sdg.get_all_tangleviews(fsize,fminkci,fmaxkci):
tc=classify_tangle(t)
if tc=="tip":
#TODO add tip to reconnection list
if solve_tip(t,s,ge): solved[tc]+=1
else: unsolved[tc]+=1
elif tc=="bubble":
if solve_bubble(t,s,ge): solved[tc]+=1
else: unsolved[tc]+=1
elif tc[:6]=="repeat":
if solve_repeat(t,s,ge): solved[tc]+=1
else: unsolved[tc]+=1
else:
if solve_unclassified(t,s,ge): solved[tc]+=1
else: unsolved[tc]+=1
total+=1
print("Total tangles: %d " % (total) )
print("Solved: ",solved.most_common())
print("Unsolved: ",unsolved.most_common())
if apply:
ge.apply_all()
ws.sdg.join_all_unitigs()
def classify_tangle(t):
if len(t.frontiers)==0:
return "debris"
if len(t.frontiers)==1:
nids=set([x.node_id() for x in t.internals])
seen_nids=set()
nexts=[t.frontiers[0].rc()]
abort=False
while nexts and not abort:
new_nexts=[]
for nv in nexts:
for nnl in nv.next():
if abs(nnl.node().node_id()) not in nids:
abort=True
break
if nnl.node().node_id() not in seen_nids:
seen_nids.add(nnl.node().node_id())
new_nexts.append(nnl.node())
if abort: break
nexts=new_nexts
if not abort: return "complex tip"
if len(t.internals)==1:
nv=t.internals[0]
if len(t.frontiers)==2 and len(nv.prev())+len(nv.next())==1:
return "tip"
if len(t.frontiers)==2*len(nv.prev()) and len(nv.prev())==len(nv.next()) and len(set([abs(nv.node_id())]+[abs(x.node().node_id()) for x in nv.prev()+nv.next()]))==len(nv.prev())*2+1:
return "repeat %d:%d"%(len(nv.prev()),len(nv.prev()))
if len(t.internals)==2 and len(t.frontiers)==2:
i0p=t.internals[0].parallels()
if len(i0p)==1 and i0p[0]==t.internals[1]:
return "bubble"
else:
i0n=t.internals[0].next()
i0p=t.internals[0].prev()
i1n=t.internals[1].next()
i1p=t.internals[1].prev()
if len(i0n)==2 and len(i0p)==2 and len(i1n)==1 and len(i1p)==1 and i1n[0].node()==i1p[0].node():
return "loop"
if len(i1n)==2 and len(i1p)==2 and len(i0n)==1 and len(i0p)==1 and i0n[0].node()==i0p[0].node():
return "loop"
return "unclassified"
```
## Strider linear paths between anchors
The run\_strider heuristic is strides out from both ends of every anchor (defined by size and KCI). Anchors ends that are connected by the same strided path in both directions are diconnected from any other node and the path between them in inserted as a new node. Two extra heuristics at the end eliminate low-coverage nodes and unroll loops that only go around once.
```python
def strider_run_from_cpp():
## TODO: 'main' kmer count only exists in sdg graph workspaces. make it nameable!
linear_anchors=set()
for nv in ws.sdg.get_all_nodeviews():
nid=nv.node_id()
if s.is_anchor[nid] and len(s.routes_fw[nid])>1 and len(s.routes_bw[nid])>1: linear_anchors.add(nid)
print("%d anchors are linear" % len(linear_anchors))
ge=SDG.GraphEditor(ws)
used_nodes=[]
paths=0
accepted=0
rejected=0
nopaths=0
for fnid in list(linear_anchors):
for nid in [fnid,-fnid]:
#print("\nNode",nid)
if nid>0: fp=s.routes_fw[nid][1:]
else: fp=s.routes_bw[-nid][1:]
#print("FW",fp)
p=[]
for x in fp:
if abs(x) in linear_anchors:
p=fp[:fp.index(x)+1]
#print(p)
if x<0: op= [-x for x in s.routes_fw[-x][1:][::-1]]
else: op= [-x for x in s.routes_bw[x][1:][::-1]]
#print(op)
#print(p[:-1],"==",op[op.index(nid):],"??")
if nid not in op or p[:-1]!=op[op.index(nid)+1:]: p=[]
break
#print(p)
if p and abs(p[-1])>abs(nid):
try:
SDG.SequenceDistanceGraphPath(ws.sdg,[nid]+p).sequence()
#print([nid]+p)
paths+=1
if ge.queue_path_detachment([nid]+p,True):
accepted+=1
else:
rejected+=1
except:
nopaths+=1
for r in p[:-1]: used_nodes.append(abs(r))
print("%d paths ( %d accepted, %d rejected) and %d nopaths"%(paths,accepted,rejected,nopaths))
ge.apply_all()
#delete "used up" tips
for r in range(10):
ge=SDG.GraphEditor(ws)
for x in used_nodes:
try:
nv=ws.sdg.get_nodeview(x)
if len(nv.prev())==0 or len(nv.next())==0:
ge.queue_node_deletion(x)
except: pass
ge.apply_all()
ws.sdg.join_all_unitigs()
ge.remove_small_components(10,1000,10000)
### Finishing-off the assembly: typical heuristics that can be more aggressive when things already look fine:
#an incredibly crude loop resolver, only supporting one round across the loop:
kc.update_graph_counts()
peds.mapper.path_reads()
print(len(ws.sdg.get_all_nodeviews()))
ge=SDG.GraphEditor(ws)
for nv in ws.sdg.get_all_nodeviews():
if len(nv.prev())!=1 or len(nv.next())!=1 or nv.prev()[0].node().node_id()!=nv.next()[0].node().node_id(): continue
r=nv.next()[0].node().node_id()
orn=[x.node().node_id() for x in nv.next()[0].node().next() if x.node().node_id()!=nv.node_id()]
if len(orn)!=1: continue
orn=orn[0]
orp=[x.node().node_id() for x in nv.next()[0].node().prev() if x.node().node_id()!=nv.node_id()]
if len(orp)!=1: continue
orp=orp[0]
print("Loop on %d -> %d -> %d -> %d -> %d"%(orp,r,nv.node_id(),r,orn))
c=Counter()
nc=Counter()
nid=nv.node_id()
for rid in peds.mapper.paths_in_node[abs(nid)]:
if nid<0: rid=-rid
c.update([tuple(peds.mapper.read_paths[abs(rid)].path)])
nc.update(peds.mapper.path_fw(rid,nid))
#for x in c.most_common(): print(x)
#print()
#for x in nc.most_common(): print(x)
if nid not in nc and orn in nc:
print("loop goes around just once!")
ge.queue_node_expansion(r,[[-orp,nid],[-nid, orn]])
ge.apply_all()
ws.sdg.join_all_unitigs()
print(len(ws.sdg.get_all_nodeviews()))
## Short, low information node removal
from statistics import median
kc.update_graph_counts()
#peds.mapper.path_reads()
removed=0
for nv in ws.sdg.get_all_nodeviews():
if nv.size()>1000: continue
rc=nv.kmer_coverage("main","pe")
gc=nv.kmer_coverage("main","sdg")
ukci=[]
skci=[]
for i in range(len(rc)):
if gc[i]==1: ukci.append(rc[i])
else: skci.append(rc[i]/gc[i])
if ukci==[]: ukci=[-1]
if skci==[]: skci=[-1]
ms=median(skci)
mu=median(ukci)
#print(nv.node_id(),)
if mu>-1 and mu<=4 and ms>5*mu:
#print("Node %d (%d bp), shared_coverage=%0.2f, unique_coverage=%0.2f"%(nv.node_id(),nv.size(),ms,mu))
removed+=1
ws.sdg.remove_node(nv.node_id())
print("%d low coverage, low information nodes removed"%removed)
ws.sdg.join_all_unitigs()
```
## Running the recipe
The full recipe (defined in an empirical fashion) for the test e.coli dataset is:
```python
import SDGpython as SDG
from collections import Counter
K=63
MIN_CVG=2
NUM_BATCHES=1
STATS=True
ws=SDG.WorkSpace()
peds=ws.add_paired_reads_datastore('ecoli_pe.prseq')
SDG.GraphMaker(ws.sdg).new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES)
kc=ws.add_kmer_counter('main')
kc.add_count("pe",peds)
kc.set_kci_peak(24)
#ws.sdg.write_to_gfa1('initial.gfa')
if STATS:
kc.compute_all_kcis()
print(ws.sdg.stats_by_kci())
for ii in range(3):
print('\n\nRound',ii,'starting...')
if STATS: simple_structures_stats(ws.sdg)
for i in range(3):
peds.mapper.path_reads()
clip_all_tips(peds,apply=True)
if STATS: simple_structures_stats(ws.sdg)
if STATS:
kc.update_graph_counts()
kc.compute_all_kcis()
print(ws.sdg.stats_by_kci())
for i in [70,100,200]:
peds.mapper.path_reads()
solve_all_tangles(ws,i,apply=True)
kc.update_graph_counts()
kc.compute_all_kcis()
if STATS: print(ws.sdg.stats_by_kci())
peds.mapper.path_reads()
s=SDG.Strider(ws)
s.add_datastore(peds)
s.stride_from_anchors(min_kci=.5,max_kci=1.5)
strider_run_from_cpp()
if STATS:
kc.update_graph_counts()
kc.compute_all_kcis()
print(ws.sdg.stats_by_kci())