# E.coli: development test 2020-08-19 ## Graph Construction Graph construction is done with a DBG at k=63 and coverage=2. This is a very noisy, but well connected graph ### Graph simplification functions The graph simplification functions use very simple heuristics based on read paths. These replace the old heuristics on GraphContigger ```python from collections import Counter def is_canonical_repeat(nv): if len(nv.prev())==len(nv.next())>1: for pl in nv.prev(): if len(pl.node().next())>1: return False for nl in nv.next(): if len(nl.node().prev())>1: return False return True return False def is_bubble_side(nv): if len(nv.parallels())==len(nv.prev())==len(nv.next())==1 and len(nv.prev()[0].node().next())==len(nv.next()[0].node().prev())==2: return True return False def is_tip(nv): if (len(nv.prev())==0 and len(nv.next())>0) or (len(nv.next())==0 and len(nv.prev())>0): return True return False def simple_structures_stats(dg): total=canrep=bubsides=tips=0 for nv in dg.get_all_nodeviews(): total+=1 if is_canonical_repeat(nv): canrep+=1 if is_bubble_side(nv): bubsides+=1 if is_tip(nv): tips+=1 print("%d tips, %d canonical repeats and %d bubble sides out of %d nodes"%(tips,canrep,bubsides,total)) def solve_canonical_repeat(ge,nv,peds,min_support=5,max_noise=10,snr=10,verbose=False): if verbose: print("\nSolving ",nv,nv.size(),"bp") out_nids=[x.node().node_id() for x in nv.next()] sols=[] for pnv in [x.node() for x in nv.prev()]: c=Counter() for p in peds.mapper.all_paths_fw(pnv.node_id(),False): if p[0]!=nv.node_id(): c[p[0]]+=1 elif len(p)>1: c[p[1]]+=1 if verbose: print(c.most_common()) t=sum(c.values()) if t<min_support: if verbose: print("FW too few paths!") return False (winner,votes)=c.most_common(1)[0] if votes<min_support: if verbose: print("FW winner poorly supported!") return False noise=t-votes if winner in out_nids and noise*snr<=votes and noise<max_noise: sols.append((-pnv.node_id(),winner)) else: if verbose: print("FW Not a clear winner!") return False if verbose: print(sols) if len(sols)<len(nv.prev()) or len(sols)>len(set([x[1] for x in sols])): return False in_nids=[-x.node().node_id() for x in nv.prev()] sols2=[] for nnv in [x.node() for x in nv.next()]: c=Counter() for p in peds.mapper.all_paths_fw(-nnv.node_id(),False): if p[0]!=-nv.node_id(): c[p[0]]+=1 elif len(p)>1: c[p[1]]+=1 t=sum(c.values()) if t<min_support: if verbose: print("BW too few paths!") return False (winner,votes)=c.most_common(1)[0] if votes<min_support: if verbose: print("BW winner poorly supported!") return False noise=t-votes if winner in in_nids and noise*snr<=votes and noise<max_noise: sols2.append((winner,nnv.node_id())) else: if verbose: print("BW Not a clear winner!") return False if verbose: print(sols2) sols2.sort() sols.sort() if sols!=sols2: if verbose: print("FW and BW solutions not the same") return False ge.queue_node_expansion(nv.node_id(),sols) #print(sols) return True def clip_tip(ge,nv,peds,min_support=5,max_noise=10,snr=3): if len(nv.prev())==0: nv=nv.rc() if len(nv.prev())!=1 or len(nv.next())!=0: return False ayes=nays=0 for p in peds.mapper.all_paths_fw(nv.prev()[0].node().node_id(),False): if p[0]==nv.node_id(): ayes+=1 else: nays+=1 if ayes<=max_noise and ayes*snr<=nays and nays>=min_support: ge.queue_node_deletion(abs(nv.node_id())) return True else: return False def pop_error_bubble(ge,nv1,nv2,peds,min_support=5,max_noise=10,snr=10): if not is_bubble_side(nv1): return False if nv1.parallels()[0]!=nv2 or abs(nv1.node_id())>abs(nv2.node_id()): return False v1=v2=vo=0 for p in peds.mapper.all_paths_fw(nv1.prev()[0].node().node_id(),False): if p[0]==nv1.node_id(): v1+=1 elif p[0]==nv2.node_id(): v2+=1 else: vo+=1 for p in peds.mapper.all_paths_fw(-nv1.next()[0].node().node_id(),False): if p[0]==-nv1.node_id(): v1+=1 elif p[0]==-nv2.node_id(): v2+=1 else: vo+=1 if v1>min_support and v2+vo<max_noise and v1>=snr*(v2+vo): ge.queue_node_deletion(nv2.node_id()) return True if v2>min_support and v1+vo<max_noise and v2>=snr*(v1+vo): ge.queue_node_deletion(nv1.node_id()) return True return False def solve_all_canonical(peds,size=1000,apply=False): ge=SDG.GraphEditor(ws) total=solved=0 for nv in ws.sdg.get_all_nodeviews(): if nv.size()<=size and is_canonical_repeat(nv): total+=1 if solve_canonical_repeat(ge,nv,peds): solved+=1 print("%d / %d canonical repeats solved" % (solved,total)) if apply: ge.apply_all() ws.sdg.join_all_unitigs() def clip_all_tips(peds,size=300,apply=False): ge=SDG.GraphEditor(ws) total=solved=0 for nv in ws.sdg.get_all_nodeviews(): if nv.size()<=size and is_tip(nv): total+=1 if clip_tip(ge,nv,peds): solved+=1 print("%d / %d tips clipped as errors" % (solved,total)) if apply: ge.apply_all() ws.sdg.join_all_unitigs() def pop_all_error_bubbles(peds,size=1000,apply=False): ge=SDG.GraphEditor(ws) total=solved=0 for nv in ws.sdg.get_all_nodeviews(): if nv.size()<=size and is_bubble_side(nv) and abs(nv.node_id())<abs(nv.parallels()[0].node_id()): total+=1 if pop_error_bubble(ge,nv,nv.parallels()[0],peds): solved+=1 print("%d / %d bubbles popped as error" % (solved,total)) if apply: ge.apply_all() ws.sdg.join_all_unitigs() ``` ## Tangle resolution The tangle resolution heuristics use Strider in a local fashion, within a single tangle, if the tangle is solved, they modify the graph to include the solution and if necessary eliminate every edge in the tangle. To this effect tangles are first classified, and then analysed with the a heuristic fit to the tangle type. Unclassified tangles are analysed by looking at a single node in the tangle producing a haplotype-specific point-to-point solution between 2 frontiers and every frontier being part of one and only one solution. ```python def solve_bubble(t,s,ge): #This checks if a bubble can be due to a sequencing error and queues a node deletion on the error node fapath=s.stride_out_in_order(t.frontiers[0].rc().node_id()).nodes[:3] fbpath=[-x for x in s.stride_out_in_order(t.frontiers[1].rc().node_id()).nodes[:3][::-1]] #print(fapath,fbpath) if len(fapath)<2: fapath=fbpath if len(fbpath)<2: fbpath=fapath if len(fapath)<2: return False if fapath==fbpath and abs(t.internals[0].node_id())==abs(fapath[1]): return ge.queue_node_deletion(t.internals[1].node_id()) if fapath==fbpath and abs(t.internals[1].node_id())==abs(fapath[1]): return ge.queue_node_deletion(t.internals[0].node_id()) def solve_repeat(t,s,ge): #solves a single-node repeat rnid=t.internals[0].node_id() ins=[x.node() for x in t.internals[0].prev()] inids=[x.node_id() for x in ins] outs=[x.node() for x in t.internals[0].next()] onids=[x.node_id() for x in outs] solutions=[] if len(ins)!=len(outs): return False for inv in ins: fpath=s.stride_out_in_order(inv.node_id()).nodes[:3] if len(fpath)==3 and fpath[1]==rnid and fpath[2] in onids: bpath=[-x for x in s.stride_out_in_order(-fpath[2]).nodes[:3][::-1]] if bpath==fpath: solutions.append(fpath) if len(solutions)!=len(ins): return False ge.queue_node_expansion(rnid,[[-x[0],x[2]] for x in solutions]) return True def solve_tip(t,s,ge): #just check that each frontier connects to the other one through reads f0s=s.stride_out_in_order(-t.frontiers[0].node_id()).nodes f1s=s.stride_out_in_order(-t.frontiers[1].node_id()).nodes if len(f0s)>1 and f0s[1]==t.frontiers[1].node_id() and len(f1s)>1 and f1s[1]==t.frontiers[0].node_id(): ge.queue_node_deletion(t.internals[0].node_id()) return True return False def end_to_end_solution(p,fnids): for i in range(len(p)): if -p[i] in fnids: for j in range(i+1,len(p)): if p[j] in fnids: if abs(p[i])<abs(p[j]): sol=p[i:j+1] else: sol=[-x for x in p[i:j+1][::-1]] #go through sol, if there's a single-option missing node, add it csol=[sol[0]] for n in sol[1:]: if n in ws.sdg.fw_reached_nodes(csol[-1],1): csol.append(n) elif n in ws.sdg.fw_reached_nodes(csol[-1],2): a=[] for skipped in ws.sdg.fw_reached_nodes(csol[-1],1): if n in ws.sdg.fw_reached_nodes(skipped,1): a.append(skipped) if len(a)==1: csol.append(a[0]) csol.append(n) else: return sol else: return sol return csol return [] return [] def solve_unclassified(t,s,ge): #first stride from all frontiers: fs={} fnids=[x.node_id() for x in t.frontiers] sols={x:[] for x in fnids} for f in t.frontiers: fs[-f.node_id()]=s.stride_out_in_order(-f.node_id()).nodes #for x in fs: print(x,fs[x]) ns={} for nv in t.internals: ns[nv.node_id()]=[-x for x in s.stride_out_in_order(-nv.node_id()).nodes[:1:-1]]+s.stride_out_in_order(nv.node_id()).nodes #for x in ns: print(x,ns[x]) sols=[] for x in list(fs.values())+list(ns.values()): sol=end_to_end_solution(x,fnids) if sol and sol not in sols: sols.append(sol) #for x in sols: print(x) if len(sols)==len(fnids)/2 and len(set([x[0] for x in sols]+[-x[-1] for x in sols]))==len(fnids): #print("UNCLASSIFIED SOLVED:") for x in sols: try: ps=SDG.SequenceDistanceGraphPath(ws.sdg,x).sequence() except: return False for x in sols: ge.queue_path_detachment(x,True) for x in t.internals: ge.queue_node_deletion(x.node_id()) return True return False def solve_all_tangles(ws,fsize=220,fminkci=-1,fmaxkci=-1,apply=False): total=solved=unsolved=0 s=SDG.Strider(ws) s.add_datastore(peds) ge=SDG.GraphEditor(ws) solved=Counter() unsolved=Counter() for t in ws.sdg.get_all_tangleviews(fsize,fminkci,fmaxkci): tc=classify_tangle(t) if tc=="tip": #TODO add tip to reconnection list if solve_tip(t,s,ge): solved[tc]+=1 else: unsolved[tc]+=1 elif tc=="bubble": if solve_bubble(t,s,ge): solved[tc]+=1 else: unsolved[tc]+=1 elif tc[:6]=="repeat": if solve_repeat(t,s,ge): solved[tc]+=1 else: unsolved[tc]+=1 else: if solve_unclassified(t,s,ge): solved[tc]+=1 else: unsolved[tc]+=1 total+=1 print("Total tangles: %d " % (total) ) print("Solved: ",solved.most_common()) print("Unsolved: ",unsolved.most_common()) if apply: ge.apply_all() ws.sdg.join_all_unitigs() def classify_tangle(t): if len(t.frontiers)==0: return "debris" if len(t.frontiers)==1: nids=set([x.node_id() for x in t.internals]) seen_nids=set() nexts=[t.frontiers[0].rc()] abort=False while nexts and not abort: new_nexts=[] for nv in nexts: for nnl in nv.next(): if abs(nnl.node().node_id()) not in nids: abort=True break if nnl.node().node_id() not in seen_nids: seen_nids.add(nnl.node().node_id()) new_nexts.append(nnl.node()) if abort: break nexts=new_nexts if not abort: return "complex tip" if len(t.internals)==1: nv=t.internals[0] if len(t.frontiers)==2 and len(nv.prev())+len(nv.next())==1: return "tip" if len(t.frontiers)==2*len(nv.prev()) and len(nv.prev())==len(nv.next()) and len(set([abs(nv.node_id())]+[abs(x.node().node_id()) for x in nv.prev()+nv.next()]))==len(nv.prev())*2+1: return "repeat %d:%d"%(len(nv.prev()),len(nv.prev())) if len(t.internals)==2 and len(t.frontiers)==2: i0p=t.internals[0].parallels() if len(i0p)==1 and i0p[0]==t.internals[1]: return "bubble" else: i0n=t.internals[0].next() i0p=t.internals[0].prev() i1n=t.internals[1].next() i1p=t.internals[1].prev() if len(i0n)==2 and len(i0p)==2 and len(i1n)==1 and len(i1p)==1 and i1n[0].node()==i1p[0].node(): return "loop" if len(i1n)==2 and len(i1p)==2 and len(i0n)==1 and len(i0p)==1 and i0n[0].node()==i0p[0].node(): return "loop" return "unclassified" ``` ## Strider linear paths between anchors The run\_strider heuristic is strides out from both ends of every anchor (defined by size and KCI). Anchors ends that are connected by the same strided path in both directions are diconnected from any other node and the path between them in inserted as a new node. Two extra heuristics at the end eliminate low-coverage nodes and unroll loops that only go around once. ```python def strider_run_from_cpp(): ## TODO: 'main' kmer count only exists in sdg graph workspaces. make it nameable! linear_anchors=set() for nv in ws.sdg.get_all_nodeviews(): nid=nv.node_id() if s.is_anchor[nid] and len(s.routes_fw[nid])>1 and len(s.routes_bw[nid])>1: linear_anchors.add(nid) print("%d anchors are linear" % len(linear_anchors)) ge=SDG.GraphEditor(ws) used_nodes=[] paths=0 accepted=0 rejected=0 nopaths=0 for fnid in list(linear_anchors): for nid in [fnid,-fnid]: #print("\nNode",nid) if nid>0: fp=s.routes_fw[nid][1:] else: fp=s.routes_bw[-nid][1:] #print("FW",fp) p=[] for x in fp: if abs(x) in linear_anchors: p=fp[:fp.index(x)+1] #print(p) if x<0: op= [-x for x in s.routes_fw[-x][1:][::-1]] else: op= [-x for x in s.routes_bw[x][1:][::-1]] #print(op) #print(p[:-1],"==",op[op.index(nid):],"??") if nid not in op or p[:-1]!=op[op.index(nid)+1:]: p=[] break #print(p) if p and abs(p[-1])>abs(nid): try: SDG.SequenceDistanceGraphPath(ws.sdg,[nid]+p).sequence() #print([nid]+p) paths+=1 if ge.queue_path_detachment([nid]+p,True): accepted+=1 else: rejected+=1 except: nopaths+=1 for r in p[:-1]: used_nodes.append(abs(r)) print("%d paths ( %d accepted, %d rejected) and %d nopaths"%(paths,accepted,rejected,nopaths)) ge.apply_all() #delete "used up" tips for r in range(10): ge=SDG.GraphEditor(ws) for x in used_nodes: try: nv=ws.sdg.get_nodeview(x) if len(nv.prev())==0 or len(nv.next())==0: ge.queue_node_deletion(x) except: pass ge.apply_all() ws.sdg.join_all_unitigs() ge.remove_small_components(10,1000,10000) ### Finishing-off the assembly: typical heuristics that can be more aggressive when things already look fine: #an incredibly crude loop resolver, only supporting one round across the loop: kc.update_graph_counts() peds.mapper.path_reads() print(len(ws.sdg.get_all_nodeviews())) ge=SDG.GraphEditor(ws) for nv in ws.sdg.get_all_nodeviews(): if len(nv.prev())!=1 or len(nv.next())!=1 or nv.prev()[0].node().node_id()!=nv.next()[0].node().node_id(): continue r=nv.next()[0].node().node_id() orn=[x.node().node_id() for x in nv.next()[0].node().next() if x.node().node_id()!=nv.node_id()] if len(orn)!=1: continue orn=orn[0] orp=[x.node().node_id() for x in nv.next()[0].node().prev() if x.node().node_id()!=nv.node_id()] if len(orp)!=1: continue orp=orp[0] print("Loop on %d -> %d -> %d -> %d -> %d"%(orp,r,nv.node_id(),r,orn)) c=Counter() nc=Counter() nid=nv.node_id() for rid in peds.mapper.paths_in_node[abs(nid)]: if nid<0: rid=-rid c.update([tuple(peds.mapper.read_paths[abs(rid)].path)]) nc.update(peds.mapper.path_fw(rid,nid)) #for x in c.most_common(): print(x) #print() #for x in nc.most_common(): print(x) if nid not in nc and orn in nc: print("loop goes around just once!") ge.queue_node_expansion(r,[[-orp,nid],[-nid, orn]]) ge.apply_all() ws.sdg.join_all_unitigs() print(len(ws.sdg.get_all_nodeviews())) ## Short, low information node removal from statistics import median kc.update_graph_counts() #peds.mapper.path_reads() removed=0 for nv in ws.sdg.get_all_nodeviews(): if nv.size()>1000: continue rc=nv.kmer_coverage("main","pe") gc=nv.kmer_coverage("main","sdg") ukci=[] skci=[] for i in range(len(rc)): if gc[i]==1: ukci.append(rc[i]) else: skci.append(rc[i]/gc[i]) if ukci==[]: ukci=[-1] if skci==[]: skci=[-1] ms=median(skci) mu=median(ukci) #print(nv.node_id(),) if mu>-1 and mu<=4 and ms>5*mu: #print("Node %d (%d bp), shared_coverage=%0.2f, unique_coverage=%0.2f"%(nv.node_id(),nv.size(),ms,mu)) removed+=1 ws.sdg.remove_node(nv.node_id()) print("%d low coverage, low information nodes removed"%removed) ws.sdg.join_all_unitigs() ``` ## Running the recipe The full recipe (defined in an empirical fashion) for the test e.coli dataset is: ```python import SDGpython as SDG from collections import Counter K=63 MIN_CVG=2 NUM_BATCHES=1 STATS=True ws=SDG.WorkSpace() peds=ws.add_paired_reads_datastore('ecoli_pe.prseq') SDG.GraphMaker(ws.sdg).new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES) kc=ws.add_kmer_counter('main') kc.add_count("pe",peds) kc.set_kci_peak(24) #ws.sdg.write_to_gfa1('initial.gfa') if STATS: kc.compute_all_kcis() print(ws.sdg.stats_by_kci()) for ii in range(3): print('\n\nRound',ii,'starting...') if STATS: simple_structures_stats(ws.sdg) for i in range(3): peds.mapper.path_reads() clip_all_tips(peds,apply=True) if STATS: simple_structures_stats(ws.sdg) if STATS: kc.update_graph_counts() kc.compute_all_kcis() print(ws.sdg.stats_by_kci()) for i in [70,100,200]: peds.mapper.path_reads() solve_all_tangles(ws,i,apply=True) kc.update_graph_counts() kc.compute_all_kcis() if STATS: print(ws.sdg.stats_by_kci()) peds.mapper.path_reads() s=SDG.Strider(ws) s.add_datastore(peds) s.stride_from_anchors(min_kci=.5,max_kci=1.5) strider_run_from_cpp() if STATS: kc.update_graph_counts() kc.compute_all_kcis() print(ws.sdg.stats_by_kci())