# Pink pigeon assembly ## Graph construction ```python import SDGpython as SDG K=63 MIN_CVG=2 NUM_BATCHES=1 ws=SDG.WorkSpace() peds=ws.add_paired_reads_datastore('prds.prseq') SDG.GraphMaker(ws.sdg).new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES) ws.dump('pp_dbg63c2.sdgws') ``` ## Graph improvement (cleanup, tangles, strider) First a light tip clipping, as it removes more than 3/4 of the nodes (note graph is reloaded): ```python import SDGpython as SDG ws=SDG.WorkSpace('pp_dbg63c2.sdgws') print(ws.ls()) c=SDG.GraphContigger(ws) c.clip_tips(125,3) ws.sdg.write_to_gfa1('pp_dbg63c2_tc125x3.gfa') ws.sdg.load_from_gfa('pp_dbg63c2_tc125x3.gfa') ws.dump('pp_dbg63c2_tc125x3.sdgws') ``` Now map the reads, save this as starting point. ```python import SDGpython as SDG ws=SDG.WorkSpace('pp_dbg63c2_tc125x3.sdgws') peds=ws.get_paired_reads_datastore('pe_reads') peds.mapper.path_reads() peds.mapper.dump_readpaths('pp_dbg63c2_tc125x3_pepaths.dump') ``` Now run the simple structures simplification (tips, error bubbles and canonical repeats) ```python import SDGpython as SDG from graphcleaning import * ws=SDG.WorkSpace('pp_dbg63c2_tc125x3.sdgws') peds=ws.get_paired_reads_datastore('pe_reads') peds.mapper.load_readpaths('pp_dbg63c2_tc125x3_pepaths.dump') for ii in range(3): print('\n\nRound',ii,'starting...') simple_structures_stats(ws.sdg) if ii!=0: peds.mapper.path_reads() clip_all_tips(ws,peds,apply=True) ws.sdg.write_to_gfa1('round%d_after_clip.gfa'%(ii)) simple_structures_stats(ws.sdg) peds.mapper.path_reads() solve_all_canonical(ws,peds,size=300,apply=True) ws.sdg.write_to_gfa1('round%d_after_canonicals.gfa'%(ii)) simple_structures_stats(ws.sdg) ``` <!--```python from collections import Counter def is_canonical_repeat(nv): if len(nv.prev())==len(nv.next())>1: for pl in nv.prev(): if len(pl.node().next())>1: return False for nl in nv.next(): if len(nl.node().prev())>1: return False return True return False def is_bubble_side(nv): if len(nv.parallels())==len(nv.prev())==len(nv.next())==1 and len(nv.prev()[0].node().next())==len(nv.next()[0].node().prev())==2: return True return False def is_tip(nv): if (len(nv.prev())==0 and len(nv.next())>0) or (len(nv.next())==0 and len(nv.prev())>0): return True return False def simple_structures_stats(dg): total=canrep=bubsides=tips=0 for nv in dg.get_all_nodeviews(): total+=1 if is_canonical_repeat(nv): canrep+=1 if is_bubble_side(nv): bubsides+=1 if is_tip(nv): tips+=1 print("%d tips, %d canonical repeats and %d bubble sides out of %d nodes"%(tips,canrep,bubsides,total)) def solve_canonical_repeat(ge,nv,peds,min_support=5,max_noise=10,snr=10,verbose=False): if verbose: print("\nSolving ",nv,nv.size(),"bp") out_nids=[x.node().node_id() for x in nv.next()] sols=[] for pnv in [x.node() for x in nv.prev()]: c=Counter() for p in peds.mapper.all_paths_fw(pnv.node_id(),False): if p[0]!=nv.node_id(): c[p[0]]+=1 elif len(p)>1: c[p[1]]+=1 if verbose: print(c.most_common()) t=sum(c.values()) if t<min_support: if verbose: print("FW too few paths!") return False (winner,votes)=c.most_common(1)[0] if votes<min_support: if verbose: print("FW winner poorly supported!") return False noise=t-votes if winner in out_nids and noise*snr<=votes and noise<max_noise: sols.append((-pnv.node_id(),winner)) else: if verbose: print("FW Not a clear winner!") return False if verbose: print(sols) if len(sols)<len(nv.prev()) or len(sols)>len(set([x[1] for x in sols])): return False in_nids=[-x.node().node_id() for x in nv.prev()] sols2=[] for nnv in [x.node() for x in nv.next()]: c=Counter() for p in peds.mapper.all_paths_fw(-nnv.node_id(),False): if p[0]!=-nv.node_id(): c[p[0]]+=1 elif len(p)>1: c[p[1]]+=1 t=sum(c.values()) if t<min_support: if verbose: print("BW too few paths!") return False (winner,votes)=c.most_common(1)[0] if votes<min_support: if verbose: print("BW winner poorly supported!") return False noise=t-votes if winner in in_nids and noise*snr<=votes and noise<max_noise: sols2.append((winner,nnv.node_id())) else: if verbose: print("BW Not a clear winner!") return False if verbose: print(sols2) sols2.sort() sols.sort() if sols!=sols2: if verbose: print("FW and BW solutions not the same") return False ge.queue_node_expansion(nv.node_id(),sols) #print(sols) return True def clip_tip(ge,nv,peds,min_support=5,max_noise=10,snr=3): if len(nv.prev())==0: nv=nv.rc() if len(nv.prev())!=1 or len(nv.next())!=0: return False ayes=nays=0 for p in peds.mapper.all_paths_fw(nv.prev()[0].node().node_id(),False): if p[0]==nv.node_id(): ayes+=1 else: nays+=1 if ayes<=max_noise and ayes*snr<=nays and nays>=min_support: ge.queue_node_deletion(abs(nv.node_id())) return True else: return False def pop_error_bubble(ge,nv1,nv2,peds,min_support=5,max_noise=10,snr=10): if not is_bubble_side(nv1): return False if nv1.parallels()[0]!=nv2 or abs(nv1.node_id())>abs(nv2.node_id()): return False v1=v2=vo=0 for p in peds.mapper.all_paths_fw(nv1.prev()[0].node().node_id(),False): if p[0]==nv1.node_id(): v1+=1 elif p[0]==nv2.node_id(): v2+=1 else: vo+=1 for p in peds.mapper.all_paths_fw(-nv1.next()[0].node().node_id(),False): if p[0]==-nv1.node_id(): v1+=1 elif p[0]==-nv2.node_id(): v2+=1 else: vo+=1 if v1>min_support and v2+vo<max_noise and v1>=snr*(v2+vo): ge.queue_node_deletion(nv2.node_id()) return True if v2>min_support and v1+vo<max_noise and v2>=snr*(v1+vo): ge.queue_node_deletion(nv1.node_id()) return True return False def solve_all_canonical(peds,size=1000,apply=False): ge=SDG.GraphEditor(ws) total=solved=0 for nv in ws.sdg.get_all_nodeviews(): if nv.size()<=size and is_canonical_repeat(nv): total+=1 if solve_canonical_repeat(ge,nv,peds): solved+=1 print("%d / %d canonical repeats solved" % (solved,total)) if apply: ge.apply_all() ws.sdg.join_all_unitigs() def clip_all_tips(peds,size=300,apply=False): ge=SDG.GraphEditor(ws) total=solved=0 for nv in ws.sdg.get_all_nodeviews(): if nv.size()<=size and is_tip(nv): total+=1 if clip_tip(ge,nv,peds): solved+=1 print("%d / %d tips clipped as errors" % (solved,total)) if apply: ge.apply_all() ws.sdg.join_all_unitigs() def pop_all_error_bubbles(peds,size=1000,apply=False): ge=SDG.GraphEditor(ws) total=solved=0 for nv in ws.sdg.get_all_nodeviews(): if nv.size()<=size and is_bubble_side(nv) and abs(nv.node_id())<abs(nv.parallels()[0].node_id()): total+=1 if pop_error_bubble(ge,nv,nv.parallels()[0],peds): solved+=1 print("%d / %d bubbles popped as error" % (solved,total)) if apply: ge.apply_all() ws.sdg.join_all_unitigs() import SDGpython as SDG ws=SDG.WorkSpace('pp_dbg63c2.sdgws') peds=ws.get_paired_reads_datastore('pe_reads') peds.mapper.load_readpaths('pp_dbg63c2_pepaths.dump') for ii in range(3): print('\n\nRound',ii,'starting...') if STATS: simple_structures_stats(ws.sdg) for i in range(1): if ii!=0 or i!=0: peds.mapper.path_reads() clip_all_tips(peds,apply=True) if STATS: simple_structures_stats(ws.sdg) ws.sdg.write_to_gfa1('round%d_after_clip_%d.gfa'%(ii,i)) for i in range(1): peds.mapper.path_reads() solve_all_canonical(peds,size=230,apply=True) if STATS: simple_structures_stats(ws.sdg) ws.sdg.write_to_gfa1('round%d_after_canonical_%d.gfa'%(ii,i)) ws.dump('pp_dbg63c2_simple.sdgws') ``` --> ## Heterozygosity removal (for clean bubbles anyway) ## Aligning the 10x genomes # OLD STUFF ## SDG-dbg and bubble popping (by Cam) There seems to be a lot of leftover bubbles in the graph, so we need to pop them. Maybe it could be a good idea to phase them first, to make longer easier-to-pop bubbles. ## Tangle solving with PE reads (by bj) ```python #load Cam's workspace import SDGpython as SDG ws = SDG.WorkSpace('/ei/workarea/Research-Groups/RG-Bernardo-Clavijo/Cam_PP_SCAFF/SDG_DBG/Jul_final_ass/3bubblepop_LRR/pp_k63_bubble_pop_lrmap.sdgws') peds = ws.get_paired_reads_datastore('pe_reads') peds.mapper.load_readpaths('/ei/workarea/Research-Groups/RG-Bernardo-Clavijo/Cam_PP_SCAFF/SDG_DBG/Jul_final_ass/3bubblepop_LRR/2_3rdbubble_popped_join_readpaths.data') kc = ws.get_kmer_counter('kmers-pe-31') kc.set_kci_peak(46) kc.load_cache('/ei/workarea/Research-Groups/RG-Bernardo-Clavijo/Cam_PP_SCAFF/SDG_DBG/Jul_final_ass/3bubblepop_LRR/2_3rdbubble_pop_kci.data') #defining the tangle functions from collections import Counter def solve_bubble(t,s,ge): #This checks if a bubble can be due to a sequencing error and queues a node deletion on the error node fapath=s.stride_out_in_order(t.frontiers[0].rc().node_id()).nodes[:3] fbpath=[-x for x in s.stride_out_in_order(t.frontiers[1].rc().node_id()).nodes[:3][::-1]] #print(fapath,fbpath) if len(fapath)<2: fapath=fbpath if len(fbpath)<2: fbpath=fapath if len(fapath)<2: return False if fapath==fbpath and abs(t.internals[0].node_id())==abs(fapath[1]): return ge.queue_node_deletion(t.internals[1].node_id()) if fapath==fbpath and abs(t.internals[1].node_id())==abs(fapath[1]): return ge.queue_node_deletion(t.internals[0].node_id()) def solve_repeat(t,s,ge): #solves a single-node repeat rnid=t.internals[0].node_id() ins=[x.node() for x in t.internals[0].prev()] inids=[x.node_id() for x in ins] outs=[x.node() for x in t.internals[0].next()] onids=[x.node_id() for x in outs] solutions=[] if len(ins)!=len(outs): return False for inv in ins: fpath=s.stride_out_in_order(inv.node_id()).nodes[:3] if len(fpath)==3 and fpath[1]==rnid and fpath[2] in onids: bpath=[-x for x in s.stride_out_in_order(-fpath[2]).nodes[:3][::-1]] if bpath==fpath: solutions.append(fpath) if len(solutions)!=len(ins): return False ge.queue_node_expansion(rnid,[[-x[0],x[2]] for x in solutions]) return True def solve_tip(t,s,ge): #just check that each frontier connects to the other one through reads f0s=s.stride_out_in_order(-t.frontiers[0].node_id()).nodes f1s=s.stride_out_in_order(-t.frontiers[1].node_id()).nodes if len(f0s)>1 and f0s[1]==t.frontiers[1].node_id() and len(f1s)>1 and f1s[1]==t.frontiers[0].node_id(): ge.queue_node_deletion(t.internals[0].node_id()) return True return False def end_to_end_solution(p,fnids): for i in range(len(p)): if -p[i] in fnids: for j in range(i+1,len(p)): if p[j] in fnids: if abs(p[i])<abs(p[j]): sol=p[i:j+1] else: sol=[-x for x in p[i:j+1][::-1]] #go through sol, if there's a single-option missing node, add it csol=[sol[0]] for n in sol[1:]: if n in ws.sdg.fw_reached_nodes(csol[-1],1): csol.append(n) elif n in ws.sdg.fw_reached_nodes(csol[-1],2): a=[] for skipped in ws.sdg.fw_reached_nodes(csol[-1],1): if n in ws.sdg.fw_reached_nodes(skipped,1): a.append(skipped) if len(a)==1: csol.append(a[0]) csol.append(n) else: return sol else: return sol return csol return [] return [] def solve_unclassified(t,s,ge): #first stride from all frontiers: fs={} fnids=[x.node_id() for x in t.frontiers] sols={x:[] for x in fnids} for f in t.frontiers: fs[-f.node_id()]=s.stride_out_in_order(-f.node_id()).nodes #for x in fs: print(x,fs[x]) ns={} for nv in t.internals: ns[nv.node_id()]=[-x for x in s.stride_out_in_order(-nv.node_id()).nodes[:1:-1]]+s.stride_out_in_order(nv.node_id()).nodes #for x in ns: print(x,ns[x]) sols=[] for x in list(fs.values())+list(ns.values()): sol=end_to_end_solution(x,fnids) if sol and sol not in sols: sols.append(sol) #for x in sols: print(x) if len(sols)==len(fnids)/2 and len(set([x[0] for x in sols]+[-x[-1] for x in sols]))==len(fnids): #print("UNCLASSIFIED SOLVED:") for x in sols: try: ps=SDG.SequenceDistanceGraphPath(ws.sdg,x).sequence() except: return False for x in sols: ge.queue_path_detachment(x,True) for x in t.internals: ge.queue_node_deletion(x.node_id()) return True return False def solve_all_tangles(ws,fsize=220,fminkci=-1,fmaxkci=-1,apply=False): total=solved=unsolved=0 s=SDG.Strider(ws) s.add_datastore(peds) ge=SDG.GraphEditor(ws) solved=Counter() unsolved=Counter() for t in ws.sdg.get_all_tangleviews(fsize,fminkci,fmaxkci): tc=classify_tangle(t) if tc=="tip": #TODO add tip to reconnection list if solve_tip(t,s,ge): solved[tc]+=1 else: unsolved[tc]+=1 elif tc=="bubble": if solve_bubble(t,s,ge): solved[tc]+=1 else: unsolved[tc]+=1 elif tc[:6]=="repeat": if solve_repeat(t,s,ge): solved[tc]+=1 else: unsolved[tc]+=1 else: if len(t.internals)<100 and len(t.frontiers)<20 and solve_unclassified(t,s,ge): solved[tc]+=1 else: unsolved[tc]+=1 total+=1 print("Total tangles: %d " % (total) ) print("Solved: ",solved.most_common()) print("Unsolved: ",unsolved.most_common()) if apply: ge.apply_all() ws.sdg.join_all_unitigs() def classify_tangle(t): if len(t.frontiers)==0: return "debris" if len(t.frontiers)==1: nids=set([x.node_id() for x in t.internals]) seen_nids=set() nexts=[t.frontiers[0].rc()] abort=False while nexts and not abort: new_nexts=[] for nv in nexts: for nnl in nv.next(): if abs(nnl.node().node_id()) not in nids: abort=True break if nnl.node().node_id() not in seen_nids: seen_nids.add(nnl.node().node_id()) new_nexts.append(nnl.node()) if abort: break nexts=new_nexts if not abort: return "complex tip" if len(t.internals)==1: nv=t.internals[0] if len(t.frontiers)==2 and len(nv.prev())+len(nv.next())==1: return "tip" if len(t.frontiers)==2*len(nv.prev()) and len(nv.prev())==len(nv.next()) and len(set([abs(nv.node_id())]+[abs(x.node().node_id()) for x in nv.prev()+nv.next()]))==len(nv.prev())*2+1: return "repeat %d:%d"%(len(nv.prev()),len(nv.prev())) if len(t.internals)==2 and len(t.frontiers)==2: i0p=t.internals[0].parallels() if len(i0p)==1 and i0p[0]==t.internals[1]: return "bubble" else: i0n=t.internals[0].next() i0p=t.internals[0].prev() i1n=t.internals[1].next() i1p=t.internals[1].prev() if len(i0n)==2 and len(i0p)==2 and len(i1n)==1 and len(i1p)==1 and i1n[0].node()==i1p[0].node(): return "loop" if len(i1n)==2 and len(i1p)==2 and len(i0n)==1 and len(i0p)==1 and i0n[0].node()==i0p[0].node(): return "loop" return "unclassified" #run tangle functions print(ws.sdg.stats_by_kci()) solve_all_tangles(ws,124,apply=True) kc.update_graph_counts() kc.compute_all_kcis() print(ws.sdg.stats_by_kci()) ws.sdg.write_to_gfa1('after_tangles.gfa') ``` ```txt ----------------------------------------------------------------------------------- | KCI | Total bp | Nodes | Tips | N25 | N50 | N75 | |-------+---------------+----------+---------+------------+------------+------------| | None | 157818826 | 2151371 | 193616 | 83 | 69 | 64 | | < 0.5 | 83657359 | 434763 | 112333 | 456 | 211 | 125 | | ~ 1 | 1102123465 | 523808 | 99012 | 21834 | 11747 | 5431 | | ~ 2 | 6800703 | 41891 | 415 | 250 | 135 | 125 | | ~ 3 | 1546314 | 10606 | 70 | 177 | 125 | 119 | | > 3.5 | 1320335 | 10122 | 115 | 133 | 125 | 114 | |-------+---------------+----------+---------+------------+------------+------------| | All | 1353267002 | 3172561 | 405561 | 18748 | 8703 | 1300 | ----------------------------------------------------------------------------------- ----------------------------------------------------------------------------------- | KCI | Total bp | Nodes | Tips | N25 | N50 | N75 | |-------+---------------+----------+---------+------------+------------+------------| | None | 155576697 | 2124766 | 191681 | 82 | 69 | 64 | | < 0.5 | 83017277 | 412021 | 111875 | 514 | 232 | 125 | | ~ 1 | 1101463260 | 510162 | 98823 | 22033 | 11867 | 5501 | | ~ 2 | 6750984 | 41241 | 410 | 257 | 136 | 125 | | ~ 3 | 1537311 | 10472 | 68 | 181 | 125 | 119 | | > 3.5 | 1315451 | 10015 | 103 | 134 | 125 | 114 | |-------+---------------+----------+---------+------------+------------+------------| | All | 1349660980 | 3108677 | 402960 | 18979 | 8836 | 1417 | ----------------------------------------------------------------------------------- ##