Assembling diploid diatoms from Illumina + nanopore

# Assembling diploid diatoms from Illumina + nanopore ## The strategy - Use illumina to create a k=63 DBG, with minimal cleanup - Strider with illumina to have long kci=1 unitigs - Resolve simple repeats with LR to have even longer unitigs - Make unitigs into linked k=31 unique anchor chains and Map long reads. - Challenge each unitig (i.e. split uncertain unitigs) by linkage among its anchors. - Join unitigs / use happysorter between them. - Go back to the k=63 DBG and untangle by the chained anchors. ## Starting graph, cleanup, KCI ```python= import SDGpython as SDG from collections import Counter K=63 MIN_CVG=5 NUM_BATCHES=4 ws = SDG.WorkSpace() peds = ws.add_paired_reads_datastore('pt_pe.prseq') lords = ws.add_long_reads_datastore('pt_nano.loseq') SDG.GraphMaker(ws.sdg).new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES) gc=SDG.GraphContigger(ws) gc.clip_tips(200) kc=ws.add_kmer_counter("main") kc.add_count("pe",peds) ``` ```python= %pylab inline plot(kc.count_spectra("pe",200)[:-1]) vlines([40,71,102],0,600000) ``` ![](https://i.imgur.com/QadcSbY.png) ```python= BUBBLE_SIZE=125 LOW_CVG=50 to_delete=[] for nv in ws.sdg.get_all_nodeviews(): if nv.size()<=BUBBLE_SIZE and len(nv.parallels())==1: on=nv.parallels()[0] if on.size()<=BUBBLE_SIZE and abs(on.node_id())>nv.node_id(): nvlc=len([x for x in nv.kmer_coverage('main','pe') if x<=LOW_CVG]) onlc=len([x for x in on.kmer_coverage('main','pe') if x<=LOW_CVG]) if nvlc and not onlc: #print('del',nv,nvlc,on,onlc) to_delete.append(nv.node_id()) if onlc and not nvlc: #print('del',on,onlc,nv,nvlc) to_delete.append(abs(on.node_id())) for x in to_delete: ws.sdg.remove_node(x) ws.sdg.join_all_unitigs() ``` ```python= kc.set_kci_peak(71) kc.compute_all_kcis() print(ws.sdg.stats_by_kci()) ``` | KCI | Total bp | Nodes | Tips | N25 | N50 | N75 | |-------|--------------:|---------:|--------:|-----------:|-----------:|-----------:| | None | 8323282 | 109486 | 0 | 88 | 75 | 67 | | < 0.5 | 258694 | 1662 | 117 | 214 | 125 | 125 | | ~ 1 | 36953626 | 216237 | 174 | 233 | 159 | 125 | | ~ 2 | 16625872 | 67265 | 40 | 494 | 285 | 171 | | ~ 3 | 1390179 | 6879 | 2 | 357 | 210 | 140 | | > 3.5 | 602563 | 3017 | 2 | 2842 | 163 | 124 | | **All** | **64154216** | **404546** | **335** | **279** | **163** | **125** | ```python= ws.dump('pt_dbg_clean.sdgws') ws.sdg.write_to_gfa1('pt_dbg_clean.gfa') peds.mapper.path_reads() peds.mapper.dump_readpaths('pt_dbg_clean.pe_paths') ``` ## Strider ```python= import SDGpython as SDG from collections import Counter ws=SDG.WorkSpace('pt_dbg_clean.sdgws') peds=ws.get_paired_reads_datastore('pt_pe') peds.mapper.load_readpaths('pt_dbg_clean.pe_paths') kc=ws.get_kmer_counter('main') kc.set_kci_peak(71) kc.update_graph_counts() kc.compute_all_kcis() s=SDG.Strider(ws) s.add_datastore(peds) s.stride_from_anchors(min_size=1) ``` ```python= def strider_run_from_cpp(): linear_anchors=set() for nv in ws.sdg.get_all_nodeviews(): nid=nv.node_id() if s.is_anchor[nid] and len(s.routes_fw[nid])>1 and len(s.routes_bw[nid])>1: linear_anchors.add(nid) print("%d anchors are linear" % len(linear_anchors)) ge=SDG.GraphEditor(ws) used_nodes=[] paths=0 accepted=0 rejected=0 nopaths=0 for fnid in list(linear_anchors): for nid in [fnid,-fnid]: #print("\nNode",nid) if nid>0: fp=s.routes_fw[nid][1:] else: fp=s.routes_bw[-nid][1:] #print("FW",fp) p=[] for x in fp: if abs(x) in linear_anchors: p=fp[:fp.index(x)+1] #print(p) if x<0: op= [-x for x in s.routes_fw[-x][1:][::-1]] else: op= [-x for x in s.routes_bw[x][1:][::-1]] #print(op) #print(p[:-1],"==",op[op.index(nid):],"??") if nid not in op or p[:-1]!=op[op.index(nid)+1:]: p=[] break #print(p) if p and abs(p[-1])>abs(nid): try: SDG.SequenceDistanceGraphPath(ws.sdg,[nid]+p).sequence() #print([nid]+p) paths+=1 if ge.queue_path_detachment([nid]+p,True): accepted+=1 else: rejected+=1 except: nopaths+=1 for r in p[:-1]: used_nodes.append(abs(r)) print("%d paths ( %d accepted, %d rejected) and %d nopaths"%(paths,accepted,rejected,nopaths)) ge.apply_all() #delete "used up" tips for r in range(10): ge=SDG.GraphEditor(ws) for x in used_nodes: try: nv=ws.sdg.get_nodeview(x) if len(nv.prev())==0 or len(nv.next())==0: ge.queue_node_deletion(x) except: pass ge.apply_all() ws.sdg.join_all_unitigs() ge.remove_small_components(10,1000,10000) ### Finishing-off the assembly: typical heuristics that can be more aggressive when things already look fine: #an incredibly crude loop resolver, only supporting one round across the loop: kc.update_graph_counts() peds.mapper.path_reads() print(len(ws.sdg.get_all_nodeviews())) ge=SDG.GraphEditor(ws) for nv in ws.sdg.get_all_nodeviews(): if len(nv.prev())!=1 or len(nv.next())!=1 or nv.prev()[0].node().node_id()!=nv.next()[0].node().node_id(): continue r=nv.next()[0].node().node_id() orn=[x.node().node_id() for x in nv.next()[0].node().next() if x.node().node_id()!=nv.node_id()] if len(orn)!=1: continue orn=orn[0] orp=[x.node().node_id() for x in nv.next()[0].node().prev() if x.node().node_id()!=nv.node_id()] if len(orp)!=1: continue orp=orp[0] print("Loop on %d -> %d -> %d -> %d -> %d"%(orp,r,nv.node_id(),r,orn)) c=Counter() nc=Counter() nid=nv.node_id() for rid in peds.mapper.paths_in_node[abs(nid)]: if nid<0: rid=-rid c.update([tuple(peds.mapper.read_paths[abs(rid)].path)]) nc.update(peds.mapper.path_fw(rid,nid)) #for x in c.most_common(): print(x) #print() #for x in nc.most_common(): print(x) if nid not in nc and orn in nc: print("loop goes around just once!") ge.queue_node_expansion(r,[[-orp,nid],[-nid, orn]]) ge.apply_all() ws.sdg.join_all_unitigs() print(len(ws.sdg.get_all_nodeviews())) ## Short, low information node removal from statistics import median kc.update_graph_counts() #peds.mapper.path_reads() removed=0 for nv in ws.sdg.get_all_nodeviews(): if nv.size()>1000: continue rc=nv.kmer_coverage("main","pe") gc=nv.kmer_coverage("main","sdg") ukci=[] skci=[] for i in range(len(rc)): if gc[i]==1: ukci.append(rc[i]) else: skci.append(rc[i]/gc[i]) if ukci==[]: ukci=[-1] if skci==[]: skci=[-1] ms=median(skci) mu=median(ukci) #print(nv.node_id(),) if mu>-1 and mu<=4 and ms>5*mu: #print("Node %d (%d bp), shared_coverage=%0.2f, unique_coverage=%0.2f"%(nv.node_id(),nv.size(),ms,mu)) removed+=1 ws.sdg.remove_node(nv.node_id()) print("%d low coverage, low information nodes removed"%removed) ws.sdg.join_all_unitigs() strider_run_from_cpp() ``` ```python= ws.dump('pt_dbg_strided.sdgws') ws.sdg.write_to_gfa1('pt_dbg_strided.gfa') kc.update_graph_counts() kc.compute_all_kcis() print(ws.sdg.stats_by_kci()) ``` | KCI | Total bp | Nodes | Tips | N25 | N50 | N75 | |-------|--------------:|---------:|--------:|-----------:|-----------:|-----------:| | None | 3708227 | 44929 | 76 | 100 | 79 | 68 | | < 0.5 | 246947 | 1195 | 31 | 2074 | 125 | 125 | | ~ 1 | 45776902 | 14833 | 178 | 14623 | 8037 | 3792 | | ~ 2 | 4757366 | 7861 | 37 | 1321 | 814 | 609 | | ~ 3 | 331153 | 1180 | 2 | 733 | 499 | 163 | | > 3.5 | 400473 | 1419 | 2 | 39651 | 409 | 128 | | **All** | **55221068** | **71417** | **326** | **13151** | **6310** | **1933** | ## Solve canonical repeats with long reads ```python= import SDGpython as SDG from collections import Counter ws=SDG.WorkSpace('pt_dbg_strided.sdgws') kc=ws.get_kmer_counter('main') kc.set_kci_peak(71) lords=ws.get_long_reads_datastore('pt_nano') lrr=SDG.LongReadsRecruiter(ws.sdg,lords) lrr.map() lrr.simple_thread_reads() rtg=lrr.rtg_from_threads() ``` ```python= def is_canonical_repeat(nid): nv=ws.sdg.get_nodeview(nid) if len(nv.next())!=2 or len(nv.prev())!=2 or len(set([abs(x) for x in [y.node().node_id() for y in nv.next()+nv.prev()]+[abs(nid)]]))!=5: return False return True def solve_with_llr2(nid,min_support=5,max_noise=2,snr=3,verbose=False): nA,nB=[x.node().node_id() for x in ws.sdg.get_nodeview(nid).next()] pA,pB=[x.node().node_id() for x in ws.sdg.get_nodeview(nid).prev()] tAA=0 tAB=0 tBA=0 tBB=0 all_voting_tids=set(rtg.node_threads(pA).union(rtg.node_threads(pB))).intersection(set(rtg.node_threads(nA).union(rtg.node_threads(nB)))) for tid in all_voting_tids: apA,apB,anA,anB=abs(pA),abs(pB),abs(nA),abs(nB) c=Counter([abs(x.node) for x in lrr.read_perfect_matches[tid] if abs(x.node) in [apA,apB,anA,anB]]) if c[apA]>c[apB]*snr: if c[anA]>c[anB]*snr: tAA+=1 elif c[anB]>c[anA]*snr: tAB+=1 elif c[apB]>c[apA]*snr: if c[anA]>c[anB]*snr: tBA+=1 elif c[anB]>c[anA]*snr: tBB+=1 if verbose: print(nid,": ",tAA,tBB,tAB,tBA) if min(tAA,tBB)>=min_support and max(tAB,tBA)<=max_noise and min(tAA,tBB)>=max(tAB,tBA)*snr: return [(-pA,nA),(-pB,nB)] if min(tAB,tBA)>=min_support and max(tAA,tBB)<=max_noise and min(tAB,tBA)>=max(tAA,tBB)*snr: return [(-pA,nB),(-pB,nA)] return [] ``` ```python= ge=SDG.GraphEditor(ws) total_cr=0 solved_cr=0 for x in ws.sdg.get_all_nodeviews(): nid=x.node_id() if is_canonical_repeat(nid): total_cr+=1 sol=solve_with_llr2(nid) if sol!=[]: ge.queue_node_expansion(nid,sol) solved_cr+=1 print("%d / %d canonical repeats solved"%(solved_cr,total_cr)) ge.apply_all() ws.sdg.join_all_unitigs() ``` ```python= kc.update_graph_counts() print(ws.sdg.stats_by_kci()) ``` | KCI | Total bp | Nodes | Tips | N25 | N50 | N75 | |-------|--------------:|---------:|--------:|-----------:|-----------:|-----------:| | None | 3689367 | 44694 | 76 | 100 | 79 | 68 | | < 0.5 | 319624 | 1126 | 31 | 11296 | 4965 | 125 | | ~ 1 | 50646430 | 7203 | 177 | 58666 | 35637 | 17486 | | ~ 2 | 1996338 | 3650 | 37 | 15936 | 1421 | 545 | | ~ 3 | 190553 | 891 | 2 | 564 | 223 | 125 | | > 3.5 | 378439 | 1302 | 2 | 39651 | 469 | 128 | | **All** | **57220751** | **58866** | **325** | **55879** | **31091** | **11469** | ```python= ws.dump('pt_dbg_strided_lrrep.sdgws') ws.sdg.write_to_gfa1('pt_dbg_strided_lrrep.gfa') ``` ## Anchors and long read mapping ```python= import SDGpython as SDG from collections import Counter ws=SDG.WorkSpace('pt_dbg_strided_lrrep.sdgws') kc=ws.get_kmer_counter('main') kc.set_kci_peak(71) ws2=SDG.WorkSpace() SDG.GraphContigger(ws).contig_reduction_to_unique_kmers(ws2,"main","pe",50,92,100); ``` ```python= peds = ws2.add_paired_reads_datastore('pt_pe.prseq') peds.mapper.path_reads(k=31) lords = ws2.add_long_reads_datastore('pt_nano.loseq') lrr=SDG.LongReadsRecruiter(ws2.sdg,lords,k=31) lrr.map() ``` ```python= ws2.dump('pt_anchors.sdgws') peds.mapper.dump_readpaths('pt_anchors.pe_paths') lrr.dump('pt_anchors.lrr') ``` ## HappySorter from anchor lines (i.e. unitigs) ```python= import SDGpython as SDG from collections import Counter ws=SDG.WorkSpace('pt_anchors.sdgws') peds = ws.get_paired_reads_datastore('pt_pe') peds.mapper.load_readpaths('pt_anchors.pe_paths') lords = ws.get_long_reads_datastore('pt_nano') lrr=SDG.LongReadsRecruiter(ws.sdg,lords,k=31) lrr.load('pt_anchors.lrr') lrr.simple_thread_reads() rtg=lrr.rtg_from_threads(True) rtg.apply_popping_list(rtg.clean_repeat_nodes_popping_list(200)) ``` ```python= hsr=SDG.HappySorterRunner(rtg) hsr.run_from_dg_lines(ws.sdg,50) hsr.dump("hsr_50.orders") ``` ```python= otg=SDG.ReadThreadsGraph(ws.sdg) for i,o in hsr.orders.items(): otg.add_thread(i+1,o.as_thread(ws.sdg)) otg.dump('otg.rtg') ``` ## Create a thread graph from orders, pop repeated/conflicted nodes and recover connectivity across disconnected ends. THIS JUST TO HAVE A ROUGH IDEA ```python= lu=SDG.LinkageUntangler(otg) lu.select_all() slotg=lu.make_nextselected_linkage(1) ``` ## Plotting order growth gifs ```python= #to check the order from the sorters rlords=ws.add_long_reads_datastore('SC5314.loseq') rlrr=SDG.LongReadsRecruiter(ws.sdg,rlords,k=31) rlrr.map() node_coords={x.node_id(): [] for x in ws.sdg.get_all_nodeviews()} for cid,rpm in enumerate(rlrr.read_perfect_matches): for m in rpm: node_coords[abs(m.node)].append((cid,m.read_position)) %pylab inline import imageio,os def plot_order(order,old_coordinates={},_ylim=None): xval=[[] for x in rlrr.read_perfect_matches] yval=[[] for x in rlrr.read_perfect_matches] new_xval=[[] for x in rlrr.read_perfect_matches] new_yval=[[] for x in rlrr.read_perfect_matches] updated_xval=[[] for x in rlrr.read_perfect_matches] updated_yval=[[] for x in rlrr.read_perfect_matches] for ix,x in enumerate(order.as_signed_nodes(),start=1): for nc in node_coords[abs(x)]: if x not in old_coordinates: new_xval[nc[0]].append(order.node_coordinates[x]) new_yval[nc[0]].append(nc[1]) elif old_coordinates[x]!=order.node_coordinates[x]: updated_xval[nc[0]].append(order.node_coordinates[x]) updated_yval[nc[0]].append(nc[1]) else: xval[nc[0]].append(order.node_coordinates[x]) yval[nc[0]].append(nc[1]) #print([len(x) for x in xval]) MIN_HITS_TO_DRAW=5 w=int(ceil(sqrt(len([i for i in range(len(xval)) if len(xval[i])+len(new_xval[i])+len(updated_xval[i])>=MIN_HITS_TO_DRAW])))) f,ax=plt.subplots(w,w,figsize=(10,10)) used_i=-1 for cid in range(1,len(xval)): xv=xval[cid] yv=yval[cid] nxv=new_xval[cid] nyv=new_yval[cid] uxv=updated_xval[cid] uyv=updated_yval[cid] if len(xv)+len(nxv)+len(uxv)<MIN_HITS_TO_DRAW: continue used_i+=1 #print(f"w={w} used_i={used_i}",flush=True) if (w>1): a=ax[used_i//w][used_i%w] else: a=ax a.plot(yv,xv,'.',color='blue') a.plot(nyv,nxv,'.',color='green') a.plot(uyv,uxv,'.',color='red') a.title.set_text('Chr #%d'%(cid)) if _ylim: a.set_ylim(_ylim) a.set_xlim(0,rlords.get_read_size(cid)) def animate_order_growth(start_line,_ylim=None): # frames between transitions n_frames = 3 print("Creating animation of an order's grow",flush=True) hs=SDG.HappySorter(rtg) hs.run_from_nodelist(start_line,10,.3,4,6,1,True) last_coords=hs.order.node_coordinates plot_order(hs.order,last_coords,_ylim) suptitle("Order growth step 1") savefig('ordergrowth_0.png') close() index=0 while hs.q_grow_loop(3,.7,4,6,1,True): index+=1 plot_order(hs.order,last_coords,_ylim) suptitle(f"Order growth step {index+1}") savefig(f'ordergrowth_{index}.png') close() last_coords=hs.order.node_coordinates print('Charts saved\n',flush=True) # Build GIF print('Creating gif\n',flush=True) images_slow=[] for x in range(index+1): for f in range(n_frames): images_slow.append(imageio.imread(f'ordergrowth_{x}.png')) for f in range(2*n_frames): images_slow.append(imageio.imread(f'ordergrowth_{index}.png')) imageio.mimsave('ordergrowth.gif', images_slow, 'GIF-FI') print('Gif saved\n',flush=True) print('Removing Images\n',flush=True) # Remove files for x in range(index+1): image = os.remove(f'ordergrowth_{x}.png') print('DONE',flush=True) ``` ## Carefull grow_fw node placement prototype - Uses only threads with a # of hits to the end nodes - Uses local offsets to place nodes in a sync'ed coordinate system to the order. - Offsets are updated every time you cross a node at the end of the order (not other nodes with order coordinates). - All this localises the problem to individual threads and nodes, reducing the effect of past mistakes in the order. - This function still requires happiness, which still requires thread recruitment in the order, which is not yet fully replaced. - There is not a similar function for internal recruitment, which means eventually this may get stuck and need a single iteration through the standard grow loop to add nodes in lower-density regions. ```python= def careful_grow(hs,end_size=30,thread_hits=4,node_hits=3,min_happiness=.7): #1 find threads that have hits to the end, cut only relevant parts of them (from first hit to the end onwards) end_nodes=hs.order.as_signed_nodes()[-end_size:] end_positions=[hs.order.node_coordinates[n] for n in end_nodes] tc=Counter() for nid in end_nodes: tc.update(rtg.node_threads(nid,True)) tids=[t for t,c in tc.items() if c>=thread_hits] cut_threads=[] for tid in tids: t=rtg.get_thread(tid) #TODO: remove threads that do not follow order ct=[] for np in t: if ct or np.node in end_nodes: ct.append(np) if ct: cut_threads.append(ct) #2 find nodes with enough hits nc=Counter() for ct in cut_threads: #print([(x.start,x.node) for x in ct]) nc.update([x.node for x in ct]) candidates=[x for x,c in nc.items() if c>=node_hits and hs.node_happiness(x,True,False)>=min_happiness] #print(candidates) #for x in candidates: # print(x,x in end_nodes, hs.node_happiness(x,True,False)) #3 Keep only order nodes and nodes with enough hits in threads, sync thread coordinates to order coordinates (fw only): np={} for ct in cut_threads: offset=hs.order.node_coordinates[ct[0].node]-ct[0].start for i in range(len(ct)): if ct[i].node in end_nodes: offset=hs.order.node_coordinates[ct[i].node]-ct[i].start elif ct[i].node not in candidates: continue ct[i].start+=offset ct[i].end+=offset if ct[i].node not in hs.order.node_coordinates: if ct[i].node not in np: np[ct[i].node]=[] np[ct[i].node].append(ct[i].start) #for ct in cut_threads: # print([(np.start,np.node) for np in ct if np.node not in end_nodes]) print("First/last coordinate of order end:",hs.order.node_coordinates[end_nodes[0]],hs.order.node_coordinates[end_nodes[-1]]) pnodes=[] for x in np.items(): sp=sorted(x[1]) #print(x[0],sp[-1]-sp[0], len (sp),sp[len(sp)//2]) pnodes.append((x[0],sp[len(sp)//2])) return pnodes ```