BASF - HackMD

# BASF ## Pacbio Scaffolding Approach: 1 - Map reads, thread via 2+ matches. 2 - dg from threads. 3 - Anchors on nodes >500bp, kci<1.5, nextselected linkage. 4 - Linearise by reconnection of tips, bubbles and simple transitions. BW6_pbthreads.gfa -> 143Kbp N50. 5 - iterate over disconnected anchor nodes, check if they connect to lines (at their sides or in gaps, either thorugh PE or LR) 6 - reconnect disconnected lines (but mark their unions as "dangerous") 7 - Line to sequence: 7.a - fill with non-anchor nodes 7.b - check if there's overlaps on non-dangerous join 7.c - strider on non-dangerous joins 7.d - put Ns in non-dangerous joins 7.e- put ns in dangerous joins. ```python import SDGpython as SDG ws = SDG.WorkSpace('basf.pb.pe.kc.rf.sdgws') print ("Calculating/loading KCI") kc=ws.get_kmer_counter("all_pcrf_reads") kc.load_cache('kc.dump') print("Load PE mappings") peds = ws.get_paired_reads_datastore('ei_pe') peds.mapper.load_readpaths('pe_read_paths_k63.dump') # Create PE strider spe=SDG.Strider(ws) spe.add_datastore(peds) ######################## # run lrr on pacbio reads print ("Loading mapped PB reads and recruiting") lords = ws.get_long_reads_datastore('pacbio') k=31 lrr=SDG.LongReadsRecruiter(ws.sdg,lords,k,50) lrr.load("lrr_pb_k{0}_all_v3_r3.dump".format(k)) # long read strider print("Run LRR") s=SDG.Strider(ws) s.add_datastore(lrr) s.link_from_anchors(1000,.75,1.25,10000,2,2,0) # min_size, min_kci, max_kci, distance, min_reads, group_size, small_node_size def lines_stats(dg,min_size=1000): used_nodes=set() line_sizes=[] for nv in dg.get_all_nodeviews(): n=nv if n.node_id() in used_nodes: continue line_nodes=set() while n.node_id() not in line_nodes and len(n.prev())==1 and len(n.prev()[0].node().next())==1: line_nodes.add(n.node_id()) n=n.prev()[0].node() linebp=n.size() used_nodes.add(abs(n.node_id())) line_nodes=set() while n.node_id() not in line_nodes and len(n.next())==1 and len(n.next()[0].node().prev())==1: line_nodes.add(n.node_id()) linebp+=n.next()[0].distance() n=n.next()[0].node() linebp+=n.size() used_nodes.add(abs(n.node_id())) if linebp>=min_size: line_sizes.append(linebp) line_sizes.sort(reverse=True) total_bp=sum(line_sizes) print("Total lines size %d bp\n"%total_bp) next_stop=0 acc=0 for count,size in enumerate(line_sizes,1): acc+=size if acc>=next_stop*total_bp/100: print("N%d: %d bp at line # %d"%(next_stop,size,count)) next_stop+=10 from collections import Counter #recruit on 2 matches (for later), create threads on 2 matches, make graph print("recruiting") lrr.recruit(1,2) print("threading reads") lrr.thread_reads(20000,2) print("Constructing dg from threads") trmldg2m=lrr.dg_from_threads(True) #getting rid of all links not supported by at least two threads, self and big-overlap links, and links to inversions of better links print("Cleaning links") for nv in trmldg2m.get_all_nodeviews(both_directions=True): for l in nv.next(): if l.distance()<-500 or abs(l.node().node_id())==abs(nv.node_id()): trmldg2m.remove_link(-nv.node_id(),l.node().node_id(),l.distance(),l.support()) onf=Counter([x.node().node_id() for x in nv.next()]) for onid in onf.keys(): if onf[onid]<2: trmldg2m.remove_link(-nv.node_id(),onid) if onf[onid]<onf[-onid]: trmldg2m.remove_link(-nv.node_id(),onid) print("Untangling linkage between anchors") #Select anchor nodes by size and kci, create graph lu=SDG.LinkageUntangler(trmldg2m) for nv in trmldg2m.get_all_nodeviews(include_disconnected=False): if nv.size()>500 and nv.kci()<1.5: lu.selected_nodes[nv.node_id()]=True trnsl=lu.make_nextselected_linkage(2) trnsl.remove_transitive_links(3) trnsl.write_to_gfa1('trnsl.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)]) lines_stats(trnsl,1000) # GRAPH linearisation heuristics #find matches to other nodes between two nodes (in order) in the LR. def nodes_matches_in_between(n1,n2): tc=Counter() for rid in set([abs(x) for x in lrr.node_reads[abs(n1)] + lrr.node_reads[abs(n2)]]): fwg=False c=Counter() bwg=False for pm in lrr.read_perfect_matches[rid]: if pm.node==n1: c=Counter() fwg=True bwg=False elif pm.node==-n2: c=Counter() fwg=False bwg=True elif pm.node==-n1: if bwg: tc+=c bwg=fwg=False c=Counter() elif pm.node==n2: if fwg: tc+=c bwg=fwg=False c=Counter() elif fwg: c[pm.node]+=1 elif bwg: c[-pm.node]+=1 return tc print("Solving simple tips") # Solve simple tips, where the tip is connected to a node and comes before the other connection. simple_tips=0 simple_tips_solved=0 for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True): if len(nv.next())==2: if len(nv.next()[0].node().next())==0 and len(nv.next()[1].node().prev())==1 and nv.next()[0].node().size()+nv.next()[0].distance()<nv.next()[1].distance()+1000: simple_tips+=1 nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id()) if nmb[nv.next()[0].node().node_id()]>2: simple_tips_solved+=1 trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance()) trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id()) print("Simple tips:",simple_tips,simple_tips_solved) trnsl.remove_transitive_links(3) trnsl.write_to_gfa1('trnsl_tj.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)]) lines_stats(trnsl,1000) print("Solving simple bubbles") #Solve simple bubbles created by two small nodes who connect to their big neighbours simple_bubbles=0 simple_bubbles_solved=0 for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True): if len(nv.next())==2 and len(nv.next()[0].node().parallels())==1 and nv.next()[0].node().parallels()[0].node_id()==nv.next()[1].node().node_id(): simple_bubbles+=1 nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id()) if nmb[nv.next()[0].node().node_id()]>2: simple_bubbles_solved+=1 trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance()) trnsl.remove_link(-nv.next()[0].node().node_id(),nv.next()[1].node().next()[0].node().node_id()) trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id()) print("Simple bubbles:",simple_bubbles,simple_bubbles_solved) trnsl.remove_transitive_links(3) trnsl.write_to_gfa1('trnsl_tj_bp.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)]) lines_stats(trnsl,1000) for i in range(3): print("Solving simple transitions") # Solve simple tips, where the tip is connected to a node and comes before the other connection. simple_transitions=0 simple_transitions_solved=0 for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True): if len(nv.next())==2: if len(nv.next()[1].node().prev())==1 and len(nv.next()[0].node().prev())==1 and nv.next()[0].node().size()+nv.next()[0].distance()<nv.next()[1].distance()+1000: simple_transitions+=1 nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id()) if nmb[nv.next()[0].node().node_id()]>2: simple_transitions_solved+=1 trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance()) trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id()) print("Simple transitions:",simple_transitions,simple_transitions_solved) trnsl.remove_transitive_links(3) trnsl.write_to_gfa1('BW6_pbthreads.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)]) lines_stats(trnsl,1000) def stride_between(s,n1,n2,verbose=False): spnn=[s.stride_out_in_order(n1).nodes,[-x for x in s.stride_out_in_order(-n2).nodes[::-1]],s.stride_out(n1).nodes,[-x for x in s.stride_out(-n2).nodes[::-1]]] for spn in spnn: if verbose: print(spn) if not (n1 in spn and n2 in spn): continue in1=spn.index(n1) in2=spn.index(n2) if verbose: print(in1,in2) if not in1<in2: continue return spn[in1:in2+1] return [] def line_to_sequence(dg,s,line,verbose=False): seq=dg.get_nodeview(line[0]).sequence() if verbose: print(len(seq)) strided=overlapped=jumped=disconnected=0 for i in range(len(line)-1): n1=line[i] nv1=dg.get_nodeview(n1) n2=line[i+1] nv2=dg.get_nodeview(n2) if verbose: print("+",len(nv2.sequence())) if dg.are_connected(-n1,n2): d=[x.distance() for x in nv1.next() if x.node().node_id()==n2][0] joined=False if d<-60: e1=nv1.sequence()[-200:] e2=nv2.sequence()[:200] for i in range (200,60,-1): if e1[-i:]==e2[:i]: if verbose: print("OVL ",i) seq+=nv2.sequence()[i:] joined=True overlapped+=1 break if not joined and d<1000: spn=stride_between(s,n1,n2) if spn!=[]: strided+=1 seq+=SDG.SequenceDistanceGraphPath(dg.sdg, spn[1:]).sequence()[199:] joined=True if not joined: if verbose: print("Jump ",d) #TODO: try striding seq+='N'*max(100,d)+nv2.sequence() jumped+=1 else: if verbose: print("Not connected ",d) seq+='n'*1000+nv2.sequence() disconnected+=1 if verbose: print(len(seq)) if verbose: print("strided",strided,"overlapped",overlapped,"jumped",jumped,"disconnected",disconnected) return seq lines=trnsl.get_all_lines(1,2000) with open("BW6_pbthreads_lines.fasta","w") as f: for i in range(len(lines)): f.write(">line%d "%i+str(lines[i])+"\n") f.write(line_to_sequence(trnsl,spe,lines[i])+"\n") for i in range(3): print("Solving multi-transitions") # Solve simple tips, where the tip is connected to a node and comes before the other connection. simple_transitions=0 simple_transitions_solved=0 for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True): if len(nv.next())>=2: if len(nv.next()[1].node().prev())==1 and len(nv.next()[0].node().prev())==1 and nv.next()[0].node().size()+nv.next()[0].distance()<nv.next()[1].distance()+1000: simple_transitions+=1 nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id()) if nmb[nv.next()[0].node().node_id()]>2: simple_transitions_solved+=1 trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance()) trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id()) print("Multi transitions:",simple_transitions,simple_transitions_solved) trnsl.remove_transitive_links(3) def count_connections(nv,n2): return len([x for x in nv.next() if x.node().node_id()==n2]) print("Disconnecting low support crosses") t=0 ec=0 for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True): if len(nv.prev())==1 and len(nv.next())==2: nn0=nv.next()[0].node() nn1=nv.next()[1].node() if len(nn0.next())!=1 or len(nn1.next())!=1: continue if len(nn0.prev())==2 and len(nn1.prev())==1: nn1,nn0=nn0,nn1 if len(nn0.prev())==1 and len(nn1.prev())==2: t+=1 o=[x.node().node_id() for x in nn1.prev() if x.node().node_id()!=nv.node_id()][0] l1c= count_connections(trmldg2m.get_nodeview(nv.node_id()),nn0.node_id()) crossc= count_connections(trmldg2m.get_nodeview(nv.node_id()),nn1.node_id()) l2c= count_connections(trmldg2m.get_nodeview(o),nn1.node_id()) if l1c>6 and crossc<3 and l2c>6: ec+=1 trnsl.remove_link(-nv.node_id(),nn1.node_id()) print("Low support crosses:",t,ec) trnsl.remove_transitive_links(3) print("Prunning tips with well-supported alternatives") ec=0 for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True): if len(nv.prev())==1 and len(nv.next())>=2: abort=False to_keep=[] to_delete=[] for nl in nv.next(): if len(nl.node().prev())!=1: abort=True break if len(nl.node().next())>0 and count_connections(trmldg2m.get_nodeview(nv.node_id()),nl.node().node_id())>2: to_keep.append(nl) elif nl.node().size()<2000 and len(nl.node().next())==0: to_delete.append(nl) if not abort and len(to_keep)==1 and len(to_delete)==len(nv.next())-1: ec+=1 for ln in to_delete: trnsl.remove_link(-nv.node_id(),ln.node().node_id()) print("nodes with tips fw prunned:",ec) trnsl.remove_transitive_links(3) trnsl.write_to_gfa1('BW6_pbthreads_cleaner.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)]) def get_node_pos(nid): try: return node_pos[nid] except: return (0,0,0) def significant_next(lid,max_noise=2,min_ratio=5): if lid>0: if len(fw_links[lid])==0: return 0 wn,wc=fw_links[lid].most_common(1)[0] for n,c in fw_links[lid].items(): if n!=wn: if c>max_noise or c*min_ratio>wc: return 0 return wn else: if len(bw_links[-lid])==0: return 0 wn,wc=bw_links[-lid].most_common(1)[0] for n,c in bw_links[-lid].items(): if n!=wn: if c>max_noise or c*min_ratio>wc: return 0 return -wn def get_next_line(lid,max_noise=2,min_ratio=5): sn=significant_next(lid,max_noise,min_ratio) if sn!=0 and significant_next(-sn,max_noise,min_ratio)==-lid: return sn return 0 def get_prev_line(lid,max_noise=2,min_ratio=5): return -get_next_line(-lid,max_noise,min_ratio) def metaline(lid,max_noise=2,min_ratio=5): ml=[] p=lid while p!=0: if p in ml or -p in ml: return [lid] ml=[p]+ml p=get_prev_line(ml[0],max_noise,min_ratio) while ml[-1]!=0: n=get_next_line(ml[-1],max_noise,min_ratio) if n in ml or -n in ml: return [lid] ml.append(n) return ml[:-1] ##NEW METALINE DUMP for min_line in [2000,5000]: print("Constructing metalines from %dbp+ lines with skipped connections"%min_line) lines=[[]]+[list(x) for x in trnsl.get_all_lines(1,min_line)] node_pos={} for lid in range(len(lines)): ll=len(lines[lid]) for nidx in range(len(lines[lid])): node_pos[lines[lid][nidx]]=(lid,nidx,-ll+nidx) node_pos[-lines[lid][nidx]]=(-lid,ll-nidx,-nidx) fw_links=[] bw_links=[] for i in range(len(lines)): fw_links.append(Counter()) bw_links.append(Counter()) for nid in lines[i][:4]: for pl in trmldg2m.get_nodeview(nid).prev(): plnp=get_node_pos(pl.node().node_id()) if plnp==(0,0,0): continue if plnp[0]!=i: bw_links[-1][plnp[0]]+=1 for nid in lines[i][-4:]: for nl in trmldg2m.get_nodeview(nid).next(): nlnp=get_node_pos(nl.node().node_id()) if nlnp==(0,0,0): continue if nlnp[0]!=i: fw_links[-1][nlnp[0]]+=1 used=[False for x in lines] metalines=[] for i in range(1,len(lines)): if used[i]: continue metalines.append([]) for lid in metaline(i): used[abs(lid)]=True if lid>0: metalines[-1]+=lines[lid] else: metalines[-1]+=[-x for x in lines[-lid][::-1]] print("dumping %d metalines"%len(metalines)) with open("BW6_metalines%dk.fasta"%int(min_line/1000),"w") as f: for i in range(len(metalines)): f.write(">BW6_metaline%dk_%d "%(int(min_line/1000),i)+str(metalines[i])+"\n") f.write(line_to_sequence(trnsl,spe,metalines[i])+"\n") ``` ```python def count_duplicatednode_lines(mlines): c=Counter() for l in mlines: for x in l: c[abs(x)]+=1 dl=[] for l in mlines: for x in l: if c[abs(x)]>1: dl.append(l) break print(len(dl)) ``` ```python node_pos={} for lid in range(len(lines)): ll=len(lines[lid]) for nidx in range(len(lines[lid])): node_pos[lines[lid][nidx]]=(lid,nidx,-ll+nidx) node_pos[-lines[lid][nidx]]=(-lid,ll-nidx,-nidx) def get_node_pos(nid): try: return node_pos[nid] except: return (0,0,0) def line_between_report(line): for i in range(len(line)-1): print("\n%d -> %d bp -> %d"%(line[i],trnsl.get_nodeview(line[i]).next()[0].distance(),line[i+1])) mmc= [x for x in nodes_matches_in_between(line[i],line[i+1]).most_common() if x[1]>2] for x in mmc: cnv=trnsl.get_nodeview(x[0]) print(x[1],"hits to node",x[0],cnv.size(),"bp",len(cnv.next()+cnv.prev())) print(spe.stride_out_in_order(line[i]).nodes) print([-x for x in spe.stride_out_in_order(-line[i+1]).nodes[::-1]]) ``` # OLD CODE: Merging both graphs We can use a thread method that is very unreliable, because we can always pop the node from the read later: ```python def lines_stats(dg,min_size=1000): used_nodes=set() line_sizes=[] for nv in dg.get_all_nodeviews(): n=nv if n.node_id() in used_nodes: continue line_nodes=set() while n.node_id() not in line_nodes and len(n.prev())==1 and len(n.prev()[0].node().next())==1: line_nodes.add(n.node_id()) n=n.prev()[0].node() linebp=n.size() used_nodes.add(abs(n.node_id())) line_nodes=set() while n.node_id() not in line_nodes and len(n.next())==1 and len(n.next()[0].node().prev())==1: line_nodes.add(n.node_id()) linebp+=n.next()[0].distance() n=n.next()[0].node() linebp+=n.size() used_nodes.add(abs(n.node_id())) if linebp>=min_size: line_sizes.append(linebp) line_sizes.sort(reverse=True) total_bp=sum(line_sizes) print("Total lines size %d bp\n"%total_bp) next_stop=0 acc=0 for count,size in enumerate(line_sizes,1): acc+=size if acc>=next_stop*total_bp/100: print("N%d: %d bp at line # %d"%(next_stop,size,count)) next_stop+=10 from collections import Counter #recruit on 2 matches (for later), create threads on 2 matches, make graph print("recruiting") lrr.recruit(1,2) print("threading reads") lrr.thread_reads(20000,2) print("Constructing dg from threads") trmldg2m=lrr.dg_from_threads(True) #getting rid of all links not supported by at least two threads, self and big-overlap links, and links to inversions of better links print("Cleaning links") for nv in trmldg2m.get_all_nodeviews(both_directions=True): for l in nv.next(): if l.distance()<-500 or abs(l.node().node_id())==abs(nv.node_id()): trmldg2m.remove_link(-nv.node_id(),l.node().node_id(),l.distance(),l.support()) onf=Counter([x.node().node_id() for x in nv.next()]) for onid in onf.keys(): if onf[onid]<2: trmldg2m.remove_link(-nv.node_id(),onid) if onf[onid]<onf[-onid]: trmldg2m.remove_link(-nv.node_id(),onid) print("Untangling linkage between anchors") #Select anchor nodes by size and kci, create graph lu=SDG.LinkageUntangler(trmldg2m) for nv in trmldg2m.get_all_nodeviews(include_disconnected=False): if nv.size()>500 and nv.kci()<1.5: lu.selected_nodes[nv.node_id()]=True trnsl=lu.make_nextselected_linkage(2) trnsl.remove_transitive_links(1) trnsl.write_to_gfa1('trnsl.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)]) lines_stats(trnsl,1000) # GRAPH linearisation heuristics #find matches to other nodes between two nodes (in order) in the LR. def nodes_matches_in_between(n1,n2): tc=Counter() for rid in set([abs(x) for x in lrr.node_reads[abs(n1)] + lrr.node_reads[abs(n2)]]): fwg=False c=Counter() bwg=False for pm in lrr.read_perfect_matches[rid]: if pm.node==n1: c=Counter() fwg=True bwg=False elif pm.node==-n2: c=Counter() fwg=False bwg=True elif pm.node==-n1: if bwg: tc+=c bwg=fwg=False c=Counter() elif pm.node==n2: if fwg: tc+=c bwg=fwg=False c=Counter() elif fwg: c[pm.node]+=1 elif bwg: c[-pm.node]+=1 return tc print("Solving simple tips") # Solve simple tips, where the tip is connected to a node and comes before the other connection. simple_tips=0 simple_tips_solved=0 for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True): if len(nv.next())==2: if len(nv.next()[0].node().next())==0 and len(nv.next()[1].node().prev())==1 and nv.next()[0].node().size()+nv.next()[0].distance()<nv.next()[1].distance()+1000: simple_tips+=1 nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id()) if nmb[nv.next()[0].node().node_id()]>2: simple_tips_solved+=1 trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance()) trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id()) print("Simple tips:",simple_tips,simple_tips_solved) trnsl.write_to_gfa1('trnsl_tj.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)]) lines_stats(trnsl,1000) print("Solving simple bubbles") #Solve simple bubbles created by two small nodes who connect to their big neighbours simple_bubbles=0 simple_bubbles_solved=0 for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True): if len(nv.next())==2 and len(nv.next()[0].node().parallels())==1 and nv.next()[0].node().parallels()[0].node_id()==nv.next()[1].node().node_id(): simple_bubbles+=1 nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id()) if nmb[nv.next()[0].node().node_id()]>2: simple_bubbles_solved+=1 trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance()) trnsl.remove_link(-nv.next()[0].node().node_id(),nv.next()[1].node().next()[0].node().node_id()) trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id()) print("Simple bubbles:",simple_bubbles,simple_bubbles_solved) trnsl.write_to_gfa1('trnsl_tj_bp.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)]) lines_stats(trnsl,1000) for i in range(3): print("Solving simple transition") # Solve simple tips, where the tip is connected to a node and comes before the other connection. simple_transitions=0 simple_transitions_solved=0 for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True): if len(nv.next())==2: if len(nv.next()[1].node().prev())==1 and len(nv.next()[0].node().prev())==1 and nv.next()[0].node().size()+nv.next()[0].distance()<nv.next()[1].distance()+1000: simple_transitions+=1 nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id()) if nmb[nv.next()[0].node().node_id()]>2: simple_transitions_solved+=1 trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance()) trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id()) print("Simple transitions:",simple_transitions,simple_transitions_solved) trnsl.write_to_gfa1('trnsl_tj_bp_st.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)]) lines_stats(trnsl,1000) #TODO: more aggressive here! print("Solving and discarding simple tips") # Solve simple tips, where the tip is connected to a node and comes before the other connection. simple_tips=0 simple_tips_solved=0 simple_tips_discarded=0 for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True): if len(nv.next())==2: if len(nv.next()[0].node().next())==0 and len(nv.next()[1].node().prev())==1 and nv.next()[0].node().size()+nv.next()[0].distance()<nv.next()[1].distance()+1000: simple_tips+=1 nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id()) if nmb[nv.next()[0].node().node_id()]>2: simple_tips_solved+=1 trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance()) trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id()) elif nv.next()[0].node().size()<2000: simple_tips_discarded+=1 trnsl.remove_link(-nv.node_id(),nv.next()[0].node().node_id()) print("Simple tips:",simple_tips,simple_tips_solved) trnsl.write_to_gfa1('trnsl_tj_bp_st_tj.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)]) lines_stats(trnsl,1000) ``` ```python node_pos={} for lid in range(len(lines)): ll=len(lines[lid]) for nidx in range(len(lines[lid])): node_pos[lines[lid][nidx]]=(lid,nidx,-ll+nidx) node_pos[-lines[lid][nidx]]=(-lid,ll-nidx,-nidx) def get_node_pos(nid): try: return node_pos[nid] except: return (0,0,0) for x in trmldg2m.get_nodeview(lines[0][-2]).next(): print(x,x.node().size(),x.node().kci(),get_node_pos(x.node().node_id())) for x in trmldg2m.get_nodeview(lines[0][-1]).next(): print(x,x.node().size(),x.node().kci(),get_node_pos(x.node().node_id())) def other_line_connectivity_report(lid): print("\nother line connectivity for line",lid) print('seq'+',seq'.join([str(abs(x)) for x in lines[lid]])) print("BW") for nid in lines[lid]: c=Counter([get_node_pos(x.node().node_id())[0] for x in trmldg2m.get_nodeview(nid).prev()]) print(c.most_common()) print("FW") for nid in lines[lid]: c=Counter([get_node_pos(x.node().node_id())[0] for x in trmldg2m.get_nodeview(nid).next()]) print(c.most_common()) for nid in nodes_3b: cb=Counter([get_node_pos(x.node().node_id())[0] for x in trmldg2m.get_nodeview(nid).prev()]) cf=Counter([get_node_pos(x.node().node_id())[0] for x in trmldg2m.get_nodeview(nid).next()]) print(get_node_pos(nid),nid,"BW:",cb.most_common(),"FW:",cf.most_common()) ``` ```python for nid in nodes_3b: print("\nNode",nid, len(ws.sdg.get_nodeview(nid).prev()), len(ws.sdg.get_nodeview(nid).next())) print("BW:") for l in [(x.node().node_id(),x.distance()) for x in trmldg2m.get_nodeview(nid).prev()]: print(l) if nid>0: print([(-x.dest,x.dist) for x in s.links_bw[nid]]) else: print([(-x.dest,x.dist) for x in s.links_fw[-nid]]) print("FW:") for l in [(x.node().node_id(),x.distance()) for x in trmldg2m.get_nodeview(nid).next()]: print(l) if nid>0: print([(x.dest,x.dist) for x in s.links_fw[nid]]) else: print([(x.dest,x.dist) for x in s.links_bw[-nid]]) for nid in nodes_3b: print("\nNode",nid, len(ws.sdg.get_nodeview(nid).prev()), len(ws.sdg.get_nodeview(nid).next())) print("BW:") for l in [(x.node().node_id(),x.distance()) for x in trnsl.get_nodeview(nid).prev()]: print(l) print("FW:") for l in [(x.node().node_id(),x.distance()) for x in trnsl.get_nodeview(nid).next()]: print(l) ``` ```python def dump_lrdg_to_text(dg,filename): with open(filename,"w") as f: for nv in dg.get_all_nodeviews( both_directions=True, include_disconnected=False): for l in nv.next(): if -nv.node_id()<l.node().node_id(): f.write("%d %d %d\n"%(-nv.node_id(),l.node().node_id(),l.distance())) def load_lrdg_to_text(dg,filename): for l in open(filename).readlines(): if not l: continue il=[int(x) for x in l.strip().split()] dg.add_link(il[0],il[1],il[2]) def clone_dg(odg): dg=SDG.DistanceGraph(odg.sdg) for nv in odg.get_all_nodeviews( both_directions=True, include_disconnected=False): for l in nv.next(): if -nv.node_id()<l.node().node_id(): dg.add_link(-nv.node_id(),l.node().node_id(),l.distance(),l.support()) return dg ``` # Pacbio scaffolding procedure: ## First scaffolding round (on the w2rap graph) 1) Take the contigs_raw gfa from w2rap and create an SDG workspace with EI PE and PB datastores. 2) Generate a kmer counter using EI and BASF PE reads, add to workspace. 3) Calculate KCI. 4) Path PE reads (k=63). 5) Map long reads and recruit, run strider. ```python #This runs all steps 1-5, where there's no uncertainty import SDGpython as SDG ws = SDG.WorkSpace('basf.pb.pe.sdgws') kc=ws.get_kmer_counter("all_pcrf_reads") kc.set_kci_peak(88) kc.compute_all_kcis() peds = ws.get_paired_reads_datastore('ei_pe') peds.mapper.path_reads(63,200) lords = ws.get_long_reads_datastore('pacbio') lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31,50) lrr.map(31) lrr.recruit(1,2) s=SDG.Strider(ws) s.add_datastore(lrr) s.link_from_anchors(1000,.5,1.25,10000,2,2,0) ``` 6) Create an LDG from strider links, find ALL LINES >1Kbp, stride between consecutive nodes, dump linear consensus. ```python #This loads all steps 1-5, where there's no uncertainty import SDGpython as SDG ws = SDG.WorkSpace('basf.pb.pe.kc.rf.sdgws') print ("Calculating/loading KCI") kc=ws.get_kmer_counter("all_pcrf_reads") kc.load_cache('kc.dump') print("Load PE mappings") peds = ws.get_paired_reads_datastore('ei_pe') peds.mapper.load_readpaths('pe_read_paths_k63.dump') # run PE strider spe=SDG.Strider(ws) spe.add_datastore(peds) ######################## # run lrr on pacbio reads print ("Loading mapped PB reads and recruiting") lords = ws.get_long_reads_datastore('pacbio') k=31 lrr=SDG.LongReadsRecruiter(ws.sdg,lords,k,50) lrr.load("lrr_pb_k{0}_all_v3_r3.dump".format(k)) # long read strider print("Run LRR") s=SDG.Strider(ws) s.add_datastore(lrr) s.link_from_anchors(1000,.75,1.25,10000,2,2,0) # min_size, min_kci, max_kci, distance, min_reads, group_size, small_node_size ``` Creating a graph: ```python ###Option A: only create reciprocal links: def check_link_in_strider(s,end1,end2): r=[False,False] if end1<0: if [1 for l in s.links_fw[abs(end1)] if l.source==end1 and l.dest==end2]: r[0]=True else: if [1 for l in s.links_bw[abs(end1)] if l.source==end1 and l.dest==end2]: r[0]=True if end2<0: if [1 for l in s.links_fw[abs(end2)] if l.source==end2 and l.dest==end1]: r[1]=True else: if [1 for l in s.links_bw[abs(end2)] if l.source==end2 and l.dest==end1]: r[1]=True return r def reciprocal_linked_anchors_dg(sdg,s): ag=SDG.DistanceGraph(ws.sdg) for nid,is_anchor in enumerate(s.is_anchor): if is_anchor: for l in s.links_bw[nid]+s.links_fw[nid]: if nid<abs(l.dest) and check_link_in_strider(s,l.source,l.dest)==[True,True]: ag.add_link(l.source, l.dest, l.dist) return ag ``` ## Second scaffolding round (on lines) 1) Load lines FASTA in SDG as a graph with disconnected nodes (use lines with A's rather than N's) 2) Add EI PE and PB datastores. 3) Create KCI (don't on the TEST) check in test if there are repeated nodes. 4) Map long reads and recruit (same parameters as before), run strider. 5) Create an LDG, dump lines with Ns, always, never try to recover connectivity as this is TAINTED by the fact that we threw all repeats away. (use lowercase ns to indicate these joins) 6) replace stretches of 100 or more 'A's with 'N's. ```python ws.sdg.load_from_fasta("anchor_graph.fa") ws.add_paired_reads_datastore("") ws.add_long_reads_datastore("") ws.add_kmer_counter("") kc=ws.get_kmer_counter("all_pcrf_reads") kc.set_kci_peak(88) kc.compute_all_kcis() lords = ws.get_long_reads_datastore('pacbio') lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31,50) lrr.map(31) lrr.recruit(1,2) ``` # Code dumps ```python #BIG IDEA: tangle lr filter: take all reads recruited to the frontiers, filter their matches to only include internal nodes from the tangle, then tap? def tangle_filtered_lrs(t,ws,lrr): print("\n\nAnaysing tangle with %d internals and %d frontiers") print('\nINTERNALS: seq'+',seq'.join([str(abs(x.node_id())) for x in t.internals])) print('\nFRONTIERS: seq'+',seq'.join([str(abs(x.node_id())) for x in t.frontiers])+"\n") for f in t.frontiers: print(f,f.kci()) pmms=SDG.PerfectMatchesFilter(ws) nids=set() for x in list(t.internals)+list(t.frontiers): nids.add(x.node_id()) nids.add(-x.node_id()) onids=[-x.node_id() for x in t.frontiers] for f in t.frontiers: fdest=Counter() print("\n\nFrontier ", f,len(lrr.node_reads[abs(f.node_id())])) for rid in lrr.node_reads[abs(f.node_id())]: if f.node_id()>0: rid=-rid #print("\nRead ",rid," RAW") #for x in lrr.read_perfect_matches[abs(rid)]: print(x) if rid>0: om=lrr.read_perfect_matches[abs(rid)] else: om=lrr.reverse_perfect_matches(lrr.read_perfect_matches[abs(rid)]) #print("\nRead ",rid," oriented matches") #for x in om: print(x) mfn=[x for x in pmms.matches_fw_from_node(-f.node_id(),pmms.truncate_turnaround(om)) if x.node in nids] #print(len(mfn)) mfn=[x for x in mfn if x.node in nids] #print(len(mfn)) if len(mfn)==0 or mfn[0].node==f.node_id(): continue for x in mfn: if x.node in onids: fdest[x.node]+=1 print("\nRead ",rid) for x in mfn: print(x) print(fdest.most_common()) ``` ```python lrr.recruit(1,2) tangle_filtered_lrs(t,ws,lrr) ``` ```python def print_path(l): for nid in l: print(nid, ws.sdg.get_nodeview(nid).size()) def test_node(nid): print("\n\nNode connectivity for node",nid) print("\nBW in anchor_graph") for lf in anchor_graph.get_nodeview(nid).prev(): print(lf,lf.node().size()) print("stride_out_in_order (bw reversed)") for nn in spe.stride_out_in_order(-nid).nodes[1:][::-1]: print(-nn,ws.sdg.get_nodeview(nn).size()) print("\nFW in anchor_graph") for lf in anchor_graph.get_nodeview(nid).next(): print(lf,lf.node().size()) print("stride_out_in_order") for nn in spe.stride_out_in_order(nid).nodes[1:]: print(nn,ws.sdg.get_nodeview(nn).size()) def test_node2(nid): print("\n\nNode connectivity for node",nid) npairs=set([int((abs(x)+1)/2)*2 for x in peds.mapper.paths_in_node[abs(nid)]]) print("\nBW in anchor_graph") for lf in anchor_graph2.get_nodeview(nid).prev(): fpairs=set([int((abs(x)+1)/2)*2 for x in peds.mapper.paths_in_node[abs(lf.node().node_id())]]) print(lf,lf.node().size(),"\t\t%d shared pairs"%len(fpairs.intersection(npairs))) print("\nBW in multi graph") for lf in anchor_graph_multi.get_nodeview(nid).prev(): fpairs=set([int((abs(x)+1)/2)*2 for x in peds.mapper.paths_in_node[abs(lf.node().node_id())]]) print(lf,lf.node().size(),"\t\t%d shared pairs"%len(fpairs.intersection(npairs))) print("stride_out_in_order (bw reversed)") for nn in spe.stride_out_in_order(-nid).nodes[1:][::-1]: print(-nn,ws.sdg.get_nodeview(nn).size()) print("\nFW in anchor_graph") for lf in anchor_graph2.get_nodeview(nid).next(): fpairs=set([int((abs(x)+1)/2)*2 for x in peds.mapper.paths_in_node[abs(lf.node().node_id())]]) print(lf,lf.node().size(),"\t\t%d shared pairs"%len(fpairs.intersection(npairs))) print("\nFW in multi graph") for lf in anchor_graph_multi.get_nodeview(nid).next(): fpairs=set([int((abs(x)+1)/2)*2 for x in peds.mapper.paths_in_node[abs(lf.node().node_id())]]) print(lf,lf.node().size(),"\t\t%d shared pairs"%len(fpairs.intersection(npairs))) print("stride_out_in_order") for nn in spe.stride_out_in_order(nid).nodes[1:]: print(nn,ws.sdg.get_nodeview(nn).size()) ``` ```python to_bridge=set() for fnv in anchor_graph2.get_all_nodeviews(): for nv in [fnv,fnv.rc()]: if len(nv.next())==1 and len(nv.next()[0].node().prev())==1: nid=-nv.node_id() onid=nv.next()[0].node().node_id() to_bridge.add((min(nid,onid),max(nid,onid))) ``` ```python valid=invalid=too_short=not_ovl=0 #for n1,n2 in list(to_bridge)[:10000]: for n1,n2 in list(to_bridge): nv=anchor_graph2.get_nodeview(-n1) d=nv.next()[0].distance() if d>-1: not_ovl+=1 elif d>-20: too_short+=1 elif d<-200: invalid+=1 else: nvs=nv.sequence() nns=nv.next()[0].node().sequence() for pd in range(max(-199,d-50),min(-20,d+50)): if nvs[pd:]==nns[:-pd]: valid+=1 break else: invalid+=1 print(valid,invalid,too_short,not_ovl) ``` ```python def lines_stats(dg,min_size=1000): used_nodes=set() line_sizes=[] for nv in dg.get_all_nodeviews(): n=nv if n.node_id() in used_nodes: continue line_nodes=set() while n.node_id() not in line_nodes and len(n.prev())==1 and len(n.prev()[0].node().next())==1: line_nodes.add(n.node_id()) n=n.prev()[0].node() linebp=n.size() used_nodes.add(abs(n.node_id())) line_nodes=set() while n.node_id() not in line_nodes and len(n.next())==1 and len(n.next()[0].node().prev())==1: line_nodes.add(n.node_id()) linebp+=n.next()[0].distance() n=n.next()[0].node() linebp+=n.size() used_nodes.add(abs(n.node_id())) if linebp>=min_size: line_sizes.append(linebp) line_sizes.sort(reverse=True) total_bp=sum(line_sizes) print("Total lines size %d bp\n"%total_bp) next_stop=0 acc=0 for count,size in enumerate(line_sizes,1): acc+=size if acc>=next_stop*total_bp/100: print("N%d: %d bp at line # %d"%(next_stop,size,count)) next_stop+=10 ``` ```python def dump_lines(dg,filename,min_size=1000,spacing="N"*100): with open(filename,"w") as f: used_nodes=set() line_sizes=[] for nv in dg.get_all_nodeviews(): n=nv if n.node_id() in used_nodes: continue line_nodes=set() while n.node_id() not in line_nodes and len(n.prev())==1 and len(n.prev()[0].node().next())==1: line_nodes.add(n.node_id()) n=n.prev()[0].node() linebp=n.size() used_nodes.add(abs(n.node_id())) lineseq=n.sequence() line_nodes=set() while n.node_id() not in line_nodes and len(n.next())==1 and len(n.next()[0].node().prev())==1: line_nodes.add(n.node_id()) linebp+=n.next()[0].distance() n=n.next()[0].node() linebp+=n.size() lineseq+=spacing+n.sequence() used_nodes.add(abs(n.node_id())) if linebp>=min_size: line_sizes.append(linebp) f.write(">%d\n%s\n"%(nv.node_id(),lineseq)) line_sizes.sort(reverse=True) total_bp=sum(line_sizes) print("Total lines size %d bp\n"%total_bp) next_stop=0 acc=0 for count,size in enumerate(line_sizes,1): acc+=size if acc>=next_stop*total_bp/100: print("N%d: %d bp at line # %d"%(next_stop,size,count)) next_stop+=10 ``` ```python for nv in anchor_graph2.get_all_nodeviews(): for l in list(nv.next())+list(nv.prev()): if l.distance()<-500 or abs(l.node().node_id())==nv.node_id(): anchor_graph2.remove_link(-nv.node_id(),l.node().node_id()) ``` ```python def matches_to_links(dg, matches, READSIZE=0, ENDSIZE=4000, ENDMATCHES=2, MAXDIFF=.1, MAXOVLP=1000, verbose=False): nstarts=[] nends=[] nodes=set([m.node for m in matches]) if READSIZE==0: READSIZE=matches[-1].read_position for node in nodes: nmatches = [m for m in matches if m.node==node] smatches = [m for m in nmatches if m.node_position<ENDSIZE] ematches = [m for m in nmatches if m.node_position>dg.sdg.get_node_size(node) - ENDSIZE] if verbose and len(nmatches)>ENDMATCHES: print(node, dg.sdg.get_node_size(node), len(nmatches), len(smatches), len(ematches)) if len(smatches) >= ENDMATCHES: nstart = smatches[0].read_position - smatches[0].node_position else: nstart = None if len(ematches) >= ENDMATCHES: nend = ematches[-1].read_position + dg.sdg.get_node_size(node) - ematches[-1].node_position else: nend = None if nstart is not None and nend is not None: if abs(nend - nstart - dg.sdg.get_node_size(node))/dg.sdg.get_node_size(node) < MAXDIFF: nstarts.append((nstart,node)) nends.append((nend,node)) if nstart is None and nend is not None: if nend - dg.sdg.get_node_size(node) < 0: nends.append((nend,node)) if nstart is not None and nend is None: if nstart + dg.sdg.get_node_size(node) > READSIZE: nstarts.append((nstart,node)) if verbose: print(nends,nstarts) for nend in nends: for nstart in nstarts: if nstart[0] - nend[0] >= -MAXOVLP: if verbose: print(-nend[1], nstart[1], nstart[0] - nend[0]) dg.add_link( -nend[1], nstart[1], nstart[0] - nend[0] ) def analyse_recruited_reads(lrr,nodes): rids=set() for n in nodes: rids.update(lrr.node_reads[abs(n)]) dg=SDG.DistanceGraph(ws.sdg) for rid in rids: #TODO: break looping reads matches_to_links(dg,lrr.read_perfect_matches[rid],ENDMATCHES=2) return dg ``` ```python def compare_graph_connectivity(g1,g2,min_size=1000): both=only_g1=only_g2=0 for nv1 in g1.get_all_nodeviews(): if nv1.size()<min_size: continue for ln in nv1.next(): if ln.node().size()<min_size: continue if g2.are_connected(-nv1.node_id(),ln.node().node_id()): both+=1 else: only_g1+=1 for lp in nv1.prev(): if lp.node().size()<min_size: continue if g2.are_connected(nv1.node_id(),-lp.node().node_id()): both+=1 else: only_g1+=1 for nv2 in g2.get_all_nodeviews(): if nv2.size()<min_size: continue for ln in nv2.next(): if ln.node().size()<min_size: continue if not g1.are_connected(-nv2.node_id(),ln.node().node_id()): only_g2+=1 for lp in nv2.prev(): if lp.node().size()<min_size: continue if not g1.are_connected(nv2.node_id(),-lp.node().node_id()): only_g2+=1 print("both:",both," only_g1:",only_g1," only_g2:",only_g2) def combine_graph_connectiviy(g1,g2): g=SDG.DistanceGraph(ws.sdg) for gx in [g1,g2]: for nv in gx.get_all_nodeviews(): for ln in nv.next(): if not g.are_connected(-nv.node_id(),ln.node().node_id()): g.add_link(-nv.node_id(),ln.node().node_id(),ln.distance()) for lp in nv.prev(): if not g.are_connected(nv.node_id(),-lp.node().node_id()): g.add_link(nv.node_id(),-lp.node().node_id(),lp.distance()) return g ``` ```python #ultimate graph connectivity: #start from reciprocal Strider links #add non-reciprocal links if those linearise the graph (i.e. missed connections) #add thread-created links if those linearise the graph rladg=reciprocal_linked_anchors_dg(ws.sdg,s) rladg.remove_transitive_links(3) lrr.thread_reads(2000,2) def classify_nodes(dg,min_size=0): disconnected=linear=branching=tips=multi_tips=repeats=0 for nv in dg.get_all_nodeviews(): if nv.size()<min_size: continue cn=len(nv.next()) cp=len(nv.prev()) if cp==0 and cn==0: disconnected+=1 elif cp==0 or cn==0: if cp==1 or cn==1: tips+=1 else: multi_tips+=1 elif cp==1 and cn==1: linear+=1 elif cp>1 and cn>1: repeats+=1 else: branching+=1 print("disconnected:",disconnected," linear:",linear," branching:",branching," tips:",tips," multi_tips:",multi_tips," repeats:",repeats) ``` ```python #add missed transitive connections from statistics import median for fnv in g3.get_all_nodeviews(): for nv in [fnv,fnv.rc()]: if len(nv.next())==2: if not g3.are_connected(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id()) and thdg.are_connected(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id()): d=int(median([x.distance() for x in thdg.get_nodeview(nv.next()[0].node().node_id()).next() if x.node().node_id()==nv.next()[1].node().node_id()])) g3.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),d) #validate tips next! does the tip have connections on threads? and on strider links? tconn=notconn=0 from statistics import median for fnv in g3.get_all_nodeviews(): for nv in [fnv,fnv.rc()]: if len(nv.prev())==1 and len(nv.next())==0: if len(thdg.get_nodeview(nv.node_id()).next())>0: tconn+=1 print('\n',nv) for x in thdg.get_nodeview(nv.node_id()).next(): print(x) if nv.node_id()>0: for x in s2.links_fw[nv.node_id()]: print(x) else: for x in s2.links_bw[-nv.node_id()]: print(x) if tconn>20: break else: notconn+=1 if tconn>20: break print(tconn,notconn) #evaluate effect of reconnectig tips and connecting currently disconnected ends, when conneciton is unique forward def extend_tips(sdg,dg1,dg2): g=SDG.DistanceGraph(sdg) connected_ends=reconnected_tips=joining_lines=0 for fnv in dg1.get_all_nodeviews(): for nv in [fnv,fnv.rc()]: for ln in nv.next(): if not g.are_connected(-nv.node_id(),ln.node().node_id()): g.add_link(-nv.node_id(),ln.node().node_id(),ln.distance()) if len(nv.prev())==1 and len(nv.next())==0: tn=dg2.get_nodeview(nv.node_id()).next() if len(tn)==1 and len(tn[0].node().prev())==1 and tn[0].node().kci()>.5 and tn[0].node().kci()<1.5: nnid=tn[0].node().node_id() dg1pnext=nv.prev()[0].node().next() dg1nprev=dg1.get_nodeview(nnid).prev() dg1nnext=dg1.get_nodeview(nnid).next() if len(dg1nprev)==0 and len(dg1pnext)==1: connected_ends+=1 if len(dg1nnext)==1: joining_lines+=1 g.add_link(-nv.node_id(),nnid,tn[0].distance()) if len(dg1nprev)==1 and len(dg1pnext)==2 and dg1.are_connected(-nv.prev()[0].node().node_id(),nnid): reconnected_tips+=1 g.add_link(-nv.node_id(),nnid,tn[0].distance()) print(connected_ends,joining_lines,reconnected_tips) def anihilate_tips(dg): c=0 for fnv in dg.get_all_nodeviews(): for nv in [fnv,fnv.rc()]: if len(nv.prev())==1 and len(nv.next())==2: n1=nv.next()[0] n2=nv.next()[1] if len(n1.node().next())>0: n2,n1=n1,n2 if len(n1.node().next())==0 and len(n2.node().next())>0 and n1.distance()+n1.node().size()<n2.distance()+500: dg.remove_link(-nv.node_id(),n1.node().node_id()) c+=1 print(c) u=SDG.LinkageUntangler(mthdg) u.select_all() mthnsl2=u.make_nextselected_linkage(2) ```