# BASF
## Pacbio Scaffolding
Approach:
1 - Map reads, thread via 2+ matches.
2 - dg from threads.
3 - Anchors on nodes >500bp, kci<1.5, nextselected linkage.
4 - Linearise by reconnection of tips, bubbles and simple transitions. BW6_pbthreads.gfa -> 143Kbp N50.
5 - iterate over disconnected anchor nodes, check if they connect to lines (at their sides or in gaps, either thorugh PE or LR)
6 - reconnect disconnected lines (but mark their unions as "dangerous")
7 - Line to sequence:
7.a - fill with non-anchor nodes
7.b - check if there's overlaps on non-dangerous join
7.c - strider on non-dangerous joins
7.d - put Ns in non-dangerous joins
7.e- put ns in dangerous joins.
```python
import SDGpython as SDG
ws = SDG.WorkSpace('basf.pb.pe.kc.rf.sdgws')
print ("Calculating/loading KCI")
kc=ws.get_kmer_counter("all_pcrf_reads")
kc.load_cache('kc.dump')
print("Load PE mappings")
peds = ws.get_paired_reads_datastore('ei_pe')
peds.mapper.load_readpaths('pe_read_paths_k63.dump')
# Create PE strider
spe=SDG.Strider(ws)
spe.add_datastore(peds)
########################
# run lrr on pacbio reads
print ("Loading mapped PB reads and recruiting")
lords = ws.get_long_reads_datastore('pacbio')
k=31
lrr=SDG.LongReadsRecruiter(ws.sdg,lords,k,50)
lrr.load("lrr_pb_k{0}_all_v3_r3.dump".format(k))
# long read strider
print("Run LRR")
s=SDG.Strider(ws)
s.add_datastore(lrr)
s.link_from_anchors(1000,.75,1.25,10000,2,2,0)
# min_size, min_kci, max_kci, distance, min_reads, group_size, small_node_size
def lines_stats(dg,min_size=1000):
used_nodes=set()
line_sizes=[]
for nv in dg.get_all_nodeviews():
n=nv
if n.node_id() in used_nodes: continue
line_nodes=set()
while n.node_id() not in line_nodes and len(n.prev())==1 and len(n.prev()[0].node().next())==1:
line_nodes.add(n.node_id())
n=n.prev()[0].node()
linebp=n.size()
used_nodes.add(abs(n.node_id()))
line_nodes=set()
while n.node_id() not in line_nodes and len(n.next())==1 and len(n.next()[0].node().prev())==1:
line_nodes.add(n.node_id())
linebp+=n.next()[0].distance()
n=n.next()[0].node()
linebp+=n.size()
used_nodes.add(abs(n.node_id()))
if linebp>=min_size:
line_sizes.append(linebp)
line_sizes.sort(reverse=True)
total_bp=sum(line_sizes)
print("Total lines size %d bp\n"%total_bp)
next_stop=0
acc=0
for count,size in enumerate(line_sizes,1):
acc+=size
if acc>=next_stop*total_bp/100:
print("N%d: %d bp at line # %d"%(next_stop,size,count))
next_stop+=10
from collections import Counter
#recruit on 2 matches (for later), create threads on 2 matches, make graph
print("recruiting")
lrr.recruit(1,2)
print("threading reads")
lrr.thread_reads(20000,2)
print("Constructing dg from threads")
trmldg2m=lrr.dg_from_threads(True)
#getting rid of all links not supported by at least two threads, self and big-overlap links, and links to inversions of better links
print("Cleaning links")
for nv in trmldg2m.get_all_nodeviews(both_directions=True):
for l in nv.next():
if l.distance()<-500 or abs(l.node().node_id())==abs(nv.node_id()):
trmldg2m.remove_link(-nv.node_id(),l.node().node_id(),l.distance(),l.support())
onf=Counter([x.node().node_id() for x in nv.next()])
for onid in onf.keys():
if onf[onid]<2: trmldg2m.remove_link(-nv.node_id(),onid)
if onf[onid]<onf[-onid]: trmldg2m.remove_link(-nv.node_id(),onid)
print("Untangling linkage between anchors")
#Select anchor nodes by size and kci, create graph
lu=SDG.LinkageUntangler(trmldg2m)
for nv in trmldg2m.get_all_nodeviews(include_disconnected=False):
if nv.size()>500 and nv.kci()<1.5:
lu.selected_nodes[nv.node_id()]=True
trnsl=lu.make_nextselected_linkage(2)
trnsl.remove_transitive_links(3)
trnsl.write_to_gfa1('trnsl.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)])
lines_stats(trnsl,1000)
# GRAPH linearisation heuristics
#find matches to other nodes between two nodes (in order) in the LR.
def nodes_matches_in_between(n1,n2):
tc=Counter()
for rid in set([abs(x) for x in lrr.node_reads[abs(n1)] + lrr.node_reads[abs(n2)]]):
fwg=False
c=Counter()
bwg=False
for pm in lrr.read_perfect_matches[rid]:
if pm.node==n1:
c=Counter()
fwg=True
bwg=False
elif pm.node==-n2:
c=Counter()
fwg=False
bwg=True
elif pm.node==-n1:
if bwg: tc+=c
bwg=fwg=False
c=Counter()
elif pm.node==n2:
if fwg: tc+=c
bwg=fwg=False
c=Counter()
elif fwg: c[pm.node]+=1
elif bwg: c[-pm.node]+=1
return tc
print("Solving simple tips")
# Solve simple tips, where the tip is connected to a node and comes before the other connection.
simple_tips=0
simple_tips_solved=0
for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True):
if len(nv.next())==2:
if len(nv.next()[0].node().next())==0 and len(nv.next()[1].node().prev())==1 and nv.next()[0].node().size()+nv.next()[0].distance()<nv.next()[1].distance()+1000:
simple_tips+=1
nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id())
if nmb[nv.next()[0].node().node_id()]>2:
simple_tips_solved+=1
trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance())
trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id())
print("Simple tips:",simple_tips,simple_tips_solved)
trnsl.remove_transitive_links(3)
trnsl.write_to_gfa1('trnsl_tj.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)])
lines_stats(trnsl,1000)
print("Solving simple bubbles")
#Solve simple bubbles created by two small nodes who connect to their big neighbours
simple_bubbles=0
simple_bubbles_solved=0
for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True):
if len(nv.next())==2 and len(nv.next()[0].node().parallels())==1 and nv.next()[0].node().parallels()[0].node_id()==nv.next()[1].node().node_id():
simple_bubbles+=1
nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id())
if nmb[nv.next()[0].node().node_id()]>2:
simple_bubbles_solved+=1
trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance())
trnsl.remove_link(-nv.next()[0].node().node_id(),nv.next()[1].node().next()[0].node().node_id())
trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id())
print("Simple bubbles:",simple_bubbles,simple_bubbles_solved)
trnsl.remove_transitive_links(3)
trnsl.write_to_gfa1('trnsl_tj_bp.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)])
lines_stats(trnsl,1000)
for i in range(3):
print("Solving simple transitions")
# Solve simple tips, where the tip is connected to a node and comes before the other connection.
simple_transitions=0
simple_transitions_solved=0
for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True):
if len(nv.next())==2:
if len(nv.next()[1].node().prev())==1 and len(nv.next()[0].node().prev())==1 and nv.next()[0].node().size()+nv.next()[0].distance()<nv.next()[1].distance()+1000:
simple_transitions+=1
nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id())
if nmb[nv.next()[0].node().node_id()]>2:
simple_transitions_solved+=1
trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance())
trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id())
print("Simple transitions:",simple_transitions,simple_transitions_solved)
trnsl.remove_transitive_links(3)
trnsl.write_to_gfa1('BW6_pbthreads.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)])
lines_stats(trnsl,1000)
def stride_between(s,n1,n2,verbose=False):
spnn=[s.stride_out_in_order(n1).nodes,[-x for x in s.stride_out_in_order(-n2).nodes[::-1]],s.stride_out(n1).nodes,[-x for x in s.stride_out(-n2).nodes[::-1]]]
for spn in spnn:
if verbose: print(spn)
if not (n1 in spn and n2 in spn): continue
in1=spn.index(n1)
in2=spn.index(n2)
if verbose: print(in1,in2)
if not in1<in2: continue
return spn[in1:in2+1]
return []
def line_to_sequence(dg,s,line,verbose=False):
seq=dg.get_nodeview(line[0]).sequence()
if verbose: print(len(seq))
strided=overlapped=jumped=disconnected=0
for i in range(len(line)-1):
n1=line[i]
nv1=dg.get_nodeview(n1)
n2=line[i+1]
nv2=dg.get_nodeview(n2)
if verbose: print("+",len(nv2.sequence()))
if dg.are_connected(-n1,n2):
d=[x.distance() for x in nv1.next() if x.node().node_id()==n2][0]
joined=False
if d<-60:
e1=nv1.sequence()[-200:]
e2=nv2.sequence()[:200]
for i in range (200,60,-1):
if e1[-i:]==e2[:i]:
if verbose: print("OVL ",i)
seq+=nv2.sequence()[i:]
joined=True
overlapped+=1
break
if not joined and d<1000:
spn=stride_between(s,n1,n2)
if spn!=[]:
strided+=1
seq+=SDG.SequenceDistanceGraphPath(dg.sdg, spn[1:]).sequence()[199:]
joined=True
if not joined:
if verbose: print("Jump ",d)
#TODO: try striding
seq+='N'*max(100,d)+nv2.sequence()
jumped+=1
else:
if verbose: print("Not connected ",d)
seq+='n'*1000+nv2.sequence()
disconnected+=1
if verbose: print(len(seq))
if verbose: print("strided",strided,"overlapped",overlapped,"jumped",jumped,"disconnected",disconnected)
return seq
lines=trnsl.get_all_lines(1,2000)
with open("BW6_pbthreads_lines.fasta","w") as f:
for i in range(len(lines)):
f.write(">line%d "%i+str(lines[i])+"\n")
f.write(line_to_sequence(trnsl,spe,lines[i])+"\n")
for i in range(3):
print("Solving multi-transitions")
# Solve simple tips, where the tip is connected to a node and comes before the other connection.
simple_transitions=0
simple_transitions_solved=0
for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True):
if len(nv.next())>=2:
if len(nv.next()[1].node().prev())==1 and len(nv.next()[0].node().prev())==1 and nv.next()[0].node().size()+nv.next()[0].distance()<nv.next()[1].distance()+1000:
simple_transitions+=1
nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id())
if nmb[nv.next()[0].node().node_id()]>2:
simple_transitions_solved+=1
trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance())
trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id())
print("Multi transitions:",simple_transitions,simple_transitions_solved)
trnsl.remove_transitive_links(3)
def count_connections(nv,n2):
return len([x for x in nv.next() if x.node().node_id()==n2])
print("Disconnecting low support crosses")
t=0
ec=0
for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True):
if len(nv.prev())==1 and len(nv.next())==2:
nn0=nv.next()[0].node()
nn1=nv.next()[1].node()
if len(nn0.next())!=1 or len(nn1.next())!=1: continue
if len(nn0.prev())==2 and len(nn1.prev())==1:
nn1,nn0=nn0,nn1
if len(nn0.prev())==1 and len(nn1.prev())==2:
t+=1
o=[x.node().node_id() for x in nn1.prev() if x.node().node_id()!=nv.node_id()][0]
l1c= count_connections(trmldg2m.get_nodeview(nv.node_id()),nn0.node_id())
crossc= count_connections(trmldg2m.get_nodeview(nv.node_id()),nn1.node_id())
l2c= count_connections(trmldg2m.get_nodeview(o),nn1.node_id())
if l1c>6 and crossc<3 and l2c>6:
ec+=1
trnsl.remove_link(-nv.node_id(),nn1.node_id())
print("Low support crosses:",t,ec)
trnsl.remove_transitive_links(3)
print("Prunning tips with well-supported alternatives")
ec=0
for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True):
if len(nv.prev())==1 and len(nv.next())>=2:
abort=False
to_keep=[]
to_delete=[]
for nl in nv.next():
if len(nl.node().prev())!=1:
abort=True
break
if len(nl.node().next())>0 and count_connections(trmldg2m.get_nodeview(nv.node_id()),nl.node().node_id())>2:
to_keep.append(nl)
elif nl.node().size()<2000 and len(nl.node().next())==0:
to_delete.append(nl)
if not abort and len(to_keep)==1 and len(to_delete)==len(nv.next())-1:
ec+=1
for ln in to_delete:
trnsl.remove_link(-nv.node_id(),ln.node().node_id())
print("nodes with tips fw prunned:",ec)
trnsl.remove_transitive_links(3)
trnsl.write_to_gfa1('BW6_pbthreads_cleaner.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)])
def get_node_pos(nid):
try:
return node_pos[nid]
except:
return (0,0,0)
def significant_next(lid,max_noise=2,min_ratio=5):
if lid>0:
if len(fw_links[lid])==0: return 0
wn,wc=fw_links[lid].most_common(1)[0]
for n,c in fw_links[lid].items():
if n!=wn:
if c>max_noise or c*min_ratio>wc:
return 0
return wn
else:
if len(bw_links[-lid])==0: return 0
wn,wc=bw_links[-lid].most_common(1)[0]
for n,c in bw_links[-lid].items():
if n!=wn:
if c>max_noise or c*min_ratio>wc:
return 0
return -wn
def get_next_line(lid,max_noise=2,min_ratio=5):
sn=significant_next(lid,max_noise,min_ratio)
if sn!=0 and significant_next(-sn,max_noise,min_ratio)==-lid:
return sn
return 0
def get_prev_line(lid,max_noise=2,min_ratio=5):
return -get_next_line(-lid,max_noise,min_ratio)
def metaline(lid,max_noise=2,min_ratio=5):
ml=[]
p=lid
while p!=0:
if p in ml or -p in ml: return [lid]
ml=[p]+ml
p=get_prev_line(ml[0],max_noise,min_ratio)
while ml[-1]!=0:
n=get_next_line(ml[-1],max_noise,min_ratio)
if n in ml or -n in ml: return [lid]
ml.append(n)
return ml[:-1]
##NEW METALINE DUMP
for min_line in [2000,5000]:
print("Constructing metalines from %dbp+ lines with skipped connections"%min_line)
lines=[[]]+[list(x) for x in trnsl.get_all_lines(1,min_line)]
node_pos={}
for lid in range(len(lines)):
ll=len(lines[lid])
for nidx in range(len(lines[lid])):
node_pos[lines[lid][nidx]]=(lid,nidx,-ll+nidx)
node_pos[-lines[lid][nidx]]=(-lid,ll-nidx,-nidx)
fw_links=[]
bw_links=[]
for i in range(len(lines)):
fw_links.append(Counter())
bw_links.append(Counter())
for nid in lines[i][:4]:
for pl in trmldg2m.get_nodeview(nid).prev():
plnp=get_node_pos(pl.node().node_id())
if plnp==(0,0,0): continue
if plnp[0]!=i: bw_links[-1][plnp[0]]+=1
for nid in lines[i][-4:]:
for nl in trmldg2m.get_nodeview(nid).next():
nlnp=get_node_pos(nl.node().node_id())
if nlnp==(0,0,0): continue
if nlnp[0]!=i: fw_links[-1][nlnp[0]]+=1
used=[False for x in lines]
metalines=[]
for i in range(1,len(lines)):
if used[i]: continue
metalines.append([])
for lid in metaline(i):
used[abs(lid)]=True
if lid>0:
metalines[-1]+=lines[lid]
else:
metalines[-1]+=[-x for x in lines[-lid][::-1]]
print("dumping %d metalines"%len(metalines))
with open("BW6_metalines%dk.fasta"%int(min_line/1000),"w") as f:
for i in range(len(metalines)):
f.write(">BW6_metaline%dk_%d "%(int(min_line/1000),i)+str(metalines[i])+"\n")
f.write(line_to_sequence(trnsl,spe,metalines[i])+"\n")
```
```python
def count_duplicatednode_lines(mlines):
c=Counter()
for l in mlines:
for x in l:
c[abs(x)]+=1
dl=[]
for l in mlines:
for x in l:
if c[abs(x)]>1:
dl.append(l)
break
print(len(dl))
```
```python
node_pos={}
for lid in range(len(lines)):
ll=len(lines[lid])
for nidx in range(len(lines[lid])):
node_pos[lines[lid][nidx]]=(lid,nidx,-ll+nidx)
node_pos[-lines[lid][nidx]]=(-lid,ll-nidx,-nidx)
def get_node_pos(nid):
try:
return node_pos[nid]
except:
return (0,0,0)
def line_between_report(line):
for i in range(len(line)-1):
print("\n%d -> %d bp -> %d"%(line[i],trnsl.get_nodeview(line[i]).next()[0].distance(),line[i+1]))
mmc= [x for x in nodes_matches_in_between(line[i],line[i+1]).most_common() if x[1]>2]
for x in mmc:
cnv=trnsl.get_nodeview(x[0])
print(x[1],"hits to node",x[0],cnv.size(),"bp",len(cnv.next()+cnv.prev()))
print(spe.stride_out_in_order(line[i]).nodes)
print([-x for x in spe.stride_out_in_order(-line[i+1]).nodes[::-1]])
```
# OLD CODE: Merging both graphs
We can use a thread method that is very unreliable, because we can always pop the node from the read later:
```python
def lines_stats(dg,min_size=1000):
used_nodes=set()
line_sizes=[]
for nv in dg.get_all_nodeviews():
n=nv
if n.node_id() in used_nodes: continue
line_nodes=set()
while n.node_id() not in line_nodes and len(n.prev())==1 and len(n.prev()[0].node().next())==1:
line_nodes.add(n.node_id())
n=n.prev()[0].node()
linebp=n.size()
used_nodes.add(abs(n.node_id()))
line_nodes=set()
while n.node_id() not in line_nodes and len(n.next())==1 and len(n.next()[0].node().prev())==1:
line_nodes.add(n.node_id())
linebp+=n.next()[0].distance()
n=n.next()[0].node()
linebp+=n.size()
used_nodes.add(abs(n.node_id()))
if linebp>=min_size:
line_sizes.append(linebp)
line_sizes.sort(reverse=True)
total_bp=sum(line_sizes)
print("Total lines size %d bp\n"%total_bp)
next_stop=0
acc=0
for count,size in enumerate(line_sizes,1):
acc+=size
if acc>=next_stop*total_bp/100:
print("N%d: %d bp at line # %d"%(next_stop,size,count))
next_stop+=10
from collections import Counter
#recruit on 2 matches (for later), create threads on 2 matches, make graph
print("recruiting")
lrr.recruit(1,2)
print("threading reads")
lrr.thread_reads(20000,2)
print("Constructing dg from threads")
trmldg2m=lrr.dg_from_threads(True)
#getting rid of all links not supported by at least two threads, self and big-overlap links, and links to inversions of better links
print("Cleaning links")
for nv in trmldg2m.get_all_nodeviews(both_directions=True):
for l in nv.next():
if l.distance()<-500 or abs(l.node().node_id())==abs(nv.node_id()):
trmldg2m.remove_link(-nv.node_id(),l.node().node_id(),l.distance(),l.support())
onf=Counter([x.node().node_id() for x in nv.next()])
for onid in onf.keys():
if onf[onid]<2: trmldg2m.remove_link(-nv.node_id(),onid)
if onf[onid]<onf[-onid]: trmldg2m.remove_link(-nv.node_id(),onid)
print("Untangling linkage between anchors")
#Select anchor nodes by size and kci, create graph
lu=SDG.LinkageUntangler(trmldg2m)
for nv in trmldg2m.get_all_nodeviews(include_disconnected=False):
if nv.size()>500 and nv.kci()<1.5:
lu.selected_nodes[nv.node_id()]=True
trnsl=lu.make_nextselected_linkage(2)
trnsl.remove_transitive_links(1)
trnsl.write_to_gfa1('trnsl.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)])
lines_stats(trnsl,1000)
# GRAPH linearisation heuristics
#find matches to other nodes between two nodes (in order) in the LR.
def nodes_matches_in_between(n1,n2):
tc=Counter()
for rid in set([abs(x) for x in lrr.node_reads[abs(n1)] + lrr.node_reads[abs(n2)]]):
fwg=False
c=Counter()
bwg=False
for pm in lrr.read_perfect_matches[rid]:
if pm.node==n1:
c=Counter()
fwg=True
bwg=False
elif pm.node==-n2:
c=Counter()
fwg=False
bwg=True
elif pm.node==-n1:
if bwg: tc+=c
bwg=fwg=False
c=Counter()
elif pm.node==n2:
if fwg: tc+=c
bwg=fwg=False
c=Counter()
elif fwg: c[pm.node]+=1
elif bwg: c[-pm.node]+=1
return tc
print("Solving simple tips")
# Solve simple tips, where the tip is connected to a node and comes before the other connection.
simple_tips=0
simple_tips_solved=0
for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True):
if len(nv.next())==2:
if len(nv.next()[0].node().next())==0 and len(nv.next()[1].node().prev())==1 and nv.next()[0].node().size()+nv.next()[0].distance()<nv.next()[1].distance()+1000:
simple_tips+=1
nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id())
if nmb[nv.next()[0].node().node_id()]>2:
simple_tips_solved+=1
trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance())
trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id())
print("Simple tips:",simple_tips,simple_tips_solved)
trnsl.write_to_gfa1('trnsl_tj.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)])
lines_stats(trnsl,1000)
print("Solving simple bubbles")
#Solve simple bubbles created by two small nodes who connect to their big neighbours
simple_bubbles=0
simple_bubbles_solved=0
for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True):
if len(nv.next())==2 and len(nv.next()[0].node().parallels())==1 and nv.next()[0].node().parallels()[0].node_id()==nv.next()[1].node().node_id():
simple_bubbles+=1
nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id())
if nmb[nv.next()[0].node().node_id()]>2:
simple_bubbles_solved+=1
trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance())
trnsl.remove_link(-nv.next()[0].node().node_id(),nv.next()[1].node().next()[0].node().node_id())
trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id())
print("Simple bubbles:",simple_bubbles,simple_bubbles_solved)
trnsl.write_to_gfa1('trnsl_tj_bp.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)])
lines_stats(trnsl,1000)
for i in range(3):
print("Solving simple transition")
# Solve simple tips, where the tip is connected to a node and comes before the other connection.
simple_transitions=0
simple_transitions_solved=0
for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True):
if len(nv.next())==2:
if len(nv.next()[1].node().prev())==1 and len(nv.next()[0].node().prev())==1 and nv.next()[0].node().size()+nv.next()[0].distance()<nv.next()[1].distance()+1000:
simple_transitions+=1
nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id())
if nmb[nv.next()[0].node().node_id()]>2:
simple_transitions_solved+=1
trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance())
trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id())
print("Simple transitions:",simple_transitions,simple_transitions_solved)
trnsl.write_to_gfa1('trnsl_tj_bp_st.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)])
lines_stats(trnsl,1000)
#TODO: more aggressive here!
print("Solving and discarding simple tips")
# Solve simple tips, where the tip is connected to a node and comes before the other connection.
simple_tips=0
simple_tips_solved=0
simple_tips_discarded=0
for nv in trnsl.get_all_nodeviews(include_disconnected=False,both_directions=True):
if len(nv.next())==2:
if len(nv.next()[0].node().next())==0 and len(nv.next()[1].node().prev())==1 and nv.next()[0].node().size()+nv.next()[0].distance()<nv.next()[1].distance()+1000:
simple_tips+=1
nmb=nodes_matches_in_between(nv.node_id(),nv.next()[1].node().node_id())
if nmb[nv.next()[0].node().node_id()]>2:
simple_tips_solved+=1
trnsl.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),nv.next()[1].distance()-nv.next()[0].node().size()-nv.next()[0].distance())
trnsl.remove_link(-nv.node_id(),nv.next()[1].node().node_id())
elif nv.next()[0].node().size()<2000:
simple_tips_discarded+=1
trnsl.remove_link(-nv.node_id(),nv.next()[0].node().node_id())
print("Simple tips:",simple_tips,simple_tips_solved)
trnsl.write_to_gfa1('trnsl_tj_bp_st_tj.gfa',selected_nodes=[x.node_id() for x in trnsl.get_all_nodeviews(include_disconnected=False)])
lines_stats(trnsl,1000)
```
```python
node_pos={}
for lid in range(len(lines)):
ll=len(lines[lid])
for nidx in range(len(lines[lid])):
node_pos[lines[lid][nidx]]=(lid,nidx,-ll+nidx)
node_pos[-lines[lid][nidx]]=(-lid,ll-nidx,-nidx)
def get_node_pos(nid):
try:
return node_pos[nid]
except:
return (0,0,0)
for x in trmldg2m.get_nodeview(lines[0][-2]).next(): print(x,x.node().size(),x.node().kci(),get_node_pos(x.node().node_id()))
for x in trmldg2m.get_nodeview(lines[0][-1]).next(): print(x,x.node().size(),x.node().kci(),get_node_pos(x.node().node_id()))
def other_line_connectivity_report(lid):
print("\nother line connectivity for line",lid)
print('seq'+',seq'.join([str(abs(x)) for x in lines[lid]]))
print("BW")
for nid in lines[lid]:
c=Counter([get_node_pos(x.node().node_id())[0] for x in trmldg2m.get_nodeview(nid).prev()])
print(c.most_common())
print("FW")
for nid in lines[lid]:
c=Counter([get_node_pos(x.node().node_id())[0] for x in trmldg2m.get_nodeview(nid).next()])
print(c.most_common())
for nid in nodes_3b:
cb=Counter([get_node_pos(x.node().node_id())[0] for x in trmldg2m.get_nodeview(nid).prev()])
cf=Counter([get_node_pos(x.node().node_id())[0] for x in trmldg2m.get_nodeview(nid).next()])
print(get_node_pos(nid),nid,"BW:",cb.most_common(),"FW:",cf.most_common())
```
```python
for nid in nodes_3b:
print("\nNode",nid, len(ws.sdg.get_nodeview(nid).prev()), len(ws.sdg.get_nodeview(nid).next()))
print("BW:")
for l in [(x.node().node_id(),x.distance()) for x in trmldg2m.get_nodeview(nid).prev()]: print(l)
if nid>0: print([(-x.dest,x.dist) for x in s.links_bw[nid]])
else: print([(-x.dest,x.dist) for x in s.links_fw[-nid]])
print("FW:")
for l in [(x.node().node_id(),x.distance()) for x in trmldg2m.get_nodeview(nid).next()]: print(l)
if nid>0: print([(x.dest,x.dist) for x in s.links_fw[nid]])
else: print([(x.dest,x.dist) for x in s.links_bw[-nid]])
for nid in nodes_3b:
print("\nNode",nid, len(ws.sdg.get_nodeview(nid).prev()), len(ws.sdg.get_nodeview(nid).next()))
print("BW:")
for l in [(x.node().node_id(),x.distance()) for x in trnsl.get_nodeview(nid).prev()]: print(l)
print("FW:")
for l in [(x.node().node_id(),x.distance()) for x in trnsl.get_nodeview(nid).next()]: print(l)
```
```python
def dump_lrdg_to_text(dg,filename):
with open(filename,"w") as f:
for nv in dg.get_all_nodeviews( both_directions=True, include_disconnected=False):
for l in nv.next():
if -nv.node_id()<l.node().node_id():
f.write("%d %d %d\n"%(-nv.node_id(),l.node().node_id(),l.distance()))
def load_lrdg_to_text(dg,filename):
for l in open(filename).readlines():
if not l: continue
il=[int(x) for x in l.strip().split()]
dg.add_link(il[0],il[1],il[2])
def clone_dg(odg):
dg=SDG.DistanceGraph(odg.sdg)
for nv in odg.get_all_nodeviews( both_directions=True, include_disconnected=False):
for l in nv.next():
if -nv.node_id()<l.node().node_id():
dg.add_link(-nv.node_id(),l.node().node_id(),l.distance(),l.support())
return dg
```
# Pacbio scaffolding procedure:
## First scaffolding round (on the w2rap graph)
1) Take the contigs_raw gfa from w2rap and create an SDG workspace with EI PE and PB datastores.
2) Generate a kmer counter using EI and BASF PE reads, add to workspace.
3) Calculate KCI.
4) Path PE reads (k=63).
5) Map long reads and recruit, run strider.
```python
#This runs all steps 1-5, where there's no uncertainty
import SDGpython as SDG
ws = SDG.WorkSpace('basf.pb.pe.sdgws')
kc=ws.get_kmer_counter("all_pcrf_reads")
kc.set_kci_peak(88)
kc.compute_all_kcis()
peds = ws.get_paired_reads_datastore('ei_pe')
peds.mapper.path_reads(63,200)
lords = ws.get_long_reads_datastore('pacbio')
lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31,50)
lrr.map(31)
lrr.recruit(1,2)
s=SDG.Strider(ws)
s.add_datastore(lrr)
s.link_from_anchors(1000,.5,1.25,10000,2,2,0)
```
6) Create an LDG from strider links, find ALL LINES >1Kbp, stride between consecutive nodes, dump linear consensus.
```python
#This loads all steps 1-5, where there's no uncertainty
import SDGpython as SDG
ws = SDG.WorkSpace('basf.pb.pe.kc.rf.sdgws')
print ("Calculating/loading KCI")
kc=ws.get_kmer_counter("all_pcrf_reads")
kc.load_cache('kc.dump')
print("Load PE mappings")
peds = ws.get_paired_reads_datastore('ei_pe')
peds.mapper.load_readpaths('pe_read_paths_k63.dump')
# run PE strider
spe=SDG.Strider(ws)
spe.add_datastore(peds)
########################
# run lrr on pacbio reads
print ("Loading mapped PB reads and recruiting")
lords = ws.get_long_reads_datastore('pacbio')
k=31
lrr=SDG.LongReadsRecruiter(ws.sdg,lords,k,50)
lrr.load("lrr_pb_k{0}_all_v3_r3.dump".format(k))
# long read strider
print("Run LRR")
s=SDG.Strider(ws)
s.add_datastore(lrr)
s.link_from_anchors(1000,.75,1.25,10000,2,2,0)
# min_size, min_kci, max_kci, distance, min_reads, group_size, small_node_size
```
Creating a graph:
```python
###Option A: only create reciprocal links:
def check_link_in_strider(s,end1,end2):
r=[False,False]
if end1<0:
if [1 for l in s.links_fw[abs(end1)] if l.source==end1 and l.dest==end2]: r[0]=True
else:
if [1 for l in s.links_bw[abs(end1)] if l.source==end1 and l.dest==end2]: r[0]=True
if end2<0:
if [1 for l in s.links_fw[abs(end2)] if l.source==end2 and l.dest==end1]: r[1]=True
else:
if [1 for l in s.links_bw[abs(end2)] if l.source==end2 and l.dest==end1]: r[1]=True
return r
def reciprocal_linked_anchors_dg(sdg,s):
ag=SDG.DistanceGraph(ws.sdg)
for nid,is_anchor in enumerate(s.is_anchor):
if is_anchor:
for l in s.links_bw[nid]+s.links_fw[nid]:
if nid<abs(l.dest) and check_link_in_strider(s,l.source,l.dest)==[True,True]:
ag.add_link(l.source, l.dest, l.dist)
return ag
```
## Second scaffolding round (on lines)
1) Load lines FASTA in SDG as a graph with disconnected nodes (use lines with A's rather than N's)
2) Add EI PE and PB datastores.
3) Create KCI (don't on the TEST) check in test if there are repeated nodes.
4) Map long reads and recruit (same parameters as before), run strider.
5) Create an LDG, dump lines with Ns, always, never try to recover connectivity as this is TAINTED by the fact that we threw all repeats away. (use lowercase ns to indicate these joins)
6) replace stretches of 100 or more 'A's with 'N's.
```python
ws.sdg.load_from_fasta("anchor_graph.fa")
ws.add_paired_reads_datastore("")
ws.add_long_reads_datastore("")
ws.add_kmer_counter("")
kc=ws.get_kmer_counter("all_pcrf_reads")
kc.set_kci_peak(88)
kc.compute_all_kcis()
lords = ws.get_long_reads_datastore('pacbio')
lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31,50)
lrr.map(31)
lrr.recruit(1,2)
```
# Code dumps
```python
#BIG IDEA: tangle lr filter: take all reads recruited to the frontiers, filter their matches to only include internal nodes from the tangle, then tap?
def tangle_filtered_lrs(t,ws,lrr):
print("\n\nAnaysing tangle with %d internals and %d frontiers")
print('\nINTERNALS: seq'+',seq'.join([str(abs(x.node_id())) for x in t.internals]))
print('\nFRONTIERS: seq'+',seq'.join([str(abs(x.node_id())) for x in t.frontiers])+"\n")
for f in t.frontiers: print(f,f.kci())
pmms=SDG.PerfectMatchesFilter(ws)
nids=set()
for x in list(t.internals)+list(t.frontiers):
nids.add(x.node_id())
nids.add(-x.node_id())
onids=[-x.node_id() for x in t.frontiers]
for f in t.frontiers:
fdest=Counter()
print("\n\nFrontier ", f,len(lrr.node_reads[abs(f.node_id())]))
for rid in lrr.node_reads[abs(f.node_id())]:
if f.node_id()>0: rid=-rid
#print("\nRead ",rid," RAW")
#for x in lrr.read_perfect_matches[abs(rid)]: print(x)
if rid>0:
om=lrr.read_perfect_matches[abs(rid)]
else:
om=lrr.reverse_perfect_matches(lrr.read_perfect_matches[abs(rid)])
#print("\nRead ",rid," oriented matches")
#for x in om: print(x)
mfn=[x for x in pmms.matches_fw_from_node(-f.node_id(),pmms.truncate_turnaround(om)) if x.node in nids]
#print(len(mfn))
mfn=[x for x in mfn if x.node in nids]
#print(len(mfn))
if len(mfn)==0 or mfn[0].node==f.node_id(): continue
for x in mfn:
if x.node in onids:
fdest[x.node]+=1
print("\nRead ",rid)
for x in mfn: print(x)
print(fdest.most_common())
```
```python
lrr.recruit(1,2)
tangle_filtered_lrs(t,ws,lrr)
```
```python
def print_path(l):
for nid in l:
print(nid, ws.sdg.get_nodeview(nid).size())
def test_node(nid):
print("\n\nNode connectivity for node",nid)
print("\nBW in anchor_graph")
for lf in anchor_graph.get_nodeview(nid).prev():
print(lf,lf.node().size())
print("stride_out_in_order (bw reversed)")
for nn in spe.stride_out_in_order(-nid).nodes[1:][::-1]:
print(-nn,ws.sdg.get_nodeview(nn).size())
print("\nFW in anchor_graph")
for lf in anchor_graph.get_nodeview(nid).next():
print(lf,lf.node().size())
print("stride_out_in_order")
for nn in spe.stride_out_in_order(nid).nodes[1:]:
print(nn,ws.sdg.get_nodeview(nn).size())
def test_node2(nid):
print("\n\nNode connectivity for node",nid)
npairs=set([int((abs(x)+1)/2)*2 for x in peds.mapper.paths_in_node[abs(nid)]])
print("\nBW in anchor_graph")
for lf in anchor_graph2.get_nodeview(nid).prev():
fpairs=set([int((abs(x)+1)/2)*2 for x in peds.mapper.paths_in_node[abs(lf.node().node_id())]])
print(lf,lf.node().size(),"\t\t%d shared pairs"%len(fpairs.intersection(npairs)))
print("\nBW in multi graph")
for lf in anchor_graph_multi.get_nodeview(nid).prev():
fpairs=set([int((abs(x)+1)/2)*2 for x in peds.mapper.paths_in_node[abs(lf.node().node_id())]])
print(lf,lf.node().size(),"\t\t%d shared pairs"%len(fpairs.intersection(npairs)))
print("stride_out_in_order (bw reversed)")
for nn in spe.stride_out_in_order(-nid).nodes[1:][::-1]:
print(-nn,ws.sdg.get_nodeview(nn).size())
print("\nFW in anchor_graph")
for lf in anchor_graph2.get_nodeview(nid).next():
fpairs=set([int((abs(x)+1)/2)*2 for x in peds.mapper.paths_in_node[abs(lf.node().node_id())]])
print(lf,lf.node().size(),"\t\t%d shared pairs"%len(fpairs.intersection(npairs)))
print("\nFW in multi graph")
for lf in anchor_graph_multi.get_nodeview(nid).next():
fpairs=set([int((abs(x)+1)/2)*2 for x in peds.mapper.paths_in_node[abs(lf.node().node_id())]])
print(lf,lf.node().size(),"\t\t%d shared pairs"%len(fpairs.intersection(npairs)))
print("stride_out_in_order")
for nn in spe.stride_out_in_order(nid).nodes[1:]:
print(nn,ws.sdg.get_nodeview(nn).size())
```
```python
to_bridge=set()
for fnv in anchor_graph2.get_all_nodeviews():
for nv in [fnv,fnv.rc()]:
if len(nv.next())==1 and len(nv.next()[0].node().prev())==1:
nid=-nv.node_id()
onid=nv.next()[0].node().node_id()
to_bridge.add((min(nid,onid),max(nid,onid)))
```
```python
valid=invalid=too_short=not_ovl=0
#for n1,n2 in list(to_bridge)[:10000]:
for n1,n2 in list(to_bridge):
nv=anchor_graph2.get_nodeview(-n1)
d=nv.next()[0].distance()
if d>-1: not_ovl+=1
elif d>-20: too_short+=1
elif d<-200: invalid+=1
else:
nvs=nv.sequence()
nns=nv.next()[0].node().sequence()
for pd in range(max(-199,d-50),min(-20,d+50)):
if nvs[pd:]==nns[:-pd]:
valid+=1
break
else:
invalid+=1
print(valid,invalid,too_short,not_ovl)
```
```python
def lines_stats(dg,min_size=1000):
used_nodes=set()
line_sizes=[]
for nv in dg.get_all_nodeviews():
n=nv
if n.node_id() in used_nodes: continue
line_nodes=set()
while n.node_id() not in line_nodes and len(n.prev())==1 and len(n.prev()[0].node().next())==1:
line_nodes.add(n.node_id())
n=n.prev()[0].node()
linebp=n.size()
used_nodes.add(abs(n.node_id()))
line_nodes=set()
while n.node_id() not in line_nodes and len(n.next())==1 and len(n.next()[0].node().prev())==1:
line_nodes.add(n.node_id())
linebp+=n.next()[0].distance()
n=n.next()[0].node()
linebp+=n.size()
used_nodes.add(abs(n.node_id()))
if linebp>=min_size:
line_sizes.append(linebp)
line_sizes.sort(reverse=True)
total_bp=sum(line_sizes)
print("Total lines size %d bp\n"%total_bp)
next_stop=0
acc=0
for count,size in enumerate(line_sizes,1):
acc+=size
if acc>=next_stop*total_bp/100:
print("N%d: %d bp at line # %d"%(next_stop,size,count))
next_stop+=10
```
```python
def dump_lines(dg,filename,min_size=1000,spacing="N"*100):
with open(filename,"w") as f:
used_nodes=set()
line_sizes=[]
for nv in dg.get_all_nodeviews():
n=nv
if n.node_id() in used_nodes: continue
line_nodes=set()
while n.node_id() not in line_nodes and len(n.prev())==1 and len(n.prev()[0].node().next())==1:
line_nodes.add(n.node_id())
n=n.prev()[0].node()
linebp=n.size()
used_nodes.add(abs(n.node_id()))
lineseq=n.sequence()
line_nodes=set()
while n.node_id() not in line_nodes and len(n.next())==1 and len(n.next()[0].node().prev())==1:
line_nodes.add(n.node_id())
linebp+=n.next()[0].distance()
n=n.next()[0].node()
linebp+=n.size()
lineseq+=spacing+n.sequence()
used_nodes.add(abs(n.node_id()))
if linebp>=min_size:
line_sizes.append(linebp)
f.write(">%d\n%s\n"%(nv.node_id(),lineseq))
line_sizes.sort(reverse=True)
total_bp=sum(line_sizes)
print("Total lines size %d bp\n"%total_bp)
next_stop=0
acc=0
for count,size in enumerate(line_sizes,1):
acc+=size
if acc>=next_stop*total_bp/100:
print("N%d: %d bp at line # %d"%(next_stop,size,count))
next_stop+=10
```
```python
for nv in anchor_graph2.get_all_nodeviews():
for l in list(nv.next())+list(nv.prev()):
if l.distance()<-500 or abs(l.node().node_id())==nv.node_id():
anchor_graph2.remove_link(-nv.node_id(),l.node().node_id())
```
```python
def matches_to_links(dg, matches, READSIZE=0, ENDSIZE=4000, ENDMATCHES=2, MAXDIFF=.1, MAXOVLP=1000, verbose=False):
nstarts=[]
nends=[]
nodes=set([m.node for m in matches])
if READSIZE==0: READSIZE=matches[-1].read_position
for node in nodes:
nmatches = [m for m in matches if m.node==node]
smatches = [m for m in nmatches if m.node_position<ENDSIZE]
ematches = [m for m in nmatches if m.node_position>dg.sdg.get_node_size(node) - ENDSIZE]
if verbose and len(nmatches)>ENDMATCHES:
print(node, dg.sdg.get_node_size(node), len(nmatches), len(smatches), len(ematches))
if len(smatches) >= ENDMATCHES:
nstart = smatches[0].read_position - smatches[0].node_position
else:
nstart = None
if len(ematches) >= ENDMATCHES:
nend = ematches[-1].read_position + dg.sdg.get_node_size(node) - ematches[-1].node_position
else:
nend = None
if nstart is not None and nend is not None:
if abs(nend - nstart - dg.sdg.get_node_size(node))/dg.sdg.get_node_size(node) < MAXDIFF:
nstarts.append((nstart,node))
nends.append((nend,node))
if nstart is None and nend is not None:
if nend - dg.sdg.get_node_size(node) < 0:
nends.append((nend,node))
if nstart is not None and nend is None:
if nstart + dg.sdg.get_node_size(node) > READSIZE:
nstarts.append((nstart,node))
if verbose: print(nends,nstarts)
for nend in nends:
for nstart in nstarts:
if nstart[0] - nend[0] >= -MAXOVLP:
if verbose: print(-nend[1], nstart[1], nstart[0] - nend[0])
dg.add_link( -nend[1], nstart[1], nstart[0] - nend[0] )
def analyse_recruited_reads(lrr,nodes):
rids=set()
for n in nodes:
rids.update(lrr.node_reads[abs(n)])
dg=SDG.DistanceGraph(ws.sdg)
for rid in rids:
#TODO: break looping reads
matches_to_links(dg,lrr.read_perfect_matches[rid],ENDMATCHES=2)
return dg
```
```python
def compare_graph_connectivity(g1,g2,min_size=1000):
both=only_g1=only_g2=0
for nv1 in g1.get_all_nodeviews():
if nv1.size()<min_size: continue
for ln in nv1.next():
if ln.node().size()<min_size: continue
if g2.are_connected(-nv1.node_id(),ln.node().node_id()):
both+=1
else:
only_g1+=1
for lp in nv1.prev():
if lp.node().size()<min_size: continue
if g2.are_connected(nv1.node_id(),-lp.node().node_id()):
both+=1
else:
only_g1+=1
for nv2 in g2.get_all_nodeviews():
if nv2.size()<min_size: continue
for ln in nv2.next():
if ln.node().size()<min_size: continue
if not g1.are_connected(-nv2.node_id(),ln.node().node_id()):
only_g2+=1
for lp in nv2.prev():
if lp.node().size()<min_size: continue
if not g1.are_connected(nv2.node_id(),-lp.node().node_id()):
only_g2+=1
print("both:",both," only_g1:",only_g1," only_g2:",only_g2)
def combine_graph_connectiviy(g1,g2):
g=SDG.DistanceGraph(ws.sdg)
for gx in [g1,g2]:
for nv in gx.get_all_nodeviews():
for ln in nv.next():
if not g.are_connected(-nv.node_id(),ln.node().node_id()):
g.add_link(-nv.node_id(),ln.node().node_id(),ln.distance())
for lp in nv.prev():
if not g.are_connected(nv.node_id(),-lp.node().node_id()):
g.add_link(nv.node_id(),-lp.node().node_id(),lp.distance())
return g
```
```python
#ultimate graph connectivity:
#start from reciprocal Strider links
#add non-reciprocal links if those linearise the graph (i.e. missed connections)
#add thread-created links if those linearise the graph
rladg=reciprocal_linked_anchors_dg(ws.sdg,s)
rladg.remove_transitive_links(3)
lrr.thread_reads(2000,2)
def classify_nodes(dg,min_size=0):
disconnected=linear=branching=tips=multi_tips=repeats=0
for nv in dg.get_all_nodeviews():
if nv.size()<min_size: continue
cn=len(nv.next())
cp=len(nv.prev())
if cp==0 and cn==0:
disconnected+=1
elif cp==0 or cn==0:
if cp==1 or cn==1:
tips+=1
else:
multi_tips+=1
elif cp==1 and cn==1:
linear+=1
elif cp>1 and cn>1:
repeats+=1
else:
branching+=1
print("disconnected:",disconnected," linear:",linear," branching:",branching," tips:",tips," multi_tips:",multi_tips," repeats:",repeats)
```
```python
#add missed transitive connections
from statistics import median
for fnv in g3.get_all_nodeviews():
for nv in [fnv,fnv.rc()]:
if len(nv.next())==2:
if not g3.are_connected(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id()) and thdg.are_connected(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id()):
d=int(median([x.distance() for x in thdg.get_nodeview(nv.next()[0].node().node_id()).next() if x.node().node_id()==nv.next()[1].node().node_id()]))
g3.add_link(-nv.next()[0].node().node_id(),nv.next()[1].node().node_id(),d)
#validate tips next!
does the tip have connections on threads? and on strider links?
tconn=notconn=0
from statistics import median
for fnv in g3.get_all_nodeviews():
for nv in [fnv,fnv.rc()]:
if len(nv.prev())==1 and len(nv.next())==0:
if len(thdg.get_nodeview(nv.node_id()).next())>0:
tconn+=1
print('\n',nv)
for x in thdg.get_nodeview(nv.node_id()).next():
print(x)
if nv.node_id()>0:
for x in s2.links_fw[nv.node_id()]: print(x)
else:
for x in s2.links_bw[-nv.node_id()]: print(x)
if tconn>20: break
else: notconn+=1
if tconn>20: break
print(tconn,notconn)
#evaluate effect of reconnectig tips and connecting currently disconnected ends, when conneciton is unique forward
def extend_tips(sdg,dg1,dg2):
g=SDG.DistanceGraph(sdg)
connected_ends=reconnected_tips=joining_lines=0
for fnv in dg1.get_all_nodeviews():
for nv in [fnv,fnv.rc()]:
for ln in nv.next():
if not g.are_connected(-nv.node_id(),ln.node().node_id()):
g.add_link(-nv.node_id(),ln.node().node_id(),ln.distance())
if len(nv.prev())==1 and len(nv.next())==0:
tn=dg2.get_nodeview(nv.node_id()).next()
if len(tn)==1 and len(tn[0].node().prev())==1 and tn[0].node().kci()>.5 and tn[0].node().kci()<1.5:
nnid=tn[0].node().node_id()
dg1pnext=nv.prev()[0].node().next()
dg1nprev=dg1.get_nodeview(nnid).prev()
dg1nnext=dg1.get_nodeview(nnid).next()
if len(dg1nprev)==0 and len(dg1pnext)==1:
connected_ends+=1
if len(dg1nnext)==1: joining_lines+=1
g.add_link(-nv.node_id(),nnid,tn[0].distance())
if len(dg1nprev)==1 and len(dg1pnext)==2 and dg1.are_connected(-nv.prev()[0].node().node_id(),nnid):
reconnected_tips+=1
g.add_link(-nv.node_id(),nnid,tn[0].distance())
print(connected_ends,joining_lines,reconnected_tips)
def anihilate_tips(dg):
c=0
for fnv in dg.get_all_nodeviews():
for nv in [fnv,fnv.rc()]:
if len(nv.prev())==1 and len(nv.next())==2:
n1=nv.next()[0]
n2=nv.next()[1]
if len(n1.node().next())>0: n2,n1=n1,n2
if len(n1.node().next())==0 and len(n2.node().next())>0 and n1.distance()+n1.node().size()<n2.distance()+500:
dg.remove_link(-nv.node_id(),n1.node().node_id())
c+=1
print(c)
u=SDG.LinkageUntangler(mthdg)
u.select_all()
mthnsl2=u.make_nextselected_linkage(2)
```