# Strawberry Sort: clean version
**Note: this note is to be maintained by bj only, please do not muddle with it. It will contain the current "best version" of the full RGxHapil strawberry pipeline.**
Index:
[toc]
---
## Step 1: DBG and anchor runs
The idea behind this step is two-fold:
- Create an "inventory" of the 31-mer spectrum, that will be used on subsequent steps to define what a "unique anchor" is. The unique anchors are unitig regions that have all their 31-mers between MIN_UKMER_FREQ and MAX_UKMER_FREQ (i.e a "single peak").
- To make the graph simpler to start with, we create unitigs from a
```python=
import sys
sys.path.append('/hpc-home/ggarcia/git_sources/bsg/build/SDGpython')
import SDGpython as SDG
import os
MIN_UKMER_FREQ=57
MAX_UKMER_FREQ=135
K=31
## SDG-DBG assembly k=63
## Min coverage is important to reduce the noice
GK=63
MIN_CVG=30
NUM_BATCHES=8
ws=SDG.WorkSpace()
peds=ws.add_paired_reads_datastore('./data/strbrg_pe.prseq', 'pe')
gm=SDG.GraphMaker(ws.sdg)
gm.new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES); #this is so the initial kmer count includes all 31-mers
print("First graph done")
kc = ws.add_kmer_counter('pek31', K)
kc.add_count('pe', peds)
print("kmers counted")
gm.new_graph_from_paired_datastore(peds,GK,MIN_CVG,NUM_BATCHES); #this is the real DBG k
print("second graph done")
ws.sdg.write_to_gfa1("./00-sdg_dbg_graph_MC%s.gfa" %MIN_CVG)
gc=SDG.GraphContigger(ws)
gc.clip_tips(2*GK-1,3)
print("Remaining nodes: %s" %(len(ws.sdg.get_all_nodeviews())))
kc.update_graph_counts()
ws.dump("./00-sdg_dbg_graph_MC%s.sdgws" %MIN_CVG)
## Check step1
passed=True
ccc=zip(kc.count_spectra('pe',present_in_graph=False),kc.count_spectra('pe',present_in_graph=True))
for i,x in list(enumerate(ccc))[MIN_UKMER_FREQ:MAX_UKMER_FREQ+1]:
if x[0]!=x[1]:
print(x[0]-x[1],"Kmers are missing at f=",i," -> ",x)
passed=False
if passed: print("Check passed!!!!")
else: print("Check FAILED!!!!")
## Make unique kmer runs
seqs = gc.contig_reduction_to_unique_kmers(MIN_UKMER_FREQ,MAX_UKMER_FREQ)
print(len(seqs),"sequences for anchoring")
with open("./pseudoLong2.fasta","w") as f:
for i, s in enumerate(seqs):
f.write(">seq_%s\n%s\n"%(i,s))
```
---
## Step 2: workspace, KmerCounters, and nanopore mapping
k=31 index, every unique mapping is kept.
---
## Step 3: filtering nanopore reads by F1 pattern
```python=
from collections import Counter
def best_patterns(patterns,verbose=False):
if len(patterns)<10:
if verbose: print('less than 10 matches, giving up!')
return []
all_1s='1'*len(patterns[0])
pc=Counter(patterns)
t=len(patterns)
if pc[all_1s]>=t*.75:
if verbose: print('Too many 1s, giving up!')
return []
t-=pc[all_1s]
pc[all_1s]=0
hc=sum([c for p,c in pc.items() if p[-1]=='1']) #count patterns with hapil
if hc>t*.7:
if verbose: print('Too many patterns with hapil, using full pattern')
fpc=pc
ft=t
else:
if verbose: print('Discarding',hc,'/',t,'patterns with hapil')
fpc=Counter()
ft=0
for p,c in pc.items():
if p[-1]!='1':
fpc[p]=c
ft+=c
w=fpc.most_common(1)[0][0]
if verbose: print("most common pattern:",fpc.most_common(1),'/',ft)
if fpc[w]>.60*ft:
if w[-1]=='1': return [w]
else: return [w,w[:-1]+'1']
else:
if verbose: print('Winner not 60% of filtered matches, giving up')
return []
def filter_read(rid,lrr,cf,verbose=False):
patterns=[cf.get_pattern(ws.sdg.get_nodeview(m.node)) for m in lrr.read_perfect_matches[rid]]
bp=best_patterns(patterns,verbose)
if verbose: print("winning patterns:",bp)
fm=[]
for i,p in enumerate(patterns):
if p in bp:
fm.append(lrr.read_perfect_matches[rid][i])
return fm
cf3=SDG.CountFilter('rgxh31','sdg',100,kc.list_names()[1:],[2 for x in kc.list_names()[1:-1]+[30]])
srpm2=[filter_read(rid,lrr,cf3) for rid in range(len(lrr.read_perfect_matches))]
for i in range(len(srpm2)):
lrr.read_perfect_matches[i]=srpm2[i]
lrr.dump('lrr_after_segfilterv2.dump')
```
---
## Step 4: multi-component thread dg partial orders
Load previos status:
```python=
import SDGpython as SDG
ws=SDG.WorkSpace()
ws.sdg.load_from_fasta('pseudoLong2.fasta')
lords = ws.add_long_reads_datastore('data/strbrg_nanopore.loseq','nanopore')
lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31)
lrr.load('lords_kmerruns.mapped.data')
lrrf=SDG.LongReadsRecruiter(ws.sdg,lords,31)
lrrf.load('lrr_after_segfilterv2.dump')
```
```python=
def extend_solution(tgs, min_support=2,min_shared=10,min_new=10):
finished=False
for step in range(10000):
print("\nStarting expansion step",step)
s=sorted(tgs.sort_graph().items(), key=lambda x: x[1])
if s:
print("Current sorted size:",s[-1][1]-s[0][1])
else:
print("No sorted size")
# print("Current sorted size:",s[-1][1]-s[0][1])
added=0
for nodes in [[x[0] for x in s[:30]],[x[0] for x in s[-30:]]]:
rids=set()
for nid in nodes:
rids.update(tgs.rids_from_node(trg_nt.get_nodeview(nid)))
while rids:
to_add=0
largest_new=min_new-1
to_add_old=0
for rid in rids:
if rid in tgs.used_reads: continue
old,new=tgs.evaluate_read(rid)
if old>=min_shared and new>largest_new:
to_add=rid
largest_new=new
to_add_old=old
if to_add:
print("adding read %d with %d new and %d old nodes" % (to_add,largest_new,to_add_old))
rno=list(tgs.evaluate_read_nodeorder(to_add))
print(rno)
print("%d ordered, %d unordered and %d unused nodes" %(rno[0], rno[1], rno[2]))
if rno[1]:
print("skipping rid with unordered nodes")
rids.remove(to_add)
else:
tgs.add_read(to_add,min_support)
added+=1
break
else:
print("can't find any read to add!!!")
break
if not added:
print("can't find any read to add on both directions!!!")
finished=True
break
if finished: break
class CCSorter():
"""Takes an initial node and builds all TGSs needed to sort most long reads"""
def __init__(self,trg_nt,founding_node,min_nodes_per_tracked_read=30):
self.sorters=[]
self.trg_nt=trg_nt
self.founding_node=founding_node
self.master_tgs=SDG.TheGreedySorter(trg_nt,founding_node)
#for every read, check how many nodes it's got, and keep track of every read with more than X nodes
self.read_nodes={}
for rid,ends in self.master_tgs.read_ends.items():
rnodes=set()
nv=self.trg_nt.get_nodeview(ends[0])
while True:
try:
rnodes.add(abs(nv.node_id()))
nv=[x for x in nv.next() if x.support().id==rid][0].node()
except:
break
if len(rnodes)>=min_nodes_per_tracked_read:
self.read_nodes[rid]=rnodes
def read_status(self,rid):
used=unused=0
sortersets=[set(list(s.used_nodes)) for s in self.sorters]
for nid in self.read_nodes[rid]:
for s in sortersets:
if nid in s:
used+=1
break
else:
unused+=1
return used,unused
def run_new_sorter(self):
possible_starts=[]
for rid in self.read_nodes.keys():
used,unused=self.read_status(rid)
if used<3:
possible_starts.append((rid,used,unused))
possible_starts.sort(key=lambda x:(-x[1],x[2]),reverse=True)
if len(possible_starts)<10:
print("less than 10 possible starts, aborting")
return
self.sorters.append(SDG.TheGreedySorter(self.trg_nt,self.founding_node))
self.sorters[-1].start_from_read(possible_starts[0][0])
extend_solution(self.sorters[-1])
def read_usage(self):
possible_starts=[]
used_reads=unused_reads=0
for rid in self.read_nodes.keys():
used,unused=self.read_status(rid)
if used>10: used_reads+=1
else: unused_reads+=1
return used_reads,unused_reads
def make_all_orders(self):
old_order_count=-1
while len(self.sorters)>old_order_count:
old_order_count=len(self.sorters)
self.run_new_sorter()
```
```python=
lrrf.thread_reads(end_matches=1)
trg_nt=lrrf.dg_from_threads(False,True)
ccs100=trg_nt.get_all_connected_components(100)
multisorters=[]
for x in ccs100:
multisorters.append(CCSorter(trg_nt,list(x)[0]))
multisorters[-1].make_all_orders()
```
---
## Step 5: total order
```python=
lrr.thread_reads(end_matches=1)
trg_ntu=lrr.dg_from_threads(False,True,20)
tgsu=SDG.TheGreedySorter(trg_ntu)
rtgsu=SDG.TheGreedySorter(trg_ntu)
```
**TODO**
- Project read links across the order.
- Find unordered nodes already included in the order and pop them.
- For each read mapped to the node, walk fw and bw until hitting a node included in the order, and check the consistency.
```python=
from datetime import datetime
def write_connected(dg,filename):
dg.write_to_gfa1(filename,selected_nodes=[x.node_id() for x in dg.get_all_nodeviews(include_disconnected=False)])
def printlog(*args, **kwargs):
print(datetime.now().strftime('%H:%M:%S'),*args, **kwargs)
def local_happiness_assembler(tgs, rtgs, reads_tg, starting_rid, min_support=2, max_unhappy=0, max_disconnected_perc=.4,min_shared=10,min_new=10,max_unhappy_nodes_in_new_read=10,max_steps=10000000):
tgs.start_from_read(starting_rid,min_support)
for stepnumber in range(max_steps):
printlog("step #",stepnumber,"starting")
write_connected(tgs.dg,"step%d_a-starting.gfa"%(stepnumber))
# remove unhappy nodes from order
printlog("sorting graph")
order=tgs.sort_graph()
printlog("full order hapiness assessment")
nbh=[(x[0],x[1],SDG.assess_node_happiness(x[0],order,reads_tg)) for x in sorted(order.items(),key=lambda x:x[1])] # (nid, pos, (happy, unhappy, disconnected))
#first remove all nodes that are too disconnected: they won't get any better
printlog("removing disconnected")
unhappy_nodes=[]
removed_disconnected=removed_unhappy=0
for x in nbh:
if x[2][0]==0 or 1.0*x[2][2]/sum(x[2])>max_disconnected_perc:
#pop_node_from_all(tgs.dg,)
#print(x, "should go!")
tgs.remove_node(x[0])
removed_disconnected+=1
elif x[2][1]>max_unhappy:
unhappy_nodes.append(x)
printlog("removing unhappy")
#now remove any unhappy nodes
unhappy_nodes.sort(key=lambda x:x[2][1])
for x in unhappy_nodes:
h=SDG.assess_node_happiness(x[0],tgs.sort_graph(),reads_tg)
if h[1]>max_unhappy:
tgs.remove_node(x[0])
removed_unhappy+=1
printlog("%d disconnected and %d unhappy nodes removed from order"%(removed_disconnected,removed_unhappy))
write_connected(tgs.dg,"step%d_b-cleaned.gfa"%(stepnumber))
###TODO: THIS IS OLD CODE!
added=0
s=sorted(tgs.sort_graph().items(),key=lambda x:x[1])
for nodes in [[x[0] for x in s[:30]],[x[0] for x in s[-30:]]]:
rids=set()
for nid in nodes:
rids.update(tgs.rids_from_node(reads_tg.get_nodeview(nid)))
printlog("choosing among %d candidate reads" % len(rids))
while rids:
to_add=0
largest_new=min_new-1
to_add_old=0
for rid in rids:
if rid in tgs.used_reads: continue
old,new=tgs.evaluate_read(rid)
if old>=min_shared and new>largest_new:
to_add=rid
largest_new=new
to_add_old=old
if to_add:
printlog("evaluating read %d with %d new and %d old nodes" % (to_add,largest_new,to_add_old))
rnv=reads_tg.get_nodeview(tgs.read_ends[to_add][0])
unhappy=[]
while True:
if rnv.node_id() in tgs.sort_graph() or -rnv.node_id() in tgs.sort_graph():
h=SDG.assess_node_happiness(rnv.node_id(), tgs.sort_graph(), reads_tg)
if h[1]>max_unhappy: unhappy.append(rnv.node_id())
rnvn=[x for x in rnv.next() if x.support().id==to_add]
if not rnvn:break
rnv=rnvn[0].node()
if len(unhappy)>max_unhappy_nodes_in_new_read:
print("skipping rid with unordered nodes")
rids.remove(to_add)
else:
printlog("adding read")
tgs.add_read(to_add,min_support)
write_connected(tgs.dg,"step%d_c-read_added%d.gfa"%(stepnumber,added))
for u in unhappy:
tgs.pop_node(u,to_add)
added+=1
write_connected(tgs.dg,"step%d_c-read_added%d_popped.gfa"%(stepnumber,added))
break
else:
print("can't find any read to add!!!")
break
if added==0:
print("no read was added")
break
# find a read to add that makes the order grow and has few unhappy nodes
printlog("resorting order")
s=sorted(tgs.sort_graph().items(),key=lambda x:x[1])
printlog("order is now",s[-1][1],"bp long")
# if no read found: STOP
# remove unhappy nodes from read until all read is happy
# add read to order
```
```python=
local_happiness_assembler(tgsu,rtgsu,trg_ntu,11676965,max_steps=30)
```
TODO: before adding a read, measure its' node's future happiness, remove future unhappy nodes, but only use reads that have few of them
TODO: Choose reads not by "nodes shared/new" but by growing the order in one direction.
Future hapiness: a node with good connections fw/bw to the order (in the raw graph)
Keep a "nodes used per read" count for all reads in the TGS. Future happy nodes appear in reads that have a high "nodes used per read" count.
In TheGreedySorter:
- NodesUsedPerRead: Keep a uint64_t vector, one entry per read, count how many nodes of that read are included in the order.
- If a node "belongs" in the order, all the reads mapping to the node in the raw graph will have HIGH NodesUsedPerRead values.
- If a node "belongs" in a parallel order, all the reads mapping to the node will have low or null NodesUsedPerRead values.
- Nodes towards the end of the order will have some reads that span in the direction the order has not yet expanded to, so their values will always be low. We need to judge a node belonging only on reads that go towards the order (or, allow for some of the reads to have a small value, which would not hurt anyway).
To add a read to the order, pick one where all its nodes have a high belonging value.
```
c=Counter()
for nv in tgsu.dg.get_all_nodeviews(include_disconnected=False):
nv=trg_ntu.get_nodeview(nv.node_id())
c.update(list(set([x.support().id for x in nv.next()+nv.prev()])))
```
now the counter has a list of reads and how many nodes they share with the order.
if node should go in the order, all its reads should have a high shared value with the order.
node future happines: how many reads the node shares with other nodes in the haplotype (ish).
read future happiness: sum of all its node's happiness.
you want to add reads that have a certain amount of shared nodes (i.e. 10/20), and really high read future happiness.
```python=
def reverse_order(dg,order):
end=max([x[1]+dg.get_nodeview(x[0]).size() for x in order.items()])
reverse_order={}
for x in order.items():
reverse_order[-x[0]]=end-x[1]-dg.get_nodeview(x[0]).size()
return reverse_order
def score_read_to_add(tgs,rid,reverse=False,):
"""gives a score to add the read fw, or bw if reversed"""
order=tgs.sort_graph()
if reverse:
order=reverse_order(trg_ntu,order)
rtn=tgs.thread_nodes(rid)
#first figure out what the relative position of the read is
fw=bw=0
for nid in rtn:
if nid in order: fw+=1
elif -nid in order: bw+=1
#print("read",rid,"shared nodes with order:",fw,"fw,",bw,"bw")
if bw>fw:
#print("reversing read")
rtn=[-x for x in rtn[::-1]]
rid=-rid
bw,fw=fw,bw
if fw<10 or bw>2:
#print("read does not have 10+ nodes FW and <3 BW, scoring None")
return None
tnodes_in_order=[x for x in enumerate(rtn) if x[1] in order]
last_order_nodes=[x[0] for x in sorted(order.items(),key=lambda x:x[1])[-30:]]
if tnodes_in_order[-1][1] not in last_order_nodes: return None
#TODO: validate the order and such?
#nodes in order, position in read of last_node_in_order, nodes after
return (len(tnodes_in_order),tnodes_in_order[-1][0]+1,len(rtn)-tnodes_in_order[-1][0])
def find_reads_to_add(tgs,reverse=False,last_nodes=30, min_hit_perc=.30, min_new=20):
rids=set()
order=sorted(tgs.sort_graph().items(),key=lambda x:x[1])
if reverse:
nodes=[x[0] for x in order[:last_nodes]]
else:
nodes=[x[0] for x in order[-last_nodes:]]
for nid in nodes:
for rid in tgs.rids_from_node(trg_ntu.get_nodeview(nid)):
rids.add(rid)
rta=[]
for rid in rids:
s=score_read_to_add(tgs,rid,reverse)
if s is None: continue
if s[0]/s[1]>=min_hit_perc and s[2]>=min_new:
rta.append((rid,s))
#print(rid,s,'%.2f'%(100.0*s[0]/s[1]))
#else: print(rid,s)
return sorted(rta,key=lambda x:-x[1][2])
from datetime import datetime
def write_connected(dg,filename):
dg.write_to_gfa1(filename,selected_nodes=[x.node_id() for x in dg.get_all_nodeviews(include_disconnected=False)])
def printlog(*args, **kwargs):
print(datetime.now().strftime('%H:%M:%S'),*args, **kwargs)
def local_happiness_assembler(tgs, rtgs, reads_tg, starting_rid,
min_support=2, max_unhappy=0, max_disconnected_perc=.4, min_shared=10, min_new=10, max_unhappy_nodes_in_new_read=10, max_steps=10000000):
tgs.start_from_read(starting_rid,min_support)
for stepnumber in range(max_steps):
printlog("step #",stepnumber,"starting")
write_connected(tgs.dg,"step%d_a-starting.gfa"%(stepnumber))
# remove unhappy nodes from order
printlog("sorting graph")
order=tgs.sort_graph()
printlog("full order hapiness assessment")
nbh=[(x[0],x[1],SDG.assess_node_happiness(x[0],order,reads_tg)) for x in sorted(order.items(),key=lambda x:x[1])] # (nid, pos, (happy, unhappy, disconnected))
#first remove all nodes that are too disconnected: they won't get any better
printlog("removing disconnected")
unhappy_nodes=[]
removed_disconnected=removed_unhappy=0
for x in nbh:
if x[2][0]==0 or 1.0*x[2][2]/sum(x[2])>max_disconnected_perc:
#pop_node_from_all(tgs.dg,)
#print(x, "should go!")
tgs.remove_node(x[0])
removed_disconnected+=1
elif x[2][1]>max_unhappy:
unhappy_nodes.append(x)
printlog("removing unhappy")
#now remove any unhappy nodes
unhappy_nodes.sort(key=lambda x:x[2][1])
for x in unhappy_nodes:
h=SDG.assess_node_happiness(x[0],tgs.sort_graph(),reads_tg)
if h[1]>max_unhappy:
tgs.remove_node(x[0])
removed_unhappy+=1
printlog("%d disconnected and %d unhappy nodes removed from order"%(removed_disconnected,removed_unhappy))
write_connected(tgs.dg,"step%d_b-cleaned.gfa"%(stepnumber))
added=0
for rids in [find_reads_to_add(tgs),find_reads_to_add(tgs,True)]:
while rids:
to_add=rids[0][0]
if to_add in tgs.used_reads:
print("skipping already-used read")
rids=rids[1:]
continue
printlog("evaluating read",rids[0])
rnv=reads_tg.get_nodeview(tgs.read_ends[to_add][0])
unhappy=[]
while True:
if rnv.node_id() in tgs.sort_graph() or -rnv.node_id() in tgs.sort_graph():
h=SDG.assess_node_happiness(rnv.node_id(), tgs.sort_graph(), reads_tg)
if h[1]>max_unhappy: unhappy.append(rnv.node_id())
rnvn=[x for x in rnv.next() if x.support().id==to_add]
if not rnvn:break
rnv=rnvn[0].node()
if len(unhappy)>max_unhappy_nodes_in_new_read:
print("skipping rid with unordered nodes")
rids=rids[1:]
else:
printlog("adding read")
tgs.add_read(to_add,min_support)
write_connected(tgs.dg,"step%d_c-read_added%d.gfa"%(stepnumber,added))
for u in unhappy:
tgs.pop_node(u,to_add)
write_connected(tgs.dg,"step%d_c-read_added%d_popped.gfa"%(stepnumber,added))
added+=1
break
if added==0:
print("no read was added")
break
# find a read to add that makes the order grow and has few unhappy nodes
printlog("resorting order")
s=sorted(tgs.sort_graph().items(),key=lambda x:x[1])
printlog("order is now",s[-1][1],"bp long")
# if no read found: STOP
# remove unhappy nodes from read until all read is happy
# add read to order
```
Strategy #1: Partial orders from Longer reads
1 - Take all reads that have more than x nodes.
2 - Filter their nodes to a curated list of "happy nodes in the read".
use this in a single read:
```python=
# def sort_a_read(tgs,rid,reads_tg,max_unhappy=1,max_disconnected_perc=.4):
# tgs.start_from_read(rid,0)
# printlog("sorting graph")
# order=tgs.sort_graph()
# printlog("full order hapiness assessment")
# nbh=[(x[0],x[1],SDG.assess_node_happiness(x[0],order,reads_tg)) for x in sorted(order.items(),key=lambda x:x[1])] # (nid, pos, (happy, unhappy, disconnected))
# #first remove all nodes that are too disconnected: they won't get any better
# printlog("removing disconnected")
# unhappy_nodes=[]
# removed_disconnected=removed_unhappy=0
# for x in nbh:
# if x[2][0]==0 or 1.0*x[2][2]/sum(x[2])>max_disconnected_perc:
# #pop_node_from_all(tgs.dg,)
# #print(x, "should go!")
# tgs.remove_node(x[0])
# removed_disconnected+=1
# elif x[2][1]>max_unhappy:
# unhappy_nodes.append(x)
# printlog("removing unhappy")
# #now remove any unhappy nodes
# unhappy_nodes.sort(key=lambda x:x[2][1])
# for x in unhappy_nodes:
# h=SDG.assess_node_happiness(x[0],tgs.sort_graph(),reads_tg)
# if h[1]>max_unhappy:
# tgs.remove_node(x[0])
# removed_unhappy+=1
# printlog("%d disconnected and %d unhappy nodes removed from order"%(removed_disconnected,removed_unhappy))
# return tgs.sort_graph()
class HappySorter():
def __init__(self,sorter,all_threads_graph):
self.sorted_graph=SDG.DistanceGraph(all_threads_graph.sdg)
self.used_reads=set()
self.sorter=sorter
self.all_threads_graph=all_threads_graph
def thread_size(self,rid):
ends=self.sorter.get_thread_ends(rid)
e0ix=[x.support().index for x in self.all_threads_graph.get_nodeview(ends[0]).next() if x.support().id==rid][0]
e1ix=[x.support().index for x in self.all_threads_graph.get_nodeview(ends[1]).next() if x.support().id==rid][0]
#print(e0ix,e1ix)
return max([e0ix,e1ix])+1
# def sort_a_read(self,rid,max_unhappy=1,max_disconnected_perc=.4):
# self.sorter.start_from_read(rid,0)
# printlog("sorting graph")
# order=self.sorter.sort_graph()
# printlog("full order hapiness assessment")
# nbh=[(x[0],x[1],SDG.assess_node_happiness(x[0],order,self.all_threads_graph)) for x in sorted(order.items(),key=lambda x:x[1])] # (nid, pos, (happy, unhappy, disconnected))
# #first remove all nodes that are too disconnected: they won't get any better
# printlog("removing disconnected")
# unhappy_nodes=[]
# removed_disconnected=removed_unhappy=0
# for x in nbh:
# if x[2][0]==0 or 1.0*x[2][2]/sum(x[2])>max_disconnected_perc:
# #pop_node_from_all(tgs.dg,)
# #print(x, "should go!")
# self.sorter.remove_node(x[0])
# removed_disconnected+=1
# elif x[2][1]>max_unhappy:
# unhappy_nodes.append(x)
# printlog("removing unhappy")
# #now remove any unhappy nodes
# unhappy_nodes.sort(key=lambda x:x[2][1])
# for x in unhappy_nodes:
# h=SDG.assess_node_happiness(x[0],self.sorter.sort_graph(), self.all_threads_graph)
# if h[1]>max_unhappy:
# self.sorter.remove_node(x[0])
# removed_unhappy+=1
# printlog("%d disconnected and %d unhappy nodes removed from order"%(removed_disconnected,removed_unhappy))
# return self.sorter.sort_graph()
def start_from_read(self,rid):
self.sorted_graph=SDG.DistanceGraph(self.all_threads_graph.sdg)
self.used_reads=set()
self.add_read(rid)
#def add_read(self,rid):
# self.used_reads.add(rid)
# for x in sorted(self.sort_a_read(rid).items(),key=lambda x:x[1])[:-1]:
# nid=x[0]
# l=self.sorter.dg.get_nodeview(nid).next()[0]
# self.sorted_graph.add_link(-nid,l.node().node_id(),l.distance(),l.support())
def add_read(self,rid):
self.used_reads.add(rid)
hr=SDG.make_thread_happy(lrr.read_threads[abs(rid)],self.all_threads_graph)
for i in range(len(hr)-1):
self.sorted_graph.add_link(-hr[i].node,hr[i+1].node,hr[i+1].start-hr[i].end,SDG.Support(id=rid))
def find_longest_threads_per_node(self,include_used=False,top=2):
c=Counter()
for nv1 in self.sorted_graph.get_all_nodeviews(False,False):
nv=self.all_threads_graph.get_nodeview(nv1.node_id())
ml=0
mlrid=0
for l in nv.prev()+nv.next():
if include_used or l.support().id not in self.used_reads:
if len(lrr.read_threads[l.support().id])>ml:
mlrid=l.support().id
ml=len(lrr.read_threads[l.support().id])
if ml: c[mlrid]+=1
return c
def sorted_order(self):
cc=self.sorted_graph.get_connected_component( self.sorted_graph.get_all_nodeviews(include_disconnected=False)[0].node_id(), signed_nodes=True)
#print(cc)
order = sorted(SDG.sort_cc(self.sorted_graph,cc).items(),key=lambda x:x[1])
#print(order)
if len(self.sorted_graph.get_all_nodeviews(include_disconnected=False))!=len(order):
print("WARNING: it appears there's more than one component in HappySorter's sorted_graph")
return order
def pick_for_extension(self,happy_in_order=4,end_size=10,fw=True):
"""picks a read fw/bw for extension, these are the furthest-reaching reads that have at least X happy nodes among the last Y nodes of the current order"""
order=self.sorted_order()
if fw: nids=[x[0] for x in order[-end_size:]]
else: nids=[x[0] for x in order[:end_size]][::-1]
rnids=[-x for x in nids]
#print(len(nids))
#print(nids)
c=Counter()
last_link={}
for nid in nids:
for l in self.all_threads_graph.get_nodeview(nid).next():
tid=l.support().id
if tid in self.used_reads: continue
tidx=l.support().index
if tid not in last_link or last_link[tid]<tidx: last_link[tid]=tidx
c[tid]+=1
#print(c.most_common(10))
c2=Counter()
for rid,oldc in c.items():
if oldc<happy_in_order: continue
hn=[x.node for x in SDG.make_thread_happy(lrr.read_threads[abs(rid)],self.all_threads_graph)]
c2[rid]= max([ len(set(hn).intersection(set(nids))), len(set(hn).intersection(set(rnids)))])
fts=[(x,self.thread_size(x)-last_link[x]) for x in c2.keys() if c2[x]>=happy_in_order]
return sorted(fts,key=lambda x:x[1],reverse=True)
#for x in :
# print(x)
# for nv1 in self.sorted_graph.get_all_nodeviews(False,False):
# nv=self.all_threads_graph.get_nodeview(nv1.node_id())
# ml=0
# mlrid=0
def write_connected(self,filename):
self.sorted_graph.write_to_gfa1(filename,selected_nodes=[x.node_id() for x in self.sorted_graph.get_all_nodeviews(include_disconnected=False)])
def extension_cycles(self,reads_per_end=1,cycles=10,happy_in_order=4,end_size=10):
print("Running %d extension cycles with the farthest reaching %d reads"%(cycles,reads_per_end))
order=self.sorted_order()
print("starting sorted length: %dbp"%(order[-1][1]-order[0][1]))
for c in range(cycles):
print("\n\nCycle", c)
pef=self.pick_for_extension(happy_in_order=happy_in_order,end_size=end_size,fw=True)
for x in range(reads_per_end):
if len(pef)>x:
print("read #%d fw: %d"%(x,pef[x][0]))
self.add_read(pef[x][0])
peb=self.pick_for_extension(happy_in_order=happy_in_order,end_size=end_size,fw=False)
for x in range(reads_per_end):
if len(peb)>x:
print("read #%d bw: %d"%(x,peb[x][0]))
self.add_read(peb[x][0])
order=self.sorted_order()
print("Sorted length: %dbp"%(order[-1][1]-order[0][1]))
def pick_for_thickening():
def remove_read():
```
Read nodes (rtn): [A, B, C, D, F, G, I, K]
Order nodes: [P, Q, R, S, T, A, B, D, E, F]
tnodes_in_order= [(0, A), (1, B), (3, D), (4, F)]
return (4, 4, 4)
```python=
hs=HappySorter(tgsu,trg_ntu);hs.start_from_read(491667)
hs.extension_cycles(2,6)
hs.add_read(3785826)
hs.add_read(13253989)
hs.write_connected("hsbefore.gfa")
hs.add_read(14272921)
hs.write_connected("hsafter.gfa")
```
```python=
#lrr2 will get modified and will have only threads with their happy nodes
lrr2.simple_thread_reads()
SDG.make_all_threads_happy(lrr2,trg_ntu)
htrg=lrr.dg_from_threads(False,True,10)#this is largely irrelevant for the happysorter, but useful
hs=HappySorter(htgs,trg_ntu);hs.start_from_read(11676965)
hs.extension_cycles(10,50,4,20)
```
```python=
def all_nodes_and_distances_across_rid(nv,rid,max_distance=10000000000):
nodes=[]
d=-nv.size()
while True:
try:
lf=[l for l in nv.next() if l.support().id==rid][0]
except:
break
d+=nv.size()+lf.distance()
if d>max_distance: break
nodes.append((d,lf.node().node_id()))
nv=lf.node()
return nodes
def get_all_nodes_up_to_dist(fnv,dist):
all_nodes=Counter()
all_rids=set()
for x in fnv.next()+fnv.prev():
all_rids.add(x.support().id)
for rid in all_rids:
for ndist,nid in all_nodes_and_distances_across_rid(fnv,rid,dist):
all_nodes[nid]+=1
for ndist,nid in all_nodes_and_distances_across_rid(fnv.rc(),rid,dist):
all_nodes[-nid]+=1
return all_nodes
def nodes_by_dist_and_support(fnv,dist,links):
nodes=[]
for nid,count in get_all_nodes_up_to_dist(fnv,dist).items():
if count>=links: nodes.append(nid)
return nodes
```
```python=
def rtg_to_tube(rtg):
tube={"nodes":[],"tracks":[]}
tids=set()
for xx in sorted(SDG.sort_cc( rtg,rtg.get_connected_component(rtg.get_all_nodeviews(include_disconnected=False)[0].node_id(),signed_nodes=True) ).items(),key=lambda x:x[1]):
x=rtg.get_nodeview(abs(xx[0]))
tube["nodes"].append( dict(name = "%d"%x.node_id(), width = 10, seq = "seq%d"%x.node_id() ) )
for l in x.next()+x.prev():
tids.add(l.support().id)
for tnum,tid in enumerate(tids):
tube["tracks"].append(dict(
id="%d"%tnum,
name="%d"%tid,
indexOfFirstBase=1,
sequence=[str(abs(np.node)) for np in rtg.get_thread(tid)],
freq=1
))
return tube
```
# Cleaning a raw graph
## Removing nodes from threads where they OBVIOUSLY do not belong
```python=
from collections import Counter
def score_threads(rtg,nid,support=1):
n1tn=rtg.node_thread_neighbours(nid)
for tid in rtg.node_threads(nid):
c=Counter([n1tn[abs(np.node)]>support for np in rtg.get_thread(tid)])
c[True]-=1
print("%10d %3d %3d %.2f%%"%(tid,c[True],c[False],c[True]*100.0/(c[True]+c[False])))
def clean_threads(rtg,nid,support=1):
n1tn=rtg.node_thread_neighbours(nid)
to_pop=[]
for tid in rtg.node_threads(nid):
c=Counter([n1tn[abs(np.node)]>support for np in rtg.get_thread(tid)])
c[True]-=1
if c[True]<3: to_pop.append(tid)
for tid in to_pop:
rtg.pop_node(nid,tid)
return len(to_pop)
```
# from here on
```python=
def sort_rtg(rtg):
cc=rtg.get_connected_component( rtg.get_all_nodeviews(include_disconnected=False)[0].node_id(), signed_nodes=True)
#print(cc)
order = sorted(SDG.sort_cc(rtg,cc).items(),key=lambda x:x[1])
#print(order)
if len(order)==0: print("WARNING: graph can't be sorted")
elif len(rtg.get_all_nodeviews(include_disconnected=False))!=len(order):
print("WARNING: there's more than one component the graph")
return []
return order
def thread_internal_linkage(rtg,tid):
nids=[x.node for x in rtg.get_thread(tid)]
l=[]
for n1 in nids:
l.append([])
for n2 in nids:
l[-1].append(len(set(rtg.node_threads(n1))&set(rtg.node_threads(n2))))
return l
def thread_linkage_support(rtg,tid):
t=rtg.get_thread(tid)
ot={}
for i in range(len(t)):
for nt in rtg.node_threads(t[i].node):
if nt==tid: continue
if nt not in ot: ot[nt]=[]
ot[nt].append(i)
covd=[0 for x in t]
for x in ot.values():
if len(x)>1:
covd[x[0]+1]+=1
covd[x[-1]]-=1
cc=0
tls=[]
for x in covd:
cc+=x
tls.append(cc)
return tls
def is_chimeric(rtg,tid,min_support=1):
if [x for x in thread_linkage_support(rtg,tid)[1:-1] if x<min_support]:
return True
return False
def add_threads_from_node(rtgdest,rtgsource,node_id, discard_chimeric=True):
print("adding threads from node %d"%node_id)
tids=rtgsource.node_threads(node_id)
if not tids:
print("node has no threads in source rtg")
return
for tid in tids:
if rtgdest.get_thread(tid):
print(" x skipping thread %d already present in dest rtg"%tid)
elif discard_chimeric and is_chimeric(rtgsource,tid):
print(" x skipping thread %d flagged as chimeric"%tid)
else:
print(" v adding thread %d"%tid)
rtgdest.add_thread(tid, rtgsource.get_thread(tid), False, 2)
```
```python=
rpl=rtg.clean_repeat_nodes_popping_list()
len(rpl) # --> 7318860
rtg.apply_popping_list(rpl)
pl=rtg.clean_all_nodes_popping_list()
len(pl) # --> 14176414 (tiene que ser mas!)
rtg.apply_popping_list(pl)
rtg.dump("rtg_rep_and_clean.dump")
lrtg=SDG.ReadThreadsGraph(ws.sdg)
add_threads_from_node(lrtg,rtg,1)
sort_rtg(lrtg) #-> check orders for other nodes ordered here.
```
```python=
from collections import Counter
class RTGSorter():
def __init__(self,all_rtg,safe_node_cv=5,thread_anchor_nodes=5,last_safe_nodes=10):
self.safe_node_cv=safe_node_cv
self.thread_anchor_nodes=thread_anchor_nodes
self.last_safe_nodes=last_safe_nodes
self.all_rtg=all_rtg
self.rtg=SDG.ReadThreadsGraph(self.all_rtg.sdg)
self.used_threads=set()
def find_safe_nodes(self):
safe_nodes=[nv.node_id() for nv in self.rtg.get_all_nodeviews(include_disconnected=False)
if len(self.rtg.node_threads(nv.node_id()))>=self.safe_node_cv]
return safe_nodes
def find_fw_extension_threads(self,rev=False,tan=None,last_safe_nodes=None):
if last_safe_nodes is None: last_safe_nodes=self.last_safe_nodes
if tan is None: tan=self.thread_anchor_nodes
sn=self.find_safe_nodes()
safesort=self.sort_safe_rtg()
#stupidly re-make this into a dict
ssd={}
if rev:
lsn=[x[0] for x in safesort[:last_safe_nodes]]
else:
lsn=[x[0] for x in safesort[-last_safe_nodes:]]
for x in safesort:
ssd[x[0]]=x[1]
c=Counter()
for x in sn:
c.update([x for x in self.all_rtg.node_threads(x) if x not in self.used_threads])
ct=[x[0] for x in c.items() if x[1]>=tan]
st=[]
for tid in ct:
thread=[x.node for x in self.all_rtg.get_thread(tid)]
#print("\nanalysing candidate thread",tid," length=",len(thread))
if rev: thread=[-x for x in thread[::-1]]
f=[]
b=[]
for n in enumerate(thread):
if n[1] in ssd:
f.append(n)
if -n[1] in ssd:
b.append(n)
#print("B:",b)
#print("F:",f)
if len(f)==0:
st.append((tid,b[0][0], len([1 for x in thread if len(self.rtg.node_threads(x))==tan-1]), len([1 for x in thread if x in lsn or -x in lsn])))
if len(b)==0:
st.append((tid,len(thread)-f[-1][0]+1, len([1 for x in thread if len(self.rtg.node_threads(x))==tan-1]), len([1 for x in thread if x in lsn or -x in lsn])))
return sorted(st,key=lambda x:(-x[3],-x[2],-x[1]))
def thread_linkage_support(self,rtg,tid):
t=rtg.get_thread(tid)
ot={}
for i in range(len(t)):
for nt in rtg.node_threads(t[i].node):
if nt==tid: continue
if nt not in ot: ot[nt]=[]
ot[nt].append(i)
covd=[0 for x in t]
for x in ot.values():
if len(x)>1:
covd[x[0]+1]+=1
covd[x[-1]]-=1
cc=0
tls=[]
for x in covd:
cc+=x
tls.append(cc)
return tls
def is_chimeric(self,tid,min_support=1):
if [x for x in self.thread_linkage_support(self.all_rtg,tid)[1:-1] if x<min_support]:
return True
return False
def add_thread(self,tid,discard_chimeric=True):
if tid in self.used_threads or self.rtg.get_thread(tid):
print(" x skipping thread %d already present in dest rtg"%tid)
return False
elif discard_chimeric and self.is_chimeric(tid):
print(" x skipping thread %d flagged as chimeric"%tid)
return False
else:
print(" v adding thread %d"%tid)
self.rtg.add_thread(tid, self.all_rtg.get_thread(tid), False, 2)
self.used_threads.add(tid)
return True
def remove_thread(self,tid):
self.used_threads.remove(tid)
self.rtg.remove_thread(tid)
def sort_rtg(self,rtg=None):
if rtg is None: rtg=self.rtg
all_connected=rtg.get_all_nodeviews(include_disconnected=False)
cc=rtg.get_connected_component( all_connected[0].node_id(), signed_nodes=True)
order = sorted(SDG.sort_cc(rtg,cc).items(),key=lambda x:x[1])
if len(order)==0: print("WARNING: graph can't be sorted")
elif len(all_connected)!=len(order):
print("WARNING: there's more than one component the graph")
return []
return order
def sort_safe_rtg(self,safe_cv=None):
if safe_cv is None: safe_cv=self.safe_node_cv
srtg=SDG.ReadThreadsGraph(self.rtg.sdg)
for tid in self.used_threads:
srtg.add_thread(tid, self.rtg.get_thread(tid), False, 2)
to_disconnect=[nv.node_id() for nv in srtg.get_all_nodeviews(include_disconnected=False) if len(srtg.node_threads(nv.node_id()))<safe_cv]
for nid in to_disconnect:
srtg.pop_node_from_all(nid)
return self.sort_rtg(srtg)
def auto_extend(self,steps=10000):
chimeric_reads=set()
for step in range(steps):
print("\nAuto extension step",step)
ssort=self.sort_safe_rtg()
print("Starting with a safe graph from %d to %d, total distance %d bp" % (ssort[0][0],ssort[-1][0],ssort[-1][1]-ssort[0][1]))
extended=0
for fc in [ [x[0] for x in self.find_fw_extension_threads() if x not in chimeric_reads], [x[0] for x in self.find_fw_extension_threads(rev=True) if x not in chimeric_reads]]:
for tid in fc:
if tid in chimeric_reads: continue
if not self.add_thread(tid):
chimeric_reads.add(tid)
continue
ssort=self.sort_safe_rtg()
if ssort==[]:
print("last thread ruined safe order! marking as chimeric and backtracking")
chimeric_reads.add(tid)
self.remove_thread(tid)
continue
print("Extended safe graph from %d to %d, total distance %d bp" % (ssort[0][0],ssort[-1][0],ssort[-1][1]-ssort[0][1]))
extended+=1
break
if extended==0:
print("can't find a thread to add on any end, quitting")
```
```python=
from collections import Counter
class RTGThreadsCluster:
def __init__(self,all_rtg):
self.all_rtg=all_rtg
self.rtg=SDG.ReadThreadsGraph(self.all_rtg.sdg)
self.threads=set()
self.used_threads=set()
self.chimeric_threads=set()
def thread_linkage_support(self,rtg,tid):
t=rtg.get_thread(tid)
ot={}
for i in range(len(t)):
for nt in rtg.node_threads(t[i].node):
if nt==tid: continue
if nt not in ot: ot[nt]=[]
ot[nt].append(i)
covd=[0 for x in t]
for x in ot.values():
if len(x)>1:
covd[x[0]+1]+=1
covd[x[-1]]-=1
cc=0
tls=[]
for x in covd:
cc+=x
tls.append(cc)
return tls
def is_chimeric(self,tid,min_support=1):
if [x for x in self.thread_linkage_support(self.all_rtg,tid)[1:-1] if x<min_support]:
return True
return False
def add_thread(self,tid,discard_chimeric=True):
if tid in self.used_threads or self.rtg.get_thread(tid):
print(" x skipping thread %d already present in dest rtg"%tid)
return False
elif discard_chimeric and self.is_chimeric(tid):
print(" x skipping thread %d flagged as chimeric"%tid)
self.chimeric_threads.add(tid)
return False
else:
print(" v adding thread %d"%tid)
self.rtg.add_thread(tid, self.all_rtg.get_thread(tid), False, 2)
self.used_threads.add(tid)
return True
def remove_thread(self,tid):
self.used_threads.remove(tid)
self.rtg.remove_thread(tid)
def find_threads_to_add(self, safe_cvg=3, min_nodes=5, max_span=20):
print("Collecting safe nodes and candidate threads...")
tc=Counter()
nc=Counter()
for nv in self.rtg.get_all_nodeviews(include_disconnected=False):
nt=self.rtg.node_threads( nv.node_id() )
nc[nv.node_id()]=len(nt)
if len(nt)<safe_cvg: continue
for tid in self.all_rtg.node_threads( nv.node_id() ):
if tid in nt or tid in self.chimeric_threads: continue
tc[tid]+=1
print("Checking threads to add")
tcc=0
to_add=[]
for tid,c in tc.items():
if c<min_nodes: continue
tcc+=1
tpa=[x[0] for x in enumerate(self.all_rtg.get_thread(tid)) if nc[x[1].node]>=safe_cvg]
for i in range(len(tpa)-min_nodes):
if tpa[i+min_nodes]-tpa[i]<=max_span:
to_add.append(tid)
break
print("There was",tcc,"candidates")
print("There was",len(to_add),"candidates with",min_nodes,"in",max_span,"consecutive nodes included")
return to_add
```
```python=
from statistics import median
def make_summary_connection_graph(mdg,links_to_connect=3):
"""returns a new dg with only one connection per node end pair (i.e. mdg to dg)"""
conn={}
dg=SDG.DistanceGraph(mdg.sdg)
for nv in mdg.get_all_nodeviews(include_disconnected=False,both_directions=True):
for ln in nv.next():
ck=(-nv.node_id(),ln.node().node_id())
if ck[0]<=ck[1]:
if ck not in conn: conn[ck]=[]
conn[ck].append(ln.distance())
for c,d in conn.items():
if len(d)>=links_to_connect:
dg.add_link(c[0],c[1],int(median(d)))
return dg
```
Check if a node's thread can be clustered in independent haplotypes (i.e. false anchor!)
```python=
#use this on every node, if the clusters many and complicated... pop this shit!
def cluster_threads_in_node(rtg,nid,min_shared=4):
tnodes={}
for tid in rtg.node_threads(nid):
tnodes[tid]=set([abs(x.node) for x in rtg.get_thread(tid) if abs(x.node)!=abs(nid)])
tconns={}
for tid1,nodes1 in tnodes.items():
tconns[tid1]=[]
for tid2,nodes2 in tnodes.items():
if len(nodes1&nodes2)<min_shared: continue
tconns[tid1].append(tid2)
used_tids=set()
clusters=[]
for tid,conns in tconns.items():
if tid in used_tids: continue
clusters.append(set([tid]))
used_tids.add(tid)
to_explore_tids=conns
while to_explore_tids:
ntet=[]
for tid2 in to_explore_tids:
if tid2 in used_tids: continue
clusters[-1].add(tid2)
used_tids.add(tid2)
for tid3 in tconns[tid2]:
if tid3 not in used_tids:
to_explore_tids.append(tid3)
to_explore_tids=ntet
return clusters
```
# Computing multi-clustering nodes fast
- Create all sets of nodes for all threads
- For every node:
- Cluster threads by node sets, ordered by size
- Compare size of first and second clusters:
- one significant cluster: remove node from threads not in the significant cluster.
- two or more significant clusters: remove node from all threads.
```python=
def threads_to_nodesets(rtg):
nodesets={}
for nv in rtg.get_all_nodeviews(include_disconnected=False):
nid=nv.node_id()
for tid in rtg.node_threads(nid):
if tid not in nodesets:
nodesets[tid]=set()
nodesets[tid].add(nid)
return nodesets
def cluster_threads_in_node(rtg,tnodes,nid,min_shared=4):
tconns={}
ntids=rtg.node_threads(nid)
for tid1 in ntids:
nodes1=tnodes[tid1]
tconns[tid1]=[]
for tid2 in ntids:
nodes2=tnodes[tid2]
if len(nodes1&nodes2)<min_shared: continue
tconns[tid1].append(tid2)
used_tids=set()
clusters=[]
for tid,conns in tconns.items():
if tid in used_tids: continue
clusters.append(set([tid]))
used_tids.add(tid)
to_explore_tids=conns
while to_explore_tids:
ntet=[]
for tid2 in to_explore_tids:
if tid2 in used_tids: continue
clusters[-1].add(tid2)
used_tids.add(tid2)
for tid3 in tconns[tid2]:
if tid3 not in used_tids:
to_explore_tids.append(tid3)
to_explore_tids=ntet
return clusters
```
```python=
rtg #this graph has all the threads
single_node_rtg=SDG.ReadThreadsGraph(ws.sdg)
for tid in trg.node_threads(nid):
single_node_rtg.add_thread(tid, rtg.get_thread(tid))
single_node_rtg.write_to_gfa1("single_node.gfa",selected_nodes=[x.node_id() for x in single_node_rtg.get_all_nodeviews(include_disconnected=False)])
```
```python=
def sift_candidates(
```