# Pink pigeon assembly
## Graph construction
```python
import SDGpython as SDG
K=63
MIN_CVG=2
NUM_BATCHES=1
ws=SDG.WorkSpace()
peds=ws.add_paired_reads_datastore('prds.prseq')
SDG.GraphMaker(ws.sdg).new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES)
ws.dump('pp_dbg63c2.sdgws')
```
## Graph improvement (cleanup, tangles, strider)
First a light tip clipping, as it removes more than 3/4 of the nodes (note graph is reloaded):
```python
import SDGpython as SDG
ws=SDG.WorkSpace('pp_dbg63c2.sdgws')
print(ws.ls())
c=SDG.GraphContigger(ws)
c.clip_tips(125,3)
ws.sdg.write_to_gfa1('pp_dbg63c2_tc125x3.gfa')
ws.sdg.load_from_gfa('pp_dbg63c2_tc125x3.gfa')
ws.dump('pp_dbg63c2_tc125x3.sdgws')
```
Now map the reads, save this as starting point.
```python
import SDGpython as SDG
ws=SDG.WorkSpace('pp_dbg63c2_tc125x3.sdgws')
peds=ws.get_paired_reads_datastore('pe_reads')
peds.mapper.path_reads()
peds.mapper.dump_readpaths('pp_dbg63c2_tc125x3_pepaths.dump')
```
Now run the simple structures simplification (tips, error bubbles and canonical repeats)
```python
import SDGpython as SDG
from graphcleaning import *
ws=SDG.WorkSpace('pp_dbg63c2_tc125x3.sdgws')
peds=ws.get_paired_reads_datastore('pe_reads')
peds.mapper.load_readpaths('pp_dbg63c2_tc125x3_pepaths.dump')
for ii in range(3):
print('\n\nRound',ii,'starting...')
simple_structures_stats(ws.sdg)
if ii!=0: peds.mapper.path_reads()
clip_all_tips(ws,peds,apply=True)
ws.sdg.write_to_gfa1('round%d_after_clip.gfa'%(ii))
simple_structures_stats(ws.sdg)
peds.mapper.path_reads()
solve_all_canonical(ws,peds,size=300,apply=True)
ws.sdg.write_to_gfa1('round%d_after_canonicals.gfa'%(ii))
simple_structures_stats(ws.sdg)
```
<!--```python
from collections import Counter
def is_canonical_repeat(nv):
if len(nv.prev())==len(nv.next())>1:
for pl in nv.prev():
if len(pl.node().next())>1: return False
for nl in nv.next():
if len(nl.node().prev())>1: return False
return True
return False
def is_bubble_side(nv):
if len(nv.parallels())==len(nv.prev())==len(nv.next())==1 and len(nv.prev()[0].node().next())==len(nv.next()[0].node().prev())==2: return True
return False
def is_tip(nv):
if (len(nv.prev())==0 and len(nv.next())>0) or (len(nv.next())==0 and len(nv.prev())>0): return True
return False
def simple_structures_stats(dg):
total=canrep=bubsides=tips=0
for nv in dg.get_all_nodeviews():
total+=1
if is_canonical_repeat(nv): canrep+=1
if is_bubble_side(nv): bubsides+=1
if is_tip(nv): tips+=1
print("%d tips, %d canonical repeats and %d bubble sides out of %d nodes"%(tips,canrep,bubsides,total))
def solve_canonical_repeat(ge,nv,peds,min_support=5,max_noise=10,snr=10,verbose=False):
if verbose: print("\nSolving ",nv,nv.size(),"bp")
out_nids=[x.node().node_id() for x in nv.next()]
sols=[]
for pnv in [x.node() for x in nv.prev()]:
c=Counter()
for p in peds.mapper.all_paths_fw(pnv.node_id(),False):
if p[0]!=nv.node_id():
c[p[0]]+=1
elif len(p)>1:
c[p[1]]+=1
if verbose: print(c.most_common())
t=sum(c.values())
if t<min_support:
if verbose: print("FW too few paths!")
return False
(winner,votes)=c.most_common(1)[0]
if votes<min_support:
if verbose: print("FW winner poorly supported!")
return False
noise=t-votes
if winner in out_nids and noise*snr<=votes and noise<max_noise:
sols.append((-pnv.node_id(),winner))
else:
if verbose: print("FW Not a clear winner!")
return False
if verbose: print(sols)
if len(sols)<len(nv.prev()) or len(sols)>len(set([x[1] for x in sols])): return False
in_nids=[-x.node().node_id() for x in nv.prev()]
sols2=[]
for nnv in [x.node() for x in nv.next()]:
c=Counter()
for p in peds.mapper.all_paths_fw(-nnv.node_id(),False):
if p[0]!=-nv.node_id():
c[p[0]]+=1
elif len(p)>1:
c[p[1]]+=1
t=sum(c.values())
if t<min_support:
if verbose: print("BW too few paths!")
return False
(winner,votes)=c.most_common(1)[0]
if votes<min_support:
if verbose: print("BW winner poorly supported!")
return False
noise=t-votes
if winner in in_nids and noise*snr<=votes and noise<max_noise:
sols2.append((winner,nnv.node_id()))
else:
if verbose: print("BW Not a clear winner!")
return False
if verbose: print(sols2)
sols2.sort()
sols.sort()
if sols!=sols2:
if verbose: print("FW and BW solutions not the same")
return False
ge.queue_node_expansion(nv.node_id(),sols)
#print(sols)
return True
def clip_tip(ge,nv,peds,min_support=5,max_noise=10,snr=3):
if len(nv.prev())==0: nv=nv.rc()
if len(nv.prev())!=1 or len(nv.next())!=0: return False
ayes=nays=0
for p in peds.mapper.all_paths_fw(nv.prev()[0].node().node_id(),False):
if p[0]==nv.node_id():
ayes+=1
else:
nays+=1
if ayes<=max_noise and ayes*snr<=nays and nays>=min_support:
ge.queue_node_deletion(abs(nv.node_id()))
return True
else:
return False
def pop_error_bubble(ge,nv1,nv2,peds,min_support=5,max_noise=10,snr=10):
if not is_bubble_side(nv1): return False
if nv1.parallels()[0]!=nv2 or abs(nv1.node_id())>abs(nv2.node_id()): return False
v1=v2=vo=0
for p in peds.mapper.all_paths_fw(nv1.prev()[0].node().node_id(),False):
if p[0]==nv1.node_id(): v1+=1
elif p[0]==nv2.node_id(): v2+=1
else: vo+=1
for p in peds.mapper.all_paths_fw(-nv1.next()[0].node().node_id(),False):
if p[0]==-nv1.node_id(): v1+=1
elif p[0]==-nv2.node_id(): v2+=1
else: vo+=1
if v1>min_support and v2+vo<max_noise and v1>=snr*(v2+vo):
ge.queue_node_deletion(nv2.node_id())
return True
if v2>min_support and v1+vo<max_noise and v2>=snr*(v1+vo):
ge.queue_node_deletion(nv1.node_id())
return True
return False
def solve_all_canonical(peds,size=1000,apply=False):
ge=SDG.GraphEditor(ws)
total=solved=0
for nv in ws.sdg.get_all_nodeviews():
if nv.size()<=size and is_canonical_repeat(nv):
total+=1
if solve_canonical_repeat(ge,nv,peds):
solved+=1
print("%d / %d canonical repeats solved" % (solved,total))
if apply:
ge.apply_all()
ws.sdg.join_all_unitigs()
def clip_all_tips(peds,size=300,apply=False):
ge=SDG.GraphEditor(ws)
total=solved=0
for nv in ws.sdg.get_all_nodeviews():
if nv.size()<=size and is_tip(nv):
total+=1
if clip_tip(ge,nv,peds):
solved+=1
print("%d / %d tips clipped as errors" % (solved,total))
if apply:
ge.apply_all()
ws.sdg.join_all_unitigs()
def pop_all_error_bubbles(peds,size=1000,apply=False):
ge=SDG.GraphEditor(ws)
total=solved=0
for nv in ws.sdg.get_all_nodeviews():
if nv.size()<=size and is_bubble_side(nv) and abs(nv.node_id())<abs(nv.parallels()[0].node_id()):
total+=1
if pop_error_bubble(ge,nv,nv.parallels()[0],peds):
solved+=1
print("%d / %d bubbles popped as error" % (solved,total))
if apply:
ge.apply_all()
ws.sdg.join_all_unitigs()
import SDGpython as SDG
ws=SDG.WorkSpace('pp_dbg63c2.sdgws')
peds=ws.get_paired_reads_datastore('pe_reads')
peds.mapper.load_readpaths('pp_dbg63c2_pepaths.dump')
for ii in range(3):
print('\n\nRound',ii,'starting...')
if STATS: simple_structures_stats(ws.sdg)
for i in range(1):
if ii!=0 or i!=0: peds.mapper.path_reads()
clip_all_tips(peds,apply=True)
if STATS: simple_structures_stats(ws.sdg)
ws.sdg.write_to_gfa1('round%d_after_clip_%d.gfa'%(ii,i))
for i in range(1):
peds.mapper.path_reads()
solve_all_canonical(peds,size=230,apply=True)
if STATS: simple_structures_stats(ws.sdg)
ws.sdg.write_to_gfa1('round%d_after_canonical_%d.gfa'%(ii,i))
ws.dump('pp_dbg63c2_simple.sdgws')
```
-->
## Heterozygosity removal (for clean bubbles anyway)
## Aligning the 10x genomes
# OLD STUFF
## SDG-dbg and bubble popping (by Cam)
There seems to be a lot of leftover bubbles in the graph, so we need to pop them. Maybe it could be a good idea to phase them first, to make longer easier-to-pop bubbles.
## Tangle solving with PE reads (by bj)
```python
#load Cam's workspace
import SDGpython as SDG
ws = SDG.WorkSpace('/ei/workarea/Research-Groups/RG-Bernardo-Clavijo/Cam_PP_SCAFF/SDG_DBG/Jul_final_ass/3bubblepop_LRR/pp_k63_bubble_pop_lrmap.sdgws')
peds = ws.get_paired_reads_datastore('pe_reads')
peds.mapper.load_readpaths('/ei/workarea/Research-Groups/RG-Bernardo-Clavijo/Cam_PP_SCAFF/SDG_DBG/Jul_final_ass/3bubblepop_LRR/2_3rdbubble_popped_join_readpaths.data')
kc = ws.get_kmer_counter('kmers-pe-31')
kc.set_kci_peak(46)
kc.load_cache('/ei/workarea/Research-Groups/RG-Bernardo-Clavijo/Cam_PP_SCAFF/SDG_DBG/Jul_final_ass/3bubblepop_LRR/2_3rdbubble_pop_kci.data')
#defining the tangle functions
from collections import Counter
def solve_bubble(t,s,ge):
#This checks if a bubble can be due to a sequencing error and queues a node deletion on the error node
fapath=s.stride_out_in_order(t.frontiers[0].rc().node_id()).nodes[:3]
fbpath=[-x for x in s.stride_out_in_order(t.frontiers[1].rc().node_id()).nodes[:3][::-1]]
#print(fapath,fbpath)
if len(fapath)<2: fapath=fbpath
if len(fbpath)<2: fbpath=fapath
if len(fapath)<2: return False
if fapath==fbpath and abs(t.internals[0].node_id())==abs(fapath[1]):
return ge.queue_node_deletion(t.internals[1].node_id())
if fapath==fbpath and abs(t.internals[1].node_id())==abs(fapath[1]):
return ge.queue_node_deletion(t.internals[0].node_id())
def solve_repeat(t,s,ge):
#solves a single-node repeat
rnid=t.internals[0].node_id()
ins=[x.node() for x in t.internals[0].prev()]
inids=[x.node_id() for x in ins]
outs=[x.node() for x in t.internals[0].next()]
onids=[x.node_id() for x in outs]
solutions=[]
if len(ins)!=len(outs): return False
for inv in ins:
fpath=s.stride_out_in_order(inv.node_id()).nodes[:3]
if len(fpath)==3 and fpath[1]==rnid and fpath[2] in onids:
bpath=[-x for x in s.stride_out_in_order(-fpath[2]).nodes[:3][::-1]]
if bpath==fpath: solutions.append(fpath)
if len(solutions)!=len(ins): return False
ge.queue_node_expansion(rnid,[[-x[0],x[2]] for x in solutions])
return True
def solve_tip(t,s,ge):
#just check that each frontier connects to the other one through reads
f0s=s.stride_out_in_order(-t.frontiers[0].node_id()).nodes
f1s=s.stride_out_in_order(-t.frontiers[1].node_id()).nodes
if len(f0s)>1 and f0s[1]==t.frontiers[1].node_id() and len(f1s)>1 and f1s[1]==t.frontiers[0].node_id():
ge.queue_node_deletion(t.internals[0].node_id())
return True
return False
def end_to_end_solution(p,fnids):
for i in range(len(p)):
if -p[i] in fnids:
for j in range(i+1,len(p)):
if p[j] in fnids:
if abs(p[i])<abs(p[j]): sol=p[i:j+1]
else: sol=[-x for x in p[i:j+1][::-1]]
#go through sol, if there's a single-option missing node, add it
csol=[sol[0]]
for n in sol[1:]:
if n in ws.sdg.fw_reached_nodes(csol[-1],1):
csol.append(n)
elif n in ws.sdg.fw_reached_nodes(csol[-1],2):
a=[]
for skipped in ws.sdg.fw_reached_nodes(csol[-1],1):
if n in ws.sdg.fw_reached_nodes(skipped,1):
a.append(skipped)
if len(a)==1:
csol.append(a[0])
csol.append(n)
else:
return sol
else:
return sol
return csol
return []
return []
def solve_unclassified(t,s,ge):
#first stride from all frontiers:
fs={}
fnids=[x.node_id() for x in t.frontiers]
sols={x:[] for x in fnids}
for f in t.frontiers:
fs[-f.node_id()]=s.stride_out_in_order(-f.node_id()).nodes
#for x in fs: print(x,fs[x])
ns={}
for nv in t.internals:
ns[nv.node_id()]=[-x for x in s.stride_out_in_order(-nv.node_id()).nodes[:1:-1]]+s.stride_out_in_order(nv.node_id()).nodes
#for x in ns: print(x,ns[x])
sols=[]
for x in list(fs.values())+list(ns.values()):
sol=end_to_end_solution(x,fnids)
if sol and sol not in sols:
sols.append(sol)
#for x in sols: print(x)
if len(sols)==len(fnids)/2 and len(set([x[0] for x in sols]+[-x[-1] for x in sols]))==len(fnids):
#print("UNCLASSIFIED SOLVED:")
for x in sols:
try:
ps=SDG.SequenceDistanceGraphPath(ws.sdg,x).sequence()
except:
return False
for x in sols: ge.queue_path_detachment(x,True)
for x in t.internals: ge.queue_node_deletion(x.node_id())
return True
return False
def solve_all_tangles(ws,fsize=220,fminkci=-1,fmaxkci=-1,apply=False):
total=solved=unsolved=0
s=SDG.Strider(ws)
s.add_datastore(peds)
ge=SDG.GraphEditor(ws)
solved=Counter()
unsolved=Counter()
for t in ws.sdg.get_all_tangleviews(fsize,fminkci,fmaxkci):
tc=classify_tangle(t)
if tc=="tip":
#TODO add tip to reconnection list
if solve_tip(t,s,ge): solved[tc]+=1
else: unsolved[tc]+=1
elif tc=="bubble":
if solve_bubble(t,s,ge): solved[tc]+=1
else: unsolved[tc]+=1
elif tc[:6]=="repeat":
if solve_repeat(t,s,ge): solved[tc]+=1
else: unsolved[tc]+=1
else:
if len(t.internals)<100 and len(t.frontiers)<20 and solve_unclassified(t,s,ge): solved[tc]+=1
else: unsolved[tc]+=1
total+=1
print("Total tangles: %d " % (total) )
print("Solved: ",solved.most_common())
print("Unsolved: ",unsolved.most_common())
if apply:
ge.apply_all()
ws.sdg.join_all_unitigs()
def classify_tangle(t):
if len(t.frontiers)==0:
return "debris"
if len(t.frontiers)==1:
nids=set([x.node_id() for x in t.internals])
seen_nids=set()
nexts=[t.frontiers[0].rc()]
abort=False
while nexts and not abort:
new_nexts=[]
for nv in nexts:
for nnl in nv.next():
if abs(nnl.node().node_id()) not in nids:
abort=True
break
if nnl.node().node_id() not in seen_nids:
seen_nids.add(nnl.node().node_id())
new_nexts.append(nnl.node())
if abort: break
nexts=new_nexts
if not abort: return "complex tip"
if len(t.internals)==1:
nv=t.internals[0]
if len(t.frontiers)==2 and len(nv.prev())+len(nv.next())==1:
return "tip"
if len(t.frontiers)==2*len(nv.prev()) and len(nv.prev())==len(nv.next()) and len(set([abs(nv.node_id())]+[abs(x.node().node_id()) for x in nv.prev()+nv.next()]))==len(nv.prev())*2+1:
return "repeat %d:%d"%(len(nv.prev()),len(nv.prev()))
if len(t.internals)==2 and len(t.frontiers)==2:
i0p=t.internals[0].parallels()
if len(i0p)==1 and i0p[0]==t.internals[1]:
return "bubble"
else:
i0n=t.internals[0].next()
i0p=t.internals[0].prev()
i1n=t.internals[1].next()
i1p=t.internals[1].prev()
if len(i0n)==2 and len(i0p)==2 and len(i1n)==1 and len(i1p)==1 and i1n[0].node()==i1p[0].node():
return "loop"
if len(i1n)==2 and len(i1p)==2 and len(i0n)==1 and len(i0p)==1 and i0n[0].node()==i0p[0].node():
return "loop"
return "unclassified"
#run tangle functions
print(ws.sdg.stats_by_kci())
solve_all_tangles(ws,124,apply=True)
kc.update_graph_counts()
kc.compute_all_kcis()
print(ws.sdg.stats_by_kci())
ws.sdg.write_to_gfa1('after_tangles.gfa')
```
```txt
-----------------------------------------------------------------------------------
| KCI | Total bp | Nodes | Tips | N25 | N50 | N75 |
|-------+---------------+----------+---------+------------+------------+------------|
| None | 157818826 | 2151371 | 193616 | 83 | 69 | 64 |
| < 0.5 | 83657359 | 434763 | 112333 | 456 | 211 | 125 |
| ~ 1 | 1102123465 | 523808 | 99012 | 21834 | 11747 | 5431 |
| ~ 2 | 6800703 | 41891 | 415 | 250 | 135 | 125 |
| ~ 3 | 1546314 | 10606 | 70 | 177 | 125 | 119 |
| > 3.5 | 1320335 | 10122 | 115 | 133 | 125 | 114 |
|-------+---------------+----------+---------+------------+------------+------------|
| All | 1353267002 | 3172561 | 405561 | 18748 | 8703 | 1300 |
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
| KCI | Total bp | Nodes | Tips | N25 | N50 | N75 |
|-------+---------------+----------+---------+------------+------------+------------|
| None | 155576697 | 2124766 | 191681 | 82 | 69 | 64 |
| < 0.5 | 83017277 | 412021 | 111875 | 514 | 232 | 125 |
| ~ 1 | 1101463260 | 510162 | 98823 | 22033 | 11867 | 5501 |
| ~ 2 | 6750984 | 41241 | 410 | 257 | 136 | 125 |
| ~ 3 | 1537311 | 10472 | 68 | 181 | 125 | 119 |
| > 3.5 | 1315451 | 10015 | 103 | 134 | 125 | 114 |
|-------+---------------+----------+---------+------------+------------+------------|
| All | 1349660980 | 3108677 | 402960 | 18979 | 8836 | 1417 |
-----------------------------------------------------------------------------------
##