# Assembling diploid diatoms from Illumina + nanopore
## The strategy
- Use illumina to create a k=63 DBG, with minimal cleanup
- Strider with illumina to have long kci=1 unitigs
- Resolve simple repeats with LR to have even longer unitigs
- Make unitigs into linked k=31 unique anchor chains and Map long reads.
- Challenge each unitig (i.e. split uncertain unitigs) by linkage among its anchors.
- Join unitigs / use happysorter between them.
- Go back to the k=63 DBG and untangle by the chained anchors.
## Starting graph, cleanup, KCI
```python=
import SDGpython as SDG
from collections import Counter
K=63
MIN_CVG=5
NUM_BATCHES=4
ws = SDG.WorkSpace()
peds = ws.add_paired_reads_datastore('pt_pe.prseq')
lords = ws.add_long_reads_datastore('pt_nano.loseq')
SDG.GraphMaker(ws.sdg).new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES)
gc=SDG.GraphContigger(ws)
gc.clip_tips(200)
kc=ws.add_kmer_counter("main")
kc.add_count("pe",peds)
```
```python=
%pylab inline
plot(kc.count_spectra("pe",200)[:-1])
vlines([40,71,102],0,600000)
```

```python=
BUBBLE_SIZE=125
LOW_CVG=50
to_delete=[]
for nv in ws.sdg.get_all_nodeviews():
if nv.size()<=BUBBLE_SIZE and len(nv.parallels())==1:
on=nv.parallels()[0]
if on.size()<=BUBBLE_SIZE and abs(on.node_id())>nv.node_id():
nvlc=len([x for x in nv.kmer_coverage('main','pe') if x<=LOW_CVG])
onlc=len([x for x in on.kmer_coverage('main','pe') if x<=LOW_CVG])
if nvlc and not onlc:
#print('del',nv,nvlc,on,onlc)
to_delete.append(nv.node_id())
if onlc and not nvlc:
#print('del',on,onlc,nv,nvlc)
to_delete.append(abs(on.node_id()))
for x in to_delete:
ws.sdg.remove_node(x)
ws.sdg.join_all_unitigs()
```
```python=
kc.set_kci_peak(71)
kc.compute_all_kcis()
print(ws.sdg.stats_by_kci())
```
| KCI | Total bp | Nodes | Tips | N25 | N50 | N75 |
|-------|--------------:|---------:|--------:|-----------:|-----------:|-----------:|
| None | 8323282 | 109486 | 0 | 88 | 75 | 67 |
| < 0.5 | 258694 | 1662 | 117 | 214 | 125 | 125 |
| ~ 1 | 36953626 | 216237 | 174 | 233 | 159 | 125 |
| ~ 2 | 16625872 | 67265 | 40 | 494 | 285 | 171 |
| ~ 3 | 1390179 | 6879 | 2 | 357 | 210 | 140 |
| > 3.5 | 602563 | 3017 | 2 | 2842 | 163 | 124 |
| **All** | **64154216** | **404546** | **335** | **279** | **163** | **125** |
```python=
ws.dump('pt_dbg_clean.sdgws')
ws.sdg.write_to_gfa1('pt_dbg_clean.gfa')
peds.mapper.path_reads()
peds.mapper.dump_readpaths('pt_dbg_clean.pe_paths')
```
## Strider
```python=
import SDGpython as SDG
from collections import Counter
ws=SDG.WorkSpace('pt_dbg_clean.sdgws')
peds=ws.get_paired_reads_datastore('pt_pe')
peds.mapper.load_readpaths('pt_dbg_clean.pe_paths')
kc=ws.get_kmer_counter('main')
kc.set_kci_peak(71)
kc.update_graph_counts()
kc.compute_all_kcis()
s=SDG.Strider(ws)
s.add_datastore(peds)
s.stride_from_anchors(min_size=1)
```
```python=
def strider_run_from_cpp():
linear_anchors=set()
for nv in ws.sdg.get_all_nodeviews():
nid=nv.node_id()
if s.is_anchor[nid] and len(s.routes_fw[nid])>1 and len(s.routes_bw[nid])>1: linear_anchors.add(nid)
print("%d anchors are linear" % len(linear_anchors))
ge=SDG.GraphEditor(ws)
used_nodes=[]
paths=0
accepted=0
rejected=0
nopaths=0
for fnid in list(linear_anchors):
for nid in [fnid,-fnid]:
#print("\nNode",nid)
if nid>0: fp=s.routes_fw[nid][1:]
else: fp=s.routes_bw[-nid][1:]
#print("FW",fp)
p=[]
for x in fp:
if abs(x) in linear_anchors:
p=fp[:fp.index(x)+1]
#print(p)
if x<0: op= [-x for x in s.routes_fw[-x][1:][::-1]]
else: op= [-x for x in s.routes_bw[x][1:][::-1]]
#print(op)
#print(p[:-1],"==",op[op.index(nid):],"??")
if nid not in op or p[:-1]!=op[op.index(nid)+1:]: p=[]
break
#print(p)
if p and abs(p[-1])>abs(nid):
try:
SDG.SequenceDistanceGraphPath(ws.sdg,[nid]+p).sequence()
#print([nid]+p)
paths+=1
if ge.queue_path_detachment([nid]+p,True):
accepted+=1
else:
rejected+=1
except:
nopaths+=1
for r in p[:-1]: used_nodes.append(abs(r))
print("%d paths ( %d accepted, %d rejected) and %d nopaths"%(paths,accepted,rejected,nopaths))
ge.apply_all()
#delete "used up" tips
for r in range(10):
ge=SDG.GraphEditor(ws)
for x in used_nodes:
try:
nv=ws.sdg.get_nodeview(x)
if len(nv.prev())==0 or len(nv.next())==0:
ge.queue_node_deletion(x)
except: pass
ge.apply_all()
ws.sdg.join_all_unitigs()
ge.remove_small_components(10,1000,10000)
### Finishing-off the assembly: typical heuristics that can be more aggressive when things already look fine:
#an incredibly crude loop resolver, only supporting one round across the loop:
kc.update_graph_counts()
peds.mapper.path_reads()
print(len(ws.sdg.get_all_nodeviews()))
ge=SDG.GraphEditor(ws)
for nv in ws.sdg.get_all_nodeviews():
if len(nv.prev())!=1 or len(nv.next())!=1 or nv.prev()[0].node().node_id()!=nv.next()[0].node().node_id(): continue
r=nv.next()[0].node().node_id()
orn=[x.node().node_id() for x in nv.next()[0].node().next() if x.node().node_id()!=nv.node_id()]
if len(orn)!=1: continue
orn=orn[0]
orp=[x.node().node_id() for x in nv.next()[0].node().prev() if x.node().node_id()!=nv.node_id()]
if len(orp)!=1: continue
orp=orp[0]
print("Loop on %d -> %d -> %d -> %d -> %d"%(orp,r,nv.node_id(),r,orn))
c=Counter()
nc=Counter()
nid=nv.node_id()
for rid in peds.mapper.paths_in_node[abs(nid)]:
if nid<0: rid=-rid
c.update([tuple(peds.mapper.read_paths[abs(rid)].path)])
nc.update(peds.mapper.path_fw(rid,nid))
#for x in c.most_common(): print(x)
#print()
#for x in nc.most_common(): print(x)
if nid not in nc and orn in nc:
print("loop goes around just once!")
ge.queue_node_expansion(r,[[-orp,nid],[-nid, orn]])
ge.apply_all()
ws.sdg.join_all_unitigs()
print(len(ws.sdg.get_all_nodeviews()))
## Short, low information node removal
from statistics import median
kc.update_graph_counts()
#peds.mapper.path_reads()
removed=0
for nv in ws.sdg.get_all_nodeviews():
if nv.size()>1000: continue
rc=nv.kmer_coverage("main","pe")
gc=nv.kmer_coverage("main","sdg")
ukci=[]
skci=[]
for i in range(len(rc)):
if gc[i]==1: ukci.append(rc[i])
else: skci.append(rc[i]/gc[i])
if ukci==[]: ukci=[-1]
if skci==[]: skci=[-1]
ms=median(skci)
mu=median(ukci)
#print(nv.node_id(),)
if mu>-1 and mu<=4 and ms>5*mu:
#print("Node %d (%d bp), shared_coverage=%0.2f, unique_coverage=%0.2f"%(nv.node_id(),nv.size(),ms,mu))
removed+=1
ws.sdg.remove_node(nv.node_id())
print("%d low coverage, low information nodes removed"%removed)
ws.sdg.join_all_unitigs()
strider_run_from_cpp()
```
```python=
ws.dump('pt_dbg_strided.sdgws')
ws.sdg.write_to_gfa1('pt_dbg_strided.gfa')
kc.update_graph_counts()
kc.compute_all_kcis()
print(ws.sdg.stats_by_kci())
```
| KCI | Total bp | Nodes | Tips | N25 | N50 | N75 |
|-------|--------------:|---------:|--------:|-----------:|-----------:|-----------:|
| None | 3708227 | 44929 | 76 | 100 | 79 | 68 |
| < 0.5 | 246947 | 1195 | 31 | 2074 | 125 | 125 |
| ~ 1 | 45776902 | 14833 | 178 | 14623 | 8037 | 3792 |
| ~ 2 | 4757366 | 7861 | 37 | 1321 | 814 | 609 |
| ~ 3 | 331153 | 1180 | 2 | 733 | 499 | 163 |
| > 3.5 | 400473 | 1419 | 2 | 39651 | 409 | 128 |
| **All** | **55221068** | **71417** | **326** | **13151** | **6310** | **1933** |
## Solve canonical repeats with long reads
```python=
import SDGpython as SDG
from collections import Counter
ws=SDG.WorkSpace('pt_dbg_strided.sdgws')
kc=ws.get_kmer_counter('main')
kc.set_kci_peak(71)
lords=ws.get_long_reads_datastore('pt_nano')
lrr=SDG.LongReadsRecruiter(ws.sdg,lords)
lrr.map()
lrr.simple_thread_reads()
rtg=lrr.rtg_from_threads()
```
```python=
def is_canonical_repeat(nid):
nv=ws.sdg.get_nodeview(nid)
if len(nv.next())!=2 or len(nv.prev())!=2 or len(set([abs(x) for x in [y.node().node_id() for y in nv.next()+nv.prev()]+[abs(nid)]]))!=5: return False
return True
def solve_with_llr2(nid,min_support=5,max_noise=2,snr=3,verbose=False):
nA,nB=[x.node().node_id() for x in ws.sdg.get_nodeview(nid).next()]
pA,pB=[x.node().node_id() for x in ws.sdg.get_nodeview(nid).prev()]
tAA=0
tAB=0
tBA=0
tBB=0
all_voting_tids=set(rtg.node_threads(pA).union(rtg.node_threads(pB))).intersection(set(rtg.node_threads(nA).union(rtg.node_threads(nB))))
for tid in all_voting_tids:
apA,apB,anA,anB=abs(pA),abs(pB),abs(nA),abs(nB)
c=Counter([abs(x.node) for x in lrr.read_perfect_matches[tid] if abs(x.node) in [apA,apB,anA,anB]])
if c[apA]>c[apB]*snr:
if c[anA]>c[anB]*snr: tAA+=1
elif c[anB]>c[anA]*snr: tAB+=1
elif c[apB]>c[apA]*snr:
if c[anA]>c[anB]*snr: tBA+=1
elif c[anB]>c[anA]*snr: tBB+=1
if verbose: print(nid,": ",tAA,tBB,tAB,tBA)
if min(tAA,tBB)>=min_support and max(tAB,tBA)<=max_noise and min(tAA,tBB)>=max(tAB,tBA)*snr:
return [(-pA,nA),(-pB,nB)]
if min(tAB,tBA)>=min_support and max(tAA,tBB)<=max_noise and min(tAB,tBA)>=max(tAA,tBB)*snr:
return [(-pA,nB),(-pB,nA)]
return []
```
```python=
ge=SDG.GraphEditor(ws)
total_cr=0
solved_cr=0
for x in ws.sdg.get_all_nodeviews():
nid=x.node_id()
if is_canonical_repeat(nid):
total_cr+=1
sol=solve_with_llr2(nid)
if sol!=[]:
ge.queue_node_expansion(nid,sol)
solved_cr+=1
print("%d / %d canonical repeats solved"%(solved_cr,total_cr))
ge.apply_all()
ws.sdg.join_all_unitigs()
```
```python=
kc.update_graph_counts()
print(ws.sdg.stats_by_kci())
```
| KCI | Total bp | Nodes | Tips | N25 | N50 | N75 |
|-------|--------------:|---------:|--------:|-----------:|-----------:|-----------:|
| None | 3689367 | 44694 | 76 | 100 | 79 | 68 |
| < 0.5 | 319624 | 1126 | 31 | 11296 | 4965 | 125 |
| ~ 1 | 50646430 | 7203 | 177 | 58666 | 35637 | 17486 |
| ~ 2 | 1996338 | 3650 | 37 | 15936 | 1421 | 545 |
| ~ 3 | 190553 | 891 | 2 | 564 | 223 | 125 |
| > 3.5 | 378439 | 1302 | 2 | 39651 | 469 | 128 |
| **All** | **57220751** | **58866** | **325** | **55879** | **31091** | **11469** |
```python=
ws.dump('pt_dbg_strided_lrrep.sdgws')
ws.sdg.write_to_gfa1('pt_dbg_strided_lrrep.gfa')
```
## Anchors and long read mapping
```python=
import SDGpython as SDG
from collections import Counter
ws=SDG.WorkSpace('pt_dbg_strided_lrrep.sdgws')
kc=ws.get_kmer_counter('main')
kc.set_kci_peak(71)
ws2=SDG.WorkSpace()
SDG.GraphContigger(ws).contig_reduction_to_unique_kmers(ws2,"main","pe",50,92,100);
```
```python=
peds = ws2.add_paired_reads_datastore('pt_pe.prseq')
peds.mapper.path_reads(k=31)
lords = ws2.add_long_reads_datastore('pt_nano.loseq')
lrr=SDG.LongReadsRecruiter(ws2.sdg,lords,k=31)
lrr.map()
```
```python=
ws2.dump('pt_anchors.sdgws')
peds.mapper.dump_readpaths('pt_anchors.pe_paths')
lrr.dump('pt_anchors.lrr')
```
## HappySorter from anchor lines (i.e. unitigs)
```python=
import SDGpython as SDG
from collections import Counter
ws=SDG.WorkSpace('pt_anchors.sdgws')
peds = ws.get_paired_reads_datastore('pt_pe')
peds.mapper.load_readpaths('pt_anchors.pe_paths')
lords = ws.get_long_reads_datastore('pt_nano')
lrr=SDG.LongReadsRecruiter(ws.sdg,lords,k=31)
lrr.load('pt_anchors.lrr')
lrr.simple_thread_reads()
rtg=lrr.rtg_from_threads(True)
rtg.apply_popping_list(rtg.clean_repeat_nodes_popping_list(200))
```
```python=
hsr=SDG.HappySorterRunner(rtg)
hsr.run_from_dg_lines(ws.sdg,50)
hsr.dump("hsr_50.orders")
```
```python=
otg=SDG.ReadThreadsGraph(ws.sdg)
for i,o in hsr.orders.items():
otg.add_thread(i+1,o.as_thread(ws.sdg))
otg.dump('otg.rtg')
```
## Create a thread graph from orders, pop repeated/conflicted nodes and recover connectivity across disconnected ends.
THIS JUST TO HAVE A ROUGH IDEA
```python=
lu=SDG.LinkageUntangler(otg)
lu.select_all()
slotg=lu.make_nextselected_linkage(1)
```
## Plotting order growth gifs
```python=
#to check the order from the sorters
rlords=ws.add_long_reads_datastore('SC5314.loseq')
rlrr=SDG.LongReadsRecruiter(ws.sdg,rlords,k=31)
rlrr.map()
node_coords={x.node_id(): [] for x in ws.sdg.get_all_nodeviews()}
for cid,rpm in enumerate(rlrr.read_perfect_matches):
for m in rpm:
node_coords[abs(m.node)].append((cid,m.read_position))
%pylab inline
import imageio,os
def plot_order(order,old_coordinates={},_ylim=None):
xval=[[] for x in rlrr.read_perfect_matches]
yval=[[] for x in rlrr.read_perfect_matches]
new_xval=[[] for x in rlrr.read_perfect_matches]
new_yval=[[] for x in rlrr.read_perfect_matches]
updated_xval=[[] for x in rlrr.read_perfect_matches]
updated_yval=[[] for x in rlrr.read_perfect_matches]
for ix,x in enumerate(order.as_signed_nodes(),start=1):
for nc in node_coords[abs(x)]:
if x not in old_coordinates:
new_xval[nc[0]].append(order.node_coordinates[x])
new_yval[nc[0]].append(nc[1])
elif old_coordinates[x]!=order.node_coordinates[x]:
updated_xval[nc[0]].append(order.node_coordinates[x])
updated_yval[nc[0]].append(nc[1])
else:
xval[nc[0]].append(order.node_coordinates[x])
yval[nc[0]].append(nc[1])
#print([len(x) for x in xval])
MIN_HITS_TO_DRAW=5
w=int(ceil(sqrt(len([i for i in range(len(xval)) if len(xval[i])+len(new_xval[i])+len(updated_xval[i])>=MIN_HITS_TO_DRAW]))))
f,ax=plt.subplots(w,w,figsize=(10,10))
used_i=-1
for cid in range(1,len(xval)):
xv=xval[cid]
yv=yval[cid]
nxv=new_xval[cid]
nyv=new_yval[cid]
uxv=updated_xval[cid]
uyv=updated_yval[cid]
if len(xv)+len(nxv)+len(uxv)<MIN_HITS_TO_DRAW: continue
used_i+=1
#print(f"w={w} used_i={used_i}",flush=True)
if (w>1):
a=ax[used_i//w][used_i%w]
else:
a=ax
a.plot(yv,xv,'.',color='blue')
a.plot(nyv,nxv,'.',color='green')
a.plot(uyv,uxv,'.',color='red')
a.title.set_text('Chr #%d'%(cid))
if _ylim: a.set_ylim(_ylim)
a.set_xlim(0,rlords.get_read_size(cid))
def animate_order_growth(start_line,_ylim=None):
# frames between transitions
n_frames = 3
print("Creating animation of an order's grow",flush=True)
hs=SDG.HappySorter(rtg)
hs.run_from_nodelist(start_line,10,.3,4,6,1,True)
last_coords=hs.order.node_coordinates
plot_order(hs.order,last_coords,_ylim)
suptitle("Order growth step 1")
savefig('ordergrowth_0.png')
close()
index=0
while hs.q_grow_loop(3,.7,4,6,1,True):
index+=1
plot_order(hs.order,last_coords,_ylim)
suptitle(f"Order growth step {index+1}")
savefig(f'ordergrowth_{index}.png')
close()
last_coords=hs.order.node_coordinates
print('Charts saved\n',flush=True)
# Build GIF
print('Creating gif\n',flush=True)
images_slow=[]
for x in range(index+1):
for f in range(n_frames):
images_slow.append(imageio.imread(f'ordergrowth_{x}.png'))
for f in range(2*n_frames):
images_slow.append(imageio.imread(f'ordergrowth_{index}.png'))
imageio.mimsave('ordergrowth.gif', images_slow, 'GIF-FI')
print('Gif saved\n',flush=True)
print('Removing Images\n',flush=True)
# Remove files
for x in range(index+1):
image = os.remove(f'ordergrowth_{x}.png')
print('DONE',flush=True)
```
## Carefull grow_fw node placement prototype
- Uses only threads with a # of hits to the end nodes
- Uses local offsets to place nodes in a sync'ed coordinate system to the order.
- Offsets are updated every time you cross a node at the end of the order (not other nodes with order coordinates).
- All this localises the problem to individual threads and nodes, reducing the effect of past mistakes in the order.
- This function still requires happiness, which still requires thread recruitment in the order, which is not yet fully replaced.
- There is not a similar function for internal recruitment, which means eventually this may get stuck and need a single iteration through the standard grow loop to add nodes in lower-density regions.
```python=
def careful_grow(hs,end_size=30,thread_hits=4,node_hits=3,min_happiness=.7):
#1 find threads that have hits to the end, cut only relevant parts of them (from first hit to the end onwards)
end_nodes=hs.order.as_signed_nodes()[-end_size:]
end_positions=[hs.order.node_coordinates[n] for n in end_nodes]
tc=Counter()
for nid in end_nodes:
tc.update(rtg.node_threads(nid,True))
tids=[t for t,c in tc.items() if c>=thread_hits]
cut_threads=[]
for tid in tids:
t=rtg.get_thread(tid)
#TODO: remove threads that do not follow order
ct=[]
for np in t:
if ct or np.node in end_nodes:
ct.append(np)
if ct: cut_threads.append(ct)
#2 find nodes with enough hits
nc=Counter()
for ct in cut_threads:
#print([(x.start,x.node) for x in ct])
nc.update([x.node for x in ct])
candidates=[x for x,c in nc.items() if c>=node_hits and hs.node_happiness(x,True,False)>=min_happiness]
#print(candidates)
#for x in candidates:
# print(x,x in end_nodes, hs.node_happiness(x,True,False))
#3 Keep only order nodes and nodes with enough hits in threads, sync thread coordinates to order coordinates (fw only):
np={}
for ct in cut_threads:
offset=hs.order.node_coordinates[ct[0].node]-ct[0].start
for i in range(len(ct)):
if ct[i].node in end_nodes:
offset=hs.order.node_coordinates[ct[i].node]-ct[i].start
elif ct[i].node not in candidates: continue
ct[i].start+=offset
ct[i].end+=offset
if ct[i].node not in hs.order.node_coordinates:
if ct[i].node not in np: np[ct[i].node]=[]
np[ct[i].node].append(ct[i].start)
#for ct in cut_threads:
# print([(np.start,np.node) for np in ct if np.node not in end_nodes])
print("First/last coordinate of order end:",hs.order.node_coordinates[end_nodes[0]],hs.order.node_coordinates[end_nodes[-1]])
pnodes=[]
for x in np.items():
sp=sorted(x[1])
#print(x[0],sp[-1]-sp[0], len (sp),sp[len(sp)//2])
pnodes.append((x[0],sp[len(sp)//2]))
return pnodes
```