# Strawberry sort
[TOC]
## Script 1
```python
import sys
sys.path.append('/hpc-home/ggarcia/git_sources/bsg/build/SDGpython')
import SDGpython as SDG
import os
MIN_UKMER_FREQ=57
MAX_UKMER_FREQ=135
K=31
## SDG-DBG assembly k=63
## Min coverage is important to reduce the noice
GK=63
MIN_CVG=30
NUM_BATCHES=8
ws=SDG.WorkSpace()
peds=ws.add_paired_reads_datastore('./data/strbrg_pe.prseq', 'pe')
gm=SDG.GraphMaker(ws.sdg)
gm.new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES); #this is so the initial kmer count includes all 31-mers
print("First graph done")
kc = ws.add_kmer_counter('pek31', K)
kc.add_count('pe', peds)
print("kmers counted")
gm.new_graph_from_paired_datastore(peds,GK,MIN_CVG,NUM_BATCHES); #this is the real DBG k
print("second graph done")
ws.sdg.write_to_gfa1("./00-sdg_dbg_graph_MC%s.gfa" %MIN_CVG)
gc=SDG.GraphContigger(ws)
gc.clip_tips(2*GK-1,3)
print("Remaining nodes: %s" %(len(ws.sdg.get_all_nodeviews())))
kc.update_graph_counts()
ws.dump("./00-sdg_dbg_graph_MC%s.sdgws" %MIN_CVG)
## Check step1
passed=True
ccc=zip(kc.count_spectra('pe',present_in_graph=False),kc.count_spectra('pe',present_in_graph=True))
for i,x in list(enumerate(ccc))[MIN_UKMER_FREQ:MAX_UKMER_FREQ+1]:
if x[0]!=x[1]:
print(x[0]-x[1],"Kmers are missing at f=",i," -> ",x)
passed=False
if passed: print("Check passed!!!!")
else: print("Check FAILED!!!!")
## Make unique kmer runs
seqs = gc.contig_reduction_to_unique_kmers(MIN_UKMER_FREQ,MAX_UKMER_FREQ)
print(len(seqs),"sequences for anchoring")
with open("./pseudoLong2.fasta","w") as f:
for i, s in enumerate(seqs):
f.write(">seq_%s\n%s\n"%(i,s))
```
## Script 2
```python
import sys
sys.path.append('/hpc-home/ggarcia/git_sources/bsg/build/SDGpython')
import SDGpython as SDG
import os
## TODO (probar en octo):
## - Gardar el path reads
## - Hacer que el path reads fuarde los offsets
## - guardar los grafos en el workspace no en gfas para tener el soporte. sedg y pedg crudos.
## -- ws.add_distance_graph o algo asi, se puede poner dentro del grafo
ws=SDG.WorkSpace()
ws.sdg.load_from_fasta("./pseudoLong2.fasta")
peds=ws.add_paired_reads_datastore('./data/strbrg_pe.prseq', 'pe')
peds.mapper.path_reads(31,fill_offsets=True)
def dg_from_paths(peds,g,add_pair_jumps=False,pair_jump_distace=300):
dg=SDG.DistanceGraph(g)
for rid in range(1,len(peds.mapper.read_path_offsets)):
p=peds.mapper.read_paths[rid].path
o=peds.mapper.read_path_offsets[rid]
for i in range(len(p)-1):
dg.add_link(-p[i],p[i+1],o[i+1][0]-o[i+1][1]-(o[i][0]-o[i][1])-dg.get_nodeview(p[i]).size(),SDG.Support(SDG.SupportType.stReadPath,0,rid))
if add_pair_jumps:
for rid1 in range(1,len(peds.mapper.read_path_offsets),2):
p1=peds.mapper.read_paths[rid1].path
p2=peds.mapper.read_paths[rid1+1].path
if p1 and p2:
dg.add_link(-p1[-1],-p2[-1],pair_jump_distace,SDG.Support(SDG.SupportType.stReadPath,1,rid1))
return dg
sedg=dg_from_paths(peds,ws.sdg)
u=SDG.LinkageUntangler(sedg)
u.select_all()
sedg_nsl=u.make_nextselected_linkage(5)
sedg_nsl.write_to_gfa1("./sedg_nsl.gfa")
pedg=dg_from_paths(peds,ws.sdg,add_pair_jumps=True)
u=SDG.LinkageUntangler(pedg)
u.select_all()
pedg_nsl=u.make_nextselected_linkage(5)
pedg_nsl.remove_transitive_links()
pedg_nsl.write_to_gfa1("./pedg_nsl.gfa")
ws.dump("./01-unique_kmer_runs_pemapped.sdgws")
```
## Script 3
```python
import sys
sys.path.append('/hpc-home/ggarcia/git_sources/bsg/build/SDGpython')
import SDGpython as SDG
import os
import pickle
ws = SDG.WorkSpace()
ws.sdg.load_from_gfa('./sedg_nsl.gfa')
## group components
def get_components(dg):
""" Transform lines to groups """
all_nodes = set([n.node_id() for n in dg.get_all_nodeviews()])
groups={}
while len(all_nodes)>0:
node = all_nodes.pop()
group = [node, ]
fwn = dg.get_nodeview(node).next()
while len(fwn)==1:
next_node = fwn[0].node().node_id()
if next_node in group: break
group.append(next_node)
try: all_nodes.remove(abs(next_node))
except: pass
fwn = dg.get_nodeview(next_node).next()
group = [x*-1 for x in group[::-1]]
fwn = dg.get_nodeview(-node).next()
while len(fwn)==1:
next_node = fwn[0].node().node_id()
if next_node in group: break
group.append(next_node)
try: all_nodes.remove(abs(next_node))
except: pass
fwn = dg.get_nodeview(next_node).next()
groups[node] = group
return groups
groups = get_components(ws.sdg)
## transformation table from nodo to group
print("Found %s groups" %len(groups))
transformation = {}
for nodo_central, integrantes in groups.items():
for integrante in integrantes:
transformation[integrante] = nodo_central
transformation[-integrante] = -nodo_central
## Pickle transformation dict
with open('./transformation_dict.pickle', 'wb') as tf:
pickle.dump(transformation, tf, protocol=pickle.HIGHEST_PROTOCOL)
## Pickle transformation dict
with open('./groups_dict.pickle', 'wb') as tf:
pickle.dump(groups, tf, protocol=pickle.HIGHEST_PROTOCOL)
```
## Script 4
```python
import sys
sys.path.append('/hpc-home/ggarcia/git_sources/bsg/build/SDGpython')
import SDGpython as SDG
import os
import pickle
## TODO:
## - remove all links al principio (no no afecta al mapper ahora)
## - Coniderar usar los thread en lugar de los read perfect matchings para lo de abajo
## Pickle transformation dict
with open('./transformation_dict.pickle', 'rb') as tf:
transformation = pickle.load(tf)
## Pickle transformation dict
with open('./groups_dict.pickle', 'rb') as tf:
groups = pickle.load(tf)
LOAD_FROM_DISK=False
if not LOAD_FROM_DISK:
ws = SDG.WorkSpace()
ws.sdg.load_from_gfa('./sedg_nsl.gfa')
deleted_links = 0
for nv in ws.sdg.get_all_nodeviews():
if len(nv.prev())>1:
for pn in nv.prev():
deleted_links+=1
ws.sdg.remove_link(-pn.node().node_id(), nv.node_id())
if len(nv.next())>1:
for nn in nv.next():
deleted_links+=1
ws.sdg.remove_link(-nv.node_id(), nn.node().node_id())
if len(nv.prev()) == 1 and len(nv.next()) == 1 and abs(nv.prev()[0].node().node_id()) == abs(nv.next()[0].node().node_id()):
ws.sdg.remove_link(-nv.node_id(), nv.next()[0].node().node_id())
ws.sdg.remove_link(-nv.prev()[0].node().node_id(), nv.node_id())
deleted_links+=2
print("Deleted %s links" %(deleted_links))
lords = ws.add_long_reads_datastore('./data/strbrg_nanopore.loseq')
lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31)
lrr.map(31)
#for x in range(1,17972097, 50000):
# print("Mapping %s-%s" %(x, x+50000))
# lrr.map(31, x, x+50000)
lrr.thread_reads(end_matches=1)
ws.dump('./sedg_nsl.sdgws')
lrr.dump('./sedg_nsl.lrr.data')
### -------- ###
else:
ws = SDG.WorkSpace('./sedg_nsl.sdgws')
lords = ws.get_long_reads_datastore('long_reads')
lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31)
lrr.load('./sedg_nsl.lrr.data')
## transform reads to group mappings XXX PIERDE DISTANCIA XXX
trpm=[]
for rpm in lrr.read_perfect_matches:
if len(rpm)<2:
trpm.append([])
continue
ms=[transformation[rpm[0].node]]
for ix in range(1,len(rpm)-1):
x=rpm[ix]
g=transformation[x.node]
if g==ms[-1]: continue #same group as before
if ms[-1]==transformation[rpm[ix+1].node]: continue #sandwich
ms.append(g)
if transformation[rpm[-1].node]!=ms[-1]: ms.append(transformation[rpm[-1].node])
trpm.append(ms)
## Pickle transformation dict
with open('./trpm.pickle', 'wb') as tf:
pickle.dump(transformation, tf, protocol=pickle.HIGHEST_PROTOCOL)
def gid_reads():
""" collection to enter using the group id in absolute value and return the mapped reads to translate later"""
gid2reads={abs(gid):set() for gid in groups.keys()}
for i, read in enumerate(trpm):
for gid in read:
gid2reads[abs(gid)].add(i)
return gid2reads
gid2reads = gid_reads()
## TODO: en lugar de vota el next, hacer un MLDG con la informacion de las lecturas largas
## Uno con y uno sin conexione totales.
## Distancia # de grupos
def get_next(gid, min_links=5, black_list=[]):
""" Given a group id return the next group using long reads """
c=Counter()
trpm_ids = gid2reads[abs(gid)]
ttrpm = [trpm[i] for i in trpm_ids]
for x in ttrpm:
if gid in black_list or -gid in black_list: continue ## If node is a super connected one don't vote (should be abs??)
if -gid in x:
x= [y*-1 for y in x[::-1]]
if gid in x and x[-1] != gid:
c[x[x.index(gid)+1]]+=1
if len(c.most_common())>0 and c.most_common(1)[0][1]>min_links:
return c.most_common(1)[0][0]
else:
None
ws2 = SDG.WorkSpace()
## Add all groups as nodes
group2node={}
node2group={}
all_nodes = [nodo for k, group in groups.items() for nodo in group]
for gid in groups.keys():
group_size = sum([ws.sdg.get_nodeview(x).size() for x in groups[gid]])
new_node = ws2.sdg.add_node("A"*group_size)
group2node[gid] = new_node
node2group[new_node] = gid
print("%s nodes in ws2 graph" %len(ws2.sdg.get_all_nodeviews()))
# def runall():
##
ndg = SDG.DistanceGraph(ws2.sdg)
for i, g in enumerate(groups.keys()):
visited=set()
prev_group = None
next_group = get_next(g)
while next_group is not None and next_group not in visited:
if prev_group is not None:
if prev_group<0: n1=-group2node[abs(prev_group)]
else: n1=group2node[abs(prev_group)]
if next_group<0: n2=-group2node[abs(next_group)]
else: n2=group2node[abs(next_group)]
# print("%s --> %s" %(n1, n2))
# if not ws.sdg.are_connected(-n1, n2):
ndg.add_link(-n1, n2, 0, SDG.Support())
visited.add(next_group)
prev_group = next_group
next_group = get_next(next_group)
print("\r%s/%s" %(i, len(groups)), end='')
if i%20000==1:
ndg.write_to_gfa1('./dummygraph_%i.gfa' %i)
ndg.write_to_gfa1('./dummygraph2.gfa')
```
## BJ's threaded graph
```python
lrr.thread_reads(end_matches=1)
trg=lrr.dg_from_threads(False)
#trgm=lrr.dg_from_threads(True)
lu=SDG.LinkageUntangler(trg)
lu.select_all()
nsl=lu.make_nextselected_linkage(5)
nodes = []
for nv in nsl.get_all_nodeviews():
if len(nv.next())==2 and len(nv.prev())==2:
nodes.append(nv.node_id())
lu.selected_nodes[nv.node_id()]==False
nsl=lu.make_nextselected_linkage(5)
wsg = SDG.WorkSpace()
## Add all groups as nodes
group2node={}
node2group={}
all_nodes = [nodo for k, group in groups.items() for nodo in group]
for gid in groups.keys():
group_size = sum([ws.sdg.get_nodeview(x).size() for x in groups[gid]])
new_node = wsg.sdg.add_node("A"*group_size)
group2node[gid] = new_node
node2group[new_node] = gid
dg=SDG.DistanceGraph(wsg.sdg)
for rpm in lrr.read_perfect_matches:
if len(rpm)<2:
trpm.append([])
continue
ms=[transformation[rpm[0].node]]
for ix in range(1,len(rpm)-1):
x=rpm[ix]
g=transformation[x.node]
if g==ms[-1]: continue #same group as before
if ms[-1]==transformation[rpm[ix+1].node]: continue #sandwich
ms.append(g)
if transformation[rpm[-1].node]!=ms[-1]: ms.append(transformation[rpm[-1].node])
trpm.append(ms)
```
```
lu.select_all()
lu.make_next_selected_linkage()
```
```python
def dump_partial_graph(filename,dg,nid, radious=10, max_nodes=10000):
nodes=set([abs(nid)])
for x in range(radious):
nn=list(nodes)
for nid in nn:
for l in dg.get_nodeview(nid).prev()+dg.get_nodeview(nid).next():
nodes.add(abs(l.node().node_id()))
if len(nodes)==max_nodes: break
if len(nodes)==max_nodes: break
dg.write_to_gfa1(filename,selected_nodes=list(nodes))
#print(nodes)
```
```python
def nodes_in_threads_fw(nv):
c=Counter()
reads=set([x.support().id for x in nv.next()])
to_explore=[nv]
while to_explore:
#print("exploring nodes:",[x.node_id() for x in to_explore])
new_to_explore=[]
for nnv in to_explore:
for ln in nnv.next():
if ln.support().id in reads:
if c[ln.node().node_id()]==0:
new_to_explore.append(ln.node())
c[ln.node().node_id()]+=1
to_explore=new_to_explore
return c
class HaplotypePuller(object):
"""Pulls a haplotype (set of nodes /reads) from a thread dg:
- A node is a good candidate to belong if:
- most (a high %) of its connecitons are to nodes in the haplotype
- most (a high %) of its reads are reads in the haplotype
- A read is a good candidate to belong if:
- most (a high %) of its nodes are in the haplotype
- To start you need a node set that is a reasonable hipothesis for haplotype region, you can get that straight away from a good long read.
"""
def __init__(self,dg,llr):
self.dg=dg
self.lrr=lrr
self.node_ids=set()
self.read_ids=set()
def start_from_read_nodes(self,rid):
"""starting from all nodes from a single read gives a set of "mostly correct haplotype" nodes"""
self.node_ids.update([m.node for m in lrr.read_threads[rid]])
self.read_ids.add(rid)
def start_node_neighbourhood(self,nid,min_reads=10):
for nnid,c in nodes_in_threads_fw(self.dg.get_nodeview(nid)).items():
if c>=min_reads:
self.node_ids.add(nnid)
for nnid,c in nodes_in_threads_fw(self.dg.get_nodeview(-nid)).items():
if c>=min_reads:
self.node_ids.add(-nnid)
def nodes_fw_inout(self,nid,min_c=2):
"""get a node connection """
nv=self.dg.get_nodeview(nid)
nodes_in=0
nodes_out=0
for nid,c in nodes_in_threads_fw(self.dg.get_nodeview(nid)).items():
if c<min_c: continue
if nid in self.node_ids:
nodes_in+=c
else:
nodes_out+=c
return (nodes_in,nodes_out)
def nodes_fw_perc(self,nid,min_c=2):
"""get a node connection """
nodes_in,nodes_out=self.nodes_fw_inout(nid,min_c)
return nodes_in/(nodes_in+nodes_out)
def nodes_all_perc(self,nid,min_c=2):
"""get a node connection """
fnodes_in,fnodes_out=self.nodes_fw_inout(nid,min_c)
bnodes_in,bnodes_out=self.nodes_fw_inout(-nid,min_c)
nodes_in=bnodes_in+fnodes_in
nodes_out=bnodes_out+fnodes_out
return nodes_in/(nodes_in+nodes_out)
```
## BJ's segregation filtering
The idea is to use coverage on Hapil and the 20 F1s to filter nodes in each read making sure only haplotype-specific nodes are left in.
For each read we first need to check what the typical profile of segregation is:
- This is easier to do on nodes/kmers that are absent from hapil. Since we are already picking nodes that are "single copy" on RG, nodes completely absent from Hapil should have f=0.5.
```python
rg_only_count=0
rg_only_bp=0
total_count=0
total_bp=0
for nv in ws.sdg.get_all_nodeviews():
if [x for x in nv.kmer_coverage("rgxh31","hapil") if x<=5]:
rg_only_count+=1
rg_only_bp+=nv.size()
total_count+=1
total_bp+=nv.size()
print("%d / %d nodes, totalling %dbp / %dbp, had RG-only k-mers" % (rg_only_count,total_count,rg_only_bp,total_bp))
```
```python
countnames=[x for x in kc.list_names() if x!="sdg" and x!="hapil"]
node_seg={}
for nv in ws.sdg.get_all_nodeviews():
node_seg[nv.node_id()] = "".join( [ str(int(mean(nv.kmer_coverage("rgxh31",x))>5)) for x in countnames])
if nv.node_id()%10000==0: print(nv.node_id())
def fnodeseg(nv):
if (mean(nv.kmer_coverage("rgxh31","hapil"))>5): return "11111111111111111111"
return "".join( [ str(int(mean(nv.kmer_coverage("rgxh31",x))>5)) for x in countnames])
def find_read_pattern(matches):
c=Counter()
t=0
for m in matches:
sp=fnodeseg(ws.sdg.get_nodeview(m.node))
if sp!="11111111111111111111":
c[sp]+=1
t+=1
if t==0: return ""
p=c.most_common(1)[0]
if p[1]<t*.51: return ""
return p
```
```python
def filter_read_by_segregation(rid):
c=Counter([cf.get_pattern(ws.sdg.get_nodeview(m.node)) for m in lrr.read_perfect_matches[rid]])
if len(c)==0: return []
w=c.most_common(1)[0]
if w[1]<sum(c.values())*.75: return []
return [m for m in lrr.read_perfect_matches[rid] if cf.get_pattern(ws.sdg.get_nodeview(m.node))==w[0]]
srpm=[filter_read_by_segregation(rid) for rid in range(1,len(lrr.read_perfect_matches))]
for i in range(len(srpm)):
lrr.read_perfect_matches[i]=srpm[i]
lrr.recruit(1,1)
lrr.dump('llr_after_segfilterv1.dump')
def filter_read_by_segregation2(rid):
c=Counter([cf3.get_pattern(ws.sdg.get_nodeview(m.node)) for m in lrr.read_perfect_matches[rid]])
if c['11111111111111111111']: c.pop('11111111111111111111')
if len(c)==0: return []
w=c.most_common(1)[0]
if w[1]<sum(c.values())*.75: return []
return [m for m in lrr.read_perfect_matches[rid] if cf3.get_pattern(ws.sdg.get_nodeview(m.node))==w[0]]
srpm2=[filter_read_by_segregation2(rid) for rid in range(len(lrrraw.read_perfect_matches))]
for i in range(len(srpm2)):
lrrraw.read_perfect_matches[i]=srpm2[i]
```
## Analysing haplotype-segegated node lines.
### Proposed method:
1) Create a set of "anchors" (the gonza way works fine so far).
2) LRR -> generate perfect mappings.
3) Generate "segregation signatures" for all nodes.
4) Find the "prevalent signature" for each read, remove all nodes with any other signatures from the read.
5) Now we should be able to grab a set of nodes with the same signature, and create a linear order for each connected component in them.
- If a node belongs to "more than one component", or "more than one location" in the same component, we need to remove it from the problem.
- One way to check is to validate that the node divides its neighbours in "before" and "after", and that these sets of nodes are connected between them (i.e. all before and all after nodes are a single component, even if you remove the node). Nodes crossing lines will generate two "before" and two "after" components, or if they appear twice in the genome in short distance, they will generate "before" and "after" sets that share some nodes:
- For each connection FW and BW from a node:
- Get all the reached nodes, up to a distance from the original node.
- If the node is a cross, it will have two subsets of connections fw and two sets bw, without connections between them bw1 and bw2, nor fw1 or fw2.
- if bw1 connects to fw1 without going thorugh the node, and bw2 connects to fw2 without going thorugh the node, then it is safe to delete the node.
- Otherwise, the reads can be partitioned, by creating a cluster of all reads that share any number of nodes, without counting the "joining" node. (i.e. start with all reads in independent sets of nodes, if two sets share X nodes, join them, repeat ad nauseoum, if at the ende there's more than one set, there you are.)
### Open "problems"
- Can we find propper thresholds for filtering?
- Can we use a "clever" filter that differentiates when things are and aren't in hapil?
- Given a sortable "line", create an order -> linearise
- solve the "local repeat" problem -> two similar unique motifs in the same haplotype -> errors in one "land" in the other. IF A NODE DOES NOT HAVE A SINGLE POSITION IN A TOTAL SORT FOR THE HAPLOTYPE, THROW AWAY (SAME NEIGHBOURS, SAME RELATIVE POSITION) -> THROW THE NODE AWAY ->
- If pattern '11100111111101111101' gets linearised/split, it's done.
- solve the "node classification vs. read classification problem" -> nodes will get classified to the wrong pattern, if all the reads they appear on have a close-by pattern, we can safely reclassify the node.
- reads that span multiple patterns/re-joining through breakpoints/ whatever -> current view is "we do this afterwards". We could technically allow for small distances form the main pattern in the read, or even allow for ambiguity in the read pattern, which would eliminate this problem, but would bring back cross-haplotype talking.
```python
cf2=SDG.CountFilter('rgxh31','sdg',100,kc.list_names()[1:-1],[5 for x in kc.list_names()[1:-1]])
def filter_read_by_segregation2(rid):
c=Counter([cf2.get_pattern(ws.sdg.get_nodeview(m.node)) for m in lrrraw.read_perfect_matches[rid]])
if c['11111111111111111111']: c.pop('11111111111111111111')
if len(c)==0: return []
w=c.most_common(1)[0]
if w[1]<sum(c.values())*.75: return []
return [m for m in lrrraw.read_perfect_matches[rid] if cf2.get_pattern(ws.sdg.get_nodeview(m.node))==w[0]]
srpm2=[filter_read_by_segregation2(rid) for rid in range(len(lrrraw.read_perfect_matches))]
for i in range(len(srpm2)):
lrrraw.read_perfect_matches[i]=srpm2[i]
lrrraw.thread_reads(end_matches=1)
trg3=lrrraw.dg_from_threads(True)
lu3=SDG.LinkageUntangler(trg3)
lu3.select_all()
nsl3=lu3.make_nextselected_linkage(2)
```
### Finding the order of a lineal component
1) take the closest neighbour.
2) if the end is hit, backtrack
3) complete intermediate neighbours.
Or:
start with an order with a single node
pick the longest-reaching read fw from the last node and add all its nodes
pick the longest-reaching read bw from the first node and add all its nodes
repeat till no reads remain from the last nodes
fill in-between nodes.
# general order of different signatures
```python
def pdistance(p1,p2):
return len([1 for i,x in enumerate(p1) if x!=p2[i]])
def find_related_signatures(p1):
rids=Counter()
for pnode in pattern_map[p1]:
for rid in lrr.node_reads[pnode]:
rids[abs(rid)]+=1
rids=[k for k,v in rids.items() if v>5]
nc=Counter()
print("Processing", len(rids), "reads")
t=0
for rid in rids:
t+=1
if t%1000==0: print(t,len(nc))
#print(rid)
pcounter = Counter( [cf2.get_pattern(ws.sdg.get_nodeview(m.node)) for m in lrr.read_perfect_matches[rid]])
for k,v in pcounter.items():
if v>5:
nc[k]+=1
for x in nc.most_common(10): print(x,pdistance(x[0],p1))
```
```python
def plrseg(rid,cf,drop_last=False):
for x in zip(*[cf.get_pattern(ws.sdg.get_nodeview(m.node)) for m in lrr.read_perfect_matches[rid] if not drop_last or cf.get_pattern(ws.sdg.get_nodeview(m.node))[-1]=='0']):
print('> ',''.join(x).replace('1','*').replace('0',' '),'\t','|' * int(10*x.count('1')/len(x)))
def print_spectra(kc,lim):
print(', '.join(kc.list_names()))
s=[kc.count_spectra(n,lim) for n in kc.list_names()]
for xs in zip(*s):
print(', '.join([str(x) for x in xs]))
```
# Creating a better pattern filter for a read
Strategy:
- Pre-filtering of "present-everywhere" signal:
- if there's up to 75% present-everywhere matches, just ignore them. More than 75% -> discard read.
- Pre-filtering of hapil noise:
- In regions where hapil is often absent (30%), just discard all nodes that have a hapil component, work on a signal for the rest. Use that signal to filter, irregardless of wheter hapil is present or not.
- Signal finding:
```python
from collections import Counter
def best_patterns(patterns,verbose=False):
if len(patterns)<10:
if verbose: print('less than 10 matches, giving up!')
return []
all_1s='1'*len(patterns[0])
pc=Counter(patterns)
t=len(patterns)
if pc[all_1s]>=t*.75:
if verbose: print('Too many 1s, giving up!')
return []
t-=pc[all_1s]
pc[all_1s]=0
hc=sum([c for p,c in pc.items() if p[-1]=='1']) #count patterns with hapil
if hc>t*.7:
if verbose: print('Too many patterns with hapil, using full pattern')
fpc=pc
ft=t
else:
if verbose: print('Discarding',hc,'/',t,'patterns with hapil')
fpc=Counter()
ft=0
for p,c in pc.items():
if p[-1]!='1':
fpc[p]=c
ft+=c
w=fpc.most_common(1)[0][0]
if verbose: print("most common pattern:",fpc.most_common(1),'/',ft)
if fpc[w]>.60*ft:
if w[-1]=='1': return [w]
else: return [w,w[:-1]+'1']
else:
if verbose: print('Winner not 60% of filtered matches, giving up')
return []
def filter_read(rid,lrr,cf,verbose=False):
patterns=[cf.get_pattern(ws.sdg.get_nodeview(m.node)) for m in lrr.read_perfect_matches[rid]]
bp=best_patterns(patterns,verbose)
if verbose: print("winning patterns:",bp)
fm=[]
for i,p in enumerate(patterns):
if p in bp:
fm.append(lrr.read_perfect_matches[rid][i])
return fm
cf3=SDG.CountFilter('rgxh31','sdg',100,kc.list_names()[1:],[2 for x in kc.list_names()[1:-1]+[30]])
srpm2=[filter_read(rid,lrr,cf3) for rid in range(len(lrr.read_perfect_matches))]
for i in range(len(srpm2)):
lrr.read_perfect_matches[i]=srpm2[i]
lrr.recruit(1,1)
lrr.thread_reads(end_matches=1)
trg=lrr.dg_from_threads(True)
lu=SDG.LinkageUntangler(trg)
lu.select_all()
nsl3=lu.make_nextselected_linkage(3)
nsl5=lu.make_nextselected_linkage(5)
nsl8=lu.make_nextselected_linkage(8)
def get_connected_component(dg,nid,signed=True):
component_nodes=set([nid])
last_added=set([nid])
while last_added:
new_last_added=set()
for nid in last_added:
nv=dg.get_nodeview(nid)
for o in nv.next()+nv.prev():
onid=o.node().node_id()
if not signed: onid=abs(onid)
if onid not in component_nodes:
component_nodes.add(onid)
new_last_added.add(onid)
last_added=new_last_added
return component_nodes
def dump_dg_to_text(filename, dg, node_id=None):
if node_id is not None:
print("Collecting subgraph")
all_nodes = [dg.get_nodeview(node) for node in get_connected_component(dg, node_id, signed=False)]
else:
print("Collecting entire graph")
all_nodes = dg.get_all_nodeviews()
print('Writing output')
with open(filename, 'w') as dg_file:
for nv in all_nodes:
## Nodes fw
for nf in nv.next():
source=-nv.node_id()
dest=nf.node().node_id()
distance=nf.distance()
support_id=nf.support().id
support_index=nf.support().index
if abs(source)<=abs(dest):
dg_file.write("%s %s %s %s %s\n" %(source, dest, distance, support_index, support_id))
## Nodes bw
for nf in nv.prev():
dest=-nf.node().node_id()
source=nv.node_id()
distance=nf.distance()
support_id=nf.support().id
support_index=nf.support().index
if abs(source)<=abs(dest):
dg_file.write("%s %s %s %s %s\n" %(source, dest, distance, support_index, support_id))
return None
def load_dg_from_text(dg, filename):
with open(filename, 'r') as linkfile:
for line in linkfile:
sline = line.strip().split(" ")
sup = SDG.Support(SDG.SupportType.stUndefined, int(sline[3]), int(sline[4]))
dg.add_link(int(sline[0]), int(sline[1]), int(sline[2]), sup)
return None
```
```python
```
```python
def process_reads(first,end):
cf3=SDG.CountFilter('rgxh31','sdg',100,kc.list_names()[1:],[2 for x in kc.list_names()[1:-1]+[30]])
return [filter_read(rid,lrr,cf3) for rid in range(first,end)]
```
# Next problem up:
do a trg (multi=False).
then basically disconnecting (i.e. popping) the nodes from the trg should solve the graph. every time a noisy node is popped out we recover information in the non-noisy nodes.
only problem: popping only works if there is a clear next/prev, if a reads goes twice through the same node, it is impossible to pop. We can just discard these when making the threaded graph. or accept the disconnection imposed by removing rather than popping in that particular read (i.e. if a reads goes twice through the same node, we disconnect it in the subcomponents between the node.)
```python
from statistics import median
def pop_node(dg,node_id,read):
#if read==8577436: print("pop %d from %d"%(node_id,read))
nv=dg.get_nodeview(node_id)
ln=[l for l in nv.next() if l.support().id==read]
lp=[l for l in nv.prev() if l.support().id==read]
if len(ln)!=1 or len(lp)!=1 or abs(ln[0].node().node_id())==abs(node_id) or abs(lp[0].node().node_id())==abs(node_id):
#print("can't pop %d from %d"%(node_id,read))
return False
td=ln[0].distance()+nv.size()+lp[0].distance()
dg.add_link(-lp[0].node().node_id(),ln[0].node().node_id(),td,lp[0].support())
dg.remove_link(-node_id,ln[0].node().node_id(),ln[0].distance(),ln[0].support())
dg.remove_link(-lp[0].node().node_id(),node_id,lp[0].distance(),lp[0].support())
return True
def make_summary_connection_graph(mdg,links_to_connect=3):
"""returns a new dg with only one connection per node end pair (i.e. mdg to dg)"""
conn={}
dg=SDG.DistanceGraph(mdg.sdg)
for nv in mdg.get_all_nodeviews(include_disconnected=False,both_directions=True):
for ln in nv.next():
ck=(-nv.node_id(),ln.node().node_id())
if ck[0]<=ck[1]:
if ck not in conn: conn[ck]=[]
conn[ck].append(ln.distance())
for c,d in conn.items():
if len(d)>=links_to_connect:
dg.add_link(c[0],c[1],int(median(d)))
return dg
def write_connected_nodes_graph(filename,dg):
dg.write_to_gfa1(filename,selected_nodes=[x.node_id() for x in dg.get_all_nodeviews(include_disconnected=False)])
```