# TAP v2
## Random notes
the problem with anchor graphs has been to try and make them too complicated.
take unique 63-mers from the distribution.
take perfect matches from the long reads.
discard anything that is not unique (i.e. double positions) -> TAP.
If short reads are available, distances with the short reads can be used too. Once a local subset is ordered (with the short reads), they can be used to refer to a central point. (i.e. a hash of kmers that refers to a has of central kmers plus short offsets). Once the central kmers are ordered, the offsets can be used to fill the non-central k-mers.
Because the anchor points are unique k-mers they can be single positions (offsets) in the read. So, when tapping, the original distance in the read is recovered.
To make it easier to work with real life long reads, all of this can be done in homopolimer-compressed space.
The problem has been always that we focused on the wrong conditions. Fixing copy-number 1 on contigs was idiotic. We can fix copy number 1 on k-mers, which we can guess reasonably well, and then order gives position. The Sequence Distance Graph is only an intermediate concept to a unique-k-mer distance graph.
Pan-genome versions of this could use the unique k-mers in different haplotypes/colors to find disorder in the set to call variants.
Integration with markers is as easy as maping a marker to a position.
## Example run
Try with the octoploid from pseudoseq.
```python
# rough TAP v2 implementation
%cd /Users/clavijob/pseudoseq_paper/new
import SDGpython as SDG
ws=SDG.WorkSpace()
peds=ws.add_paired_reads_datastore('octo_pe.prseq')
lords=ws.add_long_reads_datastore('octo_long.loseq')
# new function: kc.kmer_unlinked_graph(count_name,min_freq,max_freq) -> sdg with kmers as nodes.
SDG.count_kmers_as_graph_nodes(ws.sdg,peds,31,50,100,1)
print(len(ws.sdg.get_all_nodeviews()))
kc=ws.add_kmer_counter("main")
kc.add_count("pe",peds)
kc.set_kci_peak(75)
```
```txt
/Users/clavijob/pseudoseq_paper/new
248880
```
```python
print(ws.sdg.stats_by_kci())
s=SDG.Strider(ws)
s.add_datastore(peds)
peds.mapper.path_reads(31)
#collapse nodes into unique-31-mer unitigs --> warning: could connects across collapsed 30-mers if strider fails.
print("starting... %s" %datetime.now())
total_nodes=len(ws.sdg.get_all_nodeviews())
for nv in ws.sdg.get_all_nodeviews():
if nv.node_id()%100000 == 0: print("\rnodos: %s/%s" %(nv.node_id(), total_nodes), end='')
fw=s.stride_single_strict(nv.node_id()).nodes
bw=s.stride_single_strict(-nv.node_id()).nodes
for fw in [fw,bw]:
if len(fw)>1:
if ws.sdg.get_nodeview(fw[0]).sequence()[1:]==ws.sdg.get_nodeview(fw[1]).sequence()[:30]:
if not ws.sdg.are_connected(-fw[0],fw[1]):
ws.sdg.add_link(-fw[0],fw[1],-30)
print("Done for loop... %s" %datetime.now())
ws.sdg.join_all_unitigs()
print("Done for Join all unitigs... %s" %datetime.now())
kc.update_graph_counts()
print(ws.sdg.stats_by_kci())
```
```txt
-----------------------------------------------------------------------------------
| KCI | Total bp | Nodes | Tips | N25 | N50 | N75 |
|-------+---------------+----------+---------+------------+------------+------------|
| None | 7715280 | 248880 | 248880 | 31 | 31 | 31 |
| < 0.5 | 0 | 0 | 0 | 0 | 0 | 0 |
| ~ 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| ~ 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| ~ 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| > 3.5 | 0 | 0 | 0 | 0 | 0 | 0 |
|-------+---------------+----------+---------+------------+------------+------------|
| All | 7715280 | 248880 | 248880 | 31 | 31 | 31 |
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
| KCI | Total bp | Nodes | Tips | N25 | N50 | N75 |
|-------+---------------+----------+---------+------------+------------+------------|
| None | 37288 | 1058 | 1058 | 38 | 36 | 33 |
| < 0.5 | 0 | 0 | 0 | 0 | 0 | 0 |
| ~ 1 | 478502 | 7839 | 7839 | 61 | 61 | 61 |
| ~ 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| ~ 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| > 3.5 | 0 | 0 | 0 | 0 | 0 | 0 |
|-------+---------------+----------+---------+------------+------------+------------|
| All | 515790 | 8897 | 8897 | 61 | 61 | 61 |
-----------------------------------------------------------------------------------
```
```python
lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31)
lrr.map(31)
lrr.thread_reads(end_matches=1)
tdg=lrr.dg_from_threads()
```

```python
u=SDG.LinkageUntangler(tdg)
u.select_all()
print(len([1 for x in u.selected_nodes if x==True]),"nodes selected to start with")
conflicted_ends=set()
for x in range(2):
nsl=u.make_nextselected_linkage(7)
nsl.remove_transitive_links()
for fnv in nsl.get_all_nodeviews():
for nv in [fnv,fnv.rc()]:
if len(nv.next())>1:
conflicted_ends.add(-nv.node_id())
for lfw in nv.next():
conflicted_ends.add(lfw.node().node_id())
for nid in conflicted_ends:
if nid>0 and -nid in conflicted_ends:
u.selected_nodes[nid]=False
print(len([1 for x in u.selected_nodes if x==True]),"nodes selected after cleaning up multi-connected")
print('leftover nodes with multiple connections: seq'+',seq'.join([str(abs(nv.node_id())) for nv in nsl.get_all_nodeviews() if len(nv.next())>1 or len(nv.prev())>1]))
print('leftover nodes with multiple connections: '+len([str(abs(nv.node_id())) for nv in nsl.get_all_nodeviews() if len(nv.next())>1 or len(nv.prev())>1]))
nsl.write_to_gfa1('nsl.gfa',selected_nodes=[x[0] for x in enumerate(u.selected_nodes) if x[1]])
```
```txt
257630 nodes selected to start with
257620 nodes selected after cleaning up multi-connected
257619 nodes selected after cleaning up multi-connected
leftover nodes with multiple connections: seq60264,seq185450,seq196603,seq250395,seq250558,seq251849,seq253004,seq253714,seq254169,seq254564,seq254895,seq256825,seq256963,seq257437,seq257602
```

## Trying with the PB HiFi dmel / strawberry datasets
1) construct graph at k=63
2) pop homopolymer bubbles
```python
def homocomp(s):
r=s[0]
for c in s:
if r[-1]!=c: r+=c
return r
bb=[1]
while bb:
bb=[]
for nv in ws.sdg.get_all_nodeviews():
if is_bubble_side(nv):
pv=nv.parallels()[0]
if nv.size()>pv.size() and homocomp(nv.sequence())==homocomp(pv.sequence()):
bb.append(pv.node_id())
for x in bb:
ws.sdg.remove_node(x)
ws.sdg.join_all_unitigs()
print(len(bb))
```
3) Expand canonical repeats
```python
ge=SDG.GraphEditor(ws)
solved=unsolved=0
for nv in ws.sdg.get_all_nodeviews():
if is_canonical_repeat(nv):
nidp0=nv.prev()[0].node().node_id()
nidp1=nv.prev()[1].node().node_id()
nidn0=nv.next()[0].node().node_id()
nidn1=nv.next()[1].node().node_id()
pA=set([abs(x) for x in lrr19.node_reads[abs(nidp0)]])
pB=set([abs(x) for x in lrr19.node_reads[abs(nidp1)]])
pC=set([abs(x) for x in lrr19.node_reads[abs(nidn0)]])
pD=set([abs(x) for x in lrr19.node_reads[abs(nidn1)]])
pAB=pA.intersection(pB)
pCD=pC.intersection(pD)
pA=pA-pAB
pB=pB-pAB
pC=pC-pCD
pD=pD-pCD
pAC=len(pA.intersection(pC))
pAD=len(pA.intersection(pD))
pBC=len(pB.intersection(pC))
pBD=len(pB.intersection(pD))
if pAC<5 and pAD>20 and pBC>20 and pBD<5:
solved+=1
ge.queue_node_expansion(nv.node_id(),[[-nidp0,nidn1],[-nidp1,nidn0]])
elif pAD<5 and pAC>20 and pBD>20 and pBC<5:
solved+=1
ge.queue_node_expansion(nv.node_id(),[[-nidp0,nidn0],[-nidp1,nidn1]])
else: unsolved+=1
```
4) Solve "double joins"
```python
def is_double_join(nv):
if len(nv.next())!=2: return False
if len(nv.next()[0].node().prev())!=2 or len(nv.next()[1].node().prev())!=2: return False
n0ps=set([x.node().node_id() for x in nv.next()[0].node().prev()])
n1ps=set([x.node().node_id() for x in nv.next()[1].node().prev()])
if n0ps!=n1ps: return False
n0ps=set([x.node().node_id() for x in nv.next()[0].node().prev()[0].node().next()])
n1ps=set([x.node().node_id() for x in nv.next()[0].node().prev()[1].node().next()])
if n0ps!=n1ps: return False
return True
solved=unsolved=0
for fnv in ws.sdg.get_all_nodeviews():
for nv in [fnv,fnv.rc()]:
if is_double_join(nv):
nidn0=nv.next()[0].node().node_id()
nidn1=nv.next()[1].node().node_id()
nidp0=nv.next()[0].node().prev()[0].node().node_id()
nidp1=nv.next()[0].node().prev()[1].node().node_id()
pA=set([abs(x) for x in lrr19.node_reads[abs(nidp0)]])
pB=set([abs(x) for x in lrr19.node_reads[abs(nidp1)]])
pC=set([abs(x) for x in lrr19.node_reads[abs(nidn0)]])
pD=set([abs(x) for x in lrr19.node_reads[abs(nidn1)]])
pAB=pA.intersection(pB)
pCD=pC.intersection(pD)
pA=pA-pAB
pB=pB-pAB
pC=pC-pCD
pD=pD-pCD
pAC=len(pA.intersection(pC))
pAD=len(pA.intersection(pD))
pBC=len(pB.intersection(pC))
pBD=len(pB.intersection(pD))
if pAC<5 and pAD>20 and pBC>20 and pBD<5:
solved+=1
ws.sdg.remove_link(-nidp0,nidn0)
ws.sdg.remove_link(-nidp1,nidn1)
elif pAD<5 and pAC>20 and pBD>20 and pBC<5:
solved+=1
ws.sdg.remove_link(-nidp0,nidn1)
ws.sdg.remove_link(-nidp1,nidn0)
else: unsolved+=1
```
5) Strider links?
```python
anchor_graph=SDG.DistanceGraph(ws.sdg)
anchor_graph_multi=SDG.DistanceGraph(ws.sdg)
for x in range(len(s.is_anchor)):
if s.is_anchor[x]:
for l in s.links_bw[x] + s.links_fw[x]:
if not anchor_graph_multi.are_connected(l.source,l.dest): anchor_graph_multi.add_link(l.source, l.dest, l.dist)
if s.is_anchor[abs(l.dest)]:
if not anchor_graph.are_connected(l.source,l.dest): anchor_graph.add_link(l.source, l.dest, l.dist)
anchor_graph.remove_transitive_links()
```
```python
def get_node_inmediate_neighbours(node):
""" Uno nodo para adelante y un nodo para atras """
group=[]
for x in Counter([-x for path in peds.mapper.all_paths_fw(-node) for x in path]).most_common(3):
if x[0]!=0 and x[1]>10:
group.append(x)
for x in Counter([x for path in peds.mapper.all_paths_fw(node) for x in path]).most_common(3):
if x[0]!=0 and x[1]>10:
group.append(x)
return group
## Grouping algo
groups = {abs(n.node_id()):[abs(n.node_id()),] for n in tdg.get_all_nodeviews()}
print(len(groups))
cont=0
nodos=[abs(x) for x in groups.keys()]
for nodo in nodos:
# changes = 0
if nodo not in groups: continue
neighbours = get_node_inmediate_neighbours(nodo)
for nn in neighbours:
if abs(nn[0]) not in groups:
continue
elif len(groups[abs(nn[0])]) > 1:
break
else:
if nodo not in groups: continue
groups[nodo].append(nn[0])
groups.pop(abs(nn[0]))
# changes+=1
# nodo=random.choice(list(groups.keys()))
print("\r%s" %(cont,), end='')
cont += 1
len(groups)
transformation = {}
for nodo_central, integrantes in groups.items():
for integrante in integrantes:
transformation[integrante] = nodo_central
transformation[-integrante] = -nodo_central
trpm=[]
for rpm in lrr.read_perfect_matches:
if len(rpm)<2:
trpm.append([])
continue
ms=[transformation[rpm[0].node]]
for ix in range(1,len(rpm)-1):
x=rpm[ix]
g=transformation[x.node]
if g==ms[-1]: continue #same group as before
if ms[-1]==transformation[rpm[ix+1].node]: continue #sandwich
ms.append(g)
if transformation[rpm[-1].node]!=ms[-1]: ms.append(transformation[rpm[-1].node])
trpm.append(ms)
```
# Gonza's implementation log
## Step 1: Create DBG
**Input:** PE reads
**Output:** DBG.
**Aim:** create a haplotype-specific DBG to have as a basis for unitig processing.
```python
K=63
MIN_CVG=10
NUM_BATCHES=1
ws=SDG.WorkSpace()
peds=ws.add_paired_reads_datastore('./pseudo_triploid_octoploid_new/octoploid_pe.prseq', 'pe')
gm=SDG.GraphMaker(ws.sdg)
gm.new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES);
gc=SDG.GraphContigger(ws)
gc.clip_tips(2*K-1,3)
print("Remaining nodes: %s" %(len(ws.sdg.get_all_nodeviews())))
```
**Validation:**
- Kmer spectra

- Unitig validity (i.e. sequence from all unitigs needs to be present in the reference): count and bp of unititgs found as perfect hits in the reference.
Found contigs 23705 with 2486607bp
```
# runs sdg-dbg with a realatively low coverage and tip clipping. too lowthreholds introduces a ton of errors as standalone contigs, min_cv changed from 2 to 10 ans now produces a better initial spectra and no errors in contigs
```
### Results in simulated Octoploid
```
for nv in ws.sdg.get_all_nodeviews():
```
## Step 2: Extract unique k-mer runs from unitigs
**Input:** DBG
**Output:** Strings with all their kmers being unique as per read frequencies.
**Aim:** create all unique k-mer runs, to be used as anchors
**Validation:**
- Each string must appear once and only once in the reference.
``` python
# - Each string must appear once and only once in the reference.
multimatch=[]
bps = 0
for k, v in encontrados.items():
if len(v)>1 or len(v) == 0:
print(k, v, ws.sdg.get_nodeview(k).sequence())
bps+= ws.sdg.get_nodeview(k).size()
multimatch.append(k)
print("Multi matching kmer runs %s/%s with %s bps" %(len(multimatch), len([x for x in ws.sdg.get_all_nodeviews()]), bps))
```
```
244 ['haplotype_1', 'haplotype_2'] GAATCTTTCACAGAAAAGCAGCTAGTAAATGA
473 ['haplotype_7', 'haplotype_8'] AAAGAAGGTTTTTTACAGCAAGAAGAAGAAGAGAA
1022 ['haplotype_1', 'haplotype_2'] CTCAGGAGATGCCAACAAATCAACTTCTGCCA
1023 ['haplotype_1', 'haplotype_2'] GAGATGCCAACAAATCAACTTCTGCCATTAA
1228 ['haplotype_2', 'haplotype_3'] AGTGTGGCGGTCACTGGGCCCACTGGTGGAAG
1973 ['haplotype_2', 'haplotype_3'] AGTGTGGCGGTCACTGGGCCCACTGGTGGAAG
3194 ['haplotype_1', 'haplotype_2'] ATGTTAATAGCAGTTTCCTGTCTGTCACCAGTTA
3244 ['haplotype_1', 'haplotype_2'] GCAGGGGATAGATGGTTGTTGGGGTGTGGTGATGGA
3497 ['haplotype_1', 'haplotype_2'] AACGATCATTATAGGCTAACTCTACCAGAAGACAAGA
3643 ['haplotype_2', 'haplotype_3'] ATCTCCCGGAAAGCTTATTTCTAAATAAAAT
3768 ['haplotype_2', 'haplotype_3'] ATCTCCCGGAAAGCTTATTTCTAAATAAAAT
4062 ['haplotype_1', 'haplotype_2'] AACGATCATTATAGGCTAACTCTACCAGAAGACAAGA
4756 ['haplotype_7', 'haplotype_8'] AACAAGGTCAAGAAAAAAATTAAAAAGGTTA
4820 ['haplotype_1', 'haplotype_2'] ACCTGGGTAATATTATGGTTACTACCAAGCC
4821 ['haplotype_1', 'haplotype_2'] CTTGGTAGTAACCATAATATTACCCAGGTAC
5013 ['haplotype_2', 'haplotype_3'] AGTGTGGCGGTCACTGGGCCCACTGGTGGAAG
6128 ['haplotype_2', 'haplotype_3'] ATCTCCCGGAAAGCTTATTTCTAAATAAAAT
7930 ['haplotype_1', 'haplotype_2'] CTTGGTAGTAACCATAATATTACCCAGGTAC
7931 ['haplotype_1', 'haplotype_2'] ACCTGGGTAATATTATGGTTACTACCAAGCC
8506 ['haplotype_7', 'haplotype_8'] AATGAAGAGATATTTAGAAACAGATTTTTTAGCACAAAGGCA
8668 ['haplotype_1', 'haplotype_2'] ACCTGGGTAATATTATGGTTACTACCAAGCC
8669 ['haplotype_1', 'haplotype_2'] CTTGGTAGTAACCATAATATTACCCAGGTAC
8711 ['haplotype_1', 'haplotype_2'] ATGTTAATAGCAGTTTCCTGTCTGTCACCAGTTA
8714 ['haplotype_1', 'haplotype_2'] AACGATCATTATAGGCTAACTCTACCAGAAGACAAGA
Multi matching kmer runs 24/8768 with 793 bps
```
- All kmers between fmin and fmax must be used ONCE in the strings.
``` python
# - All kmers between fmin and fmax must be used ONCE in the strings.
nodes_with_repeated_content=[]
cant = 0
for i, nv in enumerate(ws.sdg.get_all_nodeviews()):
kcov = nv.kmer_coverage('pek31', 'sdg')
if sum(kcov) != len(kcov):
cant += 1
print(nv, kcov, nv.node_id() in multimatch)
print("Kmers multi represented in the graph %s" %cant)
```
```
<NodeView: node 1228 in graph SDG> [3, 3] True
<NodeView: node 1973 in graph SDG> [3, 3] True
<NodeView: node 3194 in graph SDG> [2, 2, 2, 2] True
<NodeView: node 3497 in graph SDG> [3, 3, 3, 3, 3, 3, 3] True
<NodeView: node 3643 in graph SDG> [3] True
<NodeView: node 3768 in graph SDG> [3] True
<NodeView: node 4062 in graph SDG> [3, 3, 3, 3, 3, 3, 3] True
<NodeView: node 4820 in graph SDG> [3] True
<NodeView: node 4821 in graph SDG> [3] True
<NodeView: node 5013 in graph SDG> [3, 3] True
<NodeView: node 6128 in graph SDG> [3] True
<NodeView: node 7930 in graph SDG> [3] True
<NodeView: node 7931 in graph SDG> [3] True
<NodeView: node 8668 in graph SDG> [3] True
<NodeView: node 8669 in graph SDG> [3] True
<NodeView: node 8711 in graph SDG> [2, 2, 2, 2] True
<NodeView: node 8714 in graph SDG> [3, 3, 3, 3, 3, 3, 3] True
Kmers multi represented in the graph 17
```
- All strings should ONLY be formed by k-mers between fmin and fmax.
``` python
cant = 0
for i, nv in enumerate(ws.sdg.get_all_nodeviews()):
kcov = [x>=40 and x <=100 for x in nv.kmer_coverage('pek31', 'pe')]
if sum(kcov) != len(kcov):
print(nv, nv.kmer_coverage('pek31', 'pe'))
cant += 1
```
```
None present
```
## Step 3: group kmer runs with PE reads
**Input:** unique kmer runs (as nodes) + PE datastore.
**Output:** groups as lists of SIGNED nodes.
**Aim:** group immediate neighbouring unique kmer runs into longer structures to allow better sensitivity when linking with long reads.
**Validation:**
- Each node can ONLY APPEAR in one group.
```python
## Prop check
count = Counter()
for k, v in groups.items():
for x in v:
count[abs(x)]+=1
for c in count.most_common():
if c[1]>1:
print(c)
```
```
Nothing reported, checks
```
- Each group should have no more than MAX_SIZE total k-mers.
- total size does not appear to be a problem atm
- unique kmer runs of each group should appear consecutively and in-order in the reference. (no scrambling). I.e.: you can express each chromosome as a list of groups.
- All group seems to be mutually excluyent, the superposition check
- Order within group seems to be ok, all the groups i checked grou "monotonically"
```python
## CHeck group superposicion
for g in group_c:
for og in group_c:
if g[0] == og[0]: continue
if g[1]<og[1]<g[2] or g[1]<og[2]<g[2]:
print("Groups %s and %s overlap %s -- %s" %(g, og, l, ol))
## Check order
group_c = []
for nc, group in groups.items():
print("GROUP %s" %nc)
print('seq'+',seq'.join([str(abs(x)) for x in group]))
for i, node in enumerate(group[:-1]):
pos=posicion[abs(group[i])][0][3]
prev_pos=posicion[abs(group[i+1])][0][3]
print('{:<4} {:>10}'.format(node, pos-prev_pos>0))
```
```
sedg=SDG.DistanceGraph(ws.sdg)
pedg=SDG.DistanceGraph(ws.sdg) # only links when there is no SE next.
for nv in ws.sdg.get_all_nodeviews(reverse=True):
for p in peds.mapper.all_paths)fw(nv.node_id()):
if len(p)==0: continue
if p[0]!=0:
sedg.add_link(-nv.node_id(), p[0], 1)
else:
pedg.add_link(-nv.node_id(), p[1], 1)
# Now linearise this graph. (pick the single most-connected node FW in SE first, and on PE if no connection passess the threshold in SE)
nextrundg=
#
```
**Something interesting, the errors joinhaplotypes even in the simulated octo**
## Step 4: link groups with long reads
**Input:** unique kmer runs (as nodes) + groups + LR datastore.
**Output:** a graph of group connectivity.
**Aim:** To discover the group composition of each haplotype.
**Validation:**
- On the graph: the best closest next should be the group that comes next in the reference. Same for prev.
- Linearly: express a chromosome as a list of anchors, put N's in the middle, it should be the same as masking all non-unique (fmin/fmax) k-mers from the reference.
## Code dump
```python
import sys
sys.path.append('/Users/ggarcia/Documents/git_sources/sdg/cmake-build-relwithdebinfo/SDGpython/')
import SDGpython as SDG
from collections import Counter
## SDG-DBG assembly k=63
## Min coverage is important to reduce the noice
K=63
MIN_CVG=10
NUM_BATCHES=1
ws=SDG.WorkSpace()
peds=ws.add_paired_reads_datastore('./pseudo_triploid_octoploid_new/octoploid_pe.prseq', 'pe')
gm=SDG.GraphMaker(ws.sdg)
gm.new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES);
gc=SDG.GraphContigger(ws)
gc.clip_tips(2*K-1,3)
print("Remaining nodes: %s" %(len(ws.sdg.get_all_nodeviews())))
ws.sdg.write_to_gfa1("./sdg_dbg_graph.gfa")
kc = ws.add_kmer_counter('pek31', 31)
kc.add_count('pe', peds)
seqs = gc.contig_reduction_to_unique_kmers(40, 100)
print(len(seqs),"sequences for anchoring")
with open("pseudoLong2.fastq","w") as f:
for s in seqs:
f.write("@seq\n%s\n+\n%s\n"%(s,s))
## Recreate workspace using only unique kmer runs
ws=SDG.WorkSpace()
peds=ws.add_paired_reads_datastore('./pseudo_triploid_octoploid_new/octoploid_pe.prseq')
lords=ws.add_long_reads_datastore('./pseudo_triploid_octoploid_new/octoploid_long.loseq')
for s in seqs:
ws.sdg.add_node(s)
print(len(ws.sdg.get_all_nodeviews()))
kc = ws.add_kmer_counter('pek31', 31)
kc.add_count('pe', peds)
lords = ws.add_long_reads_datastore('./pseudo_triploid_octoploid_new/octoploid_long.loseq')
lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31)
lrr.map(31)
lrr.thread_reads(end_matches=1)
tdg=lrr.dg_from_threads()
peds.mapper.path_reads(31)
## Group nodes using the next neighbour
def make_neighbours_graph():
sedg=SDG.DistanceGraph(ws.sdg)
pedg=SDG.DistanceGraph(ws.sdg) # only links when there is no SE next.
for nv in ws.sdg.get_all_nodeviews(both_directions=True):
for p in peds.mapper.all_paths_fw(nv.node_id()):
if len(p)==0: continue
if p[0]!=0:
sedg.add_link(-nv.node_id(), p[0], 0)
else:
pedg.add_link(-nv.node_id(), p[1], 0)
return sedg, pedg
sedg, pedg = make_neighbours_graph()
sedg.write_to_gfa1("./neghbour_sedg.gfa")
pedg.write_to_gfa1("./neghbour_pedg.gfa")
## Remove multiple conections by voting the most popular
def pick_next(nv):
winner = None
poll = Counter([x.node().node_id() for x in nv.next()]).most_common(1)
if len(poll)>0 and poll[0][1]>10:
winner = poll[0][0]
return winner
def clean_conections(idg):
ndg = SDG.DistanceGraph(ws.sdg)
for nv in idg.get_all_nodeviews(both_directions=True):
winner = pick_next(nv)
if winner is not None:
if not ndg.are_connected(-nv.node_id(), winner):
ndg.add_link(-nv.node_id(), winner, 0)
return ndg
dg = clean_conections(sedg)
dg2 = clean_conections(pedg)
dg.write_to_gfa1("./neghbour_dg.gfa")
dg2.write_to_gfa1("./neghbour_dg2.gfa")
## Disconect multiple connections
for nv in dg.get_all_nodeviews(both_directions=True):
if len(nv.next())>1:
for con in nv.next():
dg.remove_link(-nv.node_id(), con.node().node_id())
dg.write_to_gfa1("./neghbour_uniruns.gfa")
## group components
def get_components(dg):
all_nodes = set([n.node_id() for n in dg.get_all_nodeviews()])
groups={}
while len(all_nodes)>0:
node = all_nodes.pop()
group = [node, ]
fwn = dg.get_nodeview(node).next()
while len(fwn)==1:
next_node = fwn[0].node().node_id()
if next_node in group: break
group.append(next_node)
all_nodes.remove(abs(next_node))
fwn = dg.get_nodeview(next_node).next()
group = [x*-1 for x in group[::-1]]
fwn = dg.get_nodeview(-node).next()
while len(fwn)==1:
next_node = fwn[0].node().node_id()
if next_node in group: break
group.append(next_node)
all_nodes.remove(abs(next_node))
fwn = dg.get_nodeview(next_node).next()
groups[node] = group
return groups
groups = get_components(dg)
## transformation table from nodo to group
print(len(groups))
transformation = {}
for nodo_central, integrantes in groups.items():
for integrante in integrantes:
transformation[integrante] = nodo_central
transformation[-integrante] = -nodo_central
## transform reads to group mappings
trpm=[]
for rpm in lrr.read_perfect_matches:
if len(rpm)<2:
trpm.append([])
continue
ms=[transformation[rpm[0].node]]
for ix in range(1,len(rpm)-1):
x=rpm[ix]
g=transformation[x.node]
if g==ms[-1]: continue #same group as before
if ms[-1]==transformation[rpm[ix+1].node]: continue #sandwich
ms.append(g)
if transformation[rpm[-1].node]!=ms[-1]: ms.append(transformation[rpm[-1].node])
trpm.append(ms)
##
def gid_reads():
""" collection to enter using the group id in absolute value and return the mapped reads to translate later"""
gid2reads={abs(gid):set() for gid in groups.keys()}
for i, read in enumerate(trpm):
for gid in read:
gid2reads[abs(gid)].add(i)
return gid2reads
gid2reads = gid_reads()
def get_next(gid):
c=Counter()
trpm_ids = gid2reads[abs(gid)]
ttrpm = [trpm[i] for i in trpm_ids]
for x in ttrpm:
if -gid in x:
x= [y*-1 for y in x[::-1]]
if gid in x and x[-1] != gid:
c[x[x.index(gid)+1]]+=1
if len(c.most_common())>0 and c.most_common(1)[0][1]>3:
return c.most_common(1)[0][0]
else:
None
ws2 = SDG.WorkSpace()
## Add all groups as nodes
group2node={}
node2group={}
all_nodes = [nodo for k, group in groups.items() for nodo in group]
for gid in groups.keys():
group_size = sum([ws.sdg.get_nodeview(x).size() for x in groups[gid]])
new_node = ws2.sdg.add_node("A"*group_size)
group2node[gid] = new_node
node2group[new_node] = gid
print("%s nodes in ws2 graph" %len(ws2.sdg.get_all_nodeviews()))
# def runall():
for i, g in enumerate(groups.keys()):
visited=set()
prev_group = None
next_group = get_next(g)
while next_group is not None and next_group not in visited:
if prev_group is not None:
if prev_group<0: n1=-group2node[abs(prev_group)]
else: n1=group2node[abs(prev_group)]
if next_group<0: n2=-group2node[abs(next_group)]
else: n2=group2node[abs(next_group)]
# print("%s --> %s" %(n1, n2))
ws2.sdg.add_link(-n1, n2, 0, SDG.Support())
visited.add(next_group)
prev_group = next_group
next_group = get_next(next_group)
print("\r%s/%s" %(i, len(groups)), end='')
# if i%50==1:
# ws2.sdg.write_to_gfa1('./dummygraph.gfa')
ws2.sdg.write_to_gfa1('./dummygraph.gfa')
```