TAP v2 - HackMD

# TAP v2 ## Random notes the problem with anchor graphs has been to try and make them too complicated. take unique 63-mers from the distribution. take perfect matches from the long reads. discard anything that is not unique (i.e. double positions) -> TAP. If short reads are available, distances with the short reads can be used too. Once a local subset is ordered (with the short reads), they can be used to refer to a central point. (i.e. a hash of kmers that refers to a has of central kmers plus short offsets). Once the central kmers are ordered, the offsets can be used to fill the non-central k-mers. Because the anchor points are unique k-mers they can be single positions (offsets) in the read. So, when tapping, the original distance in the read is recovered. To make it easier to work with real life long reads, all of this can be done in homopolimer-compressed space. The problem has been always that we focused on the wrong conditions. Fixing copy-number 1 on contigs was idiotic. We can fix copy number 1 on k-mers, which we can guess reasonably well, and then order gives position. The Sequence Distance Graph is only an intermediate concept to a unique-k-mer distance graph. Pan-genome versions of this could use the unique k-mers in different haplotypes/colors to find disorder in the set to call variants. Integration with markers is as easy as maping a marker to a position. ## Example run Try with the octoploid from pseudoseq. ```python # rough TAP v2 implementation %cd /Users/clavijob/pseudoseq_paper/new import SDGpython as SDG ws=SDG.WorkSpace() peds=ws.add_paired_reads_datastore('octo_pe.prseq') lords=ws.add_long_reads_datastore('octo_long.loseq') # new function: kc.kmer_unlinked_graph(count_name,min_freq,max_freq) -> sdg with kmers as nodes. SDG.count_kmers_as_graph_nodes(ws.sdg,peds,31,50,100,1) print(len(ws.sdg.get_all_nodeviews())) kc=ws.add_kmer_counter("main") kc.add_count("pe",peds) kc.set_kci_peak(75) ``` ```txt /Users/clavijob/pseudoseq_paper/new 248880 ``` ```python print(ws.sdg.stats_by_kci()) s=SDG.Strider(ws) s.add_datastore(peds) peds.mapper.path_reads(31) #collapse nodes into unique-31-mer unitigs --> warning: could connects across collapsed 30-mers if strider fails. print("starting... %s" %datetime.now()) total_nodes=len(ws.sdg.get_all_nodeviews()) for nv in ws.sdg.get_all_nodeviews(): if nv.node_id()%100000 == 0: print("\rnodos: %s/%s" %(nv.node_id(), total_nodes), end='') fw=s.stride_single_strict(nv.node_id()).nodes bw=s.stride_single_strict(-nv.node_id()).nodes for fw in [fw,bw]: if len(fw)>1: if ws.sdg.get_nodeview(fw[0]).sequence()[1:]==ws.sdg.get_nodeview(fw[1]).sequence()[:30]: if not ws.sdg.are_connected(-fw[0],fw[1]): ws.sdg.add_link(-fw[0],fw[1],-30) print("Done for loop... %s" %datetime.now()) ws.sdg.join_all_unitigs() print("Done for Join all unitigs... %s" %datetime.now()) kc.update_graph_counts() print(ws.sdg.stats_by_kci()) ``` ```txt ----------------------------------------------------------------------------------- | KCI | Total bp | Nodes | Tips | N25 | N50 | N75 | |-------+---------------+----------+---------+------------+------------+------------| | None | 7715280 | 248880 | 248880 | 31 | 31 | 31 | | < 0.5 | 0 | 0 | 0 | 0 | 0 | 0 | | ~ 1 | 0 | 0 | 0 | 0 | 0 | 0 | | ~ 2 | 0 | 0 | 0 | 0 | 0 | 0 | | ~ 3 | 0 | 0 | 0 | 0 | 0 | 0 | | > 3.5 | 0 | 0 | 0 | 0 | 0 | 0 | |-------+---------------+----------+---------+------------+------------+------------| | All | 7715280 | 248880 | 248880 | 31 | 31 | 31 | ----------------------------------------------------------------------------------- ----------------------------------------------------------------------------------- | KCI | Total bp | Nodes | Tips | N25 | N50 | N75 | |-------+---------------+----------+---------+------------+------------+------------| | None | 37288 | 1058 | 1058 | 38 | 36 | 33 | | < 0.5 | 0 | 0 | 0 | 0 | 0 | 0 | | ~ 1 | 478502 | 7839 | 7839 | 61 | 61 | 61 | | ~ 2 | 0 | 0 | 0 | 0 | 0 | 0 | | ~ 3 | 0 | 0 | 0 | 0 | 0 | 0 | | > 3.5 | 0 | 0 | 0 | 0 | 0 | 0 | |-------+---------------+----------+---------+------------+------------+------------| | All | 515790 | 8897 | 8897 | 61 | 61 | 61 | ----------------------------------------------------------------------------------- ``` ```python lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31) lrr.map(31) lrr.thread_reads(end_matches=1) tdg=lrr.dg_from_threads() ``` ![](https://i.imgur.com/mmc9em4.png) ```python u=SDG.LinkageUntangler(tdg) u.select_all() print(len([1 for x in u.selected_nodes if x==True]),"nodes selected to start with") conflicted_ends=set() for x in range(2): nsl=u.make_nextselected_linkage(7) nsl.remove_transitive_links() for fnv in nsl.get_all_nodeviews(): for nv in [fnv,fnv.rc()]: if len(nv.next())>1: conflicted_ends.add(-nv.node_id()) for lfw in nv.next(): conflicted_ends.add(lfw.node().node_id()) for nid in conflicted_ends: if nid>0 and -nid in conflicted_ends: u.selected_nodes[nid]=False print(len([1 for x in u.selected_nodes if x==True]),"nodes selected after cleaning up multi-connected") print('leftover nodes with multiple connections: seq'+',seq'.join([str(abs(nv.node_id())) for nv in nsl.get_all_nodeviews() if len(nv.next())>1 or len(nv.prev())>1])) print('leftover nodes with multiple connections: '+len([str(abs(nv.node_id())) for nv in nsl.get_all_nodeviews() if len(nv.next())>1 or len(nv.prev())>1])) nsl.write_to_gfa1('nsl.gfa',selected_nodes=[x[0] for x in enumerate(u.selected_nodes) if x[1]]) ``` ```txt 257630 nodes selected to start with 257620 nodes selected after cleaning up multi-connected 257619 nodes selected after cleaning up multi-connected leftover nodes with multiple connections: seq60264,seq185450,seq196603,seq250395,seq250558,seq251849,seq253004,seq253714,seq254169,seq254564,seq254895,seq256825,seq256963,seq257437,seq257602 ``` ![](https://i.imgur.com/9LbMypa.png) ## Trying with the PB HiFi dmel / strawberry datasets 1) construct graph at k=63 2) pop homopolymer bubbles ```python def homocomp(s): r=s[0] for c in s: if r[-1]!=c: r+=c return r bb=[1] while bb: bb=[] for nv in ws.sdg.get_all_nodeviews(): if is_bubble_side(nv): pv=nv.parallels()[0] if nv.size()>pv.size() and homocomp(nv.sequence())==homocomp(pv.sequence()): bb.append(pv.node_id()) for x in bb: ws.sdg.remove_node(x) ws.sdg.join_all_unitigs() print(len(bb)) ``` 3) Expand canonical repeats ```python ge=SDG.GraphEditor(ws) solved=unsolved=0 for nv in ws.sdg.get_all_nodeviews(): if is_canonical_repeat(nv): nidp0=nv.prev()[0].node().node_id() nidp1=nv.prev()[1].node().node_id() nidn0=nv.next()[0].node().node_id() nidn1=nv.next()[1].node().node_id() pA=set([abs(x) for x in lrr19.node_reads[abs(nidp0)]]) pB=set([abs(x) for x in lrr19.node_reads[abs(nidp1)]]) pC=set([abs(x) for x in lrr19.node_reads[abs(nidn0)]]) pD=set([abs(x) for x in lrr19.node_reads[abs(nidn1)]]) pAB=pA.intersection(pB) pCD=pC.intersection(pD) pA=pA-pAB pB=pB-pAB pC=pC-pCD pD=pD-pCD pAC=len(pA.intersection(pC)) pAD=len(pA.intersection(pD)) pBC=len(pB.intersection(pC)) pBD=len(pB.intersection(pD)) if pAC<5 and pAD>20 and pBC>20 and pBD<5: solved+=1 ge.queue_node_expansion(nv.node_id(),[[-nidp0,nidn1],[-nidp1,nidn0]]) elif pAD<5 and pAC>20 and pBD>20 and pBC<5: solved+=1 ge.queue_node_expansion(nv.node_id(),[[-nidp0,nidn0],[-nidp1,nidn1]]) else: unsolved+=1 ``` 4) Solve "double joins" ```python def is_double_join(nv): if len(nv.next())!=2: return False if len(nv.next()[0].node().prev())!=2 or len(nv.next()[1].node().prev())!=2: return False n0ps=set([x.node().node_id() for x in nv.next()[0].node().prev()]) n1ps=set([x.node().node_id() for x in nv.next()[1].node().prev()]) if n0ps!=n1ps: return False n0ps=set([x.node().node_id() for x in nv.next()[0].node().prev()[0].node().next()]) n1ps=set([x.node().node_id() for x in nv.next()[0].node().prev()[1].node().next()]) if n0ps!=n1ps: return False return True solved=unsolved=0 for fnv in ws.sdg.get_all_nodeviews(): for nv in [fnv,fnv.rc()]: if is_double_join(nv): nidn0=nv.next()[0].node().node_id() nidn1=nv.next()[1].node().node_id() nidp0=nv.next()[0].node().prev()[0].node().node_id() nidp1=nv.next()[0].node().prev()[1].node().node_id() pA=set([abs(x) for x in lrr19.node_reads[abs(nidp0)]]) pB=set([abs(x) for x in lrr19.node_reads[abs(nidp1)]]) pC=set([abs(x) for x in lrr19.node_reads[abs(nidn0)]]) pD=set([abs(x) for x in lrr19.node_reads[abs(nidn1)]]) pAB=pA.intersection(pB) pCD=pC.intersection(pD) pA=pA-pAB pB=pB-pAB pC=pC-pCD pD=pD-pCD pAC=len(pA.intersection(pC)) pAD=len(pA.intersection(pD)) pBC=len(pB.intersection(pC)) pBD=len(pB.intersection(pD)) if pAC<5 and pAD>20 and pBC>20 and pBD<5: solved+=1 ws.sdg.remove_link(-nidp0,nidn0) ws.sdg.remove_link(-nidp1,nidn1) elif pAD<5 and pAC>20 and pBD>20 and pBC<5: solved+=1 ws.sdg.remove_link(-nidp0,nidn1) ws.sdg.remove_link(-nidp1,nidn0) else: unsolved+=1 ``` 5) Strider links? ```python anchor_graph=SDG.DistanceGraph(ws.sdg) anchor_graph_multi=SDG.DistanceGraph(ws.sdg) for x in range(len(s.is_anchor)): if s.is_anchor[x]: for l in s.links_bw[x] + s.links_fw[x]: if not anchor_graph_multi.are_connected(l.source,l.dest): anchor_graph_multi.add_link(l.source, l.dest, l.dist) if s.is_anchor[abs(l.dest)]: if not anchor_graph.are_connected(l.source,l.dest): anchor_graph.add_link(l.source, l.dest, l.dist) anchor_graph.remove_transitive_links() ``` ```python def get_node_inmediate_neighbours(node): """ Uno nodo para adelante y un nodo para atras """ group=[] for x in Counter([-x for path in peds.mapper.all_paths_fw(-node) for x in path]).most_common(3): if x[0]!=0 and x[1]>10: group.append(x) for x in Counter([x for path in peds.mapper.all_paths_fw(node) for x in path]).most_common(3): if x[0]!=0 and x[1]>10: group.append(x) return group ## Grouping algo groups = {abs(n.node_id()):[abs(n.node_id()),] for n in tdg.get_all_nodeviews()} print(len(groups)) cont=0 nodos=[abs(x) for x in groups.keys()] for nodo in nodos: # changes = 0 if nodo not in groups: continue neighbours = get_node_inmediate_neighbours(nodo) for nn in neighbours: if abs(nn[0]) not in groups: continue elif len(groups[abs(nn[0])]) > 1: break else: if nodo not in groups: continue groups[nodo].append(nn[0]) groups.pop(abs(nn[0])) # changes+=1 # nodo=random.choice(list(groups.keys())) print("\r%s" %(cont,), end='') cont += 1 len(groups) transformation = {} for nodo_central, integrantes in groups.items(): for integrante in integrantes: transformation[integrante] = nodo_central transformation[-integrante] = -nodo_central trpm=[] for rpm in lrr.read_perfect_matches: if len(rpm)<2: trpm.append([]) continue ms=[transformation[rpm[0].node]] for ix in range(1,len(rpm)-1): x=rpm[ix] g=transformation[x.node] if g==ms[-1]: continue #same group as before if ms[-1]==transformation[rpm[ix+1].node]: continue #sandwich ms.append(g) if transformation[rpm[-1].node]!=ms[-1]: ms.append(transformation[rpm[-1].node]) trpm.append(ms) ``` # Gonza's implementation log ## Step 1: Create DBG **Input:** PE reads **Output:** DBG. **Aim:** create a haplotype-specific DBG to have as a basis for unitig processing. ```python K=63 MIN_CVG=10 NUM_BATCHES=1 ws=SDG.WorkSpace() peds=ws.add_paired_reads_datastore('./pseudo_triploid_octoploid_new/octoploid_pe.prseq', 'pe') gm=SDG.GraphMaker(ws.sdg) gm.new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES); gc=SDG.GraphContigger(ws) gc.clip_tips(2*K-1,3) print("Remaining nodes: %s" %(len(ws.sdg.get_all_nodeviews()))) ``` **Validation:** - Kmer spectra ![](https://i.imgur.com/adARsBu.png) - Unitig validity (i.e. sequence from all unitigs needs to be present in the reference): count and bp of unititgs found as perfect hits in the reference. Found contigs 23705 with 2486607bp ``` # runs sdg-dbg with a realatively low coverage and tip clipping. too lowthreholds introduces a ton of errors as standalone contigs, min_cv changed from 2 to 10 ans now produces a better initial spectra and no errors in contigs ``` ### Results in simulated Octoploid ``` for nv in ws.sdg.get_all_nodeviews(): ``` ## Step 2: Extract unique k-mer runs from unitigs **Input:** DBG **Output:** Strings with all their kmers being unique as per read frequencies. **Aim:** create all unique k-mer runs, to be used as anchors **Validation:** - Each string must appear once and only once in the reference. ``` python # - Each string must appear once and only once in the reference. multimatch=[] bps = 0 for k, v in encontrados.items(): if len(v)>1 or len(v) == 0: print(k, v, ws.sdg.get_nodeview(k).sequence()) bps+= ws.sdg.get_nodeview(k).size() multimatch.append(k) print("Multi matching kmer runs %s/%s with %s bps" %(len(multimatch), len([x for x in ws.sdg.get_all_nodeviews()]), bps)) ``` ``` 244 ['haplotype_1', 'haplotype_2'] GAATCTTTCACAGAAAAGCAGCTAGTAAATGA 473 ['haplotype_7', 'haplotype_8'] AAAGAAGGTTTTTTACAGCAAGAAGAAGAAGAGAA 1022 ['haplotype_1', 'haplotype_2'] CTCAGGAGATGCCAACAAATCAACTTCTGCCA 1023 ['haplotype_1', 'haplotype_2'] GAGATGCCAACAAATCAACTTCTGCCATTAA 1228 ['haplotype_2', 'haplotype_3'] AGTGTGGCGGTCACTGGGCCCACTGGTGGAAG 1973 ['haplotype_2', 'haplotype_3'] AGTGTGGCGGTCACTGGGCCCACTGGTGGAAG 3194 ['haplotype_1', 'haplotype_2'] ATGTTAATAGCAGTTTCCTGTCTGTCACCAGTTA 3244 ['haplotype_1', 'haplotype_2'] GCAGGGGATAGATGGTTGTTGGGGTGTGGTGATGGA 3497 ['haplotype_1', 'haplotype_2'] AACGATCATTATAGGCTAACTCTACCAGAAGACAAGA 3643 ['haplotype_2', 'haplotype_3'] ATCTCCCGGAAAGCTTATTTCTAAATAAAAT 3768 ['haplotype_2', 'haplotype_3'] ATCTCCCGGAAAGCTTATTTCTAAATAAAAT 4062 ['haplotype_1', 'haplotype_2'] AACGATCATTATAGGCTAACTCTACCAGAAGACAAGA 4756 ['haplotype_7', 'haplotype_8'] AACAAGGTCAAGAAAAAAATTAAAAAGGTTA 4820 ['haplotype_1', 'haplotype_2'] ACCTGGGTAATATTATGGTTACTACCAAGCC 4821 ['haplotype_1', 'haplotype_2'] CTTGGTAGTAACCATAATATTACCCAGGTAC 5013 ['haplotype_2', 'haplotype_3'] AGTGTGGCGGTCACTGGGCCCACTGGTGGAAG 6128 ['haplotype_2', 'haplotype_3'] ATCTCCCGGAAAGCTTATTTCTAAATAAAAT 7930 ['haplotype_1', 'haplotype_2'] CTTGGTAGTAACCATAATATTACCCAGGTAC 7931 ['haplotype_1', 'haplotype_2'] ACCTGGGTAATATTATGGTTACTACCAAGCC 8506 ['haplotype_7', 'haplotype_8'] AATGAAGAGATATTTAGAAACAGATTTTTTAGCACAAAGGCA 8668 ['haplotype_1', 'haplotype_2'] ACCTGGGTAATATTATGGTTACTACCAAGCC 8669 ['haplotype_1', 'haplotype_2'] CTTGGTAGTAACCATAATATTACCCAGGTAC 8711 ['haplotype_1', 'haplotype_2'] ATGTTAATAGCAGTTTCCTGTCTGTCACCAGTTA 8714 ['haplotype_1', 'haplotype_2'] AACGATCATTATAGGCTAACTCTACCAGAAGACAAGA Multi matching kmer runs 24/8768 with 793 bps ``` - All kmers between fmin and fmax must be used ONCE in the strings. ``` python # - All kmers between fmin and fmax must be used ONCE in the strings. nodes_with_repeated_content=[] cant = 0 for i, nv in enumerate(ws.sdg.get_all_nodeviews()): kcov = nv.kmer_coverage('pek31', 'sdg') if sum(kcov) != len(kcov): cant += 1 print(nv, kcov, nv.node_id() in multimatch) print("Kmers multi represented in the graph %s" %cant) ``` ``` <NodeView: node 1228 in graph SDG> [3, 3] True <NodeView: node 1973 in graph SDG> [3, 3] True <NodeView: node 3194 in graph SDG> [2, 2, 2, 2] True <NodeView: node 3497 in graph SDG> [3, 3, 3, 3, 3, 3, 3] True <NodeView: node 3643 in graph SDG> [3] True <NodeView: node 3768 in graph SDG> [3] True <NodeView: node 4062 in graph SDG> [3, 3, 3, 3, 3, 3, 3] True <NodeView: node 4820 in graph SDG> [3] True <NodeView: node 4821 in graph SDG> [3] True <NodeView: node 5013 in graph SDG> [3, 3] True <NodeView: node 6128 in graph SDG> [3] True <NodeView: node 7930 in graph SDG> [3] True <NodeView: node 7931 in graph SDG> [3] True <NodeView: node 8668 in graph SDG> [3] True <NodeView: node 8669 in graph SDG> [3] True <NodeView: node 8711 in graph SDG> [2, 2, 2, 2] True <NodeView: node 8714 in graph SDG> [3, 3, 3, 3, 3, 3, 3] True Kmers multi represented in the graph 17 ``` - All strings should ONLY be formed by k-mers between fmin and fmax. ``` python cant = 0 for i, nv in enumerate(ws.sdg.get_all_nodeviews()): kcov = [x>=40 and x <=100 for x in nv.kmer_coverage('pek31', 'pe')] if sum(kcov) != len(kcov): print(nv, nv.kmer_coverage('pek31', 'pe')) cant += 1 ``` ``` None present ``` ## Step 3: group kmer runs with PE reads **Input:** unique kmer runs (as nodes) + PE datastore. **Output:** groups as lists of SIGNED nodes. **Aim:** group immediate neighbouring unique kmer runs into longer structures to allow better sensitivity when linking with long reads. **Validation:** - Each node can ONLY APPEAR in one group. ```python ## Prop check count = Counter() for k, v in groups.items(): for x in v: count[abs(x)]+=1 for c in count.most_common(): if c[1]>1: print(c) ``` ``` Nothing reported, checks ``` - Each group should have no more than MAX_SIZE total k-mers. - total size does not appear to be a problem atm - unique kmer runs of each group should appear consecutively and in-order in the reference. (no scrambling). I.e.: you can express each chromosome as a list of groups. - All group seems to be mutually excluyent, the superposition check - Order within group seems to be ok, all the groups i checked grou "monotonically" ```python ## CHeck group superposicion for g in group_c: for og in group_c: if g[0] == og[0]: continue if g[1]<og[1]<g[2] or g[1]<og[2]<g[2]: print("Groups %s and %s overlap %s -- %s" %(g, og, l, ol)) ## Check order group_c = [] for nc, group in groups.items(): print("GROUP %s" %nc) print('seq'+',seq'.join([str(abs(x)) for x in group])) for i, node in enumerate(group[:-1]): pos=posicion[abs(group[i])][0][3] prev_pos=posicion[abs(group[i+1])][0][3] print('{:<4} {:>10}'.format(node, pos-prev_pos>0)) ``` ``` sedg=SDG.DistanceGraph(ws.sdg) pedg=SDG.DistanceGraph(ws.sdg) # only links when there is no SE next. for nv in ws.sdg.get_all_nodeviews(reverse=True): for p in peds.mapper.all_paths)fw(nv.node_id()): if len(p)==0: continue if p[0]!=0: sedg.add_link(-nv.node_id(), p[0], 1) else: pedg.add_link(-nv.node_id(), p[1], 1) # Now linearise this graph. (pick the single most-connected node FW in SE first, and on PE if no connection passess the threshold in SE) nextrundg= # ``` **Something interesting, the errors joinhaplotypes even in the simulated octo** ## Step 4: link groups with long reads **Input:** unique kmer runs (as nodes) + groups + LR datastore. **Output:** a graph of group connectivity. **Aim:** To discover the group composition of each haplotype. **Validation:** - On the graph: the best closest next should be the group that comes next in the reference. Same for prev. - Linearly: express a chromosome as a list of anchors, put N's in the middle, it should be the same as masking all non-unique (fmin/fmax) k-mers from the reference. ## Code dump ```python import sys sys.path.append('/Users/ggarcia/Documents/git_sources/sdg/cmake-build-relwithdebinfo/SDGpython/') import SDGpython as SDG from collections import Counter ## SDG-DBG assembly k=63 ## Min coverage is important to reduce the noice K=63 MIN_CVG=10 NUM_BATCHES=1 ws=SDG.WorkSpace() peds=ws.add_paired_reads_datastore('./pseudo_triploid_octoploid_new/octoploid_pe.prseq', 'pe') gm=SDG.GraphMaker(ws.sdg) gm.new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES); gc=SDG.GraphContigger(ws) gc.clip_tips(2*K-1,3) print("Remaining nodes: %s" %(len(ws.sdg.get_all_nodeviews()))) ws.sdg.write_to_gfa1("./sdg_dbg_graph.gfa") kc = ws.add_kmer_counter('pek31', 31) kc.add_count('pe', peds) seqs = gc.contig_reduction_to_unique_kmers(40, 100) print(len(seqs),"sequences for anchoring") with open("pseudoLong2.fastq","w") as f: for s in seqs: f.write("@seq\n%s\n+\n%s\n"%(s,s)) ## Recreate workspace using only unique kmer runs ws=SDG.WorkSpace() peds=ws.add_paired_reads_datastore('./pseudo_triploid_octoploid_new/octoploid_pe.prseq') lords=ws.add_long_reads_datastore('./pseudo_triploid_octoploid_new/octoploid_long.loseq') for s in seqs: ws.sdg.add_node(s) print(len(ws.sdg.get_all_nodeviews())) kc = ws.add_kmer_counter('pek31', 31) kc.add_count('pe', peds) lords = ws.add_long_reads_datastore('./pseudo_triploid_octoploid_new/octoploid_long.loseq') lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31) lrr.map(31) lrr.thread_reads(end_matches=1) tdg=lrr.dg_from_threads() peds.mapper.path_reads(31) ## Group nodes using the next neighbour def make_neighbours_graph(): sedg=SDG.DistanceGraph(ws.sdg) pedg=SDG.DistanceGraph(ws.sdg) # only links when there is no SE next. for nv in ws.sdg.get_all_nodeviews(both_directions=True): for p in peds.mapper.all_paths_fw(nv.node_id()): if len(p)==0: continue if p[0]!=0: sedg.add_link(-nv.node_id(), p[0], 0) else: pedg.add_link(-nv.node_id(), p[1], 0) return sedg, pedg sedg, pedg = make_neighbours_graph() sedg.write_to_gfa1("./neghbour_sedg.gfa") pedg.write_to_gfa1("./neghbour_pedg.gfa") ## Remove multiple conections by voting the most popular def pick_next(nv): winner = None poll = Counter([x.node().node_id() for x in nv.next()]).most_common(1) if len(poll)>0 and poll[0][1]>10: winner = poll[0][0] return winner def clean_conections(idg): ndg = SDG.DistanceGraph(ws.sdg) for nv in idg.get_all_nodeviews(both_directions=True): winner = pick_next(nv) if winner is not None: if not ndg.are_connected(-nv.node_id(), winner): ndg.add_link(-nv.node_id(), winner, 0) return ndg dg = clean_conections(sedg) dg2 = clean_conections(pedg) dg.write_to_gfa1("./neghbour_dg.gfa") dg2.write_to_gfa1("./neghbour_dg2.gfa") ## Disconect multiple connections for nv in dg.get_all_nodeviews(both_directions=True): if len(nv.next())>1: for con in nv.next(): dg.remove_link(-nv.node_id(), con.node().node_id()) dg.write_to_gfa1("./neghbour_uniruns.gfa") ## group components def get_components(dg): all_nodes = set([n.node_id() for n in dg.get_all_nodeviews()]) groups={} while len(all_nodes)>0: node = all_nodes.pop() group = [node, ] fwn = dg.get_nodeview(node).next() while len(fwn)==1: next_node = fwn[0].node().node_id() if next_node in group: break group.append(next_node) all_nodes.remove(abs(next_node)) fwn = dg.get_nodeview(next_node).next() group = [x*-1 for x in group[::-1]] fwn = dg.get_nodeview(-node).next() while len(fwn)==1: next_node = fwn[0].node().node_id() if next_node in group: break group.append(next_node) all_nodes.remove(abs(next_node)) fwn = dg.get_nodeview(next_node).next() groups[node] = group return groups groups = get_components(dg) ## transformation table from nodo to group print(len(groups)) transformation = {} for nodo_central, integrantes in groups.items(): for integrante in integrantes: transformation[integrante] = nodo_central transformation[-integrante] = -nodo_central ## transform reads to group mappings trpm=[] for rpm in lrr.read_perfect_matches: if len(rpm)<2: trpm.append([]) continue ms=[transformation[rpm[0].node]] for ix in range(1,len(rpm)-1): x=rpm[ix] g=transformation[x.node] if g==ms[-1]: continue #same group as before if ms[-1]==transformation[rpm[ix+1].node]: continue #sandwich ms.append(g) if transformation[rpm[-1].node]!=ms[-1]: ms.append(transformation[rpm[-1].node]) trpm.append(ms) ## def gid_reads(): """ collection to enter using the group id in absolute value and return the mapped reads to translate later""" gid2reads={abs(gid):set() for gid in groups.keys()} for i, read in enumerate(trpm): for gid in read: gid2reads[abs(gid)].add(i) return gid2reads gid2reads = gid_reads() def get_next(gid): c=Counter() trpm_ids = gid2reads[abs(gid)] ttrpm = [trpm[i] for i in trpm_ids] for x in ttrpm: if -gid in x: x= [y*-1 for y in x[::-1]] if gid in x and x[-1] != gid: c[x[x.index(gid)+1]]+=1 if len(c.most_common())>0 and c.most_common(1)[0][1]>3: return c.most_common(1)[0][0] else: None ws2 = SDG.WorkSpace() ## Add all groups as nodes group2node={} node2group={} all_nodes = [nodo for k, group in groups.items() for nodo in group] for gid in groups.keys(): group_size = sum([ws.sdg.get_nodeview(x).size() for x in groups[gid]]) new_node = ws2.sdg.add_node("A"*group_size) group2node[gid] = new_node node2group[new_node] = gid print("%s nodes in ws2 graph" %len(ws2.sdg.get_all_nodeviews())) # def runall(): for i, g in enumerate(groups.keys()): visited=set() prev_group = None next_group = get_next(g) while next_group is not None and next_group not in visited: if prev_group is not None: if prev_group<0: n1=-group2node[abs(prev_group)] else: n1=group2node[abs(prev_group)] if next_group<0: n2=-group2node[abs(next_group)] else: n2=group2node[abs(next_group)] # print("%s --> %s" %(n1, n2)) ws2.sdg.add_link(-n1, n2, 0, SDG.Support()) visited.add(next_group) prev_group = next_group next_group = get_next(next_group) print("\r%s/%s" %(i, len(groups)), end='') # if i%50==1: # ws2.sdg.write_to_gfa1('./dummygraph.gfa') ws2.sdg.write_to_gfa1('./dummygraph.gfa') ```