Strawberry sort

# Strawberry sort [TOC] ## Script 1 ```python import sys sys.path.append('/hpc-home/ggarcia/git_sources/bsg/build/SDGpython') import SDGpython as SDG import os MIN_UKMER_FREQ=57 MAX_UKMER_FREQ=135 K=31 ## SDG-DBG assembly k=63 ## Min coverage is important to reduce the noice GK=63 MIN_CVG=30 NUM_BATCHES=8 ws=SDG.WorkSpace() peds=ws.add_paired_reads_datastore('./data/strbrg_pe.prseq', 'pe') gm=SDG.GraphMaker(ws.sdg) gm.new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES); #this is so the initial kmer count includes all 31-mers print("First graph done") kc = ws.add_kmer_counter('pek31', K) kc.add_count('pe', peds) print("kmers counted") gm.new_graph_from_paired_datastore(peds,GK,MIN_CVG,NUM_BATCHES); #this is the real DBG k print("second graph done") ws.sdg.write_to_gfa1("./00-sdg_dbg_graph_MC%s.gfa" %MIN_CVG) gc=SDG.GraphContigger(ws) gc.clip_tips(2*GK-1,3) print("Remaining nodes: %s" %(len(ws.sdg.get_all_nodeviews()))) kc.update_graph_counts() ws.dump("./00-sdg_dbg_graph_MC%s.sdgws" %MIN_CVG) ## Check step1 passed=True ccc=zip(kc.count_spectra('pe',present_in_graph=False),kc.count_spectra('pe',present_in_graph=True)) for i,x in list(enumerate(ccc))[MIN_UKMER_FREQ:MAX_UKMER_FREQ+1]: if x[0]!=x[1]: print(x[0]-x[1],"Kmers are missing at f=",i," -> ",x) passed=False if passed: print("Check passed!!!!") else: print("Check FAILED!!!!") ## Make unique kmer runs seqs = gc.contig_reduction_to_unique_kmers(MIN_UKMER_FREQ,MAX_UKMER_FREQ) print(len(seqs),"sequences for anchoring") with open("./pseudoLong2.fasta","w") as f: for i, s in enumerate(seqs): f.write(">seq_%s\n%s\n"%(i,s)) ``` ## Script 2 ```python import sys sys.path.append('/hpc-home/ggarcia/git_sources/bsg/build/SDGpython') import SDGpython as SDG import os ## TODO (probar en octo): ## - Gardar el path reads ## - Hacer que el path reads fuarde los offsets ## - guardar los grafos en el workspace no en gfas para tener el soporte. sedg y pedg crudos. ## -- ws.add_distance_graph o algo asi, se puede poner dentro del grafo ws=SDG.WorkSpace() ws.sdg.load_from_fasta("./pseudoLong2.fasta") peds=ws.add_paired_reads_datastore('./data/strbrg_pe.prseq', 'pe') peds.mapper.path_reads(31,fill_offsets=True) def dg_from_paths(peds,g,add_pair_jumps=False,pair_jump_distace=300): dg=SDG.DistanceGraph(g) for rid in range(1,len(peds.mapper.read_path_offsets)): p=peds.mapper.read_paths[rid].path o=peds.mapper.read_path_offsets[rid] for i in range(len(p)-1): dg.add_link(-p[i],p[i+1],o[i+1][0]-o[i+1][1]-(o[i][0]-o[i][1])-dg.get_nodeview(p[i]).size(),SDG.Support(SDG.SupportType.stReadPath,0,rid)) if add_pair_jumps: for rid1 in range(1,len(peds.mapper.read_path_offsets),2): p1=peds.mapper.read_paths[rid1].path p2=peds.mapper.read_paths[rid1+1].path if p1 and p2: dg.add_link(-p1[-1],-p2[-1],pair_jump_distace,SDG.Support(SDG.SupportType.stReadPath,1,rid1)) return dg sedg=dg_from_paths(peds,ws.sdg) u=SDG.LinkageUntangler(sedg) u.select_all() sedg_nsl=u.make_nextselected_linkage(5) sedg_nsl.write_to_gfa1("./sedg_nsl.gfa") pedg=dg_from_paths(peds,ws.sdg,add_pair_jumps=True) u=SDG.LinkageUntangler(pedg) u.select_all() pedg_nsl=u.make_nextselected_linkage(5) pedg_nsl.remove_transitive_links() pedg_nsl.write_to_gfa1("./pedg_nsl.gfa") ws.dump("./01-unique_kmer_runs_pemapped.sdgws") ``` ## Script 3 ```python import sys sys.path.append('/hpc-home/ggarcia/git_sources/bsg/build/SDGpython') import SDGpython as SDG import os import pickle ws = SDG.WorkSpace() ws.sdg.load_from_gfa('./sedg_nsl.gfa') ## group components def get_components(dg): """ Transform lines to groups """ all_nodes = set([n.node_id() for n in dg.get_all_nodeviews()]) groups={} while len(all_nodes)>0: node = all_nodes.pop() group = [node, ] fwn = dg.get_nodeview(node).next() while len(fwn)==1: next_node = fwn[0].node().node_id() if next_node in group: break group.append(next_node) try: all_nodes.remove(abs(next_node)) except: pass fwn = dg.get_nodeview(next_node).next() group = [x*-1 for x in group[::-1]] fwn = dg.get_nodeview(-node).next() while len(fwn)==1: next_node = fwn[0].node().node_id() if next_node in group: break group.append(next_node) try: all_nodes.remove(abs(next_node)) except: pass fwn = dg.get_nodeview(next_node).next() groups[node] = group return groups groups = get_components(ws.sdg) ## transformation table from nodo to group print("Found %s groups" %len(groups)) transformation = {} for nodo_central, integrantes in groups.items(): for integrante in integrantes: transformation[integrante] = nodo_central transformation[-integrante] = -nodo_central ## Pickle transformation dict with open('./transformation_dict.pickle', 'wb') as tf: pickle.dump(transformation, tf, protocol=pickle.HIGHEST_PROTOCOL) ## Pickle transformation dict with open('./groups_dict.pickle', 'wb') as tf: pickle.dump(groups, tf, protocol=pickle.HIGHEST_PROTOCOL) ``` ## Script 4 ```python import sys sys.path.append('/hpc-home/ggarcia/git_sources/bsg/build/SDGpython') import SDGpython as SDG import os import pickle ## TODO: ## - remove all links al principio (no no afecta al mapper ahora) ## - Coniderar usar los thread en lugar de los read perfect matchings para lo de abajo ## Pickle transformation dict with open('./transformation_dict.pickle', 'rb') as tf: transformation = pickle.load(tf) ## Pickle transformation dict with open('./groups_dict.pickle', 'rb') as tf: groups = pickle.load(tf) LOAD_FROM_DISK=False if not LOAD_FROM_DISK: ws = SDG.WorkSpace() ws.sdg.load_from_gfa('./sedg_nsl.gfa') deleted_links = 0 for nv in ws.sdg.get_all_nodeviews(): if len(nv.prev())>1: for pn in nv.prev(): deleted_links+=1 ws.sdg.remove_link(-pn.node().node_id(), nv.node_id()) if len(nv.next())>1: for nn in nv.next(): deleted_links+=1 ws.sdg.remove_link(-nv.node_id(), nn.node().node_id()) if len(nv.prev()) == 1 and len(nv.next()) == 1 and abs(nv.prev()[0].node().node_id()) == abs(nv.next()[0].node().node_id()): ws.sdg.remove_link(-nv.node_id(), nv.next()[0].node().node_id()) ws.sdg.remove_link(-nv.prev()[0].node().node_id(), nv.node_id()) deleted_links+=2 print("Deleted %s links" %(deleted_links)) lords = ws.add_long_reads_datastore('./data/strbrg_nanopore.loseq') lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31) lrr.map(31) #for x in range(1,17972097, 50000): # print("Mapping %s-%s" %(x, x+50000)) # lrr.map(31, x, x+50000) lrr.thread_reads(end_matches=1) ws.dump('./sedg_nsl.sdgws') lrr.dump('./sedg_nsl.lrr.data') ### -------- ### else: ws = SDG.WorkSpace('./sedg_nsl.sdgws') lords = ws.get_long_reads_datastore('long_reads') lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31) lrr.load('./sedg_nsl.lrr.data') ## transform reads to group mappings XXX PIERDE DISTANCIA XXX trpm=[] for rpm in lrr.read_perfect_matches: if len(rpm)<2: trpm.append([]) continue ms=[transformation[rpm[0].node]] for ix in range(1,len(rpm)-1): x=rpm[ix] g=transformation[x.node] if g==ms[-1]: continue #same group as before if ms[-1]==transformation[rpm[ix+1].node]: continue #sandwich ms.append(g) if transformation[rpm[-1].node]!=ms[-1]: ms.append(transformation[rpm[-1].node]) trpm.append(ms) ## Pickle transformation dict with open('./trpm.pickle', 'wb') as tf: pickle.dump(transformation, tf, protocol=pickle.HIGHEST_PROTOCOL) def gid_reads(): """ collection to enter using the group id in absolute value and return the mapped reads to translate later""" gid2reads={abs(gid):set() for gid in groups.keys()} for i, read in enumerate(trpm): for gid in read: gid2reads[abs(gid)].add(i) return gid2reads gid2reads = gid_reads() ## TODO: en lugar de vota el next, hacer un MLDG con la informacion de las lecturas largas ## Uno con y uno sin conexione totales. ## Distancia # de grupos def get_next(gid, min_links=5, black_list=[]): """ Given a group id return the next group using long reads """ c=Counter() trpm_ids = gid2reads[abs(gid)] ttrpm = [trpm[i] for i in trpm_ids] for x in ttrpm: if gid in black_list or -gid in black_list: continue ## If node is a super connected one don't vote (should be abs??) if -gid in x: x= [y*-1 for y in x[::-1]] if gid in x and x[-1] != gid: c[x[x.index(gid)+1]]+=1 if len(c.most_common())>0 and c.most_common(1)[0][1]>min_links: return c.most_common(1)[0][0] else: None ws2 = SDG.WorkSpace() ## Add all groups as nodes group2node={} node2group={} all_nodes = [nodo for k, group in groups.items() for nodo in group] for gid in groups.keys(): group_size = sum([ws.sdg.get_nodeview(x).size() for x in groups[gid]]) new_node = ws2.sdg.add_node("A"*group_size) group2node[gid] = new_node node2group[new_node] = gid print("%s nodes in ws2 graph" %len(ws2.sdg.get_all_nodeviews())) # def runall(): ## ndg = SDG.DistanceGraph(ws2.sdg) for i, g in enumerate(groups.keys()): visited=set() prev_group = None next_group = get_next(g) while next_group is not None and next_group not in visited: if prev_group is not None: if prev_group<0: n1=-group2node[abs(prev_group)] else: n1=group2node[abs(prev_group)] if next_group<0: n2=-group2node[abs(next_group)] else: n2=group2node[abs(next_group)] # print("%s --> %s" %(n1, n2)) # if not ws.sdg.are_connected(-n1, n2): ndg.add_link(-n1, n2, 0, SDG.Support()) visited.add(next_group) prev_group = next_group next_group = get_next(next_group) print("\r%s/%s" %(i, len(groups)), end='') if i%20000==1: ndg.write_to_gfa1('./dummygraph_%i.gfa' %i) ndg.write_to_gfa1('./dummygraph2.gfa') ``` ## BJ's threaded graph ```python lrr.thread_reads(end_matches=1) trg=lrr.dg_from_threads(False) #trgm=lrr.dg_from_threads(True) lu=SDG.LinkageUntangler(trg) lu.select_all() nsl=lu.make_nextselected_linkage(5) nodes = [] for nv in nsl.get_all_nodeviews(): if len(nv.next())==2 and len(nv.prev())==2: nodes.append(nv.node_id()) lu.selected_nodes[nv.node_id()]==False nsl=lu.make_nextselected_linkage(5) wsg = SDG.WorkSpace() ## Add all groups as nodes group2node={} node2group={} all_nodes = [nodo for k, group in groups.items() for nodo in group] for gid in groups.keys(): group_size = sum([ws.sdg.get_nodeview(x).size() for x in groups[gid]]) new_node = wsg.sdg.add_node("A"*group_size) group2node[gid] = new_node node2group[new_node] = gid dg=SDG.DistanceGraph(wsg.sdg) for rpm in lrr.read_perfect_matches: if len(rpm)<2: trpm.append([]) continue ms=[transformation[rpm[0].node]] for ix in range(1,len(rpm)-1): x=rpm[ix] g=transformation[x.node] if g==ms[-1]: continue #same group as before if ms[-1]==transformation[rpm[ix+1].node]: continue #sandwich ms.append(g) if transformation[rpm[-1].node]!=ms[-1]: ms.append(transformation[rpm[-1].node]) trpm.append(ms) ``` ``` lu.select_all() lu.make_next_selected_linkage() ``` ```python def dump_partial_graph(filename,dg,nid, radious=10, max_nodes=10000): nodes=set([abs(nid)]) for x in range(radious): nn=list(nodes) for nid in nn: for l in dg.get_nodeview(nid).prev()+dg.get_nodeview(nid).next(): nodes.add(abs(l.node().node_id())) if len(nodes)==max_nodes: break if len(nodes)==max_nodes: break dg.write_to_gfa1(filename,selected_nodes=list(nodes)) #print(nodes) ``` ```python def nodes_in_threads_fw(nv): c=Counter() reads=set([x.support().id for x in nv.next()]) to_explore=[nv] while to_explore: #print("exploring nodes:",[x.node_id() for x in to_explore]) new_to_explore=[] for nnv in to_explore: for ln in nnv.next(): if ln.support().id in reads: if c[ln.node().node_id()]==0: new_to_explore.append(ln.node()) c[ln.node().node_id()]+=1 to_explore=new_to_explore return c class HaplotypePuller(object): """Pulls a haplotype (set of nodes /reads) from a thread dg: - A node is a good candidate to belong if: - most (a high %) of its connecitons are to nodes in the haplotype - most (a high %) of its reads are reads in the haplotype - A read is a good candidate to belong if: - most (a high %) of its nodes are in the haplotype - To start you need a node set that is a reasonable hipothesis for haplotype region, you can get that straight away from a good long read. """ def __init__(self,dg,llr): self.dg=dg self.lrr=lrr self.node_ids=set() self.read_ids=set() def start_from_read_nodes(self,rid): """starting from all nodes from a single read gives a set of "mostly correct haplotype" nodes""" self.node_ids.update([m.node for m in lrr.read_threads[rid]]) self.read_ids.add(rid) def start_node_neighbourhood(self,nid,min_reads=10): for nnid,c in nodes_in_threads_fw(self.dg.get_nodeview(nid)).items(): if c>=min_reads: self.node_ids.add(nnid) for nnid,c in nodes_in_threads_fw(self.dg.get_nodeview(-nid)).items(): if c>=min_reads: self.node_ids.add(-nnid) def nodes_fw_inout(self,nid,min_c=2): """get a node connection """ nv=self.dg.get_nodeview(nid) nodes_in=0 nodes_out=0 for nid,c in nodes_in_threads_fw(self.dg.get_nodeview(nid)).items(): if c<min_c: continue if nid in self.node_ids: nodes_in+=c else: nodes_out+=c return (nodes_in,nodes_out) def nodes_fw_perc(self,nid,min_c=2): """get a node connection """ nodes_in,nodes_out=self.nodes_fw_inout(nid,min_c) return nodes_in/(nodes_in+nodes_out) def nodes_all_perc(self,nid,min_c=2): """get a node connection """ fnodes_in,fnodes_out=self.nodes_fw_inout(nid,min_c) bnodes_in,bnodes_out=self.nodes_fw_inout(-nid,min_c) nodes_in=bnodes_in+fnodes_in nodes_out=bnodes_out+fnodes_out return nodes_in/(nodes_in+nodes_out) ``` ## BJ's segregation filtering The idea is to use coverage on Hapil and the 20 F1s to filter nodes in each read making sure only haplotype-specific nodes are left in. For each read we first need to check what the typical profile of segregation is: - This is easier to do on nodes/kmers that are absent from hapil. Since we are already picking nodes that are "single copy" on RG, nodes completely absent from Hapil should have f=0.5. ```python rg_only_count=0 rg_only_bp=0 total_count=0 total_bp=0 for nv in ws.sdg.get_all_nodeviews(): if [x for x in nv.kmer_coverage("rgxh31","hapil") if x<=5]: rg_only_count+=1 rg_only_bp+=nv.size() total_count+=1 total_bp+=nv.size() print("%d / %d nodes, totalling %dbp / %dbp, had RG-only k-mers" % (rg_only_count,total_count,rg_only_bp,total_bp)) ``` ```python countnames=[x for x in kc.list_names() if x!="sdg" and x!="hapil"] node_seg={} for nv in ws.sdg.get_all_nodeviews(): node_seg[nv.node_id()] = "".join( [ str(int(mean(nv.kmer_coverage("rgxh31",x))>5)) for x in countnames]) if nv.node_id()%10000==0: print(nv.node_id()) def fnodeseg(nv): if (mean(nv.kmer_coverage("rgxh31","hapil"))>5): return "11111111111111111111" return "".join( [ str(int(mean(nv.kmer_coverage("rgxh31",x))>5)) for x in countnames]) def find_read_pattern(matches): c=Counter() t=0 for m in matches: sp=fnodeseg(ws.sdg.get_nodeview(m.node)) if sp!="11111111111111111111": c[sp]+=1 t+=1 if t==0: return "" p=c.most_common(1)[0] if p[1]<t*.51: return "" return p ``` ```python def filter_read_by_segregation(rid): c=Counter([cf.get_pattern(ws.sdg.get_nodeview(m.node)) for m in lrr.read_perfect_matches[rid]]) if len(c)==0: return [] w=c.most_common(1)[0] if w[1]<sum(c.values())*.75: return [] return [m for m in lrr.read_perfect_matches[rid] if cf.get_pattern(ws.sdg.get_nodeview(m.node))==w[0]] srpm=[filter_read_by_segregation(rid) for rid in range(1,len(lrr.read_perfect_matches))] for i in range(len(srpm)): lrr.read_perfect_matches[i]=srpm[i] lrr.recruit(1,1) lrr.dump('llr_after_segfilterv1.dump') def filter_read_by_segregation2(rid): c=Counter([cf3.get_pattern(ws.sdg.get_nodeview(m.node)) for m in lrr.read_perfect_matches[rid]]) if c['11111111111111111111']: c.pop('11111111111111111111') if len(c)==0: return [] w=c.most_common(1)[0] if w[1]<sum(c.values())*.75: return [] return [m for m in lrr.read_perfect_matches[rid] if cf3.get_pattern(ws.sdg.get_nodeview(m.node))==w[0]] srpm2=[filter_read_by_segregation2(rid) for rid in range(len(lrrraw.read_perfect_matches))] for i in range(len(srpm2)): lrrraw.read_perfect_matches[i]=srpm2[i] ``` ## Analysing haplotype-segegated node lines. ### Proposed method: 1) Create a set of "anchors" (the gonza way works fine so far). 2) LRR -> generate perfect mappings. 3) Generate "segregation signatures" for all nodes. 4) Find the "prevalent signature" for each read, remove all nodes with any other signatures from the read. 5) Now we should be able to grab a set of nodes with the same signature, and create a linear order for each connected component in them. - If a node belongs to "more than one component", or "more than one location" in the same component, we need to remove it from the problem. - One way to check is to validate that the node divides its neighbours in "before" and "after", and that these sets of nodes are connected between them (i.e. all before and all after nodes are a single component, even if you remove the node). Nodes crossing lines will generate two "before" and two "after" components, or if they appear twice in the genome in short distance, they will generate "before" and "after" sets that share some nodes: - For each connection FW and BW from a node: - Get all the reached nodes, up to a distance from the original node. - If the node is a cross, it will have two subsets of connections fw and two sets bw, without connections between them bw1 and bw2, nor fw1 or fw2. - if bw1 connects to fw1 without going thorugh the node, and bw2 connects to fw2 without going thorugh the node, then it is safe to delete the node. - Otherwise, the reads can be partitioned, by creating a cluster of all reads that share any number of nodes, without counting the "joining" node. (i.e. start with all reads in independent sets of nodes, if two sets share X nodes, join them, repeat ad nauseoum, if at the ende there's more than one set, there you are.) ### Open "problems" - Can we find propper thresholds for filtering? - Can we use a "clever" filter that differentiates when things are and aren't in hapil? - Given a sortable "line", create an order -> linearise - solve the "local repeat" problem -> two similar unique motifs in the same haplotype -> errors in one "land" in the other. IF A NODE DOES NOT HAVE A SINGLE POSITION IN A TOTAL SORT FOR THE HAPLOTYPE, THROW AWAY (SAME NEIGHBOURS, SAME RELATIVE POSITION) -> THROW THE NODE AWAY -> - If pattern '11100111111101111101' gets linearised/split, it's done. - solve the "node classification vs. read classification problem" -> nodes will get classified to the wrong pattern, if all the reads they appear on have a close-by pattern, we can safely reclassify the node. - reads that span multiple patterns/re-joining through breakpoints/ whatever -> current view is "we do this afterwards". We could technically allow for small distances form the main pattern in the read, or even allow for ambiguity in the read pattern, which would eliminate this problem, but would bring back cross-haplotype talking. ```python cf2=SDG.CountFilter('rgxh31','sdg',100,kc.list_names()[1:-1],[5 for x in kc.list_names()[1:-1]]) def filter_read_by_segregation2(rid): c=Counter([cf2.get_pattern(ws.sdg.get_nodeview(m.node)) for m in lrrraw.read_perfect_matches[rid]]) if c['11111111111111111111']: c.pop('11111111111111111111') if len(c)==0: return [] w=c.most_common(1)[0] if w[1]<sum(c.values())*.75: return [] return [m for m in lrrraw.read_perfect_matches[rid] if cf2.get_pattern(ws.sdg.get_nodeview(m.node))==w[0]] srpm2=[filter_read_by_segregation2(rid) for rid in range(len(lrrraw.read_perfect_matches))] for i in range(len(srpm2)): lrrraw.read_perfect_matches[i]=srpm2[i] lrrraw.thread_reads(end_matches=1) trg3=lrrraw.dg_from_threads(True) lu3=SDG.LinkageUntangler(trg3) lu3.select_all() nsl3=lu3.make_nextselected_linkage(2) ``` ### Finding the order of a lineal component 1) take the closest neighbour. 2) if the end is hit, backtrack 3) complete intermediate neighbours. Or: start with an order with a single node pick the longest-reaching read fw from the last node and add all its nodes pick the longest-reaching read bw from the first node and add all its nodes repeat till no reads remain from the last nodes fill in-between nodes. # general order of different signatures ```python def pdistance(p1,p2): return len([1 for i,x in enumerate(p1) if x!=p2[i]]) def find_related_signatures(p1): rids=Counter() for pnode in pattern_map[p1]: for rid in lrr.node_reads[pnode]: rids[abs(rid)]+=1 rids=[k for k,v in rids.items() if v>5] nc=Counter() print("Processing", len(rids), "reads") t=0 for rid in rids: t+=1 if t%1000==0: print(t,len(nc)) #print(rid) pcounter = Counter( [cf2.get_pattern(ws.sdg.get_nodeview(m.node)) for m in lrr.read_perfect_matches[rid]]) for k,v in pcounter.items(): if v>5: nc[k]+=1 for x in nc.most_common(10): print(x,pdistance(x[0],p1)) ``` ```python def plrseg(rid,cf,drop_last=False): for x in zip(*[cf.get_pattern(ws.sdg.get_nodeview(m.node)) for m in lrr.read_perfect_matches[rid] if not drop_last or cf.get_pattern(ws.sdg.get_nodeview(m.node))[-1]=='0']): print('> ',''.join(x).replace('1','*').replace('0',' '),'\t','|' * int(10*x.count('1')/len(x))) def print_spectra(kc,lim): print(', '.join(kc.list_names())) s=[kc.count_spectra(n,lim) for n in kc.list_names()] for xs in zip(*s): print(', '.join([str(x) for x in xs])) ``` # Creating a better pattern filter for a read Strategy: - Pre-filtering of "present-everywhere" signal: - if there's up to 75% present-everywhere matches, just ignore them. More than 75% -> discard read. - Pre-filtering of hapil noise: - In regions where hapil is often absent (30%), just discard all nodes that have a hapil component, work on a signal for the rest. Use that signal to filter, irregardless of wheter hapil is present or not. - Signal finding: ```python from collections import Counter def best_patterns(patterns,verbose=False): if len(patterns)<10: if verbose: print('less than 10 matches, giving up!') return [] all_1s='1'*len(patterns[0]) pc=Counter(patterns) t=len(patterns) if pc[all_1s]>=t*.75: if verbose: print('Too many 1s, giving up!') return [] t-=pc[all_1s] pc[all_1s]=0 hc=sum([c for p,c in pc.items() if p[-1]=='1']) #count patterns with hapil if hc>t*.7: if verbose: print('Too many patterns with hapil, using full pattern') fpc=pc ft=t else: if verbose: print('Discarding',hc,'/',t,'patterns with hapil') fpc=Counter() ft=0 for p,c in pc.items(): if p[-1]!='1': fpc[p]=c ft+=c w=fpc.most_common(1)[0][0] if verbose: print("most common pattern:",fpc.most_common(1),'/',ft) if fpc[w]>.60*ft: if w[-1]=='1': return [w] else: return [w,w[:-1]+'1'] else: if verbose: print('Winner not 60% of filtered matches, giving up') return [] def filter_read(rid,lrr,cf,verbose=False): patterns=[cf.get_pattern(ws.sdg.get_nodeview(m.node)) for m in lrr.read_perfect_matches[rid]] bp=best_patterns(patterns,verbose) if verbose: print("winning patterns:",bp) fm=[] for i,p in enumerate(patterns): if p in bp: fm.append(lrr.read_perfect_matches[rid][i]) return fm cf3=SDG.CountFilter('rgxh31','sdg',100,kc.list_names()[1:],[2 for x in kc.list_names()[1:-1]+[30]]) srpm2=[filter_read(rid,lrr,cf3) for rid in range(len(lrr.read_perfect_matches))] for i in range(len(srpm2)): lrr.read_perfect_matches[i]=srpm2[i] lrr.recruit(1,1) lrr.thread_reads(end_matches=1) trg=lrr.dg_from_threads(True) lu=SDG.LinkageUntangler(trg) lu.select_all() nsl3=lu.make_nextselected_linkage(3) nsl5=lu.make_nextselected_linkage(5) nsl8=lu.make_nextselected_linkage(8) def get_connected_component(dg,nid,signed=True): component_nodes=set([nid]) last_added=set([nid]) while last_added: new_last_added=set() for nid in last_added: nv=dg.get_nodeview(nid) for o in nv.next()+nv.prev(): onid=o.node().node_id() if not signed: onid=abs(onid) if onid not in component_nodes: component_nodes.add(onid) new_last_added.add(onid) last_added=new_last_added return component_nodes def dump_dg_to_text(filename, dg, node_id=None): if node_id is not None: print("Collecting subgraph") all_nodes = [dg.get_nodeview(node) for node in get_connected_component(dg, node_id, signed=False)] else: print("Collecting entire graph") all_nodes = dg.get_all_nodeviews() print('Writing output') with open(filename, 'w') as dg_file: for nv in all_nodes: ## Nodes fw for nf in nv.next(): source=-nv.node_id() dest=nf.node().node_id() distance=nf.distance() support_id=nf.support().id support_index=nf.support().index if abs(source)<=abs(dest): dg_file.write("%s %s %s %s %s\n" %(source, dest, distance, support_index, support_id)) ## Nodes bw for nf in nv.prev(): dest=-nf.node().node_id() source=nv.node_id() distance=nf.distance() support_id=nf.support().id support_index=nf.support().index if abs(source)<=abs(dest): dg_file.write("%s %s %s %s %s\n" %(source, dest, distance, support_index, support_id)) return None def load_dg_from_text(dg, filename): with open(filename, 'r') as linkfile: for line in linkfile: sline = line.strip().split(" ") sup = SDG.Support(SDG.SupportType.stUndefined, int(sline[3]), int(sline[4])) dg.add_link(int(sline[0]), int(sline[1]), int(sline[2]), sup) return None ``` ```python ``` ```python def process_reads(first,end): cf3=SDG.CountFilter('rgxh31','sdg',100,kc.list_names()[1:],[2 for x in kc.list_names()[1:-1]+[30]]) return [filter_read(rid,lrr,cf3) for rid in range(first,end)] ``` # Next problem up: do a trg (multi=False). then basically disconnecting (i.e. popping) the nodes from the trg should solve the graph. every time a noisy node is popped out we recover information in the non-noisy nodes. only problem: popping only works if there is a clear next/prev, if a reads goes twice through the same node, it is impossible to pop. We can just discard these when making the threaded graph. or accept the disconnection imposed by removing rather than popping in that particular read (i.e. if a reads goes twice through the same node, we disconnect it in the subcomponents between the node.) ```python from statistics import median def pop_node(dg,node_id,read): #if read==8577436: print("pop %d from %d"%(node_id,read)) nv=dg.get_nodeview(node_id) ln=[l for l in nv.next() if l.support().id==read] lp=[l for l in nv.prev() if l.support().id==read] if len(ln)!=1 or len(lp)!=1 or abs(ln[0].node().node_id())==abs(node_id) or abs(lp[0].node().node_id())==abs(node_id): #print("can't pop %d from %d"%(node_id,read)) return False td=ln[0].distance()+nv.size()+lp[0].distance() dg.add_link(-lp[0].node().node_id(),ln[0].node().node_id(),td,lp[0].support()) dg.remove_link(-node_id,ln[0].node().node_id(),ln[0].distance(),ln[0].support()) dg.remove_link(-lp[0].node().node_id(),node_id,lp[0].distance(),lp[0].support()) return True def make_summary_connection_graph(mdg,links_to_connect=3): """returns a new dg with only one connection per node end pair (i.e. mdg to dg)""" conn={} dg=SDG.DistanceGraph(mdg.sdg) for nv in mdg.get_all_nodeviews(include_disconnected=False,both_directions=True): for ln in nv.next(): ck=(-nv.node_id(),ln.node().node_id()) if ck[0]<=ck[1]: if ck not in conn: conn[ck]=[] conn[ck].append(ln.distance()) for c,d in conn.items(): if len(d)>=links_to_connect: dg.add_link(c[0],c[1],int(median(d))) return dg def write_connected_nodes_graph(filename,dg): dg.write_to_gfa1(filename,selected_nodes=[x.node_id() for x in dg.get_all_nodeviews(include_disconnected=False)]) ```