Strawberry Sort: clean version

# Strawberry Sort: clean version **Note: this note is to be maintained by bj only, please do not muddle with it. It will contain the current "best version" of the full RGxHapil strawberry pipeline.** Index: [toc] --- ## Step 1: DBG and anchor runs The idea behind this step is two-fold: - Create an "inventory" of the 31-mer spectrum, that will be used on subsequent steps to define what a "unique anchor" is. The unique anchors are unitig regions that have all their 31-mers between MIN_UKMER_FREQ and MAX_UKMER_FREQ (i.e a "single peak"). - To make the graph simpler to start with, we create unitigs from a ```python= import sys sys.path.append('/hpc-home/ggarcia/git_sources/bsg/build/SDGpython') import SDGpython as SDG import os MIN_UKMER_FREQ=57 MAX_UKMER_FREQ=135 K=31 ## SDG-DBG assembly k=63 ## Min coverage is important to reduce the noice GK=63 MIN_CVG=30 NUM_BATCHES=8 ws=SDG.WorkSpace() peds=ws.add_paired_reads_datastore('./data/strbrg_pe.prseq', 'pe') gm=SDG.GraphMaker(ws.sdg) gm.new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES); #this is so the initial kmer count includes all 31-mers print("First graph done") kc = ws.add_kmer_counter('pek31', K) kc.add_count('pe', peds) print("kmers counted") gm.new_graph_from_paired_datastore(peds,GK,MIN_CVG,NUM_BATCHES); #this is the real DBG k print("second graph done") ws.sdg.write_to_gfa1("./00-sdg_dbg_graph_MC%s.gfa" %MIN_CVG) gc=SDG.GraphContigger(ws) gc.clip_tips(2*GK-1,3) print("Remaining nodes: %s" %(len(ws.sdg.get_all_nodeviews()))) kc.update_graph_counts() ws.dump("./00-sdg_dbg_graph_MC%s.sdgws" %MIN_CVG) ## Check step1 passed=True ccc=zip(kc.count_spectra('pe',present_in_graph=False),kc.count_spectra('pe',present_in_graph=True)) for i,x in list(enumerate(ccc))[MIN_UKMER_FREQ:MAX_UKMER_FREQ+1]: if x[0]!=x[1]: print(x[0]-x[1],"Kmers are missing at f=",i," -> ",x) passed=False if passed: print("Check passed!!!!") else: print("Check FAILED!!!!") ## Make unique kmer runs seqs = gc.contig_reduction_to_unique_kmers(MIN_UKMER_FREQ,MAX_UKMER_FREQ) print(len(seqs),"sequences for anchoring") with open("./pseudoLong2.fasta","w") as f: for i, s in enumerate(seqs): f.write(">seq_%s\n%s\n"%(i,s)) ``` --- ## Step 2: workspace, KmerCounters, and nanopore mapping k=31 index, every unique mapping is kept. --- ## Step 3: filtering nanopore reads by F1 pattern ```python= from collections import Counter def best_patterns(patterns,verbose=False): if len(patterns)<10: if verbose: print('less than 10 matches, giving up!') return [] all_1s='1'*len(patterns[0]) pc=Counter(patterns) t=len(patterns) if pc[all_1s]>=t*.75: if verbose: print('Too many 1s, giving up!') return [] t-=pc[all_1s] pc[all_1s]=0 hc=sum([c for p,c in pc.items() if p[-1]=='1']) #count patterns with hapil if hc>t*.7: if verbose: print('Too many patterns with hapil, using full pattern') fpc=pc ft=t else: if verbose: print('Discarding',hc,'/',t,'patterns with hapil') fpc=Counter() ft=0 for p,c in pc.items(): if p[-1]!='1': fpc[p]=c ft+=c w=fpc.most_common(1)[0][0] if verbose: print("most common pattern:",fpc.most_common(1),'/',ft) if fpc[w]>.60*ft: if w[-1]=='1': return [w] else: return [w,w[:-1]+'1'] else: if verbose: print('Winner not 60% of filtered matches, giving up') return [] def filter_read(rid,lrr,cf,verbose=False): patterns=[cf.get_pattern(ws.sdg.get_nodeview(m.node)) for m in lrr.read_perfect_matches[rid]] bp=best_patterns(patterns,verbose) if verbose: print("winning patterns:",bp) fm=[] for i,p in enumerate(patterns): if p in bp: fm.append(lrr.read_perfect_matches[rid][i]) return fm cf3=SDG.CountFilter('rgxh31','sdg',100,kc.list_names()[1:],[2 for x in kc.list_names()[1:-1]+[30]]) srpm2=[filter_read(rid,lrr,cf3) for rid in range(len(lrr.read_perfect_matches))] for i in range(len(srpm2)): lrr.read_perfect_matches[i]=srpm2[i] lrr.dump('lrr_after_segfilterv2.dump') ``` --- ## Step 4: multi-component thread dg partial orders Load previos status: ```python= import SDGpython as SDG ws=SDG.WorkSpace() ws.sdg.load_from_fasta('pseudoLong2.fasta') lords = ws.add_long_reads_datastore('data/strbrg_nanopore.loseq','nanopore') lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31) lrr.load('lords_kmerruns.mapped.data') lrrf=SDG.LongReadsRecruiter(ws.sdg,lords,31) lrrf.load('lrr_after_segfilterv2.dump') ``` ```python= def extend_solution(tgs, min_support=2,min_shared=10,min_new=10): finished=False for step in range(10000): print("\nStarting expansion step",step) s=sorted(tgs.sort_graph().items(), key=lambda x: x[1]) if s: print("Current sorted size:",s[-1][1]-s[0][1]) else: print("No sorted size") # print("Current sorted size:",s[-1][1]-s[0][1]) added=0 for nodes in [[x[0] for x in s[:30]],[x[0] for x in s[-30:]]]: rids=set() for nid in nodes: rids.update(tgs.rids_from_node(trg_nt.get_nodeview(nid))) while rids: to_add=0 largest_new=min_new-1 to_add_old=0 for rid in rids: if rid in tgs.used_reads: continue old,new=tgs.evaluate_read(rid) if old>=min_shared and new>largest_new: to_add=rid largest_new=new to_add_old=old if to_add: print("adding read %d with %d new and %d old nodes" % (to_add,largest_new,to_add_old)) rno=list(tgs.evaluate_read_nodeorder(to_add)) print(rno) print("%d ordered, %d unordered and %d unused nodes" %(rno[0], rno[1], rno[2])) if rno[1]: print("skipping rid with unordered nodes") rids.remove(to_add) else: tgs.add_read(to_add,min_support) added+=1 break else: print("can't find any read to add!!!") break if not added: print("can't find any read to add on both directions!!!") finished=True break if finished: break class CCSorter(): """Takes an initial node and builds all TGSs needed to sort most long reads""" def __init__(self,trg_nt,founding_node,min_nodes_per_tracked_read=30): self.sorters=[] self.trg_nt=trg_nt self.founding_node=founding_node self.master_tgs=SDG.TheGreedySorter(trg_nt,founding_node) #for every read, check how many nodes it's got, and keep track of every read with more than X nodes self.read_nodes={} for rid,ends in self.master_tgs.read_ends.items(): rnodes=set() nv=self.trg_nt.get_nodeview(ends[0]) while True: try: rnodes.add(abs(nv.node_id())) nv=[x for x in nv.next() if x.support().id==rid][0].node() except: break if len(rnodes)>=min_nodes_per_tracked_read: self.read_nodes[rid]=rnodes def read_status(self,rid): used=unused=0 sortersets=[set(list(s.used_nodes)) for s in self.sorters] for nid in self.read_nodes[rid]: for s in sortersets: if nid in s: used+=1 break else: unused+=1 return used,unused def run_new_sorter(self): possible_starts=[] for rid in self.read_nodes.keys(): used,unused=self.read_status(rid) if used<3: possible_starts.append((rid,used,unused)) possible_starts.sort(key=lambda x:(-x[1],x[2]),reverse=True) if len(possible_starts)<10: print("less than 10 possible starts, aborting") return self.sorters.append(SDG.TheGreedySorter(self.trg_nt,self.founding_node)) self.sorters[-1].start_from_read(possible_starts[0][0]) extend_solution(self.sorters[-1]) def read_usage(self): possible_starts=[] used_reads=unused_reads=0 for rid in self.read_nodes.keys(): used,unused=self.read_status(rid) if used>10: used_reads+=1 else: unused_reads+=1 return used_reads,unused_reads def make_all_orders(self): old_order_count=-1 while len(self.sorters)>old_order_count: old_order_count=len(self.sorters) self.run_new_sorter() ``` ```python= lrrf.thread_reads(end_matches=1) trg_nt=lrrf.dg_from_threads(False,True) ccs100=trg_nt.get_all_connected_components(100) multisorters=[] for x in ccs100: multisorters.append(CCSorter(trg_nt,list(x)[0])) multisorters[-1].make_all_orders() ``` --- ## Step 5: total order ```python= lrr.thread_reads(end_matches=1) trg_ntu=lrr.dg_from_threads(False,True,20) tgsu=SDG.TheGreedySorter(trg_ntu) rtgsu=SDG.TheGreedySorter(trg_ntu) ``` **TODO** - Project read links across the order. - Find unordered nodes already included in the order and pop them. - For each read mapped to the node, walk fw and bw until hitting a node included in the order, and check the consistency. ```python= from datetime import datetime def write_connected(dg,filename): dg.write_to_gfa1(filename,selected_nodes=[x.node_id() for x in dg.get_all_nodeviews(include_disconnected=False)]) def printlog(*args, **kwargs): print(datetime.now().strftime('%H:%M:%S'),*args, **kwargs) def local_happiness_assembler(tgs, rtgs, reads_tg, starting_rid, min_support=2, max_unhappy=0, max_disconnected_perc=.4,min_shared=10,min_new=10,max_unhappy_nodes_in_new_read=10,max_steps=10000000): tgs.start_from_read(starting_rid,min_support) for stepnumber in range(max_steps): printlog("step #",stepnumber,"starting") write_connected(tgs.dg,"step%d_a-starting.gfa"%(stepnumber)) # remove unhappy nodes from order printlog("sorting graph") order=tgs.sort_graph() printlog("full order hapiness assessment") nbh=[(x[0],x[1],SDG.assess_node_happiness(x[0],order,reads_tg)) for x in sorted(order.items(),key=lambda x:x[1])] # (nid, pos, (happy, unhappy, disconnected)) #first remove all nodes that are too disconnected: they won't get any better printlog("removing disconnected") unhappy_nodes=[] removed_disconnected=removed_unhappy=0 for x in nbh: if x[2][0]==0 or 1.0*x[2][2]/sum(x[2])>max_disconnected_perc: #pop_node_from_all(tgs.dg,) #print(x, "should go!") tgs.remove_node(x[0]) removed_disconnected+=1 elif x[2][1]>max_unhappy: unhappy_nodes.append(x) printlog("removing unhappy") #now remove any unhappy nodes unhappy_nodes.sort(key=lambda x:x[2][1]) for x in unhappy_nodes: h=SDG.assess_node_happiness(x[0],tgs.sort_graph(),reads_tg) if h[1]>max_unhappy: tgs.remove_node(x[0]) removed_unhappy+=1 printlog("%d disconnected and %d unhappy nodes removed from order"%(removed_disconnected,removed_unhappy)) write_connected(tgs.dg,"step%d_b-cleaned.gfa"%(stepnumber)) ###TODO: THIS IS OLD CODE! added=0 s=sorted(tgs.sort_graph().items(),key=lambda x:x[1]) for nodes in [[x[0] for x in s[:30]],[x[0] for x in s[-30:]]]: rids=set() for nid in nodes: rids.update(tgs.rids_from_node(reads_tg.get_nodeview(nid))) printlog("choosing among %d candidate reads" % len(rids)) while rids: to_add=0 largest_new=min_new-1 to_add_old=0 for rid in rids: if rid in tgs.used_reads: continue old,new=tgs.evaluate_read(rid) if old>=min_shared and new>largest_new: to_add=rid largest_new=new to_add_old=old if to_add: printlog("evaluating read %d with %d new and %d old nodes" % (to_add,largest_new,to_add_old)) rnv=reads_tg.get_nodeview(tgs.read_ends[to_add][0]) unhappy=[] while True: if rnv.node_id() in tgs.sort_graph() or -rnv.node_id() in tgs.sort_graph(): h=SDG.assess_node_happiness(rnv.node_id(), tgs.sort_graph(), reads_tg) if h[1]>max_unhappy: unhappy.append(rnv.node_id()) rnvn=[x for x in rnv.next() if x.support().id==to_add] if not rnvn:break rnv=rnvn[0].node() if len(unhappy)>max_unhappy_nodes_in_new_read: print("skipping rid with unordered nodes") rids.remove(to_add) else: printlog("adding read") tgs.add_read(to_add,min_support) write_connected(tgs.dg,"step%d_c-read_added%d.gfa"%(stepnumber,added)) for u in unhappy: tgs.pop_node(u,to_add) added+=1 write_connected(tgs.dg,"step%d_c-read_added%d_popped.gfa"%(stepnumber,added)) break else: print("can't find any read to add!!!") break if added==0: print("no read was added") break # find a read to add that makes the order grow and has few unhappy nodes printlog("resorting order") s=sorted(tgs.sort_graph().items(),key=lambda x:x[1]) printlog("order is now",s[-1][1],"bp long") # if no read found: STOP # remove unhappy nodes from read until all read is happy # add read to order ``` ```python= local_happiness_assembler(tgsu,rtgsu,trg_ntu,11676965,max_steps=30) ``` TODO: before adding a read, measure its' node's future happiness, remove future unhappy nodes, but only use reads that have few of them TODO: Choose reads not by "nodes shared/new" but by growing the order in one direction. Future hapiness: a node with good connections fw/bw to the order (in the raw graph) Keep a "nodes used per read" count for all reads in the TGS. Future happy nodes appear in reads that have a high "nodes used per read" count. In TheGreedySorter: - NodesUsedPerRead: Keep a uint64_t vector, one entry per read, count how many nodes of that read are included in the order. - If a node "belongs" in the order, all the reads mapping to the node in the raw graph will have HIGH NodesUsedPerRead values. - If a node "belongs" in a parallel order, all the reads mapping to the node will have low or null NodesUsedPerRead values. - Nodes towards the end of the order will have some reads that span in the direction the order has not yet expanded to, so their values will always be low. We need to judge a node belonging only on reads that go towards the order (or, allow for some of the reads to have a small value, which would not hurt anyway). To add a read to the order, pick one where all its nodes have a high belonging value. ``` c=Counter() for nv in tgsu.dg.get_all_nodeviews(include_disconnected=False): nv=trg_ntu.get_nodeview(nv.node_id()) c.update(list(set([x.support().id for x in nv.next()+nv.prev()]))) ``` now the counter has a list of reads and how many nodes they share with the order. if node should go in the order, all its reads should have a high shared value with the order. node future happines: how many reads the node shares with other nodes in the haplotype (ish). read future happiness: sum of all its node's happiness. you want to add reads that have a certain amount of shared nodes (i.e. 10/20), and really high read future happiness. ```python= def reverse_order(dg,order): end=max([x[1]+dg.get_nodeview(x[0]).size() for x in order.items()]) reverse_order={} for x in order.items(): reverse_order[-x[0]]=end-x[1]-dg.get_nodeview(x[0]).size() return reverse_order def score_read_to_add(tgs,rid,reverse=False,): """gives a score to add the read fw, or bw if reversed""" order=tgs.sort_graph() if reverse: order=reverse_order(trg_ntu,order) rtn=tgs.thread_nodes(rid) #first figure out what the relative position of the read is fw=bw=0 for nid in rtn: if nid in order: fw+=1 elif -nid in order: bw+=1 #print("read",rid,"shared nodes with order:",fw,"fw,",bw,"bw") if bw>fw: #print("reversing read") rtn=[-x for x in rtn[::-1]] rid=-rid bw,fw=fw,bw if fw<10 or bw>2: #print("read does not have 10+ nodes FW and <3 BW, scoring None") return None tnodes_in_order=[x for x in enumerate(rtn) if x[1] in order] last_order_nodes=[x[0] for x in sorted(order.items(),key=lambda x:x[1])[-30:]] if tnodes_in_order[-1][1] not in last_order_nodes: return None #TODO: validate the order and such? #nodes in order, position in read of last_node_in_order, nodes after return (len(tnodes_in_order),tnodes_in_order[-1][0]+1,len(rtn)-tnodes_in_order[-1][0]) def find_reads_to_add(tgs,reverse=False,last_nodes=30, min_hit_perc=.30, min_new=20): rids=set() order=sorted(tgs.sort_graph().items(),key=lambda x:x[1]) if reverse: nodes=[x[0] for x in order[:last_nodes]] else: nodes=[x[0] for x in order[-last_nodes:]] for nid in nodes: for rid in tgs.rids_from_node(trg_ntu.get_nodeview(nid)): rids.add(rid) rta=[] for rid in rids: s=score_read_to_add(tgs,rid,reverse) if s is None: continue if s[0]/s[1]>=min_hit_perc and s[2]>=min_new: rta.append((rid,s)) #print(rid,s,'%.2f'%(100.0*s[0]/s[1])) #else: print(rid,s) return sorted(rta,key=lambda x:-x[1][2]) from datetime import datetime def write_connected(dg,filename): dg.write_to_gfa1(filename,selected_nodes=[x.node_id() for x in dg.get_all_nodeviews(include_disconnected=False)]) def printlog(*args, **kwargs): print(datetime.now().strftime('%H:%M:%S'),*args, **kwargs) def local_happiness_assembler(tgs, rtgs, reads_tg, starting_rid, min_support=2, max_unhappy=0, max_disconnected_perc=.4, min_shared=10, min_new=10, max_unhappy_nodes_in_new_read=10, max_steps=10000000): tgs.start_from_read(starting_rid,min_support) for stepnumber in range(max_steps): printlog("step #",stepnumber,"starting") write_connected(tgs.dg,"step%d_a-starting.gfa"%(stepnumber)) # remove unhappy nodes from order printlog("sorting graph") order=tgs.sort_graph() printlog("full order hapiness assessment") nbh=[(x[0],x[1],SDG.assess_node_happiness(x[0],order,reads_tg)) for x in sorted(order.items(),key=lambda x:x[1])] # (nid, pos, (happy, unhappy, disconnected)) #first remove all nodes that are too disconnected: they won't get any better printlog("removing disconnected") unhappy_nodes=[] removed_disconnected=removed_unhappy=0 for x in nbh: if x[2][0]==0 or 1.0*x[2][2]/sum(x[2])>max_disconnected_perc: #pop_node_from_all(tgs.dg,) #print(x, "should go!") tgs.remove_node(x[0]) removed_disconnected+=1 elif x[2][1]>max_unhappy: unhappy_nodes.append(x) printlog("removing unhappy") #now remove any unhappy nodes unhappy_nodes.sort(key=lambda x:x[2][1]) for x in unhappy_nodes: h=SDG.assess_node_happiness(x[0],tgs.sort_graph(),reads_tg) if h[1]>max_unhappy: tgs.remove_node(x[0]) removed_unhappy+=1 printlog("%d disconnected and %d unhappy nodes removed from order"%(removed_disconnected,removed_unhappy)) write_connected(tgs.dg,"step%d_b-cleaned.gfa"%(stepnumber)) added=0 for rids in [find_reads_to_add(tgs),find_reads_to_add(tgs,True)]: while rids: to_add=rids[0][0] if to_add in tgs.used_reads: print("skipping already-used read") rids=rids[1:] continue printlog("evaluating read",rids[0]) rnv=reads_tg.get_nodeview(tgs.read_ends[to_add][0]) unhappy=[] while True: if rnv.node_id() in tgs.sort_graph() or -rnv.node_id() in tgs.sort_graph(): h=SDG.assess_node_happiness(rnv.node_id(), tgs.sort_graph(), reads_tg) if h[1]>max_unhappy: unhappy.append(rnv.node_id()) rnvn=[x for x in rnv.next() if x.support().id==to_add] if not rnvn:break rnv=rnvn[0].node() if len(unhappy)>max_unhappy_nodes_in_new_read: print("skipping rid with unordered nodes") rids=rids[1:] else: printlog("adding read") tgs.add_read(to_add,min_support) write_connected(tgs.dg,"step%d_c-read_added%d.gfa"%(stepnumber,added)) for u in unhappy: tgs.pop_node(u,to_add) write_connected(tgs.dg,"step%d_c-read_added%d_popped.gfa"%(stepnumber,added)) added+=1 break if added==0: print("no read was added") break # find a read to add that makes the order grow and has few unhappy nodes printlog("resorting order") s=sorted(tgs.sort_graph().items(),key=lambda x:x[1]) printlog("order is now",s[-1][1],"bp long") # if no read found: STOP # remove unhappy nodes from read until all read is happy # add read to order ``` Strategy #1: Partial orders from Longer reads 1 - Take all reads that have more than x nodes. 2 - Filter their nodes to a curated list of "happy nodes in the read". use this in a single read: ```python= # def sort_a_read(tgs,rid,reads_tg,max_unhappy=1,max_disconnected_perc=.4): # tgs.start_from_read(rid,0) # printlog("sorting graph") # order=tgs.sort_graph() # printlog("full order hapiness assessment") # nbh=[(x[0],x[1],SDG.assess_node_happiness(x[0],order,reads_tg)) for x in sorted(order.items(),key=lambda x:x[1])] # (nid, pos, (happy, unhappy, disconnected)) # #first remove all nodes that are too disconnected: they won't get any better # printlog("removing disconnected") # unhappy_nodes=[] # removed_disconnected=removed_unhappy=0 # for x in nbh: # if x[2][0]==0 or 1.0*x[2][2]/sum(x[2])>max_disconnected_perc: # #pop_node_from_all(tgs.dg,) # #print(x, "should go!") # tgs.remove_node(x[0]) # removed_disconnected+=1 # elif x[2][1]>max_unhappy: # unhappy_nodes.append(x) # printlog("removing unhappy") # #now remove any unhappy nodes # unhappy_nodes.sort(key=lambda x:x[2][1]) # for x in unhappy_nodes: # h=SDG.assess_node_happiness(x[0],tgs.sort_graph(),reads_tg) # if h[1]>max_unhappy: # tgs.remove_node(x[0]) # removed_unhappy+=1 # printlog("%d disconnected and %d unhappy nodes removed from order"%(removed_disconnected,removed_unhappy)) # return tgs.sort_graph() class HappySorter(): def __init__(self,sorter,all_threads_graph): self.sorted_graph=SDG.DistanceGraph(all_threads_graph.sdg) self.used_reads=set() self.sorter=sorter self.all_threads_graph=all_threads_graph def thread_size(self,rid): ends=self.sorter.get_thread_ends(rid) e0ix=[x.support().index for x in self.all_threads_graph.get_nodeview(ends[0]).next() if x.support().id==rid][0] e1ix=[x.support().index for x in self.all_threads_graph.get_nodeview(ends[1]).next() if x.support().id==rid][0] #print(e0ix,e1ix) return max([e0ix,e1ix])+1 # def sort_a_read(self,rid,max_unhappy=1,max_disconnected_perc=.4): # self.sorter.start_from_read(rid,0) # printlog("sorting graph") # order=self.sorter.sort_graph() # printlog("full order hapiness assessment") # nbh=[(x[0],x[1],SDG.assess_node_happiness(x[0],order,self.all_threads_graph)) for x in sorted(order.items(),key=lambda x:x[1])] # (nid, pos, (happy, unhappy, disconnected)) # #first remove all nodes that are too disconnected: they won't get any better # printlog("removing disconnected") # unhappy_nodes=[] # removed_disconnected=removed_unhappy=0 # for x in nbh: # if x[2][0]==0 or 1.0*x[2][2]/sum(x[2])>max_disconnected_perc: # #pop_node_from_all(tgs.dg,) # #print(x, "should go!") # self.sorter.remove_node(x[0]) # removed_disconnected+=1 # elif x[2][1]>max_unhappy: # unhappy_nodes.append(x) # printlog("removing unhappy") # #now remove any unhappy nodes # unhappy_nodes.sort(key=lambda x:x[2][1]) # for x in unhappy_nodes: # h=SDG.assess_node_happiness(x[0],self.sorter.sort_graph(), self.all_threads_graph) # if h[1]>max_unhappy: # self.sorter.remove_node(x[0]) # removed_unhappy+=1 # printlog("%d disconnected and %d unhappy nodes removed from order"%(removed_disconnected,removed_unhappy)) # return self.sorter.sort_graph() def start_from_read(self,rid): self.sorted_graph=SDG.DistanceGraph(self.all_threads_graph.sdg) self.used_reads=set() self.add_read(rid) #def add_read(self,rid): # self.used_reads.add(rid) # for x in sorted(self.sort_a_read(rid).items(),key=lambda x:x[1])[:-1]: # nid=x[0] # l=self.sorter.dg.get_nodeview(nid).next()[0] # self.sorted_graph.add_link(-nid,l.node().node_id(),l.distance(),l.support()) def add_read(self,rid): self.used_reads.add(rid) hr=SDG.make_thread_happy(lrr.read_threads[abs(rid)],self.all_threads_graph) for i in range(len(hr)-1): self.sorted_graph.add_link(-hr[i].node,hr[i+1].node,hr[i+1].start-hr[i].end,SDG.Support(id=rid)) def find_longest_threads_per_node(self,include_used=False,top=2): c=Counter() for nv1 in self.sorted_graph.get_all_nodeviews(False,False): nv=self.all_threads_graph.get_nodeview(nv1.node_id()) ml=0 mlrid=0 for l in nv.prev()+nv.next(): if include_used or l.support().id not in self.used_reads: if len(lrr.read_threads[l.support().id])>ml: mlrid=l.support().id ml=len(lrr.read_threads[l.support().id]) if ml: c[mlrid]+=1 return c def sorted_order(self): cc=self.sorted_graph.get_connected_component( self.sorted_graph.get_all_nodeviews(include_disconnected=False)[0].node_id(), signed_nodes=True) #print(cc) order = sorted(SDG.sort_cc(self.sorted_graph,cc).items(),key=lambda x:x[1]) #print(order) if len(self.sorted_graph.get_all_nodeviews(include_disconnected=False))!=len(order): print("WARNING: it appears there's more than one component in HappySorter's sorted_graph") return order def pick_for_extension(self,happy_in_order=4,end_size=10,fw=True): """picks a read fw/bw for extension, these are the furthest-reaching reads that have at least X happy nodes among the last Y nodes of the current order""" order=self.sorted_order() if fw: nids=[x[0] for x in order[-end_size:]] else: nids=[x[0] for x in order[:end_size]][::-1] rnids=[-x for x in nids] #print(len(nids)) #print(nids) c=Counter() last_link={} for nid in nids: for l in self.all_threads_graph.get_nodeview(nid).next(): tid=l.support().id if tid in self.used_reads: continue tidx=l.support().index if tid not in last_link or last_link[tid]<tidx: last_link[tid]=tidx c[tid]+=1 #print(c.most_common(10)) c2=Counter() for rid,oldc in c.items(): if oldc<happy_in_order: continue hn=[x.node for x in SDG.make_thread_happy(lrr.read_threads[abs(rid)],self.all_threads_graph)] c2[rid]= max([ len(set(hn).intersection(set(nids))), len(set(hn).intersection(set(rnids)))]) fts=[(x,self.thread_size(x)-last_link[x]) for x in c2.keys() if c2[x]>=happy_in_order] return sorted(fts,key=lambda x:x[1],reverse=True) #for x in : # print(x) # for nv1 in self.sorted_graph.get_all_nodeviews(False,False): # nv=self.all_threads_graph.get_nodeview(nv1.node_id()) # ml=0 # mlrid=0 def write_connected(self,filename): self.sorted_graph.write_to_gfa1(filename,selected_nodes=[x.node_id() for x in self.sorted_graph.get_all_nodeviews(include_disconnected=False)]) def extension_cycles(self,reads_per_end=1,cycles=10,happy_in_order=4,end_size=10): print("Running %d extension cycles with the farthest reaching %d reads"%(cycles,reads_per_end)) order=self.sorted_order() print("starting sorted length: %dbp"%(order[-1][1]-order[0][1])) for c in range(cycles): print("\n\nCycle", c) pef=self.pick_for_extension(happy_in_order=happy_in_order,end_size=end_size,fw=True) for x in range(reads_per_end): if len(pef)>x: print("read #%d fw: %d"%(x,pef[x][0])) self.add_read(pef[x][0]) peb=self.pick_for_extension(happy_in_order=happy_in_order,end_size=end_size,fw=False) for x in range(reads_per_end): if len(peb)>x: print("read #%d bw: %d"%(x,peb[x][0])) self.add_read(peb[x][0]) order=self.sorted_order() print("Sorted length: %dbp"%(order[-1][1]-order[0][1])) def pick_for_thickening(): def remove_read(): ``` Read nodes (rtn): [A, B, C, D, F, G, I, K] Order nodes: [P, Q, R, S, T, A, B, D, E, F] tnodes_in_order= [(0, A), (1, B), (3, D), (4, F)] return (4, 4, 4) ```python= hs=HappySorter(tgsu,trg_ntu);hs.start_from_read(491667) hs.extension_cycles(2,6) hs.add_read(3785826) hs.add_read(13253989) hs.write_connected("hsbefore.gfa") hs.add_read(14272921) hs.write_connected("hsafter.gfa") ``` ```python= #lrr2 will get modified and will have only threads with their happy nodes lrr2.simple_thread_reads() SDG.make_all_threads_happy(lrr2,trg_ntu) htrg=lrr.dg_from_threads(False,True,10)#this is largely irrelevant for the happysorter, but useful hs=HappySorter(htgs,trg_ntu);hs.start_from_read(11676965) hs.extension_cycles(10,50,4,20) ``` ```python= def all_nodes_and_distances_across_rid(nv,rid,max_distance=10000000000): nodes=[] d=-nv.size() while True: try: lf=[l for l in nv.next() if l.support().id==rid][0] except: break d+=nv.size()+lf.distance() if d>max_distance: break nodes.append((d,lf.node().node_id())) nv=lf.node() return nodes def get_all_nodes_up_to_dist(fnv,dist): all_nodes=Counter() all_rids=set() for x in fnv.next()+fnv.prev(): all_rids.add(x.support().id) for rid in all_rids: for ndist,nid in all_nodes_and_distances_across_rid(fnv,rid,dist): all_nodes[nid]+=1 for ndist,nid in all_nodes_and_distances_across_rid(fnv.rc(),rid,dist): all_nodes[-nid]+=1 return all_nodes def nodes_by_dist_and_support(fnv,dist,links): nodes=[] for nid,count in get_all_nodes_up_to_dist(fnv,dist).items(): if count>=links: nodes.append(nid) return nodes ``` ```python= def rtg_to_tube(rtg): tube={"nodes":[],"tracks":[]} tids=set() for xx in sorted(SDG.sort_cc( rtg,rtg.get_connected_component(rtg.get_all_nodeviews(include_disconnected=False)[0].node_id(),signed_nodes=True) ).items(),key=lambda x:x[1]): x=rtg.get_nodeview(abs(xx[0])) tube["nodes"].append( dict(name = "%d"%x.node_id(), width = 10, seq = "seq%d"%x.node_id() ) ) for l in x.next()+x.prev(): tids.add(l.support().id) for tnum,tid in enumerate(tids): tube["tracks"].append(dict( id="%d"%tnum, name="%d"%tid, indexOfFirstBase=1, sequence=[str(abs(np.node)) for np in rtg.get_thread(tid)], freq=1 )) return tube ``` # Cleaning a raw graph ## Removing nodes from threads where they OBVIOUSLY do not belong ```python= from collections import Counter def score_threads(rtg,nid,support=1): n1tn=rtg.node_thread_neighbours(nid) for tid in rtg.node_threads(nid): c=Counter([n1tn[abs(np.node)]>support for np in rtg.get_thread(tid)]) c[True]-=1 print("%10d %3d %3d %.2f%%"%(tid,c[True],c[False],c[True]*100.0/(c[True]+c[False]))) def clean_threads(rtg,nid,support=1): n1tn=rtg.node_thread_neighbours(nid) to_pop=[] for tid in rtg.node_threads(nid): c=Counter([n1tn[abs(np.node)]>support for np in rtg.get_thread(tid)]) c[True]-=1 if c[True]<3: to_pop.append(tid) for tid in to_pop: rtg.pop_node(nid,tid) return len(to_pop) ``` # from here on ```python= def sort_rtg(rtg): cc=rtg.get_connected_component( rtg.get_all_nodeviews(include_disconnected=False)[0].node_id(), signed_nodes=True) #print(cc) order = sorted(SDG.sort_cc(rtg,cc).items(),key=lambda x:x[1]) #print(order) if len(order)==0: print("WARNING: graph can't be sorted") elif len(rtg.get_all_nodeviews(include_disconnected=False))!=len(order): print("WARNING: there's more than one component the graph") return [] return order def thread_internal_linkage(rtg,tid): nids=[x.node for x in rtg.get_thread(tid)] l=[] for n1 in nids: l.append([]) for n2 in nids: l[-1].append(len(set(rtg.node_threads(n1))&set(rtg.node_threads(n2)))) return l def thread_linkage_support(rtg,tid): t=rtg.get_thread(tid) ot={} for i in range(len(t)): for nt in rtg.node_threads(t[i].node): if nt==tid: continue if nt not in ot: ot[nt]=[] ot[nt].append(i) covd=[0 for x in t] for x in ot.values(): if len(x)>1: covd[x[0]+1]+=1 covd[x[-1]]-=1 cc=0 tls=[] for x in covd: cc+=x tls.append(cc) return tls def is_chimeric(rtg,tid,min_support=1): if [x for x in thread_linkage_support(rtg,tid)[1:-1] if x<min_support]: return True return False def add_threads_from_node(rtgdest,rtgsource,node_id, discard_chimeric=True): print("adding threads from node %d"%node_id) tids=rtgsource.node_threads(node_id) if not tids: print("node has no threads in source rtg") return for tid in tids: if rtgdest.get_thread(tid): print(" x skipping thread %d already present in dest rtg"%tid) elif discard_chimeric and is_chimeric(rtgsource,tid): print(" x skipping thread %d flagged as chimeric"%tid) else: print(" v adding thread %d"%tid) rtgdest.add_thread(tid, rtgsource.get_thread(tid), False, 2) ``` ```python= rpl=rtg.clean_repeat_nodes_popping_list() len(rpl) # --> 7318860 rtg.apply_popping_list(rpl) pl=rtg.clean_all_nodes_popping_list() len(pl) # --> 14176414 (tiene que ser mas!) rtg.apply_popping_list(pl) rtg.dump("rtg_rep_and_clean.dump") lrtg=SDG.ReadThreadsGraph(ws.sdg) add_threads_from_node(lrtg,rtg,1) sort_rtg(lrtg) #-> check orders for other nodes ordered here. ``` ```python= from collections import Counter class RTGSorter(): def __init__(self,all_rtg,safe_node_cv=5,thread_anchor_nodes=5,last_safe_nodes=10): self.safe_node_cv=safe_node_cv self.thread_anchor_nodes=thread_anchor_nodes self.last_safe_nodes=last_safe_nodes self.all_rtg=all_rtg self.rtg=SDG.ReadThreadsGraph(self.all_rtg.sdg) self.used_threads=set() def find_safe_nodes(self): safe_nodes=[nv.node_id() for nv in self.rtg.get_all_nodeviews(include_disconnected=False) if len(self.rtg.node_threads(nv.node_id()))>=self.safe_node_cv] return safe_nodes def find_fw_extension_threads(self,rev=False,tan=None,last_safe_nodes=None): if last_safe_nodes is None: last_safe_nodes=self.last_safe_nodes if tan is None: tan=self.thread_anchor_nodes sn=self.find_safe_nodes() safesort=self.sort_safe_rtg() #stupidly re-make this into a dict ssd={} if rev: lsn=[x[0] for x in safesort[:last_safe_nodes]] else: lsn=[x[0] for x in safesort[-last_safe_nodes:]] for x in safesort: ssd[x[0]]=x[1] c=Counter() for x in sn: c.update([x for x in self.all_rtg.node_threads(x) if x not in self.used_threads]) ct=[x[0] for x in c.items() if x[1]>=tan] st=[] for tid in ct: thread=[x.node for x in self.all_rtg.get_thread(tid)] #print("\nanalysing candidate thread",tid," length=",len(thread)) if rev: thread=[-x for x in thread[::-1]] f=[] b=[] for n in enumerate(thread): if n[1] in ssd: f.append(n) if -n[1] in ssd: b.append(n) #print("B:",b) #print("F:",f) if len(f)==0: st.append((tid,b[0][0], len([1 for x in thread if len(self.rtg.node_threads(x))==tan-1]), len([1 for x in thread if x in lsn or -x in lsn]))) if len(b)==0: st.append((tid,len(thread)-f[-1][0]+1, len([1 for x in thread if len(self.rtg.node_threads(x))==tan-1]), len([1 for x in thread if x in lsn or -x in lsn]))) return sorted(st,key=lambda x:(-x[3],-x[2],-x[1])) def thread_linkage_support(self,rtg,tid): t=rtg.get_thread(tid) ot={} for i in range(len(t)): for nt in rtg.node_threads(t[i].node): if nt==tid: continue if nt not in ot: ot[nt]=[] ot[nt].append(i) covd=[0 for x in t] for x in ot.values(): if len(x)>1: covd[x[0]+1]+=1 covd[x[-1]]-=1 cc=0 tls=[] for x in covd: cc+=x tls.append(cc) return tls def is_chimeric(self,tid,min_support=1): if [x for x in self.thread_linkage_support(self.all_rtg,tid)[1:-1] if x<min_support]: return True return False def add_thread(self,tid,discard_chimeric=True): if tid in self.used_threads or self.rtg.get_thread(tid): print(" x skipping thread %d already present in dest rtg"%tid) return False elif discard_chimeric and self.is_chimeric(tid): print(" x skipping thread %d flagged as chimeric"%tid) return False else: print(" v adding thread %d"%tid) self.rtg.add_thread(tid, self.all_rtg.get_thread(tid), False, 2) self.used_threads.add(tid) return True def remove_thread(self,tid): self.used_threads.remove(tid) self.rtg.remove_thread(tid) def sort_rtg(self,rtg=None): if rtg is None: rtg=self.rtg all_connected=rtg.get_all_nodeviews(include_disconnected=False) cc=rtg.get_connected_component( all_connected[0].node_id(), signed_nodes=True) order = sorted(SDG.sort_cc(rtg,cc).items(),key=lambda x:x[1]) if len(order)==0: print("WARNING: graph can't be sorted") elif len(all_connected)!=len(order): print("WARNING: there's more than one component the graph") return [] return order def sort_safe_rtg(self,safe_cv=None): if safe_cv is None: safe_cv=self.safe_node_cv srtg=SDG.ReadThreadsGraph(self.rtg.sdg) for tid in self.used_threads: srtg.add_thread(tid, self.rtg.get_thread(tid), False, 2) to_disconnect=[nv.node_id() for nv in srtg.get_all_nodeviews(include_disconnected=False) if len(srtg.node_threads(nv.node_id()))<safe_cv] for nid in to_disconnect: srtg.pop_node_from_all(nid) return self.sort_rtg(srtg) def auto_extend(self,steps=10000): chimeric_reads=set() for step in range(steps): print("\nAuto extension step",step) ssort=self.sort_safe_rtg() print("Starting with a safe graph from %d to %d, total distance %d bp" % (ssort[0][0],ssort[-1][0],ssort[-1][1]-ssort[0][1])) extended=0 for fc in [ [x[0] for x in self.find_fw_extension_threads() if x not in chimeric_reads], [x[0] for x in self.find_fw_extension_threads(rev=True) if x not in chimeric_reads]]: for tid in fc: if tid in chimeric_reads: continue if not self.add_thread(tid): chimeric_reads.add(tid) continue ssort=self.sort_safe_rtg() if ssort==[]: print("last thread ruined safe order! marking as chimeric and backtracking") chimeric_reads.add(tid) self.remove_thread(tid) continue print("Extended safe graph from %d to %d, total distance %d bp" % (ssort[0][0],ssort[-1][0],ssort[-1][1]-ssort[0][1])) extended+=1 break if extended==0: print("can't find a thread to add on any end, quitting") ``` ```python= from collections import Counter class RTGThreadsCluster: def __init__(self,all_rtg): self.all_rtg=all_rtg self.rtg=SDG.ReadThreadsGraph(self.all_rtg.sdg) self.threads=set() self.used_threads=set() self.chimeric_threads=set() def thread_linkage_support(self,rtg,tid): t=rtg.get_thread(tid) ot={} for i in range(len(t)): for nt in rtg.node_threads(t[i].node): if nt==tid: continue if nt not in ot: ot[nt]=[] ot[nt].append(i) covd=[0 for x in t] for x in ot.values(): if len(x)>1: covd[x[0]+1]+=1 covd[x[-1]]-=1 cc=0 tls=[] for x in covd: cc+=x tls.append(cc) return tls def is_chimeric(self,tid,min_support=1): if [x for x in self.thread_linkage_support(self.all_rtg,tid)[1:-1] if x<min_support]: return True return False def add_thread(self,tid,discard_chimeric=True): if tid in self.used_threads or self.rtg.get_thread(tid): print(" x skipping thread %d already present in dest rtg"%tid) return False elif discard_chimeric and self.is_chimeric(tid): print(" x skipping thread %d flagged as chimeric"%tid) self.chimeric_threads.add(tid) return False else: print(" v adding thread %d"%tid) self.rtg.add_thread(tid, self.all_rtg.get_thread(tid), False, 2) self.used_threads.add(tid) return True def remove_thread(self,tid): self.used_threads.remove(tid) self.rtg.remove_thread(tid) def find_threads_to_add(self, safe_cvg=3, min_nodes=5, max_span=20): print("Collecting safe nodes and candidate threads...") tc=Counter() nc=Counter() for nv in self.rtg.get_all_nodeviews(include_disconnected=False): nt=self.rtg.node_threads( nv.node_id() ) nc[nv.node_id()]=len(nt) if len(nt)<safe_cvg: continue for tid in self.all_rtg.node_threads( nv.node_id() ): if tid in nt or tid in self.chimeric_threads: continue tc[tid]+=1 print("Checking threads to add") tcc=0 to_add=[] for tid,c in tc.items(): if c<min_nodes: continue tcc+=1 tpa=[x[0] for x in enumerate(self.all_rtg.get_thread(tid)) if nc[x[1].node]>=safe_cvg] for i in range(len(tpa)-min_nodes): if tpa[i+min_nodes]-tpa[i]<=max_span: to_add.append(tid) break print("There was",tcc,"candidates") print("There was",len(to_add),"candidates with",min_nodes,"in",max_span,"consecutive nodes included") return to_add ``` ```python= from statistics import median def make_summary_connection_graph(mdg,links_to_connect=3): """returns a new dg with only one connection per node end pair (i.e. mdg to dg)""" conn={} dg=SDG.DistanceGraph(mdg.sdg) for nv in mdg.get_all_nodeviews(include_disconnected=False,both_directions=True): for ln in nv.next(): ck=(-nv.node_id(),ln.node().node_id()) if ck[0]<=ck[1]: if ck not in conn: conn[ck]=[] conn[ck].append(ln.distance()) for c,d in conn.items(): if len(d)>=links_to_connect: dg.add_link(c[0],c[1],int(median(d))) return dg ``` Check if a node's thread can be clustered in independent haplotypes (i.e. false anchor!) ```python= #use this on every node, if the clusters many and complicated... pop this shit! def cluster_threads_in_node(rtg,nid,min_shared=4): tnodes={} for tid in rtg.node_threads(nid): tnodes[tid]=set([abs(x.node) for x in rtg.get_thread(tid) if abs(x.node)!=abs(nid)]) tconns={} for tid1,nodes1 in tnodes.items(): tconns[tid1]=[] for tid2,nodes2 in tnodes.items(): if len(nodes1&nodes2)<min_shared: continue tconns[tid1].append(tid2) used_tids=set() clusters=[] for tid,conns in tconns.items(): if tid in used_tids: continue clusters.append(set([tid])) used_tids.add(tid) to_explore_tids=conns while to_explore_tids: ntet=[] for tid2 in to_explore_tids: if tid2 in used_tids: continue clusters[-1].add(tid2) used_tids.add(tid2) for tid3 in tconns[tid2]: if tid3 not in used_tids: to_explore_tids.append(tid3) to_explore_tids=ntet return clusters ``` # Computing multi-clustering nodes fast - Create all sets of nodes for all threads - For every node: - Cluster threads by node sets, ordered by size - Compare size of first and second clusters: - one significant cluster: remove node from threads not in the significant cluster. - two or more significant clusters: remove node from all threads. ```python= def threads_to_nodesets(rtg): nodesets={} for nv in rtg.get_all_nodeviews(include_disconnected=False): nid=nv.node_id() for tid in rtg.node_threads(nid): if tid not in nodesets: nodesets[tid]=set() nodesets[tid].add(nid) return nodesets def cluster_threads_in_node(rtg,tnodes,nid,min_shared=4): tconns={} ntids=rtg.node_threads(nid) for tid1 in ntids: nodes1=tnodes[tid1] tconns[tid1]=[] for tid2 in ntids: nodes2=tnodes[tid2] if len(nodes1&nodes2)<min_shared: continue tconns[tid1].append(tid2) used_tids=set() clusters=[] for tid,conns in tconns.items(): if tid in used_tids: continue clusters.append(set([tid])) used_tids.add(tid) to_explore_tids=conns while to_explore_tids: ntet=[] for tid2 in to_explore_tids: if tid2 in used_tids: continue clusters[-1].add(tid2) used_tids.add(tid2) for tid3 in tconns[tid2]: if tid3 not in used_tids: to_explore_tids.append(tid3) to_explore_tids=ntet return clusters ``` ```python= rtg #this graph has all the threads single_node_rtg=SDG.ReadThreadsGraph(ws.sdg) for tid in trg.node_threads(nid): single_node_rtg.add_thread(tid, rtg.get_thread(tid)) single_node_rtg.write_to_gfa1("single_node.gfa",selected_nodes=[x.node_id() for x in single_node_rtg.get_all_nodeviews(include_disconnected=False)]) ``` ```python= def sift_candidates( ```