Strawberry code

# Strawberry code [TOC] ## Strategic approach ## Tangle stats Count: bp and nodes in tangles, in frontiers and others (nodes in frontiers are ENDS, obviously). For the nodes in tangles, plot a distribution of count and bp by count of internal elements (i.e. to give an idea of how tangled the graph is and how complex these tangles are). probably a good idea to plot these numbers as a progress from 0% to 100% of a total count/size. ### Improving the initial graph. The initial graph needs improvement, so the tangle definition gets easier to work with (i.e. smaller tangles within . This being strawberry, the two more obvious ways to do this are to clip tips and to phase bubbles. **Gonza is working on phasing bubbly paths and solving other trivial structures by just expanding canonical repeats** **DONE** - Simple stats for repeats, bubbles and tips - Canonical repeat resolution with single-end short reads **TO DO** - Canonical repeat resolution with pairs (fallback after failing with single reads) - Bubble popping (error and heterozygous bubble mode, use KCI and paths) - Tip clipping (using paths) Ideally we would want to have a single heuristic step that can do: tip clippling, repeat expansion and bubble phasing in one go. For now, we have this pretty stupid simple code to just do repeat expansion: ```python from collections import Counter def is_canonical_repeat(nv): if len(nv.prev())==len(nv.next())>1: for pl in nv.prev(): if len(pl.node().next())>1: return False for nl in nv.next(): if len(nl.node().prev())>1: return False return True return False def is_bubble_side(nv): if len(nv.parallels())==len(nv.prev())==len(nv.next())==1 and len(nv.prev()[0].node().next())==len(nv.next()[0].node().prev())==2: return True return False def is_tip(nv): if len(nv.prev())==0 or len(nv.next())==0: return True return False def simple_structures_stats(dg): total=canrep=bubsides=tips=0 for nv in dg.get_all_nodeviews(): total+=1 if is_canonical_repeat(nv): canrep+=1 if is_bubble_side(nv): bubsides+=1 if is_tip(nv): tips+=1 print("%d tips, %d canonical repeats and %d bubble sides out of %d nodes"%(tips,canrep,bubsides,total)) def solve_canonical_repeat(ge,nv,peds,min_support=5,max_noise=10,snr=10,verbose=False): if verbose: print("\nSolving ",nv,nv.size(),"bp") out_nids=[x.node().node_id() for x in nv.next()] sols=[] for pnv in [x.node() for x in nv.prev()]: c=Counter() for p in peds.mapper.all_paths_fw(pnv.node_id(),False): if p[0]!=nv.node_id(): c[p[0]]+=1 elif len(p)>1: c[p[1]]+=1 if verbose: print(c.most_common()) t=sum(c.values()) if t<min_support: if verbose: print("FW too few paths!") return False (winner,votes)=c.most_common(1)[0] if votes<min_support: if verbose: print("FW winner poorly supported!") return False noise=t-votes if winner in out_nids and noise*snr<=votes and noise<max_noise: sols.append((-pnv.node_id(),winner)) else: if verbose: print("FW Not a clear winner!") return False if verbose: print(sols) if len(sols)<len(nv.prev()) or len(sols)>len(set([x[1] for x in sols])): return False in_nids=[-x.node().node_id() for x in nv.prev()] sols2=[] for nnv in [x.node() for x in nv.next()]: c=Counter() for p in peds.mapper.all_paths_fw(-nnv.node_id(),False): if p[0]!=-nv.node_id(): c[p[0]]+=1 elif len(p)>1: c[p[1]]+=1 t=sum(c.values()) if t<min_support: if verbose: print("BW too few paths!") return False (winner,votes)=c.most_common(1)[0] if votes<min_support: if verbose: print("BW winner poorly supported!") return False noise=t-votes if winner in in_nids and noise*snr<=votes and noise<max_noise: sols2.append((winner,nnv.node_id())) else: if verbose: print("BW Not a clear winner!") return False if verbose: print(sols2) sols2.sort() sols.sort() if sols!=sols2: if verbose: print("FW and BW solutions not the same") return False ge.queue_node_expansion(nv.node_id(),sols) #print(sols) return True def solve_all_canonical(peds,size=1000,apply=False): ge=SDG.GraphEditor(ws) total=solved=0 for nv in ws.sdg.get_all_nodeviews(): if nv.size()<=size and is_canonical_repeat(nv): total+=1 if solve_canonical_repeat(ge,nv,peds): solved+=1 print("\r%d / %d canonical repeats solved" % (solved,total), end='') print("%d / %d canonical repeats solved" % (solved,total)) if apply: ge.apply_all() ws.sdg.join_all_unitigs() #######RUN THIS BIT simple_structures_stats(ws.sdg) solve_all_canonical(peds,apply=True) simple_structures_stats(ws.sdg) kc.update_graph_counts() kc.compute_all_kcis() print(ws.sdg.stats_by_kci()) ``` ### Tangle drill down If we start by getting all tangles with frontiers of a large size, we can then decompose the unclassified tangles in their internal sub-tangles for a smaller size. By definition every internal node on a smaller-frontier defined tangle will be included in a single larger-frontier tangle. This allows for different types of tangles to be definied at independently-determined tangle sizes. # Code dumps!!! ```python import sys sys.path.append('/hpc-home/ggarcia/git_sources/bsg/gonza_master/SDGpython/') import SDGpython as SDG ws = SDG.WorkSpace('./00-raw_graph.sdgws') gc = SDG.GraphContigger(ws) gc.clip_tips(200) kc = ws.get_kmer_counter('kmers-pe-31') for i, c in enumerate(kc.count_spectra('pe')[:400]): print("%s,%s" %(i, c)) kc.update_graph_counts() kc.set_kci_peak(100) kc.compute_all_kcis() print(ws.sdg.stats_by_kci()) ws.dump('./01-tip_clipped.sdgws') kc.dump_cache('./01-tip_clipped.kci.cache') peds = ws.get_paired_reads_datastore('pe') ## Map reads peds.mapper.path_reads() peds.mapper.dump_readpaths('./01-tip_clipped.pe.paths') ``` ![](https://i.imgur.com/dBdC5MW.png) ```python import sys sys.path.append('/hpc-home/ggarcia/git_sources/bsg/gonza_master/SDGpython/') import SDGpython as SDG ws = SDG.WorkSpace('./01-tip_clipped.sdgws') kc = ws.get_kmer_counter('kmers-pe-31') kc.load_cache('./01-tip_clipped.kci.cache') ## Load mapped reads peds = ws.get_paired_reads_datastore('pe') peds.mapper.load_readpaths('./01-tip_clipped.pe.paths') simple_structures_stats(ws.sdg) solve_all_canonical(peds,True) simple_structures_stats(ws.sdg) kc.update_graph_counts() kc.compute_all_kcis() print(ws.sdg.stats_by_kci()) ``` ![](https://i.imgur.com/DWDtsBt.png) ![](https://i.imgur.com/5rmSkly.png) [Functions defined in](https://hackmd.io/7i2WQxdEQyOcOC3YLsY-VQ?view) ```python for ii in range(3): print('\n\nRound',ii,'starting...') if STATS: simple_structures_stats(ws.sdg) for i in range(3): peds.mapper.path_reads() clip_all_tips(peds,apply=True) if STATS: simple_structures_stats(ws.sdg) if STATS: kc.update_graph_counts() kc.compute_all_kcis() print(ws.sdg.stats_by_kci()) for i in [70,100,200]: peds.mapper.path_reads() solve_all_tangles(ws,i,apply=True) kc.update_graph_counts() kc.compute_all_kcis() if STATS: print(ws.sdg.stats_by_kci()) peds.mapper.path_reads() s=SDG.Strider(ws) s.add_datastore(peds) s.stride_from_anchors(min_kci=.5,max_kci=1.5) strider_run_from_cpp() if STATS: kc.update_graph_counts() kc.compute_all_kcis() print(ws.sdg.stats_by_kci()) ``` ```python lords = ws.add_long_reads_datastore('./nanoporeds.lords.loseq') lrr = SDG.LongReadsRecruiter() ``` ## testing mapper with k=63 and pathing ```python def mapping_report(peds,peds2,offset=1000000,total=1000000): same=old_longer=new_longer=old_empty=new_empty=same_difference=both_empty=0 for x in range(offset,offset+total): p1=list(peds.mapper.read_paths[x].path) p2=list(peds2.mapper.read_paths[x].path) if p1==p2: same+=1 if len(p1)==0: both_empty+=1 else: if len(p1)>len(p2): old_longer+=1 elif len(p2)>len(p1): new_longer+=1 else: same_difference+=1 if len(p1)==0: old_empty+=1 if len(p2)==0: new_empty+=1 print("same = %d (%.2f%%) --> both empty: %d (%.2f%%) " % (same,100*same/total,both_empty,100*both_empty/total)) print("old longer = %d (%.2f%%)" % (old_longer-new_empty,100*(old_longer-new_empty)/total)) print("new longer = %d (%.2f%%)" % (new_longer-old_empty,100*(new_longer-old_empty)/total)) print("old empty = %d (%.2f%%)" % (old_empty,100*old_empty/total)) print("new empty = %d (%.2f%%)" % (new_empty,100*new_empty/total)) print("same length different = %d (%.2f%%)" % (same_difference,100*same_difference/total)) peds1=ws.add_paired_reads_datastore('test.prseq','test1') peds2=ws.add_paired_reads_datastore('test.prseq','test2') peds1.mapper.path_reads(31,50) peds2.mapper.path_reads() mapping_report(peds1,peds2,1,peds1.size()) ```