shared stuff - HackMD

# shared stuff [TOC] ```python= from collections import Counter def get_hs_tags(lirds, hs): c=Counter() for nid in hs.order.as_signed_nodes(): for rid in lirds.mapper.paths_in_node[abs(nid)]: c[lirds.get_read_tag(abs(rid))]+=1 return set([x for x,c in c.most_common() if c>1]) def node_tags_in_tagset(lirds, nid, tagset): f=t=0 for rid in lirds.mapper.paths_in_node[abs(nid)]: if lirds.get_read_tag(abs(rid)) in tagset: f+=1 t+=1 if t==0: return 0 return f/t tagset1=get_hs_tags(lirds, hs1) tagset2=get_hs_tags(lirds, hs2) print("order 1") for n in hs1.order.as_signed_nodes(): print("%20d\t%.2f\t%.2f"%(n,node_tags_in_tagset(lirds,n,tagset1 ),node_tags_in_tagset(lirds,n,tagset2))) print("\n\n\norder 2") for n in hs2.order.as_signed_nodes(): print("%20d\t%.2f\t%.2f"%(n,node_tags_in_tagset(lirds,n,tagset1 ),node_tags_in_tagset(lirds,n,tagset2))) ``` # Thread plots vs. node orders: ```python= def thread_plot(rtg,nodes,tids=[],min_nodes=4): if not tids: tc=Counter() for nid in nodes: tc.update(rtg.node_threads(nid,True)) tids=[tid for tid,c in tc.items() if c>=min_nodes] figure(figsize=(40,40)) #each node gets an x position. y=0 node_x={} for xp,nid in enumerate(nodes): node_x[nid]=xp #reorder the tids by their first node tf=[] for tid in tids: for np in rtg.get_thread(tid): if np.node in node_x: tf.append([node_x[np.node],tid]) break tids=[x[1] for x in sorted(tf)] yticks_y=[] yticks_l=[] for tid in tids: y-=1 yticks_y.append(y) yticks_l.append("%d (%d crosses)" % (tid, hs_fail.order.thread_order_crosses(rtg.get_thread(tid)))) t_x=[] t_y=[] for np in rtg.get_thread(tid): if np.node in node_x: pos=node_x[np.node] elif -np.node in node_x: pos=-node_x[-np.node] else: continue if len(t_x) and ( ((last_pos>0) != (pos>0)) or (last_pos>pos)): y-=1 last_pos=pos t_x.append(abs(pos)) t_y.append(y) #print(t_y,t_x) plot(t_x,t_y,'o-',linewidth=3) grid(color='black', axis='x',which='both', linestyle=':', linewidth=1) grid(color='black', axis='y',which='both', linestyle=':', linewidth=1, alpha=.5) xticks(range(len(node_x)),labels=[str(x[0]) for x in sorted(node_x.items(),key=lambda v:v[1])],rotation=90) yticks(yticks_y,labels=yticks_l) ``` # Checking LR + 10 integration: ```python= hss={} tags={} for i, nid in enumerate(nodos): print(i, nid) hss[nid]=SDG.HappySorter(rtg,.5,.7) hss[nid].start_from_node(nid) hss[nid].grow_loop(10) tags[nid]=get_hs_tags(lirds, hss[nid]) def greedy_cluster_nodes(nodos, nid): cnodes=set([nid]) last_len=0 while len(cnodes)>last_len: last_len=len(cnodes) for onid in nodos: if onid in cnodes: continue for oonid in cnodes: if len(tags[oonid]&tags[onid])>10: cnodes.add(onid) # print(onid) break return cnodes def all_nodes_touched_by(nodos, nid): nodes=set() for o in greedy_cluster_nodes(nodos, nid): for n in hss[o].order.as_signed_nodes(): nodes.add(n) return (n) ``` # creating the reference positions for nodes: ```shell= ~/software/mummer-4.0.0rc1/nucmer -t16 -c2 -l15 -p anchors_to_fv Fragaria_vesca_v4.0.a1.fasta sdg_raw.fasta ~/software/mummer-4.0.0rc1/delta-filter -q -l20 anchors_to_fv.delta > anchor_positions.delta ~/software/mummer-4.0.0rc1/show-coords -qH anchor_positions.delta > anchor_positions.coords ``` ```python= def load_npos(): npos={} for l in open('anchor_positions.coords'): sl=l.split() rstart=int(sl[0]) rend=int(sl[1]) qstart=int(sl[3]) qend=int(sl[4]) tname=sl[11] if tname[:3]!="Fvb": continue chrid=int(tname[3:]) if qstart>qend: chrid=-chrid if rstart>rend: chrid=-chrid qlen=abs(qend-qstart)+1 nid=int(sl[12][3:]) if nid not in npos or npos[nid][2]<qlen: npos[nid]=(chrid,rstart,qlen) return npos ``` # Problems to solve ## 1 - Nodes with haplotype collapsing/switch ```python= %%time hs_fail=SDG.HappySorter(rtg,.5,.7) hs_fail.start_from_node(6736641) hs_fail.fast_grow_loop(verbose=True) print(hs_fail.order.size()) plot_F1(hs_fail.order,[6736641]) ``` ![](https://i.imgur.com/SQMxc0k.png) ```python= def plot_F1(order,mark_nodes=[],exclude_happil_shares=True): patterns=[] to_mark=[] ni=[] for i,nid in enumerate(order.as_signed_nodes()): if cf.get_pattern(rtg.get_nodeview(nid))[-1]!='1' or not exclude_happil_shares: patterns.append([int(x) for x in cf.get_pattern(rtg.get_nodeview(nid))]) ni.append((len(patterns)-1,nid)) if nid in mark_nodes: to_mark.append((len(patterns)-1,' %d'% nid)) step=len(ni)//20 for i in range(0,20*step,step): to_mark.append(ni[i]) to_mark.append(ni[-1]) to_mark.sort() figure(figsize=(4,10)) ax=subplot(facecolor='w') ax.imshow(patterns,aspect='auto',interpolation='none') #for i,nid in to_mark: #plotpatterns[i]=[5 for x in range(21)] # ax.right_ax. rax=ax.axes.twinx() #help(rax) rax.set_yticks([x[0] for x in to_mark]) rax.set_yticklabels([x[1] for x in to_mark]) rax.imshow(patterns,aspect='auto',interpolation='none') for i,tm in enumerate(to_mark): if int(tm[1]) in mark_nodes: rax.get_yticklabels()[i].set_color('red') rax.get_yticklabels()[i].set_weight('bold') show() ``` ### Conclusion For a haplotype to switch there must be recruitment of threads form the alternative haplotype. Since taking into account a larger section of the haplotype for node happiness solved all those problems for nodes (with only occasional switches now incurred by the nodes), we need to implement something similar for thread recruitment: a more stringent happiness condition that computes hits towards all the potential length of the thread that could fall into the order. Then the identity for recruitment will need to be lowered, but it will still be more precise. ## 2 - Node update creates "drifts" on some nodes ```python= START_NODE=694137 hs_fail=SDG.HappySorter(rtg,.5,.7) hs_fail.start_from_node(START_NODE) hs_fail.fast_grow_loop(verbose=True,steps=2) for i in range(3): hs_fail.update_positions() nc=dict(hs_fail.order.node_coordinates) figure() plot([nc[nid] for nid in hs_fail.order.as_signed_nodes()]) ``` ![](https://i.imgur.com/iWxVQ92.png) # Finding complementary haplotypes in a region of chromosome ```python= def sdist(s1,s2): return len([1 for x in range(len(s1)) if s1[x]!=s2[x]]) from collections import Counter c=Counter() for x in selected_anchors: if ws.sdg.get_nodeview(x).size()<70: continue p=cf.get_pattern(ws.sdg.get_nodeview(x)) if p[-1]=='1': continue #print(p) c[p[:-2]]+=1 used=set() for x in c.most_common()[:20]: if x[0] in used: continue #print("\n",x) closest="" closest_score=30 furthest="" furthest_score=0 for x2 in c.keys(): if x2 !=x[0]: s=sdist(x[0],x2) if s<closest_score: closest_score=s closest=x2 if s>furthest_score: furthest_score=s furthest=x2 #print("closest:",closest,c[closest],"s:",closest_score) #print("furthest",furthest,c[furthest],"s:",furthest_score) if furthest_score==19: #print("furthest",furthest,c[furthest],"s:",furthest_score) print(max(x[0],furthest),min(x[0],furthest),c[max(x[0],furthest)],c[min(x[0],furthest)]) used.add(x[0]) used.add(furthest) ```