F. cylindrus methods

# F. cylindrus methods ## Step 1 -> the gonza scripts ![](https://i.imgur.com/xKLxhFR.png) ```python= MIN_UKMER_FREQ=20 MAX_UKMER_FREQ=80 K=31 GK=63 MIN_CVG=30 NUM_BATCHES=8 ws=SDG.WorkSpace() peds=ws.add_paired_reads_datastore('./PE_reads.prseq', 'pe') gm=SDG.GraphMaker(ws.sdg) gm.new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES); print("First graph done") kc = ws.add_kmer_counter('pek31', K) kc.add_count('pe', peds) print("kmers counted") gm.new_graph_from_paired_datastore(peds,GK,MIN_CVG,NUM_BATCHES); print("second graph done") ws.sdg.write_to_gfa1("./00-sdg_dbg_graph_MC%s.gfa" %MIN_CVG) gc=SDG.GraphContigger(ws) gc.clip_tips(2*GK-1,3) print("Remaining nodes: %s" %(len(ws.sdg.get_all_nodeviews()))) kc.update_graph_counts() ws.dump("./00-sdg_dbg_graph_MC%s.sdgws" %MIN_CVG) ## Make unique kmer runs seqs = gc.contig_reduction_to_unique_kmers(MIN_UKMER_FREQ,MAX_UKMER_FREQ) print(len(seqs),"sequences for anchoring") with open("./pseudoLong2.fasta","w") as f: for i, s in enumerate(seqs): f.write(">seq_%s\n%s\n"%(i,s)) peds.mapper.path_reads(31,fill_offsets=True) def dg_from_paths(peds,g,add_pair_jumps=False,pair_jump_distace=300): dg=SDG.DistanceGraph(g) for rid in range(1,len(peds.mapper.read_path_offsets)): p=peds.mapper.read_paths[rid].path o=peds.mapper.read_path_offsets[rid] for i in range(len(p)-1): dg.add_link(-p[i],p[i+1],o[i+1][0]-o[i+1][1]-(o[i][0]-o[i][1])-dg.get_nodeview(p[i]).size(),SDG.Support(SDG.SupportType.stReadPath,0,rid)) if add_pair_jumps: for rid1 in range(1,len(peds.mapper.read_path_offsets),2): p1=peds.mapper.read_paths[rid1].path p2=peds.mapper.read_paths[rid1+1].path if p1 and p2: dg.add_link(-p1[-1],-p2[-1],pair_jump_distace,SDG.Support(SDG.SupportType.stReadPath,1,rid1)) return dg sedg=dg_from_paths(peds,ws.sdg) u=SDG.LinkageUntangler(sedg) u.select_all() sedg_nsl=u.make_nextselected_linkage(5) sedg_nsl.write_to_gfa1("./sedg_nsl.gfa") pedg=dg_from_paths(peds,ws.sdg,add_pair_jumps=True) u=SDG.LinkageUntangler(pedg) u.select_all() pedg_nsl=u.make_nextselected_linkage(5) pedg_nsl.remove_transitive_links() pedg_nsl.write_to_gfa1("./pedg_nsl.gfa") ws.dump("./01-unique_kmer_runs_pemapped.sdgws") lords = ws.add_long_reads_datastore('./nanopore.loseq') lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31) lrr.map(31) lrr.dump('./lrr_mapped_nanopore.data') ws.dump('./kmer_runs_mapped_nanopore_toload.sdgws') lrr.simple_thread_reads() rtg=lrr.rtg_from_threads(True) rpl=rtg.clean_repeat_nodes_popping_list(200) print(len(rpl),len(set([x[1] for x in rpl]))) rtg.apply_popping_list(rpl) cpl=rtg.clean_all_nodes_by_thread_clustering_popping_list(5,.5) print(len(cpl),len(set([x[1] for x in cpl]))) rtg.apply_popping_list(cpl) his=SDG.HappySorter(rtg) ``` ![](https://i.imgur.com/b2JLMPX.png) ## Step 2 -> contig classification (A/B, C, AB, ABC) ```python= ws=SDG.WorkSpace("./00-sdg_dbg_graph_MC30.sdgws") K=31 peds=ws.add_paired_reads_datastore('./PE_reads.prseq', 'pe') kcount = ws.get_kmer_counter('pek31') kcount.compute_all_kcis() cnodes=[] for node in ws.sdg.get_all_nodeviews(): kcov=node.kmer_coverage("pek31", "pe") if median(kcov) > 25: if median(kcov) < 80: if len(node.sequence()) > 200: cnodes.append(node) f=open("C_seqs.00-sdg_dbg_graph_MC30.fasta", "w") for node in ws.sdg.get_all_nodeviews(): if node in cnodes: f.write(">seq{0}\n{1}\n".format(node.node_id(),node.sequence())) f.close ``` ![](https://i.imgur.com/tjIKOcS.png) ## Step 3 -> finding "C" parts on dbg -> creating a list of used anchors. ```python= cnodes={} with open ('./C_seqs.00-sdg_dbg_graph_MC30.fasta') as Cseqs: for line in Cseqs: if line.startswith('>'): cnodes[line]=line[4:].strip() canchors={} ws=SDG.WorkSpace('./long_reads_cleanup.sdgws') for node in ws.sdg.get_all_nodeviews(): for key, value in cnodes.items(): n = float(value) if node.node_id() == n: canchors[node.node_id()]=node.sequence() f=open("C_anchors.00-sdg_dbg_graph_MC30.fasta", "w") for key, value in canchors.items(): f.write(">seq{0}\n{1}\n".format(key,value)) ``` ![](https://i.imgur.com/Cqm8hrH.png) ```python= ws=SDG.WorkSpace('./long_reads_cleanup.sdgws') kc=ws.add_kmer_counter('C_sequence_collection', 31) kc.add_count('c_contigs', ['./C_anchors.00-sdg_dbg_graph_MC30.fasta', ], fastq=False) rtg=SDG.ReadThreadsGraph(ws.sdg) rtg.load('./cleanup.rtg') C_anchors=[] for node in ws.sdg.get_all_nodeviews(): nodekc=ws.sdg.get_nodeview(node.node_id()).kmer_coverage("C_sequence_collection", "c_contigs") if statistics.mean(nodekc)==1: C_anchors.append(node.node_id()) print ("C anchors listed") used_anchors=[] i=1 with open("Corders_.8.8mtn20.fa","w") as f: for node in C_anchors: if abs(node) not in used_anchors: START_NODE=ws.sdg.get_nodeview(node).node_id() hs=SDG.HappySorter(rtg,.8,.8,min_thread_nodes=10) hs.start_from_node(START_NODE,min_links=5) hs.grow_loop() f.write(">lo%d\n%s\n"%(i,order_to_sequence(hs.order))) i+=1 for x in hs.order.as_signed_nodes(): used_anchors.append(abs(x)) print ("C orders happy") ``` ![](https://i.imgur.com/8HlSxdK.png) ## Step 4 -> the "separated C" genome (filter threads, recover reads, minimap2) ```python= threadlist=[] rtg=SDG.ReadThreadsGraph(ws.sdg) rtg.load('./cleanup.rtg') for node in C_anchors: for x in rtg.get_nodeview(node).next(): threadid = x.support().id thread = rtg.get_thread(threadid) threadlist.append(threadid) b=set(C_anchors) a=set() for threadid in threadlist: thread=rtg.get_thread(threadid) for node in thread: a.add(node.node) shared=b.intersection(a) if len(shared)>3: print (thread) ``` ## Step 5 -> the "anchor consumption C" genome ## Step 6 -> running happy sorter in all anchors to produce a full genome

Syntax	Example	Reference
# Header	Header	基本排版
- Unordered List	Unordered List
1. Ordered List	Ordered List
- [ ] Todo List	Todo List
> Blockquote	Blockquote
Bold font	Bold font
Italics font	Italics font
~~Strikethrough~~	~~Strikethrough~~
19^th^	19^th
H~2~O	H₂O
++Inserted text++	Inserted text
==Marked text==	Marked text
[link text](https:// "title")	Link
![image alt](https:// "title")	Image
`Code`	`Code`	在筆記中貼入程式碼
```javascript var i = 0; ```	`var i = 0;`
:smile:		Emoji list
{%youtube youtube_id %}	Externals
$L^aT_eX$	L^aT_eX
:::info This is a alert area. :::	This is a alert area.