# F. cylindrus methods
## Step 1 -> the gonza scripts

```python=
MIN_UKMER_FREQ=20
MAX_UKMER_FREQ=80
K=31
GK=63
MIN_CVG=30
NUM_BATCHES=8
ws=SDG.WorkSpace()
peds=ws.add_paired_reads_datastore('./PE_reads.prseq', 'pe')
gm=SDG.GraphMaker(ws.sdg)
gm.new_graph_from_paired_datastore(peds,K,MIN_CVG,NUM_BATCHES);
print("First graph done")
kc = ws.add_kmer_counter('pek31', K)
kc.add_count('pe', peds)
print("kmers counted")
gm.new_graph_from_paired_datastore(peds,GK,MIN_CVG,NUM_BATCHES);
print("second graph done")
ws.sdg.write_to_gfa1("./00-sdg_dbg_graph_MC%s.gfa" %MIN_CVG)
gc=SDG.GraphContigger(ws)
gc.clip_tips(2*GK-1,3)
print("Remaining nodes: %s" %(len(ws.sdg.get_all_nodeviews())))
kc.update_graph_counts()
ws.dump("./00-sdg_dbg_graph_MC%s.sdgws" %MIN_CVG)
## Make unique kmer runs
seqs = gc.contig_reduction_to_unique_kmers(MIN_UKMER_FREQ,MAX_UKMER_FREQ)
print(len(seqs),"sequences for anchoring")
with open("./pseudoLong2.fasta","w") as f:
for i, s in enumerate(seqs):
f.write(">seq_%s\n%s\n"%(i,s))
peds.mapper.path_reads(31,fill_offsets=True)
def dg_from_paths(peds,g,add_pair_jumps=False,pair_jump_distace=300):
dg=SDG.DistanceGraph(g)
for rid in range(1,len(peds.mapper.read_path_offsets)):
p=peds.mapper.read_paths[rid].path
o=peds.mapper.read_path_offsets[rid]
for i in range(len(p)-1):
dg.add_link(-p[i],p[i+1],o[i+1][0]-o[i+1][1]-(o[i][0]-o[i][1])-dg.get_nodeview(p[i]).size(),SDG.Support(SDG.SupportType.stReadPath,0,rid))
if add_pair_jumps:
for rid1 in range(1,len(peds.mapper.read_path_offsets),2):
p1=peds.mapper.read_paths[rid1].path
p2=peds.mapper.read_paths[rid1+1].path
if p1 and p2:
dg.add_link(-p1[-1],-p2[-1],pair_jump_distace,SDG.Support(SDG.SupportType.stReadPath,1,rid1))
return dg
sedg=dg_from_paths(peds,ws.sdg)
u=SDG.LinkageUntangler(sedg)
u.select_all()
sedg_nsl=u.make_nextselected_linkage(5)
sedg_nsl.write_to_gfa1("./sedg_nsl.gfa")
pedg=dg_from_paths(peds,ws.sdg,add_pair_jumps=True)
u=SDG.LinkageUntangler(pedg)
u.select_all()
pedg_nsl=u.make_nextselected_linkage(5)
pedg_nsl.remove_transitive_links()
pedg_nsl.write_to_gfa1("./pedg_nsl.gfa")
ws.dump("./01-unique_kmer_runs_pemapped.sdgws")
lords = ws.add_long_reads_datastore('./nanopore.loseq')
lrr=SDG.LongReadsRecruiter(ws.sdg,lords,31)
lrr.map(31)
lrr.dump('./lrr_mapped_nanopore.data')
ws.dump('./kmer_runs_mapped_nanopore_toload.sdgws')
lrr.simple_thread_reads()
rtg=lrr.rtg_from_threads(True)
rpl=rtg.clean_repeat_nodes_popping_list(200)
print(len(rpl),len(set([x[1] for x in rpl])))
rtg.apply_popping_list(rpl)
cpl=rtg.clean_all_nodes_by_thread_clustering_popping_list(5,.5)
print(len(cpl),len(set([x[1] for x in cpl])))
rtg.apply_popping_list(cpl)
his=SDG.HappySorter(rtg)
```

## Step 2 -> contig classification (A/B, C, AB, ABC)
```python=
ws=SDG.WorkSpace("./00-sdg_dbg_graph_MC30.sdgws")
K=31
peds=ws.add_paired_reads_datastore('./PE_reads.prseq', 'pe')
kcount = ws.get_kmer_counter('pek31')
kcount.compute_all_kcis()
cnodes=[]
for node in ws.sdg.get_all_nodeviews():
kcov=node.kmer_coverage("pek31", "pe")
if median(kcov) > 25:
if median(kcov) < 80:
if len(node.sequence()) > 200:
cnodes.append(node)
f=open("C_seqs.00-sdg_dbg_graph_MC30.fasta", "w")
for node in ws.sdg.get_all_nodeviews():
if node in cnodes:
f.write(">seq{0}\n{1}\n".format(node.node_id(),node.sequence()))
f.close
```

## Step 3 -> finding "C" parts on dbg -> creating a list of used anchors.
```python=
cnodes={}
with open ('./C_seqs.00-sdg_dbg_graph_MC30.fasta') as Cseqs:
for line in Cseqs:
if line.startswith('>'):
cnodes[line]=line[4:].strip()
canchors={}
ws=SDG.WorkSpace('./long_reads_cleanup.sdgws')
for node in ws.sdg.get_all_nodeviews():
for key, value in cnodes.items():
n = float(value)
if node.node_id() == n:
canchors[node.node_id()]=node.sequence()
f=open("C_anchors.00-sdg_dbg_graph_MC30.fasta", "w")
for key, value in canchors.items():
f.write(">seq{0}\n{1}\n".format(key,value))
```

```python=
ws=SDG.WorkSpace('./long_reads_cleanup.sdgws')
kc=ws.add_kmer_counter('C_sequence_collection', 31)
kc.add_count('c_contigs', ['./C_anchors.00-sdg_dbg_graph_MC30.fasta', ], fastq=False)
rtg=SDG.ReadThreadsGraph(ws.sdg)
rtg.load('./cleanup.rtg')
C_anchors=[]
for node in ws.sdg.get_all_nodeviews():
nodekc=ws.sdg.get_nodeview(node.node_id()).kmer_coverage("C_sequence_collection", "c_contigs")
if statistics.mean(nodekc)==1:
C_anchors.append(node.node_id())
print ("C anchors listed")
used_anchors=[]
i=1
with open("Corders_.8.8mtn20.fa","w") as f:
for node in C_anchors:
if abs(node) not in used_anchors:
START_NODE=ws.sdg.get_nodeview(node).node_id()
hs=SDG.HappySorter(rtg,.8,.8,min_thread_nodes=10)
hs.start_from_node(START_NODE,min_links=5)
hs.grow_loop()
f.write(">lo%d\n%s\n"%(i,order_to_sequence(hs.order)))
i+=1
for x in hs.order.as_signed_nodes():
used_anchors.append(abs(x))
print ("C orders happy")
```

## Step 4 -> the "separated C" genome (filter threads, recover reads, minimap2)
```python=
threadlist=[]
rtg=SDG.ReadThreadsGraph(ws.sdg)
rtg.load('./cleanup.rtg')
for node in C_anchors:
for x in rtg.get_nodeview(node).next():
threadid = x.support().id
thread = rtg.get_thread(threadid)
threadlist.append(threadid)
b=set(C_anchors)
a=set()
for threadid in threadlist:
thread=rtg.get_thread(threadid)
for node in thread:
a.add(node.node)
shared=b.intersection(a)
if len(shared)>3:
print (thread)
```
## Step 5 -> the "anchor consumption C" genome
## Step 6 -> running happy sorter in all anchors to produce a full genome