# TOGA installation and running (slurm)
1. Install nextflow and add it to $PATH
```
cd /scratch/ddepanis/Software/
curl -fsSL https://get.nextflow.io | bash
cd /home/ddepanis/bin
ln -s /scratch/ddepanis/Software/nextflow nextflow
# log out / log in to have nextflow in the path
```
2. Install [make_lastz_chains](https://github.com/hillerlab/make_lastz_chains) and [TOGA](https://github.com/hillerlab/TOGA)
```
conda create -n TOGA_env python=3.8
conda activate TOGA_env
conda install -c bioconda bedparse ucsc-fatotwobit ucsc-twobitinfo
cd /scratch/ddepanis/Software/
git clone https://github.com/hillerlab/make_lastz_chains.git
cd make_lastz_chains
pip3 install -r requirements.txt >> install.log 2>&1
python3 install_dependencies.py >> install.log 2>&1
#this is to solve a dependency problem that appeared:
cd /scratch/ddepanis/Software/anaconda3/lib
ln -s libssl.so.1.1 libssl.so.1.0.0
ln -s libcrypto.so.1.1 libcrypto.so.1.0.0
# remember to add kent_binaries to path in the job script, like:
export PATH=/scratch/ddepanis/Software/make_lastz_chains/kent_binaries:$PATH
cd /scratch/ddepanis/Software/
git clone https://github.com/hillerlab/TOGA.git
cd TOGA
python3 -m pip install -r requirements.txt --user
./configure.sh
./run_test.sh micro
```
:::warning
Files required to run:
* soft-masked reference gz compressed
* reference gtf gz compressed
* reference isoforms tsv file
* soft-masked query assembly fasta
*Example:*
Download reference data from www.ensembl.org
(reference should be <...>dna_sm.toplevel.fa.gz)
To get the isoform file, go to www.ensembl.org/biomart/martview
. In Dataset, choose "Ensembl Genes" and the species of interest
. In Filters, select "Gene type" - protein_coding
. In Attributes only select "Gene stable ID" and "Transcript stable ID"
. Go to Results and download the isoforms tsv file
. Check the isoforms file, if the names have something like .1 at the end, remove it to match the bed file
:warning: Isoforms file is not mandatory, if not using it remove the `-i ${ISOFORMS_REF_TSV}` flag from the `toga.py` command in the [full_chains_toga.job](https://github.com/diegomics/Pahuel_assembly_pipeline/blob/main/extras/full_chains_toga.job) script
:::
3. Run make_lastz_chains and Run TOGA
:::success
Use the [full_chains_toga.job](https://github.com/diegomics/Pahuel_assembly_pipeline/blob/main/extras/full_chains_toga.job) script to run both **make_lastz_chains** and **TOGA** in one take :wink:
:::
TOGA installation 2024
```
conda create -n TOGA_env python=3.11
conda activate TOGA_env
cd /scratch/ddepanis/Software
git clone https://github.com/hillerlab/TOGA.git
cd TOGA
python3 -m pip install -r requirements.txt
bash configure.sh
module load Nextflow
./run_test.sh micro
cd /scratch/ddepanis/Software
git clone https://github.com/hillerlab/make_lastz_chains.git
cd make_lastz_chains
# The pipeline requires many UCSC Kent binaries,
# they can be downloaded using this script,
# unless they are already in the $PATH:
./install_dependencies.py
conda install bioconda::lastz bioconda::ucsc-genepredtobed bioconda::ucsc-gff3togenepred bioconda::ucsc-bedtogenepred bioconda::ucsc-genepredtogtf conda-forge::rust conda-forge::matplotlib
(base) [ddepanis@login make_lastz_chains]$ cat parallelization/nextflow_wrapper.py
"""Module to manage Nextflow processes."""
import os
import shutil
import subprocess
from constants import Constants
from modules.make_chains_logging import to_log
from modules.error_classes import NextflowProcessError
class NextflowConfig:
"""Model for a config file."""
def __init__(self, executor, memory, time, label, config_dir, **kwargs):
self.executor = executor
self.memory = memory
self.time = time
self.label = label
self.config_dir = config_dir
self.queue = kwargs.get("queue", None)
self.qos = kwargs.get("qos", "standard") # Added qos parameter with default value
self.cpus = 1 # always a fixed number
self.config_path = None
self.queue_size = Constants.NextflowConstants.DEFAULT_QUEUE_SIZE
def dump_to_file(self):
"""Write the respective config file,"""
filename = f"{self.label}_config.nf"
self.config_path = os.path.join(self.config_dir, filename)
f = open(self.config_path, "w")
f.write(f"// Nextflow config for {self.label} jobs\n")
f.write(f"process.executor = '{self.executor}'\n")
f.write(f"process.memory = '{self.memory} G'\n")
f.write(f"process.time = '{self.time}'\n")
f.write(f"process.cpus = '{self.cpus}'\n")
if self.queue:
f.write(f"process.queue = '{self.queue}'\n")
f.write(f"executor.queueSize = '{self.queue_size}'\n")
f.write(f"process.clusterOptions = '--qos={self.qos}'\n") # Write qos setting
f.close()
return self.config_path
def remove_config(self):
if self.config_path is None:
return
os.remove(self.config_path) if os.path.isfile(self.config_path) else None
class NextflowWrapper:
"""
Nextflow manager.
"""
def __init__(self, nextflow_exec):
self.nextflow_exec = nextflow_exec
self._process = None
self.joblist_path = None
self.config_file = None
self.config_instance = None
self.return_code = None
self.execute_dir = None
self.label = None
self.nf_master_script = Constants.NextflowConstants.NF_SCRIPT_PATH
def execute(self, joblist_path: str, config_instance: NextflowConfig, execute_dir: str, wait=False, **kwargs):
"""Implementation for Nextflow."""
# define parameters
self.joblist_path = joblist_path
self.config_instance = config_instance
self.execute_dir = execute_dir
self.label = kwargs.get("label", "")
# create the nextflow process
self.config_file = config_instance.dump_to_file()
cmd = f"{self.nextflow_exec} {self.nf_master_script} --joblist {joblist_path} -c {self.config_file}"
os.makedirs(self.execute_dir, exist_ok=True)
to_log(f"Parallel manager: pushing job {cmd}")
self._process = subprocess.Popen(cmd,
shell=True,
# stdout=log_file,
# stderr=log_file,
cwd=self.execute_dir)
if wait:
self._process.wait()
def _acquire_return_code(self):
running = self._process.poll() is None
if running:
return
self.return_code = self._process.returncode
def check_status(self):
"""Check if nextflow jobs are done."""
if self.return_code:
return self.return_code
self._acquire_return_code()
return self.return_code
def check_failed(self, dont_clean_logs=False):
self._acquire_return_code()
if self.return_code is None:
return
if self.return_code == 0:
to_log(f"\n### Nextflow process {self.label} finished successfully")
return
to_log(f"\n### Error! The nextflow process {self.label} crashed!")
if dont_clean_logs is False:
to_log(f"Please look at the logs in the {self.execute_dir}")
else:
self.cleanup()
raise NextflowProcessError(f"Jobs for {self.label} at {self.joblist_path} died")
def cleanup(self):
"""Nextflow produces a bunch of files: to be removed."""
nf_dir = os.path.join(self.execute_dir, ".nextflow")
work_dir = os.path.join(self.execute_dir, "work")
shutil.rmtree(nf_dir)
shutil.rmtree(work_dir)
self.config_instance.remove_config()
def execute_nextflow_step(nextflow_exec,
executor,
memory_req,
time_req,
step_label,
config_dir,
queue,
joblist,
run_dir,
qos="standard"): # Added qos parameter with default value
"""
Execute Nextflow Step
Executes a Nextflow step using the specified parameters.
Facilitates the cooperation between NextflowConfig and NextflowWrapper classes.
Parameters:
- nextflow_exec (str): The path to the Nextflow executable.
- executor (str): The Nextflow executor to use.
- memory_req (str): The memory requirement for the step.
- time_req (str): The time requirement for the step.
- step_label (str): The label for the step.
- config_dir (str): The directory to find the Nextflow configuration files.
- queue (str): The queue to submit the job to.
- joblist (str): The list of jobs to execute.
- run_dir (str): The directory where the Nextflow run will be executed.
- qos (str): The quality of service for the job. Defaults to 'standard'.
Returns:
None
Raises:
- NextflowProcessError: If the Nextflow process fails.
"""
nextflow_config = NextflowConfig(executor, memory_req, time_req, step_label, config_dir=config_dir, queue=queue, qos=qos)
nextflow_manager = NextflowWrapper(nextflow_exec)
nextflow_manager.execute(joblist, nextflow_config, run_dir, wait=True, label=step_label)
nextflow_manager.check_failed()
nextflow_manager.cleanup()
```
``
conda install bioconda::bedparse
conda install bioconda::ucsc-fatotwobit
conda install bioconda::ucsc-twobitinfo
```
TOGA data downloading 2024
```
```
TOGA running script 2024
```
```
process.queue = 'begendiv,main'
process.clusterOptions = '--qos=standard'