TOGA installation and running (slurm)

# TOGA installation and running (slurm) 1. Install nextflow and add it to $PATH ``` cd /scratch/ddepanis/Software/ curl -fsSL https://get.nextflow.io | bash cd /home/ddepanis/bin ln -s /scratch/ddepanis/Software/nextflow nextflow # log out / log in to have nextflow in the path ``` 2. Install [make_lastz_chains](https://github.com/hillerlab/make_lastz_chains) and [TOGA](https://github.com/hillerlab/TOGA) ``` conda create -n TOGA_env python=3.8 conda activate TOGA_env conda install -c bioconda bedparse ucsc-fatotwobit ucsc-twobitinfo cd /scratch/ddepanis/Software/ git clone https://github.com/hillerlab/make_lastz_chains.git cd make_lastz_chains pip3 install -r requirements.txt >> install.log 2>&1 python3 install_dependencies.py >> install.log 2>&1 #this is to solve a dependency problem that appeared: cd /scratch/ddepanis/Software/anaconda3/lib ln -s libssl.so.1.1 libssl.so.1.0.0 ln -s libcrypto.so.1.1 libcrypto.so.1.0.0 # remember to add kent_binaries to path in the job script, like: export PATH=/scratch/ddepanis/Software/make_lastz_chains/kent_binaries:$PATH cd /scratch/ddepanis/Software/ git clone https://github.com/hillerlab/TOGA.git cd TOGA python3 -m pip install -r requirements.txt --user ./configure.sh ./run_test.sh micro ``` :::warning Files required to run: * soft-masked reference gz compressed * reference gtf gz compressed * reference isoforms tsv file * soft-masked query assembly fasta *Example:* Download reference data from www.ensembl.org (reference should be <...>dna_sm.toplevel.fa.gz) To get the isoform file, go to www.ensembl.org/biomart/martview . In Dataset, choose "Ensembl Genes" and the species of interest . In Filters, select "Gene type" - protein_coding . In Attributes only select "Gene stable ID" and "Transcript stable ID" . Go to Results and download the isoforms tsv file . Check the isoforms file, if the names have something like .1 at the end, remove it to match the bed file :warning: Isoforms file is not mandatory, if not using it remove the `-i ${ISOFORMS_REF_TSV}` flag from the `toga.py` command in the [full_chains_toga.job](https://github.com/diegomics/Pahuel_assembly_pipeline/blob/main/extras/full_chains_toga.job) script ::: 3. Run make_lastz_chains and Run TOGA :::success Use the [full_chains_toga.job](https://github.com/diegomics/Pahuel_assembly_pipeline/blob/main/extras/full_chains_toga.job) script to run both **make_lastz_chains** and **TOGA** in one take :wink: ::: TOGA installation 2024 ``` conda create -n TOGA_env python=3.11 conda activate TOGA_env cd /scratch/ddepanis/Software git clone https://github.com/hillerlab/TOGA.git cd TOGA python3 -m pip install -r requirements.txt bash configure.sh module load Nextflow ./run_test.sh micro cd /scratch/ddepanis/Software git clone https://github.com/hillerlab/make_lastz_chains.git cd make_lastz_chains # The pipeline requires many UCSC Kent binaries, # they can be downloaded using this script, # unless they are already in the $PATH: ./install_dependencies.py conda install bioconda::lastz bioconda::ucsc-genepredtobed bioconda::ucsc-gff3togenepred bioconda::ucsc-bedtogenepred bioconda::ucsc-genepredtogtf conda-forge::rust conda-forge::matplotlib (base) [ddepanis@login make_lastz_chains]$ cat parallelization/nextflow_wrapper.py """Module to manage Nextflow processes.""" import os import shutil import subprocess from constants import Constants from modules.make_chains_logging import to_log from modules.error_classes import NextflowProcessError class NextflowConfig: """Model for a config file.""" def __init__(self, executor, memory, time, label, config_dir, **kwargs): self.executor = executor self.memory = memory self.time = time self.label = label self.config_dir = config_dir self.queue = kwargs.get("queue", None) self.qos = kwargs.get("qos", "standard") # Added qos parameter with default value self.cpus = 1 # always a fixed number self.config_path = None self.queue_size = Constants.NextflowConstants.DEFAULT_QUEUE_SIZE def dump_to_file(self): """Write the respective config file,""" filename = f"{self.label}_config.nf" self.config_path = os.path.join(self.config_dir, filename) f = open(self.config_path, "w") f.write(f"// Nextflow config for {self.label} jobs\n") f.write(f"process.executor = '{self.executor}'\n") f.write(f"process.memory = '{self.memory} G'\n") f.write(f"process.time = '{self.time}'\n") f.write(f"process.cpus = '{self.cpus}'\n") if self.queue: f.write(f"process.queue = '{self.queue}'\n") f.write(f"executor.queueSize = '{self.queue_size}'\n") f.write(f"process.clusterOptions = '--qos={self.qos}'\n") # Write qos setting f.close() return self.config_path def remove_config(self): if self.config_path is None: return os.remove(self.config_path) if os.path.isfile(self.config_path) else None class NextflowWrapper: """ Nextflow manager. """ def __init__(self, nextflow_exec): self.nextflow_exec = nextflow_exec self._process = None self.joblist_path = None self.config_file = None self.config_instance = None self.return_code = None self.execute_dir = None self.label = None self.nf_master_script = Constants.NextflowConstants.NF_SCRIPT_PATH def execute(self, joblist_path: str, config_instance: NextflowConfig, execute_dir: str, wait=False, **kwargs): """Implementation for Nextflow.""" # define parameters self.joblist_path = joblist_path self.config_instance = config_instance self.execute_dir = execute_dir self.label = kwargs.get("label", "") # create the nextflow process self.config_file = config_instance.dump_to_file() cmd = f"{self.nextflow_exec} {self.nf_master_script} --joblist {joblist_path} -c {self.config_file}" os.makedirs(self.execute_dir, exist_ok=True) to_log(f"Parallel manager: pushing job {cmd}") self._process = subprocess.Popen(cmd, shell=True, # stdout=log_file, # stderr=log_file, cwd=self.execute_dir) if wait: self._process.wait() def _acquire_return_code(self): running = self._process.poll() is None if running: return self.return_code = self._process.returncode def check_status(self): """Check if nextflow jobs are done.""" if self.return_code: return self.return_code self._acquire_return_code() return self.return_code def check_failed(self, dont_clean_logs=False): self._acquire_return_code() if self.return_code is None: return if self.return_code == 0: to_log(f"\n### Nextflow process {self.label} finished successfully") return to_log(f"\n### Error! The nextflow process {self.label} crashed!") if dont_clean_logs is False: to_log(f"Please look at the logs in the {self.execute_dir}") else: self.cleanup() raise NextflowProcessError(f"Jobs for {self.label} at {self.joblist_path} died") def cleanup(self): """Nextflow produces a bunch of files: to be removed.""" nf_dir = os.path.join(self.execute_dir, ".nextflow") work_dir = os.path.join(self.execute_dir, "work") shutil.rmtree(nf_dir) shutil.rmtree(work_dir) self.config_instance.remove_config() def execute_nextflow_step(nextflow_exec, executor, memory_req, time_req, step_label, config_dir, queue, joblist, run_dir, qos="standard"): # Added qos parameter with default value """ Execute Nextflow Step Executes a Nextflow step using the specified parameters. Facilitates the cooperation between NextflowConfig and NextflowWrapper classes. Parameters: - nextflow_exec (str): The path to the Nextflow executable. - executor (str): The Nextflow executor to use. - memory_req (str): The memory requirement for the step. - time_req (str): The time requirement for the step. - step_label (str): The label for the step. - config_dir (str): The directory to find the Nextflow configuration files. - queue (str): The queue to submit the job to. - joblist (str): The list of jobs to execute. - run_dir (str): The directory where the Nextflow run will be executed. - qos (str): The quality of service for the job. Defaults to 'standard'. Returns: None Raises: - NextflowProcessError: If the Nextflow process fails. """ nextflow_config = NextflowConfig(executor, memory_req, time_req, step_label, config_dir=config_dir, queue=queue, qos=qos) nextflow_manager = NextflowWrapper(nextflow_exec) nextflow_manager.execute(joblist, nextflow_config, run_dir, wait=True, label=step_label) nextflow_manager.check_failed() nextflow_manager.cleanup() ``` `` conda install bioconda::bedparse conda install bioconda::ucsc-fatotwobit conda install bioconda::ucsc-twobitinfo ``` TOGA data downloading 2024 ``` ``` TOGA running script 2024 ``` ``` process.queue = 'begendiv,main' process.clusterOptions = '--qos=standard'