## 1. Read cleaning This cleaning code was used on each file: #SBATCH -J TrimGal #SBATCH -p core #SBATCH -n 1 #SBATCH -t 06:00:00 #SBATCH -A snic2020-5-20 #SBATCH --mail-user peter.pruisscher@ebc.uu.se #SBATCH --mail-type=ALL #load modules module load bioinfo-tools module load TrimGalore/0.4.1 module load cutadapt/1.9.1 module load FastQC/0.11.5 #module load MultiQC/0.9 #read name of files containing paired reads READ1=${1:?msg} READ2=${2:?msg} #location of output folder SRCDIR=/proj/uppstore2017185/b2014034/nobackup/PeterP/balancingselection/1_ReadTrimming/TrimmingQCOutputs #use local scratch disk #cd $SNIC_TMP #USAGE: trim_galore [options] <filename(s)> trim_galore \ --quality 30 \ --paired \ --illumina \ --phred33 \ --stringency 1 \ -e 0.1 \ --clip_R1 12 \ --clip_R2 12 \ --three_prime_clip_R1 3 \ --three_prime_clip_R2 3 \ --length 30 \ --gzip \ --output_dir $SRCDIR \ --fastqc \ $READ1 \ $READ2 #multiqc $SRCDIR/ # creates single report ## 2. Mapping #Prepare working directory: mkdir /proj/uppstore2017185/b2014034/nobackup/Aleix/recombination_test cd /proj/uppstore2017185/b2014034/nobackup/Aleix/recombination_test mkdir nextgenmap cd nextgenmap ln -s /proj/uppstore2017185/b2014034/private/raw_data/Genome_assembly_data/HiC_Lars_Hook/delivery03527/INBOX/P14503/01-Results/02-3DDNA/P14503_103_3DDNA.fasta ./ nano submit_mapping.sh echo echo "Starting Uppmax jobs ..." echo #input data files #PATH_RAW=/proj #path to where raw files are located (top folder) #PATH_MAIN=/proj/uppstore2017185/b2014034/nobackup/Aleix/recombination_test/nextgenmap/ #path to project folder (my analysis) RAW_FN=/proj/uppstore2017185/b2014034/nobackup/Aleix/recombination_test/nextgenmap/sample_list.txt #file with names of files containing raw reads N_LINES=$(wc -l < "$RAW_FN") N_SAMPLES=$((N_LINES / 2)) #number of samples echo $N_SAMPLES declare -A READS #declare variable as array i=1 j=1 while read -r LINE #reads each filename at a time, stores names (paired) in array do if [ "$j" -eq 1 ] then READS[$i,1]=$LINE let "j=2" else READS[$i,2]=$LINE let "i+=1" let "j=1" fi # echo $j#${READS[$i,$j]} done < ${RAW_FN} #echo $READS # prepare output folder cd /proj/uppstore2017185/b2014034/nobackup/Aleix/recombination_test/nextgenmap/ #cd to project folder for i in `seq 1 1 $N_SAMPLES`; do DNAFILE_A=${READS[$i,1]} DNAFILE_B=${READS[$i,2]} echo $DNAFILE_A #display fastq file names (paired) echo $DNAFILE_B sbatch /proj/uppstore2017185/b2014034/nobackup/Aleix/recombination_test/nextgenmap/run_mapping.sh $DNAFILE_A $DNAFILE_B #starts job sleep 1 #pauses for 1 sec echo done chmod u+x submit_mapping.sh # next script for mapping nano run_mapping.sh #!/bin/bash -l #SBATCH -A snic2020-5-20 #specify the job name #SBATCH --job-name=NextGenMap #how many cpus are requested #SBATCH --ntasks=19 #run on one node, importand if you have more than 1 ntasks #SBATCH -n 1 #SBATCH --time=30:05:00 #maximum requested memory #SBATCH --mem=128G #write std out and std error to these files #send a mail for job start, end, fail, etc. #SBATCH --mail-type=ALL #SBATCH --mail-user=aleix.palahi@ebc.uu.se #SBATCH -p node date #load modules: module load bioinfo-tools module load NextGenMap/0.5.4 READ1=${1:?msg} #read name of files containing paired reads READ2=${2:?msg} SRCDIR=/proj/uppstore2017185/b2014034/nobackup/Aleix/recombination_test/nextgenmap/${READ1:102:114}.bam #location of output folder #USAGE: ngm -r reference.fa -1 -2 -o file.bam -t 19 -p -b -i 0.65 ngm -r /proj/uppstore2017185/b2014034/private/raw_data/Genome_assembly_data/HiC_Lars_Hook/delivery03527/INBOX/P14503/01-Results/02-3DDNA/P14503_103_3DDNA.fasta \ -1 $READ1 \ -2 $READ2 \ -o $SRCDIR \ -t 19 -p -b -i 0.65 chmod u+x run_mapping.sh #run the mapping script: ./submit_mapping.sh ## Filter for correctly aligned pairs only, sort and index the bamfiles: cd /data/projects/pruisscher/poolseq/napi samtools flagstat ./ngm/Aiguamolls_ngm.bam > ./ngm/Aiguamolls_ngm.bam.flagstats & samtools flagstat ./ngm/Abisko_ngm.bam > ./ngm/Abisko_ngm.bam.flagstats & samtools view -f 3 -b ./ngm/Aiguamolls_ngm.bam > ./ngm/Aiguamolls_ngm.pairs.bam & samtools view -f 3 -b ./ngm/Abisko_ngm.bam > ./ngm/Abisko_ngm.pairs.bam & samtools flagstat ./ngm/Aiguamolls_ngm.pairs.bam > ./ngm/Aiguamolls_ngm.pairs.bam.flagstats & samtools flagstat ./ngm/Abisko_ngm.pairs.bam > ./ngm/Abisko_ngm.pairs.bam.flagstats & samtools sort ./ngm/Aiguamolls_ngm.pairs.bam ./ngm/Aiguamolls_ngm.pairs.sorted & samtools sort ./ngm/Abisko_ngm.pairs.bam ./ngm/Abisko_ngm.pairs.sorted & samtools index ./ngm/Aiguamolls_ngm.pairs.sorted.bam & samtools index ./ngm/Abisko_ngm.pairs.sorted.bam & #### Mapping check by comparing read depth across the genome using idxstats: ## Prepare sorted and indexed bamfile for samtools: cd /data/projects/pruisscher/poolseq/pararge samtools sort ./ngm/Aiguamolls_ngm.bam ./ngm/Aiguamolls_ngm.sorted & samtools sort ./ngm/Abisko_ngm.bam ./ngm/Abisko_ngm.sorted & samtools index ./ngm/Aiguamolls_ngm.sorted.bam & samtools index ./ngm/Abisko_ngm.sorted.bam & ## Run samtools idxstats for quality check: samtools idxstats ./ngm/Aiguamolls_ngm.sorted.bam > ./ngm/Aiguamolls_ngm.sorted.bam.idxstats samtools idxstats ./ngm/Abisko_ngm.sorted.bam > ./ngm/Abisko_ngm.sorted.bam.idxstats join ./ngm/Aiguamolls_ngm.sorted.bam.idxstats ./ngm/Abisko_ngm.sorted.bam.idxstats > ./ngm/AiguamollsAbisko.idxstats ## Run samtools idxstats for quality check on the paired reads: samtools idxstats ./ngm/Aiguamolls_ngm.pairs.sorted.bam > ./ngm/Aiguamolls_ngm.pairs.sorted.bam.idxstats samtools idxstats ./ngm/Abisko_ngm.pairs.sorted.bam > ./ngm/Abisko_ngm.pairs.sorted.bam.idxstats join ./ngm/Aiguamolls_ngm.pairs.sorted.bam.idxstats ./ngm/Abisko_ngm.pairs.sorted.bam.idxstats > ./ngm/AiguamollsAbisko.pairs.idxstats