## 1. Read cleaning
This cleaning code was used on each file:
#SBATCH -J TrimGal
#SBATCH -p core
#SBATCH -n 1
#SBATCH -t 06:00:00
#SBATCH -A snic2020-5-20
#SBATCH --mail-user peter.pruisscher@ebc.uu.se
#SBATCH --mail-type=ALL
#load modules
module load bioinfo-tools
module load TrimGalore/0.4.1
module load cutadapt/1.9.1
module load FastQC/0.11.5
#module load MultiQC/0.9
#read name of files containing paired reads
READ1=${1:?msg}
READ2=${2:?msg}
#location of output folder
SRCDIR=/proj/uppstore2017185/b2014034/nobackup/PeterP/balancingselection/1_ReadTrimming/TrimmingQCOutputs
#use local scratch disk
#cd $SNIC_TMP
#USAGE: trim_galore [options] <filename(s)>
trim_galore \
--quality 30 \
--paired \
--illumina \
--phred33 \
--stringency 1 \
-e 0.1 \
--clip_R1 12 \
--clip_R2 12 \
--three_prime_clip_R1 3 \
--three_prime_clip_R2 3 \
--length 30 \
--gzip \
--output_dir $SRCDIR \
--fastqc \
$READ1 \
$READ2
#multiqc $SRCDIR/ # creates single report
## 2. Mapping
#Prepare working directory:
mkdir /proj/uppstore2017185/b2014034/nobackup/Aleix/recombination_test
cd /proj/uppstore2017185/b2014034/nobackup/Aleix/recombination_test
mkdir nextgenmap
cd nextgenmap
ln -s /proj/uppstore2017185/b2014034/private/raw_data/Genome_assembly_data/HiC_Lars_Hook/delivery03527/INBOX/P14503/01-Results/02-3DDNA/P14503_103_3DDNA.fasta ./
nano submit_mapping.sh
echo
echo "Starting Uppmax jobs ..."
echo
#input data files
#PATH_RAW=/proj #path to where raw files are located (top folder)
#PATH_MAIN=/proj/uppstore2017185/b2014034/nobackup/Aleix/recombination_test/nextgenmap/ #path to project folder (my analysis)
RAW_FN=/proj/uppstore2017185/b2014034/nobackup/Aleix/recombination_test/nextgenmap/sample_list.txt #file with names of files containing raw reads
N_LINES=$(wc -l < "$RAW_FN")
N_SAMPLES=$((N_LINES / 2))
#number of samples
echo $N_SAMPLES
declare -A READS #declare variable as array
i=1
j=1
while read -r LINE #reads each filename at a time, stores names (paired) in array
do
if [ "$j" -eq 1 ]
then
READS[$i,1]=$LINE
let "j=2"
else
READS[$i,2]=$LINE
let "i+=1"
let "j=1"
fi
# echo $j#${READS[$i,$j]}
done < ${RAW_FN}
#echo $READS
# prepare output folder
cd /proj/uppstore2017185/b2014034/nobackup/Aleix/recombination_test/nextgenmap/ #cd to project folder
for i in `seq 1 1 $N_SAMPLES`; do
DNAFILE_A=${READS[$i,1]}
DNAFILE_B=${READS[$i,2]}
echo $DNAFILE_A #display fastq file names (paired)
echo $DNAFILE_B
sbatch /proj/uppstore2017185/b2014034/nobackup/Aleix/recombination_test/nextgenmap/run_mapping.sh $DNAFILE_A $DNAFILE_B #starts job
sleep 1 #pauses for 1 sec
echo
done
chmod u+x submit_mapping.sh
# next script for mapping
nano run_mapping.sh
#!/bin/bash -l
#SBATCH -A snic2020-5-20
#specify the job name
#SBATCH --job-name=NextGenMap
#how many cpus are requested
#SBATCH --ntasks=19
#run on one node, importand if you have more than 1 ntasks
#SBATCH -n 1
#SBATCH --time=30:05:00
#maximum requested memory
#SBATCH --mem=128G
#write std out and std error to these files
#send a mail for job start, end, fail, etc.
#SBATCH --mail-type=ALL
#SBATCH --mail-user=aleix.palahi@ebc.uu.se
#SBATCH -p node
date
#load modules:
module load bioinfo-tools
module load NextGenMap/0.5.4
READ1=${1:?msg} #read name of files containing paired reads
READ2=${2:?msg}
SRCDIR=/proj/uppstore2017185/b2014034/nobackup/Aleix/recombination_test/nextgenmap/${READ1:102:114}.bam #location of output folder
#USAGE: ngm -r reference.fa -1 -2 -o file.bam -t 19 -p -b -i 0.65
ngm -r /proj/uppstore2017185/b2014034/private/raw_data/Genome_assembly_data/HiC_Lars_Hook/delivery03527/INBOX/P14503/01-Results/02-3DDNA/P14503_103_3DDNA.fasta \
-1 $READ1 \
-2 $READ2 \
-o $SRCDIR \
-t 19 -p -b -i 0.65
chmod u+x run_mapping.sh
#run the mapping script:
./submit_mapping.sh
## Filter for correctly aligned pairs only, sort and index the bamfiles:
cd /data/projects/pruisscher/poolseq/napi
samtools flagstat ./ngm/Aiguamolls_ngm.bam > ./ngm/Aiguamolls_ngm.bam.flagstats &
samtools flagstat ./ngm/Abisko_ngm.bam > ./ngm/Abisko_ngm.bam.flagstats &
samtools view -f 3 -b ./ngm/Aiguamolls_ngm.bam > ./ngm/Aiguamolls_ngm.pairs.bam &
samtools view -f 3 -b ./ngm/Abisko_ngm.bam > ./ngm/Abisko_ngm.pairs.bam &
samtools flagstat ./ngm/Aiguamolls_ngm.pairs.bam > ./ngm/Aiguamolls_ngm.pairs.bam.flagstats &
samtools flagstat ./ngm/Abisko_ngm.pairs.bam > ./ngm/Abisko_ngm.pairs.bam.flagstats &
samtools sort ./ngm/Aiguamolls_ngm.pairs.bam ./ngm/Aiguamolls_ngm.pairs.sorted &
samtools sort ./ngm/Abisko_ngm.pairs.bam ./ngm/Abisko_ngm.pairs.sorted &
samtools index ./ngm/Aiguamolls_ngm.pairs.sorted.bam &
samtools index ./ngm/Abisko_ngm.pairs.sorted.bam &
#### Mapping check by comparing read depth across the genome using idxstats:
## Prepare sorted and indexed bamfile for samtools:
cd /data/projects/pruisscher/poolseq/pararge
samtools sort ./ngm/Aiguamolls_ngm.bam ./ngm/Aiguamolls_ngm.sorted &
samtools sort ./ngm/Abisko_ngm.bam ./ngm/Abisko_ngm.sorted &
samtools index ./ngm/Aiguamolls_ngm.sorted.bam &
samtools index ./ngm/Abisko_ngm.sorted.bam &
## Run samtools idxstats for quality check:
samtools idxstats ./ngm/Aiguamolls_ngm.sorted.bam > ./ngm/Aiguamolls_ngm.sorted.bam.idxstats
samtools idxstats ./ngm/Abisko_ngm.sorted.bam > ./ngm/Abisko_ngm.sorted.bam.idxstats
join ./ngm/Aiguamolls_ngm.sorted.bam.idxstats ./ngm/Abisko_ngm.sorted.bam.idxstats > ./ngm/AiguamollsAbisko.idxstats
## Run samtools idxstats for quality check on the paired reads:
samtools idxstats ./ngm/Aiguamolls_ngm.pairs.sorted.bam > ./ngm/Aiguamolls_ngm.pairs.sorted.bam.idxstats
samtools idxstats ./ngm/Abisko_ngm.pairs.sorted.bam > ./ngm/Abisko_ngm.pairs.sorted.bam.idxstats
join ./ngm/Aiguamolls_ngm.pairs.sorted.bam.idxstats ./ngm/Abisko_ngm.pairs.sorted.bam.idxstats > ./ngm/AiguamollsAbisko.pairs.idxstats