# Create Mouse RNA-seq
### Environment
* Environment is named `BCH709_RNASeq`
```
conda create -y -n BCH709_RNASeq -c bioconda -c conda-forge -c anaconda python=3.7 mamba
conda activate BCH709_RNASeq
mamba install -c bioconda -c conda-forge -c anaconda trim-galore=0.6.7 sra-tools=2.11.0 STAR htseq=1.99.2 subread=2.0.1 multiqc=1.11 snakemake=7.5.0 parallel-fastq-dump=0.6.7 bioconductor-tximport samtools=1.14 r-ggplot2 trinity=2.13.2 hisat2 bioconductor-qvalue sambamba graphviz gffread tpmcalculator lxml rsem
```
### Working Directory
* Working directory will be `mouse` with sub directories like `fastq`, `ref`, `trim`, etc.
```
cd /data/gpfs/assoc/bch709-4/joelshin/mouse/{working_directory}
```
### Reference directory
* Reference Directory is called `ref`
```
cd /data/gpfs/assoc/bch709-4/joelshin/mouse/ref
```
* Download reference mouse files in this directory
```
wget https://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/mm39.fa.gz
wget https://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/genes/refGene.gtf.gz
```
* Decompress the fasta and gtf files
```
gunzip mm39.fa.gz
gunzip refGene.gtf.gz
```
* Copy a batch file from course material
```
cp /data/gpfs/assoc/bch709-4/Course_materials/mouse/run.sh /data/gpfs/assoc/bch709-4/joelshin/mouse/ref/ref_build.sh
```
* Change configuration:
1) Job name too - STAR_Mouse_joelshin
2) Change email too - joelshin4@gmail.com
3) Change memory too - 64g
4) Add this command to the bottom of the slurm file:
```
STAR --runThreadN 4 --runMode genomeGenerate --genomeDir . --genomeFastaFiles mm39.fa --sjdbGTFfile refGene.gtf --sjdbOverhang 99 --genomeSAindexNbases 12
```
* It should look something like this:
```
#!/bin/bash
#SBATCH --job-name=STAR_Mouse_Joel
#SBATCH --cpus-per-task=4
#SBATCH --mem=64g
#SBATCH --time=2-15:00:00
#SBATCH --mail-type=all
#SBATCH --mail-user=joelshin@nevada.unr.edu
#SBATCH -o log.%x.%j.out # STDOUT & STDERR
#SBATCH -p cpu-core-0
#SBATCH -A cpu-s5-bch709-4
####SBATCH --dependency=afterok:<PREVIOUS_JOBID>
STAR --runThreadN 4 --runMode genomeGenerate --genomeDir . --genomeFastaFiles mm39.fa --sjdbGTFfile refGene.gtf --sjdbOverhang 99 --genomeSAindexNbases 12
```
* Submit the slurm file
```
sbatch ref_build.sh
```
### FASTQ File
* Change directory
```
cd /data/gpfs/assoc/bch709-4/joelshin/mouse/
```
* Link all fastq files from course materials to your made fastq directory
```
ln -s /data/gpfs/assoc/bch709-4/Course_materials/mouse/fastq/* /data/gpfs/assoc/bch709-4/joelshin/mouse/fastq
```
* Look at all the list of fastq files
```
cd /data/gpfs/assoc/bch709-4/joelshin/mouse/fastq
ls
12WK_R6-2_Rep_1_R1.fastq.gz 1YR_zQ175_Rep_3_R2.fastq.gz
12WK_R6-2_Rep_1_R2.fastq.gz
...
...
...
...
```
* Create file list that contains the sorted and organized files
```
ls -1 *.gz | sed 's/_R.\.fastq\.gz//g' | sort -u > /data/gpfs/assoc/bch709-4/joelshin/mouse/filelist
```
* Look at the filelist
```
cat /data/gpfs/assoc/bch709-4/joelshin/mouse/filelist
12WK_R6-2_Rep_1
12WK_R6-2_Rep_2
12WK_R6-2_Rep_3
12WK_R6-2_Rep_4
12WK_R6-2_Rep_5
12WK_R6-2_Rep_6
12WK_R6-2_Rep_7
12WK_WT_-R6-2_Littermate_Rep_1
12WK_WT_-R6-2_Littermate_Rep_2
12WK_WT_-R6-2_Littermate_Rep_3
12WK_WT_-R6-2_Littermate_Rep_4
12WK_WT_-R6-2_Littermate_Rep_5
12WK_WT_-R6-2_Littermate_Rep_6
12WK_WT_-R6-2_Littermate_Rep_7
12WK_WT_-zQ175_Littermate_Rep_1
12WK_WT_-zQ175_Littermate_Rep_2
12WK_WT_-zQ175_Littermate_Rep_3
12WK_WT_-zQ175_Littermate_Rep_4
12WK_WT_-zQ175_Littermate_Rep_5
12WK_WT_-zQ175_Littermate_Rep_6
12WK_zQ175_Rep_1
12WK_zQ175_Rep_2
12WK_zQ175_Rep_3
12WK_zQ175_Rep_4
12WK_zQ175_Rep_5
12WK_zQ175_Rep_6
1YR_WT_-zQ175_Littermate_Rep_2
1YR_WT_-zQ175_Littermate_Rep_3
1YR_WT_-zQ175_Littermate_Rep_4
1YR_WT_-zQ175_Littermate_Rep_5
1YR_zQ175_Rep_2
1YR_zQ175_Rep_3
1YR_zQ175_Rep_4
1YR_zQ175_Rep_5
1YR_zQ175_Rep_6
6WK_R6-2_Rep_1
6WK_R6-2_Rep_2
6WK_R6-2_Rep_3
6WK_R6-2_Rep_4
6WK_R6-2_Rep_5
6WK_R6-2_Rep_6
6WK_WT_-R6-2_Littermate_Rep_1
6WK_WT_-R6-2_Littermate_Rep_2
6WK_WT_-R6-2_Littermate_Rep_3
6WK_WT_-R6-2_Littermate_Rep_4
6WK_WT_-R6-2_Littermate_Rep_6
```
### Trim the reads
* copy trim slurm file form course material
```
cp /data/gpfs/assoc/bch709-4/Course_materials/mouse/run.sh /data/gpfs/assoc/bch709-4/joelshin/mouse/fastq/trim.sh
```
* Change configuration of the trim slrum file
```
sed -i "s/\-\-cpus\-per\-task\=2/\-\-cpus\-per\-task\=4/g; s/\[NAME\]/Trim/g; s/\[youremail\]/${USER}\@unr.edu\,${USER}\@nevada.unr.edu/g" /data/gpfs/assoc/bch709-4/${USER}/mouse/fastq/trim.sh
```
* merge trim-galore command for each of the reads with their respective slurm command
```
for i in `cat ../filelist`
do
read1=${i}_R1.fastq.gz
read2=${read1//_R1.fastq.gz/_R2.fastq.gz}
echo $read1 $read2
echo "trim_galore --paired --three_prime_clip_R1 5 --three_prime_clip_R2 5 --cores 2 --max_n 40 --fastqc --gzip -o /data/gpfs/assoc/bch709-4/${USER}/mouse/trim $read1 $read2" | cat trim.sh - > ${i}_trim.sh
done
```
* submit the jobs
```
for i in `ls -1 *.sh`
do
sbatch $i
done
```
* check the jobs
```
squeue -u joelshin
```
### RNA-Seq Alignment
* Change to trim folder
```
cd /data/gpfs/assoc/bch709-4/joelshin/mouse/trim
```
* copy mapping slurm job file from course material to trim folder
```
cp /data/gpfs/assoc/bch709-4/Course_materials/mouse/run.sh /data/gpfs/assoc/bch709-4/${USER}/mouse/trim/mapping.sh
```
* Change configuration of the mapping slurm file
```
sed -i "s/16g/64g/g; s/\-\-cpus\-per\-task\=2/\-\-cpus\-per\-task\=4/g; s/\[NAME\]/Trim/g; s/\[youremail\]/${USER}\@unr.edu\,${USER}\@nevada.unr.edu/g" /data/gpfs/assoc/bch709-4/${USER}/mouse/trim/mapping.sh
```
* Create RNA-Seq alignment batch file
```
for i in `cat ../filelist`
do
read1=${i}_R1_val_1.fq.gz
read2=${read1//_R1_val_1.fq.gz/_R2_val_2.fq.gz}
echo $read1 $read2
echo "STAR --runMode alignReads --runThreadN 4 --outFilterMultimapNmax 100 --alignIntronMin 25 --alignIntronMax 50000 --genomeDir /data/gpfs/assoc/bch709-4/${USER}/mouse/ref --readFilesCommand gunzip -c --readFilesIn /data/gpfs/assoc/bch709-4/${USER}/mouse/trim/${read1} /data/gpfs/assoc/bch709-4/${USER}/mouse/trim/${read2} --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /data/gpfs/assoc/bch709-4/${USER}/mouse/bam/${i}.bam"
done
```
* Check out bam files created. Go into the bam file directory and submit the jobs
```
cd ../
cd bam
or
cd /data/gpfs/assoc/bch709-4/joelshin/mouse/bam
for i in `ls -1 *_mapping.sh`
do
sbatch $i
done
```
### Create FeatureCounts file
* Head into your bam working directory and check out all bam files created
```
cd /data/gpfs/assoc/bch709-4/joelshin/mouse/bam
ls -1 *.bam
12WK_R6-2_Rep_1.bamAligned.sortedByCoord.out.bam
12WK_R6-2_Rep_2.bamAligned.sortedByCoord.out.bam
12WK_R6-2_Rep_3.bamAligned.sortedByCoord.out.bam
12WK_R6-2_Rep_4.bamAligned.sortedByCoord.out.bam
...
...
...
...
```
* Copy count slurm files from course material
```
cp /data/gpfs/assoc/bch709-4/Course_materials/mouse/run.sh /data/gpfs/assoc/bch709-4/${USER}/mouse/bam/count.sh
```
* Change and configure settings of the slurm file
```
sed -i "s/16g/64g/g; s/\-\-cpus\-per\-task\=2/\-\-cpus\-per\-task\=4/g; s/\[NAME\]/Count/g; s/\[youremail\]/${USER}\@unr.edu\,${USER}\@nevada.unr.edu/g" /data/gpfs/assoc/bch709-4/${USER}/mouse/bam/count.sh
```
* Change `count.sh` batch file and add this to the file
```
featureCounts -o /data/gpfs/assoc/bch709-4/joelshin/mouse/readcount/featurecount -T 4 -Q 1 -p -M -g gene_id -a /data/gpfs/assoc/bch709-4/joelshin/mouse/ref/refGene.gtf $(for i in `cat /data/gpfs/assoc/bch709-4/joelshin/mouse/filelist`; do echo ${i}.bamAligned.sortedByCoord.out.bam| tr '\n' ' ';done)
```
* Your `count.sh` file should look like this
```
#!/bin/bash
#SBATCH --job-name=FeatureCounts_Mouse_joelshin
#SBATCH --cpus-per-task=4
#SBATCH --mem=64g
#SBATCH --time=2-15:00:00
#SBATCH --mail-type=all
#SBATCH --mail-user=joelshin@unr.edu,joelshin@nevada.unr.edu
#SBATCH -o log.%x.%j.out # STDOUT & STDERR
#SBATCH -p cpu-core-0
#SBATCH -A cpu-s5-bch709-4
####SBATCH --dependency=afterok:<PREVIOUS_JOBID>
featureCounts -o /data/gpfs/assoc/bch709-4/joelshin/mouse/readcount/featurecount -T 4 -Q 1 -p -M -g gene_id -a /data/gpfs/assoc/bch709-4/joelshin/mouse/ref/refGene.gtf $(for i in `cat /data/gpfs/assoc/bch709-4/joelshin/mouse/filelist`; do echo ${i}.bamAligned.sortedByCoord.out.bam| tr '\n' ' ';done)
```
* Submit the batch file
```
jobid=$(squeue --noheader --format %i --user joelshin | tr '\n' ':'| sed 's/:$//g')
sbatch --dependency=afterany:${jobid} count.sh
```
* Check progress
```
squeue -u joelshin
```
* Submit a feature counts multiqc file
```
multiqc --filename featurecounts_report .
```
### Human portion
```
cd /data/gpfs/assoc/bch709-4/joelshin
mkdir human
mkdir /data/gpfs/assoc/bch709-4/joelshin/human/fastq
mkdir /data/gpfs/assoc/bch709-4/joelshin/human/ref
mkdir /data/gpfs/assoc/bch709-4/joelshin/human/trim
mkdir /data/gpfs/assoc/bch709-4/joelshin/human/bam
mkdir /data/gpfs/assoc/bch709-4/joelshin/human/readcount
mkdir /data/gpfs/assoc/bch709-4/joelshin/human/DEG
cd /data/gpfs/assoc/bch709-4/joelshin/ref
wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.refGene.gtf.gz
wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
gunzip hg38.fa.gz
gunzip hg38.refGene.gtf.gz
cp /data/gpfs/assoc/bch709-4/Course_materials/human/run.sh /data/gpfs/assoc/bch709-4/${USER}/human/ref/ref_build.sh
sed -i "s/16g/64g/g; s/\-\-cpus\-per\-task\=2/\-\-cpus\-per\-task\=4/g; s/\[NAME\]/ref_build/g; s/\[youremail\]/${USER}\@unr.edu\,${USER}\@nevada.unr.edu/g" /data/gpfs/assoc/bch709-4/${USER}/human/ref/ref_build.sh
nano ref_build.sh
<This is my ref_build.sh>
#!/bin/bash
#SBATCH --job-name=STAR_Human_joelshin
#SBATCH --cpus-per-task=4
#SBATCH --mem=64g
#SBATCH --time=2-15:00:00
#SBATCH --mail-type=all
#SBATCH --mail-user=joelshin4@gmail.com
#SBATCH -o log.%x.%j.out # STDOUT & STDERR
#SBATCH -p cpu-core-0
#SBATCH -A cpu-s5-bch709-4
####SBATCH --dependency=afterok:<PREVIOUS_JOBID>
STAR --runThreadN 4 --runMode genomeGenerate --genomeDir . --genomeFastaFiles hg38.fa --sjdbGTFfile hg38.refGene.gtf --sjdbOverhang 99 --genomeSAindexNbases 12
sbatch ref_build.sh
cd /data/gpfs/assoc/bch709-4/joelshin/human/
ln -s /data/gpfs/assoc/bch709-4/Course_materials/human/fastq/* /data/gpfs/assoc/bch709-4/joelshin/human/fastq
ls /data/gpfs/assoc/bch709-4/joelshin/human/fastq
cd /data/gpfs/assoc/bch709-4/joelshin/human/fastq
ls -1 *.gz | sed 's/_R.\.fastq\.gz//g' | sort -u > /data/gpfs/assoc/bch709-4/joelshin/human/filelist
cp /data/gpfs/assoc/bch709-4/Course_materials/human/run.sh /data/gpfs/assoc/bch709-4/${USER}/human/fastq/trim.sh
sed -i "s/16g/64g/g; s/\-\-cpus\-per\-task\=2/\-\-cpus\-per\-task\=4/g; s/\[NAME\]/Trim/g; s/\[youremail\]/${USER}\@unr.edu\,${USER}\@nevada.unr.edu/g" /data/gpfs/assoc/bch709-4/${USER}/human/fastq/trim.sh
for i in `cat ../filelist`
do
read1=${i}_R1.fastq.gz
read2=${read1//_R1.fastq.gz/_R2.fastq.gz}
echo "trim_galore --paired --three_prime_clip_R1 5 --three_prime_clip_R2 5 --cores 2 --max_n 40 --fastqc --gzip -o /data/gpfs/assoc/bch709-4/${USER}/human/trim $read1 $read2" | cat trim.sh - > ${i}_trim.sh
echo "$read1 $read2 trim file has been created."
done
for i in `ls -1 *_trim.sh`
do
sbatch $i
done
squeue -u joelshin
cd /data/gpfs/assoc/bch709-4/joelshin/human/trim
cp /data/gpfs/assoc/bch709-4/Course_materials/human/run.sh /data/gpfs/assoc/bch709-4/joelshin/human/trim/mapping.sh
sed -i "s/16g/64g/g; s/\-\-cpus\-per\-task\=2/\-\-cpus\-per\-task\=4/g; s/\[NAME\]/Mapping/g; s/\[youremail\]/${USER}\@unr.edu\,${USER}\@nevada.unr.edu/g" /data/gpfs/assoc/bch709-4/${USER}/human/trim/mapping.sh
for i in `cat ../filelist`
do
read1=${i}_R1_val_1.fq.gz
read2=${read1//_R1_val_1.fq.gz/_R2_val_2.fq.gz}
echo $read1 $read2
echo "STAR --runMode alignReads --runThreadN 4 --outFilterMultimapNmax 100 --alignIntronMin 25 --alignIntronMax 50000 --genomeDir /data/gpfs/assoc/bch709-4/${USER}/human/ref --outSAMtype BAM SortedByCoordinate --readFilesCommand gunzip -c --readFilesIn /data/gpfs/assoc/bch709-4/${USER}/human/trim/${read1} /data/gpfs/assoc/bch709-4/${USER}/human/trim/${read2} --outFileNamePrefix /data/gpfs/assoc/bch709-4/${USER}/human/bam/${i}.bam" | cat mapping.sh - > ${i}_mapping.sh
done
for i in `ls -1 *_mapping.sh`
do
sbatch $i
done
cd /data/gpfs/assoc/bch709-4/joelshin/human/bam
cp /data/gpfs/assoc/bch709-4/Course_materials/human/run.sh /data/gpfs/assoc/bch709-4/joelshin/human/bam/count.sh
sed -i "s/16g/64g/g; s/\-\-cpus\-per\-task\=2/\-\-cpus\-per\-task\=4/g; s/\[NAME\]/Count/g; s/\[youremail\]/${USER}\@unr.edu\,${USER}\@nevada.unr.edu/g" /data/gpfs/assoc/bch709-4/${USER}/human/bam/count.sh
cd /data/gpfs/assoc/bch709-4/${USER}/human/bam
for i in `cat /data/gpfs/assoc/bch709-4/wyim/human/filelist`
do
echo ${i}.bamAligned.sortedByCoord.out.bam | tr '\n' ' '
done
<count.sh Batch file>
#!/bin/bash
#SBATCH --job-name=Count_Human_joelshin
#SBATCH --cpus-per-task=4
#SBATCH --mem=64g
#SBATCH --time=2-15:00:00
#SBATCH --mail-type=all
#SBATCH --mail-user=joelshin4@gmail.com
#SBATCH -o log.%x.%j.out # STDOUT & STDERR
#SBATCH -p cpu-core-0
#SBATCH -A cpu-s5-bch709-4
####SBATCH --dependency=afterok:<PREVIOUS_JOBID>
featureCounts -o /data/gpfs/assoc/bch709-4/joelshin/human/readcount/featurecount -T 4 -Q 1 -p -M -g gene_id -a /data/gpfs/assoc/bch709-4/joelshin/human/ref/hg38.refGene.gtf $(for i in `cat /data/gpfs/assoc/bch709-4/joelshin/human/filelist`; do echo ${i}.bamAligned.sortedByCoord.out.bam| tr '\n' ' ';done)
jobid=$(squeue --noheader --format %i --user ${USER} | tr '\n' ':'| sed 's/:$//g')
sbatch --dependency=afterany:${jobid} count.sh
```