# Sequencing Data Analysis Process
https://chungtsai.medium.com/%E7%B2%BE%E6%BA%96%E9%86%AB%E7%99%82-variant-calling-%E7%9A%84%E6%88%B0%E6%B3%81%E5%88%86%E6%9E%90-97e77d0730c8
https://wiki.bits.vib.be/index.php/Introduction_to_Linux_for_bioinformatics#Count_all_the_variants_in_three_vcf_files

### 1. Fastqc: check sequencing quality
install at 172.22.116.52
```
#!/usr/bin/bash
INPUT_DIR=/data4/DRAGEN_upload_temp/renamed/clean/ # Revise the "INPUT_DIR" path
OUTPUT_DIR=/data4/DRAGEN_upload_temp/renamed/clean/FASTQC/ # Revise the "OUTPUT_DIR" path
FASTQ_for_find=*.fastq.gz # Revise the filename for the command find
if [ ! -d "${INPUT_DIR}" ]; then
echo "[Warning] Input directory doesn't exist !"
exit 4
else
echo "--------------------------------------------- fastqc start ! ---------------------------------------------"
fi
if [ ! -d ${OUTPUT_DIR} ]; then
mkdir -p -m 770 ${OUTPUT_DIR}
fi
echo "Input directory = ${INPUT_DIR}"
echo "Output directory = ${OUTPUT_DIR}"
FASTQ_FILES=$(find ${INPUT_DIR} -maxdepth 1 -type f -name ${FASTQ_for_find})
FASTQ_for_FASTQC=""
for FILE in ${FASTQ_FILES} ; do
FASTQ_for_FASTQC=${FASTQ_for_FASTQC}"${FILE} "
done
echo "fastq files for FastQC : ${FASTQ_for_FASTQC}"
echo "fastqc -t 8 ${FASTQ_for_FASTQC} -o ${OUTPUT_DIR}"
fastqc -t 8 ${FASTQ_for_FASTQC} -o ${OUTPUT_DIR} #QC report html file
```
### 2. Trimmomatic: trim adapter (trim & pair)
install at 172.22.116.52
```
INPUT="/data4/una/pca/"
OUTPUT="/data4/DRAGEN_upload_temp/PCa_shao"
if [ ! -d "$OUTPUT" ]; then
# Control will enter here if $DIRECTORY doesn't exist
mkdir -m 755 $OUTPUT
fi
echo Input Path = $INPUT
echo Output Path = $OUTPUT
#for infile in $INPUT/*
#do
#cd $infile
for inp in $INPUT/*R1.fastq.gz
do
echo $inp
SAMPLE=$(basename $inp _R1.fastq.gz)
echo $SAMPLE
java -jar /home/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads 8 -phred33\
$INPUT/$SAMPLE\_R1.fastq.gz $INPUT/$SAMPLE\_R2.fastq.gz\
$OUTPUT/$SAMPLE\_R1_paired.fastq.gz $OUTPUT/$SAMPLE\_R1_unpaired.fastq.gz\
$OUTPUT/$SAMPLE\_R2_paired.fastq.gz $OUTPUT/$SAMPLE\_R2_unpaired.fastq.gz\
ILLUMINACLIP:/home/Trimmomatic-0.39/adapters/WES_GG_uni.fa:2:30:10:2:True\
SLIDINGWINDOW:4:15 LEADING:3 TRAILING:3 MINLEN:36
done
#done
echo "********** Trimmomatic done **********"
```
Note: If QC of 'Per Tile Sequence Quality' fails after trimming adapters, FilterByTile (BBMap package) can further remove Low-Quality Reads caused by bubbles in flowcells without adding bias.
https://www.seqanswers.com/forum/bioinformatics/bioinformatics-aa/60180-introducing-filterbytile-remove-low-quality-reads-without-adding-bias
https://sequencing.qcfail.com/articles/position-specific-failures-of-flowcells/
https://sourceforge.net/projects/bbmap/
```
filterbytile.sh in1=r1.fq in2=r2.fq out1=filtered1.fq out2=filtered2.fq ud=0.75 qd=1 ed=1 ua=.5 qa=.5 ea=.5
```
### 3. estimate coverage (BBMap package)
install at 172.22.116.52
https://www.jianshu.com/p/cf4c64ac4610
```
INPUT="/data4/una/pca/trim3"
echo Input Path = $INPUT
for inp in $INPUT/*R1_paired.fastq.gz
do
echo $inp
SAMPLE=$(basename $inp _R1_paired.fastq.gz)
echo $SAMPLE
reformat.sh in1=$INPUT/$SAMPLE\_R1_paired.fastq.gz in2=$INPUT/$SAMPLE\_R2_paired.fastq.gz
done
```
Input: 50943516 reads 7688577992 bases
Output: 50943516 reads (100.00%) 7688577992 bases (100.00%)
Time: 65.668 seconds.
Reads Processed: 50943k 775.77k reads/sec
Bases Processed: 7688m 117.08m bases/sec
### 4. upload files to Illumina website
install at 172.22.116.71
https://developer.basespace.illumina.com/docs/content/documentation/cli/cli-examples#UploadFASTQfilesthatmatchthebiosamplename
```
list=('A19bsA129' 'A19bsA442' 'A19bsA551' 'A19bsA129' 'A19bsA389' 'A19bsA123' 'A19bsA689' 'A19bsA186' 'A19bsA993' 'A19bsA997' 'A19bsA998' 'A19bsA975' 'A19bsA953' 'A19bsA950' 'A19bsA901' 'A19bsA913' 'A20bsA213' 'A20bsA334' 'A20bsA635' 'A20bsA354' 'A20bsA302' 'A20bsA159' 'A20bsA202' 'A20bsA281')
for i in ${list[@]}
do
echo $i
file=$(find . -type f -name "$i*_R1*")
file2=$(find . -type f -name "$i*_R2*")
echo $file
mv $file $i\_S1_L004_R1_001.fastq.gz
mv $file2 $i\_S1_L004_R2_001.fastq.gz
done
$ bs upload dataset -p <Project Number> --recursive .
```
### 5. Mapping & SNV calling: Illumina Dragen germline/GATK -->VCF file get!
https://www.prismabiotech.com.tw/post/iillumina%E7%94%9F%E6%85%8B%E7%B3%BB%E4%B8%AD%E8%9E%8D%E5%90%88%E5%9F%BA%E5%9B%A0%E6%AA%A2%E6%B8%AC%E5%B7%A5%E5%85%B7%E6%95%B4%E7%90%86
https://github.com/Illumina/manta
### 6. Download vcf file of each sample from Illumina
https://weitinglin.com/2016/01/27/sambam-and-cram/
```
$ bs download project -i <ProjectID> --extension bam --extension bai --extension vcf.gz --extension vcf.gz.tbi -o <Folder>
```
### 7. Filter data depth <10 (vcftools)
```
Parameters as interpreted:
--gzvcf /data4/una/hypospadias/A19bsA551_ds.eae194c8080943d4a96c951b9073c2b7/A19bsA551.hard-filtered.vcf.gz
--recode-INFO-all
--minDP 10
--minGQ 20
--out /data4/una/filtered_hypospadias/A19bsA551
--recode
```
DP : read depth at this position for this sample (Integer)
GQ : conditional genotype quality, encoded as a phred quality −10log10p (genotype call is wrong, conditioned on the site’s being variant) (Integer)
https://samtools.github.io/hts-specs/VCFv4.2.pdf
```
vcftools --gzvcf /data3/prostate_cancer/DRAGEN_results/1050829-5B_ds.796988f2ebfa447fbd8f06068a684716/1050829-5B.hard-filtered.vcf.gz --recode-INFO-all --minDP 10 --minGQ 20 --out /home/amy/TPMI/Ori_Apt_Adv/Prostate_WES/vcf_file/DRAGEN_PC_substract_major/1050829-5B.hard-filtered_substract_major --exclude-positions /home/amy/TPMI/Ori_Apt_Adv/Prostate_WES/Major0.05_WES_pos.txt --recode
```
Using zlib version: 1.2.7
Warning: Expected at least 2 parts in FORMAT entry: ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
Warning: Expected at least 2 parts in FORMAT entry: ID=PS,Number=1,Type=Integer,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connen a phasing group">
Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
After filtering, kept 1 out of 1 Individuals
Outputting VCF file...
After filtering, kept 292683 out of a possible 292683 Sites
Run Time = 11.00 seconds
### 8. Select genes
```
vcftools --gzvcf $x --recode-INFO-all --out $OUTPUTDIR/minor/$outname\_filter --bed $BED --remove-indels --minGQ 20 --minDP 10 --recode-INFO-all --recode
```
### 9. Compress vcf files to .gz
```
for i in *.vcf
do
bgzip -c -f $i > $i\.gz
tabix -f $i\.gz #index file
done
```
### 10. Merge all compressed vcf files (bcftools)
http://samtools.github.io/bcftools/bcftools.html
```
bcftools merge *.vcf.gz -Oz -o out.vcf.gz
```
### 11. Normalize & uncompress compressed VCF file (bcftools)
```
bcftools norm -Ov -m -both DRAGEN_PC_minor0.05_noAALT.vcf.gz > DRAGON_PC_vcf_norm/DRAGEN_PC_minor0.05_noALT_norm.vcf
```
### 12. VCF file: Extract GT from FORMAT column
```
bcftools annotate -x "^FORMAT/GT" DRAGEN_PC_minor0.05_noALT_cancer_DNArepair_norm.vcf > DRAGEN_PC_minor0.05_noALT_cancer_DNArepair_norm_GT_DP.vcf
```
### 13. VCF file: set_ID
```
bcftools annotate --set-id +'%CHROM\_%POS\_%REF\_%ALT' input.vcf > out_re.vcf
```
### 14. Read VCF file into DataFrame
```
import os
import io
import pandas as pd
def read_vcf(path):
with open(path, 'r') as f:
lines = [l for l in f if not l.startswith('##')]
return pd.read_csv(
io.StringIO(''.join(lines)),
dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
'QUAL': str, 'FILTER': str, 'INFO': str}, sep='\t',low_memory=False
).rename(columns={'#CHROM': 'CHROM'})
vcf_file=read_vcf(input_file)
```
### 15. Convert VCF file to avinput file
run code under 172.22.116.52
```
perl /home/2019GenomicsEpidemiologyWorkshop/annovar/convert2annovar.pl -format vcf4old file.vcf -out file.avinput --includeinfo
##the avinput contains all information, only column 1(CHR),2(START),3(END),4(REF),5(ALT),8(ID) is necessary
awk '{print $1,$2,$3,$4,$5,$8}' file.avinput > file_out.avinput
```
### 16. Annotation
ANNOVAR (https://annovar.openbioinformatics.org/en/latest/articles/VCF/)
SnpEff (http://pcingola.github.io/SnpEff/se_introduction/)
QCI (https://digitalinsights.qiagen.com/wp-content/uploads/2022/02/QCI-IT_Release-Notes_0921_WW.pdf)
annotation column (https://brb.nci.nih.gov/seqtools/colexpanno.html)
"0" reference nucleotides
"-" null nucleotide
```
CHROM:chr1 POS ID:207296325 207296325 REF:G ALT:C unknown 12.95 5 183 7.79
```
```
#!/usr/bin/perl
### Annovar ###
use Getopt::Std;
sub Usage{
print STDERR <<EOF;
Usage: annovar.pl -t <number of threads> -i <sample name> -r <reference version> -o <annotation output>
Command:
-i sample file convert to annovar format (Example: *.avinput)
-t number of threads
-r reference version (hg19 / hg38)
-o sample file annotation output
EOF
exit;
}
my %opt;
getopt("i:t:r:o:", \%opt);
my $input = $opt{i} or &Usage();
my $num_threads = $opt{t} or &Usage();
my $rf = $opt{r} or &Usage();
my $output = $opt{o} or &Usage();
my $PATH = '/home/2019GenomicsEpidemiologyWorkshop/annovar_2020_SCH/';
my $DB_PATH = '/home/2019GenomicsEpidemiologyWorkshop/humandb/';
`$PATH/table_annovar.pl --thread $num_threads $input $DB_PATH -buildver $rf -out $output -remove -protocol refGene,ensGene,cytoBand,genomicSuperDups,gwasCatalog,dbnsfp41a,dbnsfp31a_interpro,cosmic89_coding,cosmic89_noncoding,clinvar_20210501,avsnp150,esp6500siv2_all,1000g2015aug_all,1000g2015aug_afr,1000g2015aug_amr,1000g2015aug_eur,1000g2015aug_eas,1000g2015aug_sas,nci60,gnomad30_genome,gnomad211_exome,exac03,intervar_20180118 -operation g,g,r,r,r,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f -otherinfo -nastring NA`;
print STDERR "Finished\n";
```
```
Working_DIR="/data4/cliao/Prostate_Cancer/"
INPUT_DIR=${Working_DIR}/ANNOVAR_input_file/ # Revise the "INPUT_DIR" path
#INPUT_DIR=${Working_DIR}/
if [ ! -d "${INPUT_DIR}" ]; then
echo "[Warning] Input directory doesn't exits !"
exit 4
else
echo "--------------------------------------------- ANNOVAR genetic variant annotation ---------------------------------------------"
fi
OUTPUT_DIR=${Working_DIR}/ANNOVAR_results # Revise the "OUTPUT_DIR" path
if [ ! -d ${OUTPUT_DIR} ]; then
mkdir -p -m 755 ${OUTPUT_DIR}
fi
echo "Input directory = ${INPUT_DIR}"
echo "Output directory = ${OUTPUT_DIR}"
AVINPUT_FILES=$(find ${INPUT_DIR} -type f -name '*.avinput') # Revise the filename extension
for INPUT_FILE in ${AVINPUT_FILES} ; do
echo "**** Input file = ${INPUT_FILE} --------------------------------------"
FILE_NAME=$(basename ${INPUT_FILE} .avinput)
# FILE_NAME_0=$(basename ${INPUT_FILE} .avinput)
# FILE_NAME_0=${FILE_NAME_0%.*}
# FILE_NAME=${FILE_NAME_0#*.}
echo "Input file name = ${FILE_NAME}"
echo "perl ${INPUT_DIR}/annovar_SCH.pl -t 8 -i ${INPUT_FILE} -r hg38 -o ${OUTPUT_DIR}/${FILE_NAME}_annovar"
perl ${Working_DIR}/annovar_SCH.pl -t 8 -i ${INPUT_FILE} -r hg38 -o ${OUTPUT_DIR}/${FILE_NAME}_annovar
echo "${FILE_NAME}_annovar.hg38_multianno.txt done"
done
```
https://www.nature.com/articles/nprot.2015.105
https://www.bioinformatics.babraham.ac.uk/projects/fastqc/good_sequence_short_fastqc.html
https://www.ncbi.nlm.nih.gov/projects/clinvar/ClinVarDataDictionary.pdf
https://pubmed.ncbi.nlm.nih.gov/25741868/
https://www.ncbi.nlm.nih.gov/projects/clinvar/ClinVarDataDictionary.pdf
https://hackmd.io/@sK-GgpcqTNWnutbxzOoG2g/BJADCICmS
https://www.jianshu.com/p/a6aab960d750
https://brb.nci.nih.gov/seqtools/colexpanno.html#annotableANNOVAR