Noncoding - HackMD

# Simplified pipeline chr11 took the longest time: 603 min. Settings in job submission: -l h_rt=23:59:59,h_data=16G; -pe shared 2 ## 0. inputs and outputs inputs: 1kg30X unphased vcf file; YRI sample name; annotated states BED files. Specifying arguments: chr=21 ``` echo "Step 0: Specify inputs and outputs" # Specify the chromosome number. Different chromosomes take different time to run because of the size variation. chr='chr21' echo "Working on "$chr # Specifiy the input directories and files dirvcf=/u/project/klohmuel/DataRepository/NYGC_1KG/genotype/ ## Specify the input files inputvcf="20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_"$chr".recalibrated_variants.vcf.gz" inputsample='/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/vcf_1kg220425/samples_YRI_2504_108.txt' maskfile='/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/vcf_1kg220425/originalfile/20160622.allChr.mask.bed' #specify output directories and files ## output file dirout='/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/vcf_1kg220425/' dirtemp='/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/vcf_1kg220425/temp/' mkdir $dirtemp ## simplify the input names to write output names inputsample_fn=$(basename "$inputsample") inputsample_fn=${inputsample_fn%.*} inputsample_fn="${inputsample_fn#*_}" ### Name the output file by samples and chr. output1=$dirtemp$inputsample_fn".1kGP."$chr".vcf.gz" output2=$dirout'poly.'$input2_name output3=$dirout'mask.bisnp.'$inputsample_fn".1kGP."$chr".vcf.gz" output5=$dirout'csites.'$chr.tab ``` ## 1. Get whole chr biallelic unmasked vcf for dadi ``` # 1.1 only Keep YRI samples in VCF file to speed up echo "Step 1.1 only Keep YRI samples in VCF file to speed up" ## Use bcftools to get samples in inputsample file. echo "Load bcftools" module load bcftools echo "time bcftools view -S $inputsample $dirvcf$inputvcf --force-samples | gzip -c > $output1" echo "This step takes a long time. Make sure to allocate enough time and memory. For reference, chr21 took 30-50 minutes, tested in using 10G memory but could work using less memory." time bcftools view -S $inputsample $dirvcf$inputvcf --force-samples | gzip -c > $output1 ## Check if the process has finished correctly. echo "Check if the process has finished correctly." ## The lines of output1 should be same or very clsose to the input. ## If the difference is large, kill the script. ### Count the number of lines in inputvcf and output1. lines_inputvcf=$(zcat $dirvcf$inputvcf | wc -l) lines_output1=$(zcat $output1 | wc -l) ### Calculate the absolute difference in line counts diff_lines=$((lines_inputvcf - lines_output1)) absolute_diff_lines=${diff_lines#-} ### Check if the absolute difference is greater than 10 if [ "$absolute_diff_lines" -gt 10 ]; then echo "Error: erros in step1 in getting samples. The difference in line counts between 1kgp input vcf file and output sample.vcf file is more than 10 lines. Maybe increasing time and memory.Exiting." exit 1 fi ###output file is YRI_2504_108.1kGP.chr21.vcf.gz # 1.2 Only keep biallelic SNPs by using bcftools echo "Step 1.2 Only keep biallelic SNPs by using bcftools.This took 6-10 min for chr21." input2=$output1 input2_name=$(basename "$input2") ## Remove monomorphic sites in the population ### I think if a site will be considered as monomorphic if it is 0 or 1 at all samples except for missing samples. That is only having '0' and '.' is monomorphic, same as '1' and '.' echo "Remove monomorphic sites in the population." bcftools view -c 1 $input2 -Oz -o $output2 ## This only keeps SNPs but not sites that are both indels and SNPs. echo "only keeps SNPs: no indels or others" bcftools view -i 'TYPE="snp"' $output2 -Oz -o $dirtemp'temp_snp.'$input2_name ## Only keep biallelic SNPs echo "Only keep biallelic SNPs" bcftools view -m2 -M2 -v snps $dirtemp'temp_snp.'$input2_name -Oz -o $dirtemp'temp_bisnp.'$input2_name # 1.3 Only keep masked sites using bedtools and maskfile ## load bedtools echo "Step 1.3 Only keep masked sites using bedtools and maskfile" echo "Load bedtools" module load bedtools ## grep chr.maskfile to speed up chrmask=$chr.$(basename "$maskfile") ###!!!!! use grep $chr will grep more than needed. ###For example, chr1, chr11, chr12 will all be included if grep chr1. But this mistake is okay in this step. It just slows down the process. ###grep $chr $maskfile>$chrmask grep "^$chr[[:space:]]" $maskfile>$chrmask echo "Get mask file for "$chr" :"$chrmask ## input3=$dirtemp'temp_bisnp.'$input2_name gzip -d $input3 input3="${input3%.*}" bedtools intersect -a $input3 -b $chrmask -header | gzip > $output3 gzip $input3 echo "Get vcf file pass maskfile :"$output3 # 1.4 Check if there are duplicated sites in the mask.biallelic.vcf.gz file echo "Step 1.4 Check if there are duplicated sites in the mask.biallelic.vcf.gz file: "$output3 ## remove .vcf.gz extension from $output3 to name the simplified position txt file. output3_name=$(basename "$output3") output3_namenoext="${output3_name%.*}" output3_namenoext="${output3_namenoext%.*}" simppos=$dirtemp'temp_pos.'$output3_namenoext'.txt' posdup=$dirtemp'temp_dupsites.'$output3_namenoext'.txt' bcftools query -f '%CHROM\ %POS\n' $output3 >$simppos sort $simppos | uniq -d > $posdup if [ "$(cat $posdup| wc -l)" = 0 ] then echo "No duplicate sites in "$dirout$output3_name rm $simppos $posdup else echo "Warning: duplicated sites in "$dirout$output3_name echo "Please find duplicated sites in $posdup" rm $simppos fi # 1.5 Record the number of sites in different steps ## If there is no duplicates. Record the number and file names. ### It's better to count the number of sites instead of number of lines by removing lines start with "#" ### The lines start with "#" for vcf.gz file can be countted by zgrep -c '^#' $file Linputvcf=$lines_inputvcf Lsamples=$lines_output1 f3=$output2 L3="$(zcat $output2| wc -l)" f4='temp_snp.'$input2_name L4="$(zcat $dirtemp$f4| wc -l)" f5='temp_bisnp.'$input2_name L5="$(zcat $dirtemp$f5| wc -l)" LmaskbiSNPs="$(zcat $output3| wc -l)" cisite_counts="$inputvcf\t$Linputvcf\t$(basename "$output1")\t$Lsamples\t$f2\t$L2\t$f3\t$L3\t$f4\t$L4\t$f5\t$L5\t$output3\t$LmaskbiSNPs" if [ "$(cat $dirtemp'temp_dupsites.'$output3_name| wc -l)" = 0 ] then echo -e "$chr\t$cisite_counts\t'no duplicated sites'" >>$output5 else echo -e $chr\t$csites\t"check duplicated sites in "temp_dupsites.$output3_name >>$output5 fi ``` ## 2. Get target unmasked annotated sites Please refer to the following section in "Annotations on States" and "Get annotated States sites with controls in a window". In /u/home/c/cdi/noncoding/test_0622/data_bed, I have curated bed file for each annotated states that 1. passes strict mask file 2. has nearby quiescent controls within 50kb windows and its quiescent control. The files are smer.has_qui.smer.mask.nc_....bed ### 2.1 Find coding and non-coding Find coding regions: This only needs to be done once. Finding protein coding regions of Hg38 2024/01/18: I need to remove CDS instead of exons to get non-coding sequence. I edit the script and redo this step. By the way, I updated to v45. ```=bash wget "http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/latest_release/gencode.v45.annotation.gtf.gz" gunzip -c gencode.v45.annotation.gtf.gz | grep 'transcript_type "protein_coding"'>temp_prtr.txt ##get exon and change genecode 1-base [a,b] genome coordinates to 0-base genome coordinates (a,b] BED file https://www.gencodegenes.org/pages/data_format.html awk '($3=="exon") {printf("%s\t%s\t%s\n",$1,int($4)-1,$5);}' temp_prtr.txt>temp_exon.txt sort -T . -k1,1 -k2,2n | bedtools merge -i temp_exon.txt> exon_hg38_gencode_v45.bed awk '($3=="CDS") {printf("%s\t%s\t%s\n",$1,int($4)-1,$5);}' temp_prtr.txt | sort -T . -k1,1 -k2,2n>temp_CDS.txt bedtools merge -i temp_CDS.txt> CDS_hg38_gencode_v45.bed ``` ### 2.2 Get sorted, merged, non-coding, passed mask file annotated bed file. #### 2.2.1 remove coding regions from input bed file ```=bash coding='/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/genecode/CDS_hg38_gencode_v45.bed' inputdir='/u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/data_bed/psbed' inputbedname= inputbed=$inputdir'/'$inputbedname inputbase="$(basename -- $inputbed)" outputdir='/u/home/c/cdi/noncoding/test_0622/data_bed' bedtools subtract -a $inputbed -b $coding > $outputdir'/nc_'$inputbase coding='/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/genecode/CDS_hg38_gencode_v45.bed' for i in *.bed; do echo "substracting "$i bedtools subtract -a $i -b $coding > 'nc_'$i; done ``` #### 2.2.2 Only work on sites that pass mask file ```=bash cd /u/home/c/cdi/noncoding/test_0622/data_bed maskfile='/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/vcf_1kg220425/originalfile/20160622.allChr.mask.bed' for i in sortedmerged_nc*.bed do # intersect echo "intersect "$i bedtools intersect -a $i -b $maskfile > 'temp.'$i sort -T . -k1,1 -k2,2n 'temp.'$i > 'tempsorted.'$i bedtools merge -i 'tempsorted.'$i >'smer.mask.'"${i#*_}" # count the number of sites before and after passing mask file lineinput="$(awk '{sum += $3 - $2} END {print sum}' $i)" linepassmask="$(awk '{sum += $3 - $2} END {print sum}' 'smer.mask.'"${i#*_}")" echo -e "$i\t$lineinput\t$linepassmask" >>count.passmask.tab rm 'temp.'$i 'tempsorted.'$i mv $i archive done # phastCons and phyloP file with scores and cannot be merged. i=nc_chr21.phastCons470way.bed bedtools intersect -a $i -b $maskfile > 'mask.'$i lineinput="$(awk '{sum += $3 - $2} END {print sum}' $i)" linepassmask="$(awk '{sum += $3 - $2} END {print sum}' 'smer.mask.'"${i#*_}")" echo -e "$i\t$lineinput\t$linepassmask" >>count.passmask.tab mv $i archive i=nc_chr21.phyloP470way.bed bedtools intersect -a $i -b $maskfile > 'mask.'$i lineinput="$(awk '{sum += $3 - $2} END {print sum}' $i)" linepassmask="$(awk '{sum += $3 - $2} END {print sum}' 'mask.'"${i#*_}")" echo -e "$i\t$lineinput\t$linepassmask" >>count.passmask.tab mv $i archive ``` ### 2.3 Get annotated States sites with controls in a window remove overlaps between quiescent and annotates states sites Check how much overalp between annotates states sites and quiescent sites Are there overlap between quiescent region and other annotated regions? ```=bash input_annobed=smer.mask.nc_enhancers_states_hg38.bed input_quiebed=smer.mask.nc_quiescent_states_hg38.bed bedtools intersect -a $input_annobed -b $input_quiebed > temp.aq_overlap.bed ``` There are 4064 bp overlap between enhancer and quiescent file. How to deal with this? Remove from both file? I removed it from the quiescent file: "remove overlapped regions from mask.quiescent file" in the following script. Count the length of overlaps between unmasked annotated regions and nc_quiescent sites. I found that the overlap between unmasked annotated regions and nc_quiescent sites are less than 0.3% (for 'others' state) and usually less than 0.04% for most of them. ``` # get unmasked_nc_annotated regions for all the annotated files in anno_file.txt cd /u/home/c/cdi/noncoding/test_0622/data_bed/states_izabel maskfile=/u/home/c/cdi/noncoding/test_0622/data_bed/rootinput/sorted.20160622.allChr.mask.bed input_filelist='ncfiles.txt' output_countfile=/u/home/c/cdi/noncoding/test_0622/data_bed/count.passmask.tab while IFS="" read -r i || \[ -n "$i" \] do dirfilei=$i # dirfile=/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/States_data_izabel/nc_enhancers_states_hg38.bed i=$(basename "$i") # sort file so that less memory is required echo "sort and merge "$i sort -T . -k1,1 -k2,2n $dirfilei > 'temp_sorted.'$i bedtools merge -i 'temp_sorted.'$i >'temp_mer.'$i sort -T . -k1,1 -k2,2n 'temp_mer.'$i > 'smer.'$i rm 'temp_sorted.'$i 'temp_mer.'$i # intersect with mask files echo "intersect "$i bedtools intersect -a 'smer.'$i -b $maskfile -sorted > 'temp.mask'$i ## sort and merge unmasked region sort -T . -k1,1 -k2,2n 'temp.mask'$i > 'temp.s.mask.'$i bedtools merge -i 'temp.s.mask.'$i | sort -T . -k1,1 -k2,2n >'smer.mask.'$i rm 'smer.'$i 'temp.mask'$i 'temp.s.mask.'$i # count the number of sites before and after passing mask file lineinput="$(awk '{sum += ($3 > $2) ? $3 - $2 : $2 - $3} END {print sum}' $dirfilei)" linepassmask="$(awk '{sum += ($3 > $2) ? $3 - $2 : $2 - $3} END {print sum}' 'smer.mask.'$i)" echo -e "$i\t$lineinput\t$linepassmask" >> $output_countfile # What's the overlap between annotated file and quiescent sites input_annobed='smer.mask.'$i input_quiebed=/u/home/c/cdi/noncoding/test_0622/data_bed/smer.mask.nc_quiescent_states_hg38.bed output_overlap='/u/home/c/cdi/noncoding/test_0622/data_bed/states_izabel/ovlp'."${input_annobed%.*}".$(basename "$input_quiebed") bedtools intersect -a $input_annobed -b $input_quiebed -sorted> $output_overlap ## count the length of annotated and overlapped regions ### It's better to count absolute values # awk '{sum += ($3 > $2) ? $3 - $2 : $2 - $3} END {print sum}' l_input_annobed="$(awk '{sum += $3 - $2} END {print sum}' $input_annobed)" l_ovlp="$(awk '{sum += $3 - $2} END {print sum}' $output_overlap)" ## calculate overlap/annotated ratio ratio=$(bc <<< "scale=9; $l_ovlp/$l_input_annobed") ## write into a file echo -e "$input_annobed\t$input_quiebed\t$l_input_annobed\t$l_ovlp\t$ratio" >>ovlp_quie.tab done<$input_filelist Look at the output ovlp_quie.tab, the overalp between annotated sites and quiescent sites is only a small portion of data. Thus, I am thinking of removing overlapd regions from quiescent sites. This is removed sites/unmasked_quiescent sites=186023/748570113=0.024% sites. # something wrong here. # get bed file for all the overlaped regions rm ovlp.smer.mask.nc_quescient_states_hg38.smer.mask.nc_quiescent_states_hg38.bed paste -d '\n' ovlp.smer.mask* > temp.ovlp.allstates_quie.bed ## remove blank lines sed -i '/^$/d' temp.ovlp.allstates_quie.bed ## sort and merge the ovlp.allstates_quiescent bed file sort -T . -k1,1 -k2,2n temp.ovlp.allstates_quie.bed > temp.s.ovlp.allstates_quie.bed bedtools merge -i temp.s.ovlp.allstates_quie.bed | sort -T . -k1,1 -k2,2n > smer.ovlp.mask.nc.allstates_quie_hg38.bed rm temp.s.ovlp.allstates_quie.bed temp.ovlp.allstates_quie.bed # remove overlapped regions from mask.quiescent file bedtools subtract -a smer.mask.nc_quescient_states_hg38.bed -b smer.ovlp.mask.nc.allstates_quie_hg38.bed > temp.novlp.bed sort -T . -k1,1 -k2,2n temp.novlp.bed > temp.s.novlp.bed bedtools merge -i temp.s.novlp.bed > temp.mer.novlp.bed sort -T . -k1,1 -k2,2n temp.mer.novlp.bed >smer.novlp.mask.nc.allstates_quescient_hg38.bed awk '{sum += ($3 > $2) ? $3 - $2 : $2 - $3} END {print sum}' temp.novlp.bed awk '{sum += ($3 > $2) ? $3 - $2 : $2 - $3} END {print sum}' smer.novlp.mask.nc.allstates_quescient_hg38.bed rm temp.novlp.bed temp.s.novlp.bed temp.mer.novlp.bed ``` ### 2.4 Identify states sites that have nearby quiescent sites and get the nearby quiescent sites ```=bash cd /u/home/c/cdi/noncoding/test_0622/data_bed/states_izabel halfwindow_size='25000' inputfile=smer.mask.nc_states.txt input_quiescent=smer.novlp.mask.nc.allstates_quescient_hg38.bed output_summerytab=states_quiectrl.tab while IFS="" read -r m || \[ -n "$m" \] do input_statebed=$m # input_statebed=smer.mask.enhancers_states_hg38.bed # Write new bed file that show the nearby windows around a quiescent site/region awk -v win="$halfwindow_size" ' { mid = ($2 + $3) / 2 start = int(mid - win) if (start < 0) start = 0 end = int(mid + win) printf("%s\t%d\t%d\t%d\t%d\n", $1, start, end,$2, $3) } ' "$input_statebed" > temp.window.$input_statebed # get sites that have control quiescent sites ## Find windows that include quiescent sites ### intersect -wa keep blocks in a.bed that overlap with b.bed. Here, finding 50,000 windows that overlap with any quiescent sites. This step took 16 sec for chr21. bedtools intersect -wa -a temp.window.$input_statebed -b $input_quiescent > temp.q.window.$input_statebed ## Keep the last two column that are coordinates for annotated states sites. This step took 2 min for chr21. awk '{printf("%s\t%d\t%d\n", $1, $4, $5)}' temp.q.window.$input_statebed | sort | uniq> temp.has_qui.$input_statebed # get control quiescent sites. This took 20 sec for chr21. bedtools intersect -a $input_quiescent -b temp.window.$input_statebed > temp.quiecent_ctrl.$input_statebed ## sort and merge output bed files tosm_file=temp.has_qui.$input_statebed smer_file=smer.has_qui.$input_statebed sort -T . -k1,1 -k2,2n $tosm_file > s.$tosm_file bedtools merge -i s.$tosm_file | sort -T . -k1,1 -k2,2n >$smer_file count_hasquistates="$(awk '{sum += ($3 > $2) ? $3 - $2 : $2 - $3} END {print sum}' $smer_file)" rm s.$tosm_file tosm_file=temp.quiecent_ctrl.$input_statebed smer_file=smer.quiecent_ctrl.$input_statebed sort -T . -k1,1 -k2,2n $tosm_file > s.$tosm_file bedtools merge -i s.$tosm_file | sort -T . -k1,1 -k2,2n >$smer_file count_quie_ctrl="$(awk '{sum += ($3 > $2) ? $3 - $2 : $2 - $3} END {print sum}' $smer_file)" rm s.$tosm_file rm temp.window.$input_statebed temp.q.window.$input_statebed temp.has_qui.$input_statebed temp.quiecent_ctrl.$input_statebed ## calculate control/annotated_states sites ratio ratio=$(bc <<< "scale=9; $count_quie_ctrl/$count_hasquistates") echo -e "smer.has_qui.$input_statebed\t$count_hasquistates\tsmer.quiecent_ctrl.$input_statebed\t$count_quie_ctrl\t$ratio" >> $output_summerytab done<$inputfile ## check # bedtools intersect -v -a temp.window.$input_statebed -b $input_quiescent > temp.noq.window.$input_statebed # awk -F'\t' '{ print $1, $4, $5 }' temp.noq.window.$input_statebed | sort | uniq> no_qui.$input_statebed ### the sum of sites in the temp.q.window.$input_statebed and temp.noq.window.$input_statebed should matches $input_statebed. The count of sites are correct. ### awk '{sum += ($3 > $2) ? $3 - $2 : $2 - $3} END {print sum}' ``` ## 3. Get vcf for unmasked annotated sites ### For each chromosome and each state. /u/home/c/cdi/noncoding/test_0622/script/pipefiltfs.bash ### Put together for selected chromosomes # Annotated sites ## filter VCF file by phastCons and phyloP ### wig2bed Download [phastCons](https://genome.ucsc.edu/goldenPath/help/phastCons.html) and phyloP [wigformat](http://genome.ucsc.edu/goldenPath/help/wiggle.html#download) file to hoffman [dir](/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/conserve) Convert wigfile to bed format using a Perl [script](https://genomewiki.ucsc.edu/images/9/9d/FixStepToBedGraph_pl.txt) from UCSC. However, this script may only works for FixedStep and step=1. I first checked if all the wig records are FixedStep and step=1 ```=bash #wig2bed.bash #Download aand convert input phyloP or phastcons wig file to bed file. #Iinputs and outputs, argument is chr and maybe runmarker chr=$1 echo "The input chr=$1, e.g. chr1, chr21." runmarker=$2 echo "The input runmarker=$2 = =c(phastCons,phyloP)." phylo=$3 echo "The input phylo=$3=c(470way, 17way)." dirrun='/u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/data_bed/psbed' dirinputwig='/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/conserve' perlscript='/u/home/c/cdi/project-klohmuel/software/FixStepToBedGraph.pl' dirout=$dirrun cd $dirinputwig if [ "$runmarker" = "phastCons" ];then url="https://hgdownload.soe.ucsc.edu/goldenPath/hg38/phastCons"$phylo"/hg38."$phylo".phastCons/"$chr".phastCons"$phylo".wigFix.gz" inputwigname=$chr".phastCons"$phylo".wigFix.gz" elif [ "$runmarker" = "phyloP" ];then url="https://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP"$phylo"/hg38."$phylo".phyloP/"$chr".phyloP"$phylo".wigFix.gz" inputwigname=$chr".phyloP"$phylo".wigFix.gz" else echo "Wrong input of argument $2 runmarkers. Please input either phastCons or phyloP." fi inputwigname=$chr"."$runmarker$phylo".wigFix.gz" outbedname=${inputwigname%%.wigFix.gz}'.bed' echo $inputwigname echo $outputbedname if [ ! -e "$inputwigname" ]; then echo "downloading "$chr".phastCons"$phylo".wigFix.gz" curl -o $dirinputwig"/"$inputwigname $url else echo $inputwigname" exist." fi cd $dirrun runmarker=$chr$runmarker #Are they fixedStep, are all the step=1? #read all headlines inputwig=$dirinputwig"/"$inputwigname rm temp.$runmarker zgrep "chrom" $inputwig >>temp.$runmarker awk '{split ($0,a, " "); print a[1],a[4]}' temp.$runmarker>temp2.$runmarker if [ "$(awk '! a[$0]++' temp2.$runmarker)" = "fixedStep step=1" ] then echo "fixedstep step=1" #### wig2bed by Perl script zcat $inputwig | $perlscript > $dirout"/temp."$outbedname else echo "step in the input wig file is not fixedstep or step!=1." echo "Use other methods to convert the wig file to bed file." fi rm temp2.$runmarker temp.$runmarker # sort and merge the bed file module load bedtools sort -T . -k1,1 -k2,2n $dirout"/temp."$outbedname > $dirout"/s."$outbedname rm $dirout"/temp."$outbedname ``` ### curate conserved sites ``` #filter the sites with top X%, e.g. 5% scores. inputbed=$1 echo "The input is "$inputbed threds=$2 outputname="top-"$threds"-per.auto.""$(basename -- $inputbed)" echo "Keep the top "$threds" percent based on scores in 4th row in "$outputname dirrun='/u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/data_bed/psbed' dirout=$dirrun cd $dirrun #only keep autosomes time grep -vE '^chr[XY]\b' $inputbed > "temp.auto."$inputbed ##rm $inputbed #sort file sort -k4,4rn -k1,2 "temp.auto."$inputbed > "s4.auto."$inputbed #rm "temp.auto."$inputbed # Calculate the total number of lines in the file input="s4.auto."$inputbed total_lines=$(wc -l < $input) # Calculate the number of lines to keep (X percent of the total) ## Change this value to your desired percentage X=$threds lines_to_keep=$((total_lines * X / 100)) echo "Keep the top "$lines_to_keep" lines." # Use 'head' to extract the first 'lines_to_keep' lines head -n "$lines_to_keep" $input > "temp.top."$outputname # sort and merge the bed file module load bedtools sort -T . -k1,1 -k2,2n "temp.top."$outputname > "temp.s12.top."$outputname #rm "temp.top."$outputname bedtools merge -i "temp.s12.top."$outputname > "temp.mer.top."$outputname rm "temp.s12.top."$outputname sort -T . -k1,1 -k2,2n "temp.mer.top."$outputname > $dirout"/smer."$outputname rm "temp.mer.top."$outputname ``` ```=bash #filter the sites whith given range of scores #inputbed=outbedname #outbedname="s."$chr"."$runmarker$phylo".bed" inputbed=$1 echo "The input is "$inputbed #lrange and uprange decide the range of scores [lrange, uprange]. ## The input is the real score*100 because this will be eaiser to write file name. lnumerator=$2 upnumerator=$3 lrange=$(bc <<< "scale=2; $lnumerator / 100") uprange=$(bc <<< "scale=2; $upnumerator / 100") outputname=$lnumerator"-"$upnumerator".""$(basename -- $inputbed)" echo "The output is "$outputname dirrun='/u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/data_bed/psbed' dirout=$dirrun # filter sites by the scores awk -v lower="$lrange" -v upper="$uprange" -F '\t' '$4 >= lower && $4 <= upper' $dirout"/"$outbedname > $dirout"/temp.$outputname #check the output sort -k4,4 -k1,2 $dirout"/temp.$outputname > $dirout"/temp.s4.$outputname # sort and merge the bed file module load bedtools sort -T . -k1,1 -k2,2n $dirout"/temp."$outputname > $dirout"/temp.s."$outputname bedtools merge -i $dirout"/temp.s."$outputname > $dirout"/temp.mer."$outputname sort -T . -k1,1 -k2,2n $dirout"/temp.mer."$outputname > $dirout"/smer."$outputname rm $dirout"/temp.s."$outputname $dirout"/temp.mer."$outputname rm $dirout"/temp."$outputname #rm $dirout"/temp.s4."$outputname ``` # FitDadi ## installation ### download dadi from gutenkunst lab [Bitbucket site](https://bitbucket.org/gutenkunstlab/dadi/downloads/) ## Q&A and useful ref Dadi doc hosted by Gutenkunst lab https://dadi.readthedocs.io/en/latest/ Q&A google group: https://groups.google.com/g/dadi-user ## install anaconda ### create conda env ```bash! mkdir dadi conda create --prefix ./env ``` ### install dadi Selection.py has been integrated to dadi.DFE function. You don’t need to do anything else except for using conda to install dadi. ```bash! conda install -c conda-forge dadi ``` ### try examples in Gutenkunst lab tutorial download [repository](https://bitbucket.org/gutenkunstlab/dadi/get/bf4eb473e7fd.zip) ```bash= conda activate ./env cd gutenkunstlab-dadi-bf4eb473e7fd/example/fs_from_data ``` [Import data](https://dadi.readthedocs.io/en/latest/user-guide/importing-data/) ```python= import dadi dd = dadi.Misc.make_data_dict_vcf("1KG.YRI.CEU.biallelic.synonymous.snps.withanc.strict.subset.vcf.gz", "1KG.YRI.CEU.popfile.txt") #browse the dictionary of input data list(dd.items())[:1] ``` ## install on hoffman2 1. Do everything in the interactive mode ```bash #You want to require enough memory for step4: install packages. #10G is enough qrsh -l h_rt=3:00:00 -l h_data=10G ``` 2. install anaconda I installed anaconda on hoffman using linux file: https://www.anaconda.com/products/distribution#linux You can find the .sh file at the end of this page. Then follow the instruction: https://docs.anaconda.com/anaconda/install/linux/ ```bash shasum -a 256 /PATH/FILENAME bash ~/Downloads/Anaconda3-2020.05-Linux-x86_64.sh # The base environment is not activated by default conda config --set auto_activate_base False ``` The PREFIX=/u/home/c/cdi/anaconda3 3. create conda environment ```bash conda create --prefix ./condaenv/dadi0315 ``` 4. install packages in the conda environment ```bash #Be careful of deactivate conda base environment. #You will see (base) when it's activated conda deactivat #activate the environment that you want to build conda activate ./condaenv/dadi0315 ``` ```bash= #Python version: It seems that scikit-allel used in Meixi's package #currently doesn't support python=3.11.0 (2023/03/16) conda install -c conda-forge python=3.10.9 ``` ``` python -V ``` Similarly, install the following packages through the conda-forge channel. For example, ```bash= conda install -c conda-forge dadi conda install -c conda-forge dadi-cli ``` Packages required by dadi will also be installed in this step. Check if dadi has been installed by ```bash python ``` ```python import dadi ``` 5. Add neugamma to dadi.DFE.PDFs modify the file condaenv/dadi0116/lib/python3.12/site-packages/dadi/DFE/PDFs.py. Define function neugamma ``` def neugamma(xx, params): """Return Neutral + Gamma distribution""" """params = (shape, scale, pneu) pneu is the proportion of neutral mutations""" alpha, beta, pneu = params xx = np.atleast_1d(xx) out = (1-pneu)*gamma(xx, (alpha, beta)) # Assume gamma < 1e-4 is essentially neutral out[np.logical_and(0 <= xx, xx < 1e-4)] += pneu/1e-4 # Reduce xx back to scalar if it's possible return np.squeeze(out) ``` ```=python import dadi import dadi.DFE >>> dadi.DFE.PDFs.neugamma <function neugamma at 0x7f0227e87ce0> ``` Add options to dadi-cli script. ``` I installed all the following packages by order ``` name: dadi0315 channels: - conda-forge - defaults dependencies: - Python 3.10.9 - dadi - ipykernel - biopython - pandas - scikit-allel - plotnine - mpmath - pyinstaller 5. install Meixi's package Download the whole resipotory from MeiXi's github. I have the version I used stored at ``` #2023/03/15. I may modify the package without updating this note. /u/home/c/cdi/noncoding/varDFE-master ``` Following the README.txt file in Meixi's package. ```bash #Make sure use the pip installed by conda instead of the system pip. $ which pip ~/condaenv/dadi0315/bin/pip ``` Navigte to where you stored Meixi's package. ```bash cd /u/home/c/cdi/noncoding/varDFE-master pip install -e ./ ``` 6. Test with examples in dadi or Meixi's package. You can naviate to Meixi's example folder: e.g. ``` /u/home/c/cdi/noncoding/varDFE-master/example/human_dfe ``` Then start python session and follow this https://github.com/meixilin/varDFE/blob/master/example/human_dfe/human_dfe_part1.ipynb ``` 8. Errors I got when installing dadi on hoffman - broken dir_yourcondaenv/lib/python3.11/site-packages/dadi/__init__.py . This automatically fixed itself at the forth retry. Otherwise, you can find it in dadi package and replace it. - Not enough memory: 10G is enough - Packages previously installed at conda base environment can be problematic, e.g. numpy. It can be solved by remove conda files: rm rf ./anancond 3; and don't forget to remove ./conda at the same directory. Or perhaps only remove one package from the base environment (I didn't try). - Python version: It seems that scikit-allel used in Meixi's package currently doesn't support python=3.11.0 ## Make FS by dadi from VCF file 1. Download data: VCF file from 1kgeome30X and sample list ``` [cdi@login4 1kg_220425]$ pwd /u/home/c/cdi/project-klohmuel/noncdoingdfe/data/1kg_220425 [cdi@login4 1kg_220425]$ ls 1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz igsr_1000genomes_30x_on_grch38_sample.tsv ``` 2. load dadi ```==bash qrsh -l h_rt=3:00:00 -l h_data=10G conda activate /u/home/c/cdi/condaenv/dadi0315 python ``` 3. Get FS file of YRI population ```=python #This took about 5 min import dadi inputdir="/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/1kg_220425" inputvcf=inputdir+"/"+"1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz" inputpopfile=inputdir+"/"+"popfile_YRI.txt" dd = dadi.Misc.make_data_dict_vcf(inputvcf, inputpopfile) fs = dadi.Spectrum.from_data_dict(dd, ['YRI'], projections = [90], polarized = False) outputdir="/u/home/c/cdi/noncoding/test_0622/" fs.to_file(outputdir+"YRI.fs") ``` 4. Get dictionary of all the populations [reference](https://dadi.readthedocs.io/en/latest/examples/basic_workflow/basic_workflow_1d_demographics/#creating-a-frequency-spectrum-fs) ```=python import dadi inputdir="/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/1kg_220425" inputvcf=inputdir+"/"+"1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz" inputpopfile=inputdir+"/"+"popfile_3202.txt" dd = dadi.Misc.make_data_dict_vcf(inputvcf, inputpopfile) # Save dictionary for the future use # Pickle is used to save variables as files for future use import pickle # Make directory for saving data dictionaries import os if not os.path.exists(inputdir+'/data_dictionaries'): os.makedirs(inputdir+'/data_dictionaries') pick = open(inputdir+'/data_dictionaries'+'/1KG_allVCF_allpop.bpkl','wb') pickle.dump(dd, pick, 2) # We can open the pickled data dictionary with: dd = pickle.load(open(inputdir+'/data_dictionaries'+'/1KG_allVCF_allpop.bpkl','rb')) ## 10GB is not enough memory to load this dictionary ``` 6. Plot SFS ```=R setwd("~/c6googledrive/projects/1.DFE of non-coding/fromhpc") allfsfile <- Sys.glob("*.fs") read_input_sfs_original = function(input_file) { ## Reads input SFS in Dadi Format this_file = file(input_file) on.exit(close(this_file)) sfs_string = readLines(this_file)[2] output_sfs = as.numeric(unlist(strsplit(sfs_string, ' '))) return(output_sfs) } filename_inputfs=allfsfile Fdata_fsplot=function(filename_inputfs){ #This function add a column to distinguish multiple groups of SNPs by its colname #Also, add the counts number by step=1 accourding to the number of inputs. i=NULL; df_plotfs=NULL for (i in c(1:length(filename_inputfs))){ print(paste0("Processing file",i,":",filename_inputfs[i])) s_fs=NULL; s_group=NULL; temp_groups=NULL; temp_counts=NULL;temp_df=NULL #Read input site frequency spectrum as a vector s_fs=read_input_sfs_original(filename_inputfs[i]) #Name the group of SNP by inputfile name s_group=sub(pattern = "(.*)\\..*$", replacement = "\\1", basename(filename_inputfs[i])) #Make three column matrix with "frequency" [the number of samples with given counts] ## "counts" [the number of samples with alternative alleles] ## "group" [the group of SNPs included in the data] temp_groups=rep(s_group,length(s_fs)) temp_counts=seq(from=1,to=length(s_fs),by=1) temp_df=cbind(s_fs, temp_counts, temp_groups) df_plotfs=rbind(df_plotfs,temp_df) } df_plotfs=data.frame(df_plotfs) colnames(df_plotfs)=c("frequency","counts","group") return(df_plotfs) } data_fsplot=Fdata_fsplot(filename_inputfs) data_fsplot$counts=as.numeric(data_fsplot$counts) data_fsplot$frequency=as.numeric(as.character(data_fsplot$frequency)) ## The file name is too long, I used the 7th to 13th character to distinguish. Adjust as needed. data_fsplot$group=substr(data_fsplot$group, start = 7, stop = 13) library(ggplot2) p <- ggplot(data = data_fsplot, aes(x=counts, y=frequency,fill=group)) + geom_bar(position='dodge2', stat='identity') + labs(x = "", fill = "") + ggtitle("Folded FS: 90/108")+ scale_x_continuous(name='Allele counts', # #breaks=x_axis, limits=c(0,46) ) + ylab('Number of Segregating Sites') + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) ## scale_fill_manual(values=c("darkslateblue", "darkslategrey", "darkturquoise")) print(p) ``` ## filter VCF by custmerized filters ### Different phastCons scores #### Archived because it's too slow: sort BED file by the phastCons scores (last column) ``` time awk '{ print $NF,$0 }' | sort -k1,1 -n | cut -f2- -d' ' chr21.phastCons470way.bed >sorted.chr21.phastCons470way.bed ``` Cut the first N lines based on the phastCons score. #### Get BED files with different scores using an R script below ```=R ##Filt bed file by column4 which saves scores, BED file should not incldue column names. ##inputfile format: ##chr21 53650 53651 0.2 ##chr21 53651 53652 0.2 #inputbed="chr21.phastCons470way.bed" # args = commandArgs(trailingOnly=TRUE) # inputbed=toString(args[1]) # outputdir=toString(args[2]) # thredlow=as.numeric(toString(args[3])) # thredhigh=as.numeric(toString(args[4])) # print(paste0("./makefiltbed.R ",toString(args[1])," ",toString(args[2])," ",toString(args[3])," ",toString(args[4]))) # #[thredlow, thredhigh] setwd("/Users/test/c6googledrive/projects/DFE_non-coding/fromhpc") thredlows=c(0, 0.0002,0.05,0.2,0.3,0.5,0.6,0.8,0.9,0.95) thredhighs=c(0.0002,0.05, 0.2,0.3,0.5,0.6,0.8,0.9,0.95,1) inputbed="./chr21.phastCons470way.bed" inputbedname=sub(pattern = "(.*)\\..*$", replacement = "\\1", basename(inputbed)) outputdir="./" scores=read.csv(paste0(inputbed),sep="\t",header = F,stringsAsFactors=FALSE) i=NULL for (i in c(1:length(thredlows))){ thredlow=thredlows[i] thredhigh=thredhighs[i] filtscores=scores[thredlow<=scores$V4 & thredhigh >=scores$V4,] write.table(filtscores,paste0(outputdir,"/",inputbedname,"_",thredlow,"-",thredhigh,"_",nrow(filtscores),".bed"), sep="\t",quote = F,row.names = F,col.names=F) } ``` # Back to previous step. Filt VCF and get FS ## Filt VCF by unmasked & has control states file and control.bed ```=bash ``` # Calculate length and mutatioInput ## 1. Grab mutation rate info from Roulette ### 1.1 Download Roulette mutation rate http://genetics.bwh.harvard.edu/downloads/Vova/Roulette/ File saved at :/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/mutation_roulette ### 1.2 Sum up the three Roulette mutational rate for each site. #### 1.2.1 Convert Roulette rate to diploid per site rate For example, on chromosome 22, we have: #CHROM POS ID REF ALT QUAL FILTER INFO 22 10510100 . C A . low PN=TGCTA;MR=0.03;MG=0.036 22 10510100 . C G . low PN=TGCTA;MR=0.041;MG=0.044 22 10510100 . C T . low PN=TGCTA;MR=0.151;MG=0.14 diploid per site per generation mutation rate will at 10510100 as (0.03+0.041+0.151)*1.015e-7=2.2533e-08 Attention: 1. Roulette gaves diplod mutation rate. Divided by 2 if you want haploid rate. 2. The scale factor is 1.015e-7. #### 1.2.2 Convert Roulette rate to diploid per-site per-generation mutation rate for autosomes The outputs are in /u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/data_mut/ Filenames are, for example: MR.10_rate_v5.2_TFBS_correction_all.bed (I think I might used bedtools to do it) ### 1.2.3 Make mutation rate file for each annotated states I may used bedtools to do it. The outputs are BED files in /u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/data_mut The file name are, e.g. chr20.mut.smer.quiecent_ctrl.smer.mask.nc_DNase_states_hg38.bed The 4th column is the diploid per-site Roulette mutation rate. It needs to be multiple with the sacle factor=1.015e-7 to convert to diploid per-site per-generation mutation rate. #### 1.2.4 summary of mutation rate in each autosome for each state. output="/u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/script/state_mut_ppsites.tab" The columns are: {1:bedfile of mutation rate. 2: length of bedfile1. 3:bedfile of all annotation. 4:length of bedfile2 5. Ratio of annotated sites that have mutation rate data. 6.averate of Roullete rate in given chromosom and given annotated sites with mutation rate info } This output was obtained by the following script. /u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/script/calmutcov.sh ```=bash #calmutcov.sh #loop over 22 chromosomes to get the mutation rate data for all the chromosomes at regions annotated by given state. #./calmutcov.sh 22 nc_enhancer module load bedtools state=$1 chr=$2 echo "./calmutcov.sh "$chr" "$state symbchr="chr"$chr dirstate="/u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/data_bed/states_izabel" dirmut="/u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/data_mut" dirrun="/u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/script" # inputs bed_test=$dirstate"/smer.has_qui.smer.mask."$state"_states_hg38.bed" bed_ctrl=$dirstate"/smer.quiecent_ctrl.smer.mask."$state"_states_hg38.bed" ## I can also just use all the mutation sites file chr_mask_mut=$dirmut"/MR."$chr"_rate_v5.2_TFBS_correction_all.bed" # outputs chr_bed_test=$dirstate"/"$symbchr"."$(basename -- $bed_test) chr_bed_ctrl=$dirstate"/"$symbchr"."$(basename -- $bed_ctrl) chr_bed_test_mut=$dirmut"/"$symbchr".mut."$(basename -- $bed_test) chr_bed_ctrl_mut=$dirmut"/"$symbchr".mut."$(basename -- $bed_ctrl) out_summary=$dirrun"/state_mut_ppsites.tab" # Get chr state bed grep "^$symbchr[[:space:]]" $bed_test > $chr_bed_test grep "^$symbchr[[:space:]]" $bed_ctrl > $chr_bed_ctrl # sites that has mutation rate echo "bedtools intersect: Get annotated sites that has mutation rate data." bedtools intersect -a $chr_mask_mut -b $chr_bed_test > $chr_bed_test_mut bedtools intersect -a $chr_mask_mut -b $chr_bed_ctrl > $chr_bed_ctrl_mut # Calculate the proportion of sites that have mutation rate for test and ctrls. echo "Calculate proportion of sites that has mutation rate data." l_chr_bed_test="$(awk '{sum += ($3 > $2) ? $3 - $2 : $2 - $3} END {print sum}' $chr_bed_test)" l_chr_bed_test_mut="$(awk '{sum += ($3 > $2) ? $3 - $2 : $2 - $3} END {print sum}' $chr_bed_test_mut)" coverage=$(printf "%.5f" "$(bc -l <<< "$l_chr_bed_test_mut / $l_chr_bed_test")") averagemut=$(awk -F'\t' '{ sum += $NF } END { if (NR > 0) printf "%.9f", sum / NR }' $chr_bed_test_mut) #echo $l_chr_bed_test_mut $l_chr_bed_test $coverage echo -e $(basename -- $chr_bed_test_mut)"\t"$l_chr_bed_test_mut"\t"$(basename -- $chr_bed_test)"\t"$l_chr_bed_test"\t"$coverage"\t"$averagemut>> $out_summary l_chr_bed_ctrl="$(awk '{sum += ($3 > $2) ? $3 - $2 : $2 - $3} END {print sum}' $chr_bed_ctrl)" l_chr_bed_ctrl_mut="$(awk '{sum += ($3 > $2) ? $3 - $2 : $2 - $3} END {print sum}' $chr_bed_ctrl_mut)" coverage=$(printf "%.5f" "$(bc -l <<< "$l_chr_bed_ctrl_mut / $l_chr_bed_ctrl")") averagemut=$(awk -F'\t' '{ sum += $NF } END { if (NR > 0) printf "%.9f", sum / NR }' $chr_bed_ctrl_mut) echo -e $(basename -- $chr_bed_ctrl_mut)"\t"$l_chr_bed_ctrl_mut"\t"$(basename -- $chr_bed_ctrl)"\t"$l_chr_bed_ctrl"\t"$coverage"\t"$averagemut>> $out_summary ``` ## 2. Calculate the length of mutational site after removing missing samples in projection. Remember to recalculate when changing the projection number. The mutational length is calculated by Length_annotated_state*(#SNPs_after_projection/#SNPs_inputvcffile). It is smaller than the annotated length. I used script: [wholegenome.R] to calculate the mutational length. output=/u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/output_fs/"chrs...sitescounts.tab" and /u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/output_fs/subchrs/ file name partter, e.g: chr10.sitescounts.tab ## 3. Calculate the haploid mutation rate and number of mutations Summerize data from the mutation rate data (step 1.2.4) and mutational length data (step 2). The output is /Users/test/c6googledrive/projects/DFE_non-coding/fromhpc/YRI_p160_roulette_mutuL.tab columns are: [1] "state": annotated state. [2]"L_mutational_test" : mutational length for test annotated states [3]"ul_test": haploid number of mutations=[per-site,per-gen,haploid mutation rate]*[L_mutational_test] [4] "L_mutational_ctrl": mutational length for quiescent control [5]"ul_ctrl"haploid number of mutations=[per-site,per-gen,haploid mutation rate]*[L_mutational_ctrl] script: Calsum_mutationrate.R ```=R #Calsum_mutationrate.py # I two precalculated files: 1. chri.sitescounts.tab which record the number of SNP sites after projection # and 2. state_mut_ppsites.tab, which has the Roulette mutation rate for each state averaged by each chromosome. # Read in the input files input_mutsum=read.csv("/Users/test/c6googledrive/projects/DFE_non-coding/fromhpc/Rollete/state_mut_ppsites.tab",sep="\t",header=F) input_fssummary=read.csv("/Users/test/c6googledrive/projects/DFE_non-coding/fromhpc/1002_output_fs_pr160_subchrs/allsubchrs.sitescounts.tab",sep="\t",header=F) roulette_scale=1.015e-7 # Name the input file columns colnames(input_mutsum)=c("file_mut","L_mutinfo","file_annot","L_annot","cover_mutoverannot","Rmut_dipl_christatei") ## {1:bedfile of mutation rate. 2: length of bedfile1. 3:bedfile of all annotation. 4:length of bedfile2 5. Ratio of annotated sites that have mutation rate data. 6.averate of Roullete rate in given chromosom and given annotated sites with mutation rate info } colnames(input_fssummary)=c("file_dadifs","num_dadifs","num_vcf","ratio_keepsnps","L_annot","L_mutational") ## {1: The FS file after projection in dadi. Attention: this file should be updated if projection number is changed. ## 2: The number of SNPs kept after projection ## 3. The number of SNPs in the vcf file from 1KG ## 4. The ratio of kept SNPs after projection=col2/col3 ## 5. The length of annotated state in each chromosome. ## 6. The mutational length = col4*col5. For regions without SNPs, some sites will be excluded because of missing samples if these sites have SNPs.} # Add columns for chr, annotation state, and control state df=input_mutsum df$chr <- sapply(strsplit(as.character(df[,1]), "\\."), function(x) ifelse(length(x) > 1, x[1], NA)) df$annot_state <- sapply(strsplit(as.character(df[,1]), "\\."), function(x) ifelse(length(x) > 1, x[7], NA)) df$ctrl_state <- sapply(strsplit(as.character(df[,1]), "\\."), function(x) ifelse(length(x) > 1, x[4], NA)) df$lable=paste(df$chr,df$annot_state,df$ctrl_state,sep=".") print(unique(df$chr)) print(unique(df$annot_state)) print(unique(df$ctrl_state)) input_mutsum2=df[,c(1:6,10)] df=input_fssummary df$chr <- sapply(strsplit(as.character(df[,1]), "\\."), function(x) ifelse(length(x) > 1, x[10], NA)) df$annot_state <- sapply(strsplit(as.character(df[,1]), "\\."), function(x) ifelse(length(x) > 1, x[5], NA)) df$ctrl_state <- sapply(strsplit(as.character(df[,1]), "\\."), function(x) ifelse(length(x) > 1, x[2], NA)) df$lable=paste(df$chr,df$annot_state,df$ctrl_state,sep=".") input_fssummary2=df[,c(1:6,10)] # Join the two tables by chr, annotation state, and control state library(dplyr) table_join=left_join(input_fssummary2,input_mutsum2,by="lable") # Remove redundant columns and reanem the column L_annot.x to L_annot colnames(table_join) ## check if the files are consistent. ##table_join$L_annot.x==table_join$L_annot.y table_join=table_join[,c(1:10,12,13)] colnames(table_join)[5]="L_annot" # Calculate the mutation number for each chromosome and annotation state. ul=hapmutrate*L_mutational table_join$hapmutrate=table_join$Rmut_dipl_christatei*roulette_scale/2 table_join$ul=table_join$hapmutrate*table_join$L_mutational # Split the lable column to chr, annot_state, and ctrl_state table_join$chr <- sapply(strsplit(as.character(table_join$lable), "\\."), function(x) ifelse(length(x) > 1, x[1], NA)) table_join$annot_state <- sapply(strsplit(as.character(table_join$lable), "\\."), function(x) ifelse(length(x) > 1, x[2], NA)) table_join$ctrl_state <- sapply(strsplit(as.character(table_join$lable), "\\."), function(x) ifelse(length(x) > 1, x[3], NA)) write.table(table_join,file="/Users/test/c6googledrive/projects/DFE_non-coding/fromhpc/1002_output_fs_pr160_subchrs.mutrate.tab",sep="\t",quote=F,row.names=F,col.names=T) # sum up the mutations over chromosomes for each state states=unique(table_join$annot_state) out=NULL for (i in c(1:length(states))){ print(states[i]) table_state <- table_join[which(table_join$annot_state == states[i]), ] table_state_test <- table_state[which(table_state$ctrl_state == "has_qui"), ] table_state_ctrl <- table_state[which(table_state$ctrl_state == "quiecent_ctrl"), ] if (nrow(table_state_test) !=22 | nrow(table_state_ctrl) !=22){ print(paste0("Skip the state:",states[i]," because the total chromosome number in subset is not 22.")) print(paste0(nrow(table_state_test)," chromosomes in test sets.")) print(paste0(nrow(table_state_test)," chromosomes in ctrl sets.")) } out_statei=c(states[i],sum(table_state_test$L_mutational),sum(table_state_test$ul),sum(table_state_ctrl$L_mutational),sum(table_state_ctrl$ul)) out=rbind(out,out_statei) } out=data.frame(out) colnames(out)=c("state","L_mutational_test","ul_test","L_mutational_ctrl","ul_ctrl") write.table(out,file="/Users/test/c6googledrive/projects/DFE_non-coding/fromhpc/YRI_p160_roulette_mutuL.tab",sep="\t",quote=F,row.names=F,col.names=T) ``` # Notes on packages # bcftools ```=bash module load htslib bgzip py_snps_temp_bcfvcfsnps_samples_YRI_2504_108.1kGP.chr.21.vcf tabix -p vcf temp_py_bcftools_snp.vcf.gz bcftools isec -n =1 -c none py_snps_temp_bcfvcfsnps_samples_YRI_2504_108.1kGP.chr.21.vcf.gz bcftools_snps_samples_YRI_2504_108.1kGP.chr.21.vcf.gz >bcfpyunique.txt # =1, records that only belong to one file. annotated by the last column, 01 means not in the first but in the second file. bcftools isec -n =2 -c none py_snps_temp_bcfvcfsnps_samples_YRI_2504_108.1kGP.chr.21.vcf.gz temp_py_bcftools_snp.vcf.gz >bcfpyshare.txt ``` bcftools viewing vcf file ``` bcftools query -f '%CHROM\ %POS[ %GT]\n' -r 'chr21: 44349012-44349012' test.vcf.gz bcftools query -f '%CHROM\ %POS %REF\ %ALT[ %GT]\n' ``` ## bcftools and SNPs (remove indels and SV) ```bcftools view -v snps``` includes records that are both snps and indels, include records that have '*' in ref or alt and multi-allelic sites. ```=bash bcftools view -v snps $input -Oz -o bcf_vsnps.$input # #-Oz means output as zipped file. -o specify the output name. #similarly use -v indels can extract indels ``` ```bcftools view -i 'TYPE="snp"'``` This function removes indels, remove records with '*' in ref or alt, keeps multi-allelic SNPs. ``` # another command from bcftools bcftools view -i 'TYPE="snp"' $input -Oz -o bcf_isnps.$input ``` # [bedtools](https://bedtools.readthedocs.io/en/latest/content/tools/subtract.html) Only use .bed file when using ```intersect``` and ```subtract```. I used bed file and vcf file together in the argument and the results are wrong. ###intersect -a vcf -b bed works -a vcf.gz -b bed works -a bed -b vcf doesn't work ### subtract -a bed -b bed works -a bed -b vcf doesn't work. -a vcf -b bed doesn't work -a vcf -b vcf doesn't work bedtools subtract -a test_mask2.bed -b test_poly.vcf ### bedtool: merge and intersect ``` dir=/u/home/c/cdi/noncoding/test_0622/ ``` Load bedtool on hoffman2 ``` module load bedtools ``` Merge coodinates: [bedtools merge](https://bedtools.readthedocs.io/en/latest/content/tools/merge.html) ``` cd $dir #merge coordinates in the "filter" file filter=chr21.phastCons470way #sort by coordinates sort -k1,1 -k2n $filter".bed" > $filter"sort.bed" bedtools merge -i $filter"sort.bed" > "merged."$filter".bed" rm $filter"sort.bed" ``` only keep VCF rows of which coordinates are included in the bed file: [bedtools intersect](https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html) ``` dirinput=/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/vcf_1kg220425/ fileinput=samples_YRI_2504_108.1kGP.chr21 #This took 14 sec bedtools intersect -a $dirinput$fileinput".vcf.gz" -b "merged."$filter".bed" -header | gzip >$filter"."$fileinput".vcf.gz" rm "merged."$filter".bed" ``` vcf2bed; vcf to bed ``` # rewrite polymorphic vcf file to bed file. bcftools query -f '%CHROM\t%POS\t%POS\n' $input.vcf | awk '{print $1 "\t" $2-1 "\t" $3}' > $input.bed ``` # BED files sum the number of sites ``` count_quie_ctrl="$(awk '{sum += ($3 > $2) ? $3 - $2 : $2 - $3} END {print sum}' $smer_file)" rm s.$tosm_file sort --parallel=20 -k1,1 -k2n $inputbed > s.$inputbed ``` # [dadi](https://dadi.readthedocs.io/en/latest/user-guide/importing-data/) # View genome in UCSC genome browser ## 1. Make track files Add description to BED files https://genome.ucsc.edu/goldenPath/help/customTrack.html#EXAMPLE1 ```=bash prefix="nc_quescient" extension="_states_hg38.bed" color="255, 0, 0" (echo "track name="$prefix" description="$prefix" visibility=2 color="$color && cat $prefix$extension) > "ucsct_"$prefix$extension ``` # dadi-cli Using input parameters file, script [dadi-cli.sh] to run dadi-cli to infer demographic and DFE estimates. ## submission file ``` #!/bin/bash #$ -cwd # error = Merged with joblog #$ -o joblog.$JOB_ID #$ -N loopinitial #$ -j y ## Edit the line below as needed: #$ -l h_rt=1:30:00,h_data=3G ## Modify the parallel environment ## and the number of cores as needed: #$ -pe shared 10 ## substitute the command to run your code ## in the two lines below: echo '/usr/bin/time -v hostname' /usr/bin/time -v hostname . /u/local/Modules/default/init/modules.sh module load anaconda3 conda activate /u/home/c/cdi/condaenv/dadi0315 cd /u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/script file_path="p0_dfe.txt" # Check if the file exists line_start=49 # Replace with the desired line number to start from # Check if the file exists if [ -f "$file_path" ]; then # Start reading from the specified line and loop through the rest tail -n +$line_start "$file_path" | while IFS= read -r line; do # Perform actions with the line echo "Processing line: $line" ./dadi-cli.sh $line done else echo "File not found: $file_path" fi ``` ### example lines of p0_dfe.txt It's a tab-split file as below. ![](https://hackmd.io/_uploads/B1FhM2-V6.png) # Infer demographic model and DFE by dadi-cli ## demographic model ## generating cache On hpc, for gamma distribution: dadi-cli_cache.sh ```=bash # After finding the best-fit demographic model. Plot the demographic SFS fit and make cache file. # The input file is maxll_demoparam_file.txt from the plot_bestll_demo.py script ran locally on my computer. # example of the input line: # promoters-demo_three_epoch-4 promoters three_epoch three_epoch.nc_promoters.YRI.demog.params.5.InferDM.bestfits 1.296357124 47927.226811458386 subdir_cache=${1} state="nc_"${2} dir_fsfile="/u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/output_fs" dir_outdemog="/u/home/c/cdi/project-klohmuel/noncdoingdfe/test_0622/result_dadi/"$subdir_cache ctrl_fsfile=$dir_fsfile"/smer.quiecent_ctrl.smer.mask."$state"_states_hg38.mask.bisnp.YRI_2504_108.1kGP.chrs_1t22.fs" test_file=$dir_fsfile"/smer.has_qui.smer.mask."$state"_states_hg38.mask.bisnp.YRI_2504_108.1kGP.chrs_1t22.fs" model_demog=${3} model_sel=$model_demog"_sel" projection_num=160 bestfit_demogfile=$dir_outdemog"/"${4} grid="1500 1520 1540" numcpus=20 mutation_ctrl_diploid=${5} theta=${6} #gamma_l=2*0.000001*theta/(2*mutation_ctrl_diploid) gamma_l=$(awk "BEGIN {printf \"%.10f\", 2 * 0.000001 * $theta / (2 * $mutation_ctrl_diploid)}") gamma_h=$(awk "BEGIN {printf \"%.10f\", 2 * 0.5 * $theta / (2 * $mutation_ctrl_diploid)}") gammabounds="$gamma_l"" ""$gamma_h" echo $gammabounds cachemarker=1 cachefile=$dir_outdemog"/"$model_sel"."$state"."$cachemarker".YRI.spectra.bpkl" echo dadi-cli GenerateCache --model $model_sel --demo-popt $bestfit_demogfile --sample-size $projection_num --grids $grid --gamma-pts 300 --gamma-bounds $gammabounds --output $cachefile --cpus $numcpus dadi-cli GenerateCache --model $model_sel --demo-popt $bestfit_demogfile --sample-size $projection_num --grids $grid --gamma-pts 300 --gamma-bounds $gammabounds --output $cachefile --cpus $numcpus ``` ## DFE model ### Add customerized model to dadi-cli neugamma model: ```=python def neugamma(xx, params): """Return Neutral + Gamma distribution""" pneu, alpha, beta = params xx = np.atleast_1d(xx) out = (1-pneu)*DFE.PDFs.gamma(xx, (alpha, beta)) # Assume gamma < 1e-4 is essentially neutral out[np.logical_and(0 <= xx, xx < 1e-4)] += pneu/1e-4 # Reduce xx back to scalar if it's possible return np.squeeze(out) ``` # Archive #Get the closest quiescent position to the other HMM state using [bedtools closest](https://bedtools.readthedocs.io/en/latest/content/tools/closest.html) Use enhancer as an example. Specify files ``` state=enhancers statebed='nc_'$state'_states_hg38.bed' ``` Sort and merge files before using bedtools closest ``` #for i in nc*; do sort -k1,1 -k2,2n $i > sorted_$i; done sort -k1,1 -k2,2n $statebed > temp_$statebed bedtools merge -i temp_$statebed>sortedmerged_$statebed rm temp_$statebed ``` ``` codingbed=/u/home/c/cdi/project-klohmuel/noncdoingdfe/data/genecode/coding_hg38_gencode_v43.bed quibed=/u/home/c/cdi/noncoding/test_0622/data_bed/nc_quiescent_states_hg38.bed sort -T . -k1,1 -k2,2n | bedtools merge -i $codingbed> sortedmerged_coding_hg38_gencode_v43.bed sort -k1,1 -k2,2n | bedtools merge -i $quibed > sortedmerged_nc_quiescent_states_hg38.bed sortcodingbed=sortedmerged_coding_hg38_gencode_v43.bed sortquibed=sortedmerged_nc_quiescent_states_hg38.bed ``` Distance to the closest coding and quiescent region ``` closestBed -a sortedmerged_$statebed -b $sortcodingbed -D ref> closest_$state'_coding.bed' closestBed -a sortedmerged_$statebed -b $sortquibed -D ref> closest_$state'_quiescent.bed' # find closest coding and quiescent region bedtools closest -a sortedmerged_$statebed -b $sortquibed $sortcodingbed -mdb each -D ref > closest_$state'_quiescent_coding.bed' ``` 4. Get both closet up and down stream blocks. Keep quiescent regions in a given distance. ``` bedtools closest -a a.bed -b b.bed -D ref -iu > bedtools closest -a a.bed -b b.bed -D ref -id > ``` for i in closest_quescient_states_*; do cut -f4-6 $i | sort -k1,1 -k2,2n -k3,3n -u > unique_$i; done # Keep only the 2 last columns (queiscent columns) for i in unique_*; do cut -f3,4 $i; done ### Get qualified monomorphic sites remove polymorphic vcf file from masked bed file ```=bash input4=$input2 #input4=bcftools_polym_samples_YRI_2504_108.1kGP.chr.21.vcf.gz gzip -d $input4 input4="${input4%.*}" output4="${input4%.*}" ``` ```=bash # bedtools subtract only works for -a bedfile -be bedfile, thus rewrite polymorphic vcf file to bed file. # bcftools query -f '%CHROM\t%POS\t%POS\n' $input4 | awk '{print $1 "\t" $2-1 "\t" $3}' > temp.$output4.bed #remove polymorphic sites from maskfile bedtools subtract -a $filt -b temp.$mid4.bed -header -f 1E-13 > mask.not.$output4.bed #check total length #The legth of chr21 = polymorphic + monomorphic echo The number of unmasked monomorphic sites. awk '{sum += $3 - $2} END {print sum}' mask.not.$output4.bed echo The number of unmasked polymorphic sites. awk '{sum += $3 - $2} END {print sum}' temp.$output4.bed echo The number of total unmasked sites in the chromosome. awk '{sum += $3 - $2} END {print sum}' $filt #merge and sort bed file sort -k1,1 -k2n mask.not.$output4.bed > mask.not.$output4.sorted.bed bedtools merge -i mask.not.$output4.sorted.bed > mask.not.$output4.merged.bed rm mask.not.$output4.bed mask.not.$output4.sorted.bed rm temp.$output4.bed rm mask.not.$output4.bed ``` # Cheatsheet for dadi ## Read cachefile ``` with open(cache_file, 'rb') as file: cache = pickle.load(file) cache.gammas ``` # End of Archive if line_count != 0 ratio_manual_s_vcf=manual_s/line_count L=interval_sum*manual_s/line_count else ratio_manual_s_vcf=0 L=0 ```=bash while IFS="" read -r m || \[ -n "$m" \] do chr=$m bedfile='temp.'$chr'.smer.has_qui.smer.mask.nc_acetylations_states_hg38.bed'; echo $bedfile counts="$(awk '{sum += ($3 > $2) ? $3 - $2 : $2 - $3} END {print sum}' $bedfile)" allcounts="$((allcounts+counts))" echo $counts; echo $allcounts done<24chr.txt ```