###### tags: `survey`
{%hackmd Eg_dogKwTYGxkdzZ3O5Gkg %}
# GATK-GVCF
## Flow
```mermaid
flowchart
subgraph referenceModelForNoVariation
direction TB
S1(finalizeRegion)-->S2(filterNonPassingReads)
S2-->S3(calculateRefConfidence)
end
A1 --"not active"--> referenceModelForNoVariation
A1(("region")) --"active"--> A2(trim haplotypes/reads)
A2 --> A3(left flank)
A2 --> A5(right flank)
A3 --> referenceModelForNoVariation
A5 --> referenceModelForNoVariation
A2 --> A4(trimmed region)
A4 --> S3
```
## calculateRefConfidence
```mermaid
flowchart
subgraph makeReferenceConfidenceVariantContext
direction TB
S1(calcGenotypeLikelihoodsOfRefVsAny)-->S2(add some tags: AD, DP...)
S2-->S3(doIndelRefConfCalc)
end
A1((For each pos)) -->A2(pileup base and BQ)
A2 --"overlappingSite"-->A3("use PL from modified read-allele likelihood(*)")
A2 --"!overlappingSite"-->makeReferenceConfidenceVariantContext
```
### finalizeRegion
- trim reads base on setting (soft, hard, BQ thresholds...)
- modify pairs overlapping BQ
### filterNonPassingReads
- filtering following reads
- unclip read too short (<10)
- has mate but mate not in same contig
- RG setting
### calcGenotypeLikelihoodsOfRefVsAny

### doIndelRefConfCalc
- choose PL be the one in { PL from calcGenotypeLikelihoodsOfRefVsAny, PL_indel} with the smaller GQ.
- PL_indel can be computed base on N = #{reads that have no plausible indels based on alignment at pileup}
- For convenience, PL_indel = [0,3N,45N]
- [N, informative reads](#Informative-reads)




20221131_cohort.pptx
### PL from modified read-allele likelihood
- base on original read-allele likelihood
- add <NON_REF> allele and it's likelihood
- the value of NON_REF would be $$\text{median(likelihoods < best allele likelihood)}$$

### Informative reads
```java
private static boolean readHasNoPlausibleIdealsOfSize(final GATKRead read,
final int readStart,
final byte[] refBases,
final int refStart,
final int maxIndelSize,
final boolean useCachedResults) {
BitSet cachedResult = (BitSet) read.getTransientAttribute(INDEL_INFORMATIVE_BASES_CACHE_ATTRIBUTE_NAME);
// 計算過的會放到 tag IDL 裡面暫存
if (cachedResult == null || !useCachedResults) {
Utils.validate(readStart >= 0, "readStart must >= 0");
Utils.validate(refStart >= 0, "refStart must >= 0");
BitSet informativeBases = new BitSet(read.getLength());
if ( !(read.getLength() - readStart < maxIndelSize) && !(refBases.length - refStart < maxIndelSize) ) {
// readStart後長度 >= maxIndelSize 且 refStart後長度 >= maxIndelSize
final int secondaryReadBreakPosition = read.getLength() - maxIndelSize;
// 倒數 maxIndelSize 個 base
final Pair<byte[], byte[]> readBasesAndBaseQualities = AlignmentUtils.getBasesAndBaseQualitiesAlignedOneToOne(read);
// calls getBasesNoCopy if CIGAR is all match
// get base & BQ
// for the delection part, fill "-" into base, and fill "0" into BQ
// for the softclip & insertion part, copy base & BQ in reads
//
// For example (deletion):
// read AAAA AAA 4444 444
// ref AAAAAAAAAA
// => AAAA---AAA 4444000444
//
// For example (insertion):
// read AAAAAAAAAA 4444444444
// ref AAAA AAA
// => AAAAAAAAAA 4444444444
final byte[] readBases = readBasesAndBaseQualities.getLeft();
final byte[] readQualities = readBasesAndBaseQualities.getRight();
if (readBases.length - readStart > maxIndelSize) {
// read.len() != readBases.len() when there is a deletion
final int lastReadBaseToMarkAsIndelRelevant;
final boolean referenceWasShorter;
if (readBases.length < refBases.length - refStart + readStart + 1) {
lastReadBaseToMarkAsIndelRelevant = readBases.length - maxIndelSize;
referenceWasShorter = false;
} else {
lastReadBaseToMarkAsIndelRelevant = refBases.length - refStart + readStart - maxIndelSize + 1;
referenceWasShorter = true;
}
// 檢查 ref.len - refstart 長還是 readBase - readStart 長
// // 看要 先算多少個base
final int[] baselineMisMatchSums = calculateBaselineMMQualities(readBases, readQualities, readStart, refBases, refStart);
// for each base, accumulate all BQ of mismatch bases after(contains) this base
// for example
// AAAAAAAAAAA BQ: 91999399959
// ATAAATAAATA
// => 88777755550
for (int indelSize = 1; indelSize <= maxIndelSize; indelSize++) {
// Computing mismatches corresponding to a deletion
traverseEndOfReadForIndelMismatches(informativeBases,
readStart,
readBases,
readQualities,
lastReadBaseToMarkAsIndelRelevant,
secondaryReadBreakPosition,
refStart,
refBases,
baselineMisMatchSums,
indelSize,
false);
// Computing mismatches corresponding to an insertion
traverseEndOfReadForIndelMismatches(informativeBases,
readStart,
readBases,
readQualities,
lastReadBaseToMarkAsIndelRelevant,
secondaryReadBreakPosition,
refStart,
refBases,
baselineMisMatchSums,
indelSize,
true);
}
if ( lastReadBaseToMarkAsIndelRelevant <= secondaryReadBreakPosition) {
informativeBases.flip(0, lastReadBaseToMarkAsIndelRelevant);
// 對錯互換
if (referenceWasShorter) {
informativeBases.set(lastReadBaseToMarkAsIndelRelevant - 1, false);
}
} else {
informativeBases.flip(0, secondaryReadBreakPosition + 1);
}
}
}
cachedResult = informativeBases;
read.setTransientAttribute(INDEL_INFORMATIVE_BASES_CACHE_ATTRIBUTE_NAME, informativeBases);
}
return cachedResult.get(readStart);
}
private static void traverseEndOfReadForIndelMismatches(final BitSet informativeBases, final int readStart, final byte[] readBases,
final byte[] readQuals, final int lastReadBaseToMarkAsIndelRelevant,
final int secondaryReadBreakPosition, final int refStart, final byte[] refBases,
final int[] baselineMMSums, final int indelSize, final boolean insertion) {
final int globalMismatchCostForReadAlignedToReference = baselineMMSums[0];
// accumulate BQ > 此值就 break
int baseQualitySum = 0;
final int insertionLength = !insertion ? 0 : indelSize;
final int deletionLength = insertion ? 0 : indelSize;
final int numberOfBasesToDirectlyCompare = Math.min(readBases.length - readStart - insertionLength,
refBases.length - refStart - deletionLength);
for (int readOffset = numberOfBasesToDirectlyCompare + insertionLength - 1,
refOffset = numberOfBasesToDirectlyCompare + deletionLength - 1;
readOffset >= 0 && refOffset >= 0;
readOffset--, refOffset--) {
// 從尾巴開始向前
final byte readBase = readBases[readStart + readOffset];
final byte refBase = refBases[refStart + refOffset];
if (isMismatchAndNotAnAlignmentGap(readBase, refBase)) {
// If mismatch
baseQualitySum += readQuals[readStart + readOffset];
// accumulate BQ
if (baseQualitySum > globalMismatchCostForReadAlignedToReference) {
// 如果累積的超過最大值, 代表往前的 base 不會 support indel
break;
}
}
int siteOfRealComparisonPoint = Math.min(readOffset, refOffset);
if (readBases[readStart + siteOfRealComparisonPoint] != AlignmentUtils.GAP_CHARACTER &&
// if read[pos] != "-" (本來的 deletion 捕的值)
readStart + siteOfRealComparisonPoint < lastReadBaseToMarkAsIndelRelevant &&
readStart + siteOfRealComparisonPoint <= secondaryReadBreakPosition &&
baselineMMSums[siteOfRealComparisonPoint] >= baseQualitySum) {
// 如果baseline 的 accumulate BQ >= BQ, 則設定此點是 un-informative
informativeBases.set(readStart + siteOfRealComparisonPoint, true);
// 他這最後會 filp 一次, 所以錯就是對, 對就是錯
}
}
}
```