---
tags: GeneLab
title: GLDS-249, RR6, LoopG read-size distributions
---
# RR6, GLDS-249, LoopG metagenomics
<a href="https://i.imgur.com/OMDNulg.png"><img src="https://i.imgur.com/OMDNulg.png"></a>
Data available at [GLDS-249](https://genelab-data.ndc.nasa.gov/genelab/accession/GLDS-249/), labeled "LoopG".
---
# Code
Uses [bit package](https://github.com/AstrobioMike/bit#bioinformatics-tools-bit) at command-line to generate summary info.
```bash
# converting to fasta (qualities looked good, i think that's done at the building step)
for file in *.fastq; do echo ${file}; sed 's/^@/>/' ${file} | paste - - - - | cut -f 1,2 | tr "\t" "\n" > ${file}.fa; done
# summarizing
bit-summarize-assembly -o LoopG-read-summaries.tsv *.fa
# counting bases per sequence for all
for file in *.fa; do echo ${file}; bit-count-bases-per-seq -i ${file} -o ${file}-num-bps-per-seq.tsv; done
# combining base counts
cat *-num-bps-per-seq.tsv | cut -f 2 > all-base-counts.txt
```
```r
library(ggplot2)
library(ggpubr)
read_summary_tab <- read.table("LoopG-read-summaries.tsv", sep="\t", header=TRUE, row.names=1)
read_summary_tab <- data.frame(t(read_summary_tab))
read_summary_tab <- read_summary_tab[, c(1,2,5,6,7,10,16)]
read_summary_tab$Total.length <- read_summary_tab$Total.length / 1000000
names(read_summary_tab) <- c("Num. SLRs", "Total size (Mbps)", "Max. length (bps)", "Min. length (bps)", "N50 (bps)", "L50 (Num. SLRs)", "Num. SLRs > 5000 bps")
# re-ordering
read_summary_tab <- data.frame(read_summary_tab[, c(1,2)], read_summary_tab[, 7, drop=FALSE], read_summary_tab[, c(3:6)], check.names=FALSE)
base_counts_vec <- scan("all-base-counts.txt")
length(base_counts_vec)
# splitting to plot together with histogram more easily
read_summary_tab_6 <- read_summary_tab[, c(1:6)]
read_summary_tab_last <- read_summary_tab[, 7, drop=FALSE]
first_boxes <- ggplot(stack(read_summary_tab_6), aes(factor(ind), values)) + geom_boxplot(width=0.4) + facet_wrap(~ind, scale="free") +
theme_bw() + theme(axis.text.x = element_blank(), axis.text.y = element_text(face = "bold")) +
theme(axis.title = element_blank(), strip.text = element_text(face = "bold"), axis.ticks.x = element_blank())
last_box <- ggplot(stack(read_summary_tab_last), aes(factor(ind), values)) + geom_boxplot(width=0.4) + facet_wrap(~ind, scale="free") +
theme_bw() + theme(axis.text.x = element_blank(), axis.text.y = element_text(face = "bold")) +
theme(axis.title = element_blank(), strip.text = element_text(face = "bold"), axis.ticks.x = element_blank())
hist <- ggplot() + geom_histogram(aes(x = base_counts_vec), binwidth=100, boundary = 0, closed = "left") + theme_bw() + ylab("Num. SLRs") +
xlab("Length (bps)") + ggtitle("All lengths (N=1,107,846 SLRs)")
plot <- ggarrange(first_boxes, ggarrange(last_box, hist, widths=c(1.06,2)), ncol=1, heights=c(2,1))
annotate_figure(plot, top = text_grob("Summary distributions of SLRs (N=30 samples)", hjust=1.2))
```