---
title: "SummarisedExperiment_assignment"
author: "Zheng Xiaoqing_A0274989Y, Li Yushan_A0274759J"
date: "`r Sys.Date()`"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE)
```
# Get packages ready
"GEOquery", "tidyverse", "SummarizedExperiment", "AnnotationHub", "Biobase" are required for this script. For your reference, the chunk is for package intalling.
```{r package intall helper,eval=FALSE}
BiocManager::install("GEOquery")
install.packages("tidyverse")
BiocManager::install("GEOquery")
BiocManager::install("SummarizedExperiment")
BiocManager::install("AnnotationHub")
BiocManager::install("Biobase")
browseVignettes("SummarizedExperiment")
```
# Create standard directories
```{r create directory}
library(here)
dir.create(here('data'),showWarnings = FALSE)
dir.create(here('src'),showWarnings = FALSE)
```
# Get the series data
```{r gain gse data from cloud}
library(Biobase)
library(tidyverse)
library(GEOquery)
gse118719 <- GEOquery::getGEO('GSE118719')[[1]]
gse118720 <- GEOquery::getGEO('GSE118720')[[1]]
for (gse_id in c('GSE118719','GSE118720')){
getGEOSuppFiles(gse_id,
makeDirectory = FALSE,
baseDir = here('data'))
}
```
# For the `GSE118719` data
## Build and clean database
```{r make GSE118719_mrna.expression.tsv into dataframe, warning=FALSE}
# GSE118719 mrna.expression into table
cts719 <- read_tsv(here('data/GSE118719_mrna.expression.tsv.gz')) |>
dplyr::select(-geneSymbol)
# sample information, platform, and other relevant details
metadata118719 <- pData(gse118719)|>
janitor::clean_names()
# clean up the _ch1 suffix
names(metadata118719) <- str_remove(names(metadata118719),"_ch1")
# replace the mrna_1 title into geo_accession (GSM3336938)
names(cts719)[-1] <- rownames(metadata118719)
```
## Get annotation from cloud
```{r prepare GRange from ah}
library(AnnotationHub)
ah <- AnnotationHub()
unique(ah$rdataclass)
query(ah,c("TxDb","hg19"))
# we choose TxDb for "Gencode v29 on hg19 coordinates"
ah['AH75170']
txdb <- ah[["AH75170"]]
txdb
# get GRanges
hg19 <- genes(txdb)
# drop the suffix of gene id
names(hg19) <- sub("\\..*", "", names(hg19))
head(names(hg19))
```
## Match gene in gse118719 with annotations
## Build SummarizedExperiment with gse118719
```{r SummarizedExperiment1}
# match gene in gse118719 with hg19
cts719 <- cts719[cts719$Geneid %in% names(hg19),]
dim(cts719)
head(cts719$Geneid)
hg19[cts719$Geneid]
gene_name_719 <- cts719$Geneid
cts719 <- dplyr::select(cts719,-Geneid)
row.names(cts719) <- gene_name_719
library(SummarizedExperiment)
se1 <- SummarizedExperiment(assays = cts719,
colData = pData(gse118719),
rowRanges = hg19[rownames(cts719)])
```
# For the `GSE118720` data
## Build and clean database
```{r make GSE118720_mrna.expression.tsv into dataframe , warning=FALSE}
# GSE118720_mrna.expression into table
cts720 <- read_tsv(here('data/GSE118720_mirna.expression.tsv.gz'))
# sample information, platform, and other relevant details
metadata118720 <- pData(gse118720) |> janitor::clean_names()
# clean up the _ch1 suffix
names(metadata118720) <- str_remove(names(metadata118720),"_ch1")
# replace the mrna_1 title into geo_accession (GSM3336938)
names(cts720)[-1] <- rownames(metadata118720)
cts720$miR_name <- str_remove(cts720$miR_name,"hsa-")
```
## Get gse118720 relevant annotations
```{r matching with ah}
unique(ah$rdataclass)
query(ah,c("hg19","mirna"))
# miRNA target association database
ah[['AH98193']]
# get GRanges
path720 <- ah[['AH98193']]
df <- read_csv(path720)
df$miR.Family <- sub("\\/.*","",df$miR.Family)
df2 <- DataFrame(df)
rowRanges <- makeGRangesFromDataFrame(df2,
seqnames.field = "miR.Family",
start.field = "UTR.start",end.field = "UTR.end")
names(rowRanges) <- df$miR.Family
```
## Build SummarizedExperiment with gse118720
```{r SummarizedExperiment2}
cts720_1 <- cts720[cts720$miR_name %in% names(rowRanges),]
dim(cts720_1)
mrna_name_720 <- cts720_1$miR_name
cts720_1 <- dplyr::select(cts720_1,-miR_name)
row.names(cts720_1) <- mrna_name_720
se2 <- SummarizedExperiment(assays = cts720_1,
colData = pData(gse118720),
rowRanges = rowRanges[rownames(cts720_1)])
```