library(tidyverse)
library(KEGGREST)
The scrapper use a list of kegg of interest, then go through the list to get the associated BRITE annotations. It create an arbitrary A to F/G categories that can be used down the line for annotation analysis.
ko.list<-c("K26400","K26400","K26441", "K26441")
The main function below.
ko.scrap<-function(ko.list,a,b,m){
ko.tmp<-list()
k=0
#1 is NA
for(i in a:b){
skip_to_next <- FALSE
tryCatch(ko.dl<-keggGet(ko.list[i]),error = function(e) { skip_to_next <<- TRUE})
if(skip_to_next) {
print(paste0("\r Skipping ",ko.list[i]))
next }
else{
cat(paste0("\r","Doing ",ko.list[i]," - ",i,"/",length(ko.list)))
k=k+1
tmp<-c(ko.dl[[1]]$BRITE)
cn<-paste0("k",k)
tt<-tibble(!!cn:=tmp)
kl<-character()
st<-tibble()
for(rn in 1:length(tmp)){
scc=nchar(str_extract(tt[rn,1],"^[:space:]*"))
if((scc+1)>nrow(st)){
st[(scc+1),1]<-1
st[(scc+1),2]<-scc
ll<-paste0("L",paste(st$...1,collapse = "."))
kl<-c(kl,ll)
} else if((scc+1)==nrow(st)){
st[(scc+1),1]<-st[(scc+1),1]+1
ll<-paste0("L",paste(st$...1,collapse = "."))
kl<-c(kl,ll)
} else if ((scc+1)<nrow(st)){
st[(scc+1),1]<-st[(scc+1),1]+1
st<-st[1:(scc+1),]
ll<-paste0("L",paste(st$...1,collapse = "."))
kl<-c(kl,ll)
}
#print(ll)
}
kl<-c("KO",kl)
tmp<-c(ko.dl[[1]]$ENTRY,tmp)
tmp<-gsub("^[ ]*","",tmp)
ko.tmp[[k]]<-tibble(kl=kl,!!cn:=tmp)
}
}
print("")
Kotbl<-reduce(ko.tmp,full_join,by=c("kl"))
write.csv(x = Kotbl,file=paste0("table",m,".csv"))
print(paste("Scrapped ", b-(a-1), " annotations"))
}
Scrapping per se. This scrit has a safeguard system that make a few temporary files to avoid overloading memory. By default it is set to 100, but can be changed.
if(length(ko.list)>100){
maxm=length(seq(1,length(ko.list),100))
for (m in 1:(maxm-1)){
a=seq(1,length(ko.list),100)[m]
b=seq(1,length(ko.list),100)[m+1]-1
ko.scrap(ko.list,a,b,m)
print(paste("Saved intermediary file table",m))
}
a=seq(1,length(ko.list),100)[length(seq(1,length(ko.list),100))]
b=length(ko.list)
ko.scrap(ko.list,a,b,maxm)
print(paste("Saved intermediary file table",maxm))
} else{
ko.scrap(ko.list,a,b,"1")
}
The tables are then reloaded and merged in one big table
table<-list.files(path = ".",pattern = "^table*")
kotbltmp<-list()
for(i in 1:length(table)){
kotbltmp[[i]]<-read_csv(table[i], skip=1) %>%
select(!c(`1`,colnames(.)[str_detect("\\.",string = colnames(.))]))
}
kotbl2<-reduce(kotbltmp, full_join)
#Make a matrix
kt<-kotbl2 %>%
pivot_longer(!KO, names_to = "Module", values_to = "name", values_drop_na = TRUE)
#Get final list
tmp<-kt %>%
mutate(ncharlen=str_count(KO,pattern = "\\."),ncharlen=LETTERS[ncharlen+1],rn=row_number()) %>%
arrange(Module, KO) %>%
mutate(set=case_when(ncharlen=="A" ~ nrow(.))) %>%
fill(set)
Create a matrix of text then fill the row by group of sets, then filter row with a "KO" word and extract it in its own column
tmp<- tmp %>% pivot_wider(names_from = ncharlen, id_cols = rn, values_from = name) %>%
left_join(tibble(rn=tmp$rn, set=tmp$set))
#Transpose to fill the end of hierachy to the end of the row (to avoid them to fill something they shouldn't)
tmp<-as.data.frame(t(tmp))
tmp<-tmp[-1,] %>% fill(names(tmp), .direction = "down")
tmp<-as.data.frame(t(tmp))
Voila
Final.Table<-tmp%>%
group_by(set) %>%
fill(A,B,C,D,E,F) %>%
ungroup() %>%
select(!c(set)) %>%
unique() %>%
rowwise() %>%
mutate(find_KO = any(str_detect(c_across(A:F), regex("K([:digit:]\\w+)", ignore_case = TRUE)), na.rm = TRUE)) %>%
filter(find_KO==TRUE) %>%
select(-find_KO) %>%
mutate(KO = unique(na.omit(str_extract(across(A:F), regex("K([:digit:]\\w+)")))))
I was trying to make a plot showing how many genes with functional annotations are present across my pipeline and connect the dots to show the connection between different boxplot and geom jitter.
Oct 10, 2023conda activate anvio-7.1anvi-script-process-genbank -i all_refs.gbff --output-gene-calls all_refs_gene_calls.tsv --output-functions all_refs_functions.tsv --output-fasta all_refs.fa --include-locus-tags-as-functionsanvi-gen-contigs-database -f all_refs.fa -o contigs.db -n Wormbiome --external-gene-calls all_refs_gene_calls.tsv --split-length -1 --num-threads 12anvi-import-functions -c contigs.db -i all_refs_functions.tsvanvi-run-hmms -c contigs.db --num-threads 12cd cleanfor genome in (ls *gbk | cut -f1 -d "."); do grep "LOCUS" "genome".gbk | tr -s " " “\t” | cut -f2 > “$genome”_contigs.tmp;for contig in (cat "genome"_contigs.tmp);do echo “genome"; done > "genome”_name.tmp;paste “genome"_contigs.tmp "genome”_name.tmp > “$genome”_for_cat.tmp;donecat *_for_cat.tmp > collection.tsv && rm *.tmpcd …mv clean/collection.tsv ./anvi-profile -c contigs.db -o profile -S profile --blank-profile --min-contig-length 0 --skip-hierarchical-clusteringanvi-import-collection collection.tsv -c contigs.db -p profile/PROFILE.db -C Wormbiome --contigs-mode
Oct 6, 2023This is always a difficult topic for me. How to handle proper statistics in my large data set analysis.
Aug 29, 2023Quick tutorial - process on metagenomic work
Aug 14, 2023or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up