# Libraries
```{R}
library(tidyverse)
library(KEGGREST)
```
# Scrapping
The scrapper use a list of kegg of interest, then go through the list to get the associated BRITE annotations. It create an arbitrary A to F/G categories that can be used down the line for annotation analysis.
```{R}
ko.list<-c("K26400","K26400","K26441", "K26441")
```
The main function below.
```{R}
ko.scrap<-function(ko.list,a,b,m){
ko.tmp<-list()
k=0
#1 is NA
for(i in a:b){
skip_to_next <- FALSE
tryCatch(ko.dl<-keggGet(ko.list[i]),error = function(e) { skip_to_next <<- TRUE})
if(skip_to_next) {
print(paste0("\r Skipping ",ko.list[i]))
next }
else{
cat(paste0("\r","Doing ",ko.list[i]," - ",i,"/",length(ko.list)))
k=k+1
tmp<-c(ko.dl[[1]]$BRITE)
cn<-paste0("k",k)
tt<-tibble(!!cn:=tmp)
kl<-character()
st<-tibble()
for(rn in 1:length(tmp)){
scc=nchar(str_extract(tt[rn,1],"^[:space:]*"))
if((scc+1)>nrow(st)){
st[(scc+1),1]<-1
st[(scc+1),2]<-scc
ll<-paste0("L",paste(st$...1,collapse = "."))
kl<-c(kl,ll)
} else if((scc+1)==nrow(st)){
st[(scc+1),1]<-st[(scc+1),1]+1
ll<-paste0("L",paste(st$...1,collapse = "."))
kl<-c(kl,ll)
} else if ((scc+1)<nrow(st)){
st[(scc+1),1]<-st[(scc+1),1]+1
st<-st[1:(scc+1),]
ll<-paste0("L",paste(st$...1,collapse = "."))
kl<-c(kl,ll)
}
#print(ll)
}
kl<-c("KO",kl)
tmp<-c(ko.dl[[1]]$ENTRY,tmp)
tmp<-gsub("^[ ]*","",tmp)
ko.tmp[[k]]<-tibble(kl=kl,!!cn:=tmp)
}
}
print("")
Kotbl<-reduce(ko.tmp,full_join,by=c("kl"))
write.csv(x = Kotbl,file=paste0("table",m,".csv"))
print(paste("Scrapped ", b-(a-1), " annotations"))
}
```
Scrapping per se. This scrit has a safeguard system that make a few temporary files to avoid overloading memory. By default it is set to 100, but can be changed.
```{r}
if(length(ko.list)>100){
maxm=length(seq(1,length(ko.list),100))
for (m in 1:(maxm-1)){
a=seq(1,length(ko.list),100)[m]
b=seq(1,length(ko.list),100)[m+1]-1
ko.scrap(ko.list,a,b,m)
print(paste("Saved intermediary file table",m))
}
a=seq(1,length(ko.list),100)[length(seq(1,length(ko.list),100))]
b=length(ko.list)
ko.scrap(ko.list,a,b,maxm)
print(paste("Saved intermediary file table",maxm))
} else{
ko.scrap(ko.list,a,b,"1")
}
```
The tables are then reloaded and merged in one big table
```{R}
table<-list.files(path = ".",pattern = "^table*")
kotbltmp<-list()
for(i in 1:length(table)){
kotbltmp[[i]]<-read_csv(table[i], skip=1) %>%
select(!c(`1`,colnames(.)[str_detect("\\.",string = colnames(.))]))
}
kotbl2<-reduce(kotbltmp, full_join)
#Make a matrix
kt<-kotbl2 %>%
pivot_longer(!KO, names_to = "Module", values_to = "name", values_drop_na = TRUE)
#Get final list
tmp<-kt %>%
mutate(ncharlen=str_count(KO,pattern = "\\."),ncharlen=LETTERS[ncharlen+1],rn=row_number()) %>%
arrange(Module, KO) %>%
mutate(set=case_when(ncharlen=="A" ~ nrow(.))) %>%
fill(set)
```
Create a matrix of text then fill the row by group of sets, then filter row with a "KO" word and extract it in its own column
```{R}
tmp<- tmp %>% pivot_wider(names_from = ncharlen, id_cols = rn, values_from = name) %>%
left_join(tibble(rn=tmp$rn, set=tmp$set))
#Transpose to fill the end of hierachy to the end of the row (to avoid them to fill something they shouldn't)
tmp<-as.data.frame(t(tmp))
tmp<-tmp[-1,] %>% fill(names(tmp), .direction = "down")
tmp<-as.data.frame(t(tmp))
```
Voila
```{R}
Final.Table<-tmp%>%
group_by(set) %>%
fill(A,B,C,D,E,F) %>%
ungroup() %>%
select(!c(set)) %>%
unique() %>%
rowwise() %>%
mutate(find_KO = any(str_detect(c_across(A:F), regex("K([:digit:]\\w+)", ignore_case = TRUE)), na.rm = TRUE)) %>%
filter(find_KO==TRUE) %>%
select(-find_KO) %>%
mutate(KO = unique(na.omit(str_extract(across(A:F), regex("K([:digit:]\\w+)")))))
```