Libraries

library(tidyverse)
library(KEGGREST)

Scrapping

The scrapper use a list of kegg of interest, then go through the list to get the associated BRITE annotations. It create an arbitrary A to F/G categories that can be used down the line for annotation analysis.

ko.list<-c("K26400","K26400","K26441", "K26441")

The main function below.

ko.scrap<-function(ko.list,a,b,m){
  ko.tmp<-list()
  k=0
  #1 is NA
  for(i in a:b){
    skip_to_next <- FALSE
    tryCatch(ko.dl<-keggGet(ko.list[i]),error = function(e) { skip_to_next <<- TRUE})
    if(skip_to_next) { 
      print(paste0("\r Skipping ",ko.list[i]))
      next }
    else{
      cat(paste0("\r","Doing ",ko.list[i]," - ",i,"/",length(ko.list)))
      k=k+1
      tmp<-c(ko.dl[[1]]$BRITE)
      cn<-paste0("k",k)
      tt<-tibble(!!cn:=tmp)
      kl<-character()
      st<-tibble()
      for(rn in 1:length(tmp)){
        scc=nchar(str_extract(tt[rn,1],"^[:space:]*"))
        if((scc+1)>nrow(st)){
          st[(scc+1),1]<-1
          st[(scc+1),2]<-scc
          ll<-paste0("L",paste(st$...1,collapse = "."))
          kl<-c(kl,ll)
        } else if((scc+1)==nrow(st)){
          st[(scc+1),1]<-st[(scc+1),1]+1
          ll<-paste0("L",paste(st$...1,collapse = "."))
          kl<-c(kl,ll)
        } else if ((scc+1)<nrow(st)){
          st[(scc+1),1]<-st[(scc+1),1]+1
          st<-st[1:(scc+1),]
          ll<-paste0("L",paste(st$...1,collapse = "."))
          kl<-c(kl,ll)
        }
        #print(ll)
      }
      kl<-c("KO",kl)
      tmp<-c(ko.dl[[1]]$ENTRY,tmp)
      tmp<-gsub("^[ ]*","",tmp)
      ko.tmp[[k]]<-tibble(kl=kl,!!cn:=tmp)
    }
  }
  print("")
  Kotbl<-reduce(ko.tmp,full_join,by=c("kl"))
  write.csv(x = Kotbl,file=paste0("table",m,".csv"))
  print(paste("Scrapped ", b-(a-1), " annotations"))
}

Scrapping per se. This scrit has a safeguard system that make a few temporary files to avoid overloading memory. By default it is set to 100, but can be changed.

if(length(ko.list)>100){ 
  maxm=length(seq(1,length(ko.list),100))
  for (m in 1:(maxm-1)){
    a=seq(1,length(ko.list),100)[m]
    b=seq(1,length(ko.list),100)[m+1]-1
    ko.scrap(ko.list,a,b,m)
    print(paste("Saved intermediary file table",m))
  }
  a=seq(1,length(ko.list),100)[length(seq(1,length(ko.list),100))]
  b=length(ko.list)
  ko.scrap(ko.list,a,b,maxm)
  print(paste("Saved intermediary file table",maxm))
} else{
  ko.scrap(ko.list,a,b,"1")
}

The tables are then reloaded and merged in one big table

table<-list.files(path = ".",pattern = "^table*")
kotbltmp<-list()
for(i in 1:length(table)){
  kotbltmp[[i]]<-read_csv(table[i], skip=1) %>%
    select(!c(`1`,colnames(.)[str_detect("\\.",string = colnames(.))]))
}
kotbl2<-reduce(kotbltmp, full_join)

#Make a matrix

kt<-kotbl2 %>% 
  pivot_longer(!KO, names_to = "Module", values_to = "name", values_drop_na = TRUE)

#Get final list

tmp<-kt %>%
  mutate(ncharlen=str_count(KO,pattern = "\\."),ncharlen=LETTERS[ncharlen+1],rn=row_number()) %>% 
  arrange(Module, KO) %>% 
  mutate(set=case_when(ncharlen=="A" ~ nrow(.))) %>%
  fill(set)

Create a matrix of text then fill the row by group of sets, then filter row with a "KO" word and extract it in its own column

tmp<- tmp %>% pivot_wider(names_from = ncharlen, id_cols = rn, values_from = name) %>%
  left_join(tibble(rn=tmp$rn, set=tmp$set))
#Transpose to fill the end of hierachy to the end of the row (to avoid them to fill something they shouldn't)
tmp<-as.data.frame(t(tmp))
tmp<-tmp[-1,] %>% fill(names(tmp), .direction = "down")
tmp<-as.data.frame(t(tmp))

Voila

Final.Table<-tmp%>%
  group_by(set) %>%
  fill(A,B,C,D,E,F) %>%
  ungroup() %>%
  select(!c(set)) %>%
  unique() %>% 
  rowwise() %>% 
  mutate(find_KO = any(str_detect(c_across(A:F), regex("K([:digit:]\\w+)", ignore_case = TRUE)), na.rm = TRUE)) %>%
  filter(find_KO==TRUE) %>% 
  select(-find_KO) %>% 
  mutate(KO = unique(na.omit(str_extract(across(A:F), regex("K([:digit:]\\w+)")))))

Libraries

Scrapping

Read more

R - Lines and geom_jitter

Anvio Pangenomic

Statistiques on large datsets

Metagenomic assembly pipeline