# Libraries ```{R} library(tidyverse) library(KEGGREST) ``` # Scrapping The scrapper use a list of kegg of interest, then go through the list to get the associated BRITE annotations. It create an arbitrary A to F/G categories that can be used down the line for annotation analysis. ```{R} ko.list<-c("K26400","K26400","K26441", "K26441") ``` The main function below. ```{R} ko.scrap<-function(ko.list,a,b,m){ ko.tmp<-list() k=0 #1 is NA for(i in a:b){ skip_to_next <- FALSE tryCatch(ko.dl<-keggGet(ko.list[i]),error = function(e) { skip_to_next <<- TRUE}) if(skip_to_next) { print(paste0("\r Skipping ",ko.list[i])) next } else{ cat(paste0("\r","Doing ",ko.list[i]," - ",i,"/",length(ko.list))) k=k+1 tmp<-c(ko.dl[[1]]$BRITE) cn<-paste0("k",k) tt<-tibble(!!cn:=tmp) kl<-character() st<-tibble() for(rn in 1:length(tmp)){ scc=nchar(str_extract(tt[rn,1],"^[:space:]*")) if((scc+1)>nrow(st)){ st[(scc+1),1]<-1 st[(scc+1),2]<-scc ll<-paste0("L",paste(st$...1,collapse = ".")) kl<-c(kl,ll) } else if((scc+1)==nrow(st)){ st[(scc+1),1]<-st[(scc+1),1]+1 ll<-paste0("L",paste(st$...1,collapse = ".")) kl<-c(kl,ll) } else if ((scc+1)<nrow(st)){ st[(scc+1),1]<-st[(scc+1),1]+1 st<-st[1:(scc+1),] ll<-paste0("L",paste(st$...1,collapse = ".")) kl<-c(kl,ll) } #print(ll) } kl<-c("KO",kl) tmp<-c(ko.dl[[1]]$ENTRY,tmp) tmp<-gsub("^[ ]*","",tmp) ko.tmp[[k]]<-tibble(kl=kl,!!cn:=tmp) } } print("") Kotbl<-reduce(ko.tmp,full_join,by=c("kl")) write.csv(x = Kotbl,file=paste0("table",m,".csv")) print(paste("Scrapped ", b-(a-1), " annotations")) } ``` Scrapping per se. This scrit has a safeguard system that make a few temporary files to avoid overloading memory. By default it is set to 100, but can be changed. ```{r} if(length(ko.list)>100){ maxm=length(seq(1,length(ko.list),100)) for (m in 1:(maxm-1)){ a=seq(1,length(ko.list),100)[m] b=seq(1,length(ko.list),100)[m+1]-1 ko.scrap(ko.list,a,b,m) print(paste("Saved intermediary file table",m)) } a=seq(1,length(ko.list),100)[length(seq(1,length(ko.list),100))] b=length(ko.list) ko.scrap(ko.list,a,b,maxm) print(paste("Saved intermediary file table",maxm)) } else{ ko.scrap(ko.list,a,b,"1") } ``` The tables are then reloaded and merged in one big table ```{R} table<-list.files(path = ".",pattern = "^table*") kotbltmp<-list() for(i in 1:length(table)){ kotbltmp[[i]]<-read_csv(table[i], skip=1) %>% select(!c(`1`,colnames(.)[str_detect("\\.",string = colnames(.))])) } kotbl2<-reduce(kotbltmp, full_join) #Make a matrix kt<-kotbl2 %>% pivot_longer(!KO, names_to = "Module", values_to = "name", values_drop_na = TRUE) #Get final list tmp<-kt %>% mutate(ncharlen=str_count(KO,pattern = "\\."),ncharlen=LETTERS[ncharlen+1],rn=row_number()) %>% arrange(Module, KO) %>% mutate(set=case_when(ncharlen=="A" ~ nrow(.))) %>% fill(set) ``` Create a matrix of text then fill the row by group of sets, then filter row with a "KO" word and extract it in its own column ```{R} tmp<- tmp %>% pivot_wider(names_from = ncharlen, id_cols = rn, values_from = name) %>% left_join(tibble(rn=tmp$rn, set=tmp$set)) #Transpose to fill the end of hierachy to the end of the row (to avoid them to fill something they shouldn't) tmp<-as.data.frame(t(tmp)) tmp<-tmp[-1,] %>% fill(names(tmp), .direction = "down") tmp<-as.data.frame(t(tmp)) ``` Voila ```{R} Final.Table<-tmp%>% group_by(set) %>% fill(A,B,C,D,E,F) %>% ungroup() %>% select(!c(set)) %>% unique() %>% rowwise() %>% mutate(find_KO = any(str_detect(c_across(A:F), regex("K([:digit:]\\w+)", ignore_case = TRUE)), na.rm = TRUE)) %>% filter(find_KO==TRUE) %>% select(-find_KO) %>% mutate(KO = unique(na.omit(str_extract(across(A:F), regex("K([:digit:]\\w+)"))))) ```