get_update_date <- function(){
  url <- 'https://biobank.ndph.ox.ac.uk/showcase/browse.cgi?id=-2'
  html <- read_html(url)
  html %>%
    html_elements(xpath = '//div[@class="footnote"]') %>%
    html_text() %>%
    do::Replace0('Summary generated ')
}

get_ALL_category <- function(res=NULL,url=NULL){
  # res = NULL
  # url <- NULL
  if (is.null(res)) res <- data.frame()
  if (is.null(url)){
    url <- 'https://biobank.ndph.ox.ac.uk/showcase/cats.cgi?id=0'
    html <- read_html2(url)
    (df <- html %>%
        html_table() %>%
        do.call(what=rbind) %>%
        data.frame() %>%
        unique())
  }else{
    html <- read_html(url)
    (df <- html %>%
        html_elements(xpath = '//div[@class="tabbertab"]') %>%
        set::grep_and('Sub-Categor') %>%
        html_table() %>%
        do.call(what=rbind) %>%
        data.frame() %>%
        unique())
  }
  
  res <- plyr::rbind.fill(res,df)
  (ck <- grepl('\\+',df$Items))
  (dfp <- df[ck,])
  if (nrow(dfp)>0){
    for (i in 1:nrow(dfp)){
      message(nrow(dfp),'/',i)
      url <- paste0('https://biobank.ndph.ox.ac.uk/showcase/label.cgi?id=',dfp$Category.ID[i])
      res <- get_ALL_category(res,url)
    }
  }
  res[is.na(res)] <- ''
  res$Items <- do::Replace0(res$Items,'\\+')
  res <- res[order(nchar(res$Items),decreasing = T),]
  return(unique(res))
}

related_main <- function(html,res,msg=''){
  if (!'Relationship' %in% colnames(res)) res$Relationship <- NA
  allls <- ls(all.names = T,envir = .GlobalEnv)
  (tabb <- html_elements(html,xpath = '//div[@class="tabbertab"]'))
  (ck1 <- grepl('Related Data-Field',do::Replace0(tabb,'</h2>.*'),T))
  (ck2 <- !grepl('>0 Related Data-Field',do::Replace0(tabb,'</h2>.*'),T))
  (ck <- ck1 & ck2)
  if (any(ck)){
    (dr <- data.frame(html_table(tabb[ck])[[1]]))
    for (n in 1:nrow(dr)){
      cat('   获取Related Data-Field')
      ck1 <- res$Field.ID %in% dr$Field.ID[n]
      ck2 <- !is.na(res$Relationship)
      ck <- any(ck1 & ck2)
      t1 <- paste0(get_title(html)[1,1],
                   do::rep_n(' ',max(7-nchar(get_title(html)[1,1]),3)))
      t2 <- paste0(n,do::rep_n(' ',max(2-nchar(n),3)))
      cat('\n','       ',msg,': ',t1,' ',nrow(dr),'/',t2,' ',ifelse(ck,'old',''))
      if (ck) next(n)
      (urlm <- browse_url_field(dr$Field.ID[n]))
      html <- read_html2(urlm,paste0('\n休息',msg,'-'))
      
      (resm <- get_main(html))
      (resm$Relationship <- dr$Relationship[n])
      
      (ck <- grepl('Current Field',resm$Relationship,T))
      if (ck) resm$Relationship <- gsub('Current Field',paste0('Field ',resm$Field.ID),resm$Relationship,T)
      if (!ck) resm$Relationship <- paste0('Relationship to Field: ',resm$Field.ID,': ',resm$Relationship)
      
      res <- plyr::rbind.fill(res,resm)
      res <- related_main(html,res,msg)
    }
  }else{
    cat('     没有Related Data-Field')
  }
  return(res)
}

browse_url_categ <- function(id){
  paste0('https://biobank.ndph.ox.ac.uk/showcase/label.cgi?id=',id)
}
browse_url_field <- function(id){
  paste0('https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=',id)
}
get_title <- function(html){
  (fld <- html %>%
     html_elements(xpath = '//span[@class="screentitle"]') %>%
     html_text() %>%
     do::Replace0('Data-Field ','Category '))
  
  (des <- html %>%
      html_elements(xpath = '//table[@summary="Identification"]/tr') %>%
      set::grep_and('Description') %>%
      html_elements(xpath = 'td') %>%
      set::grep_not_and('Description') %>%
      html_text())
  if (length(des)==0){
    (des <- html %>%
       html_elements(xpath = '//div[@id="main"]') %>%
       html_text() %>%
       do::Replace0('.*\nDescription\n','\n'))
  }
  data.frame(Field.ID=fld,
             Description=des)
  
}
get_category <- function(html){
  (categ <- html %>%
     html_elements(xpath = '//table[@summary="Identification"]/tr//a') %>%
     html_text())
  (categ_id <- html %>%
      html_elements(xpath = '//table[@summary="Identification"]/tr//a') %>%
      do::attr_href() %>%
      do::Replace0('label.cgi\\?id='))
  if (length(categ)==0){
    (categ <- html %>%
       html_elements(xpath = '//div[@id="main"]//a[@class="basic"]') %>%
       html_text())
    (categ_id <- html %>%
        html_elements(xpath = '//div[@id="main"]//a[@class="basic"]') %>%
        do::attr_href() %>%
        do::Replace0('label.cgi\\?id='))
  }
  if (length(categ) == 0) return(NA)
  for (t in 1:length(categ)){
    iti <- data.frame(categ[t],categ_id[t])
    colnames(iti) <- sprintf(c('Category%s','Category%s_id'),t)
    if (t==1){
      Category <- iti
    }else{
      Category <- cbind(Category,iti)
    }
  }
  Category
}
get_cpn <- function(html){
  (blu <- html %>% html_elements(xpath = '//td'))
  (pn0 <- do::Replace0(html_text(blu[which(grepl('participants',blu,T) & grepl('txt_blu',blu,T)) + 1]),',',' '))
  (pn <- suppressWarnings(as.numeric(pn0)))
  (cn <- suppressWarnings(as.numeric(do::Replace0(html_text(blu[which(grepl('Item count',blu,T)) + 1]),',',' '))))
  if (length(pn) == 0 & length(cn)==0){
    NA
  }else{
    if (is.na(pn)){
      pn0
    }else{
      sprintf('%s/%s=%s',cn,pn,round(cn/pn,1))
    }
  }
}
get_costTier <- function(html){
  (blu <- html %>% html_elements(xpath = '//td'))
  (ct <- html_text(blu[which(grepl('Cost Tier',blu,T)) + 1]))
  if (length(ct)==0){
    NA
  }else{
    ct
  }
}
get_main <- function(html){
  (categ <- get_category(html))
  (cpn <- get_cpn(html))
  (CostTier <- get_costTier(html))
  (title <- get_title(html))
  df <- cbind(categ,title,cpn,CostTier)
  df[,do::NA.col.sums(df) == 0]
}
read_html2 <- function(url,msg=''){
  (html <- tryCatch(read_html(url),error=function(e) 'e'))
  for (n in 1:100){
    if (is.character(html)){
      Sys.sleep(5)*sample(c(1,2,3),1) + abs(rnorm(1,1,1))
      cat(msg,n)
      closeAllConnections()
      html <- tryCatch(read_html(url),error=function(e) 'e')
    }
  }
  if (is.character(html)) stop('没有读取成功')
  html
}

# 开始提取----------------------------
allcategory <- get_ALL_category()
dim(allcategory)
(id <- allcategory$Category.ID)
library(htmltools)
library(rvest)
# 提取文件名
i=133
i=1
i=6
for (i in 1:length(id)){
  message('\n',i,'/',length(id),' ',allcategory$Category.ID[i])
  if (i==1) res <- data.frame()
  (url <- browse_url_categ(id[i]))
  (html_i <- read_html2(url,'\n休息i'))
  (tabb <- html_i %>% html_elements(xpath = '//div[@class="tabbertab"]'))
  (ck <- grepl('data-field',do::Replace0(tabb,'</h2>.*'),T))
  if (any(ck)){
    (di <- data.frame(html_table(tabb[ck])[[1]])[,c("Field.ID","Description")])
    for (j in 1:nrow(di)){
      cat('\n','j =',j,'/',nrow(di))
      (urlj <- browse_url_field(di$Field.ID[j]))
      (html_j <- read_html2(urlj,'\n休息j-'))
      (resj <- get_main(html_j))
      (res <- plyr::rbind.fill(res,resj))
      res <- related_main(html_j,res,'m')
    }
  }else{
    (resi <- get_main(html_i))
    if (any(grepl('Category',colnames(res)))){
      res <- plyr::rbind.fill(res,resi)      
    }
  }
}
res <- unique(res)
ck <- do::left(colnames(res),4) %in% 'Category'
res <- res[,c(colnames(res)[ck],
              colnames(res)[!ck])]

res <- cbind(seq=1:nrow(res),res)
openxlsx::write.xlsx(res,'varlabel.xlsx')
data.table::fwrite(res,'varlabel.txt')









