Xml readHTMLTable对单元格中的字符数有限制吗?

Xml readHTMLTable对单元格中的字符数有限制吗?,xml,r,rselenium,Xml,R,Rselenium,我使用RSelenium在联合国条约收集网站上提交表格并保存结果。除了条约名称在我的最后一个表格中被截断之外,一切都正常。是因为readHTML可以读取的字符数有限制,还是我做错了什么 下面是一个(希望)可重复的示例: ### ### RSelenium scraping of UN treaty collection ### # https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.htm

我使用RSelenium在联合国条约收集网站上提交表格并保存结果。除了条约名称在我的最后一个表格中被截断之外,一切都正常。是因为readHTML可以读取的字符数有限制,还是我做错了什么

下面是一个(希望)可重复的示例:

###
###   RSelenium scraping of UN treaty collection
###

# https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html

rm(list=ls())

###
setwd("C:/Users/HP/Desktop/BIT/UN_treaties")

library("XML")
library("RCurl")
library("RSelenium")
library("RHTMLForms")


# Start Selenium Server --------------------------------------------------------

checkForServer()  ## check if the server is there
startServer() ## start the selenium server
remDrv <- remoteDriver() ## connect server
remDrv$open() ## connect server
Sys.sleep(5) 
#remDrv$getStatus()  ## info connection, not necessary

# Simulate browser session and fill out form -----------------------------------

## go to page
remDrv$navigate('https://treaties.un.org/Pages/UNTSOnline.aspx?id=2')

## check out what is of interest:

## additional attributes
webElem1 <- remDrv$findElement(using = 'xpath', "//*/input[@value = 'cnd2']")
webElem1$getElementAttribute("class")
webElem1$getElementAttribute("type")
webElem1$getElementAttribute("id")
webElem1$clickElement()  ## match all the elements
Sys.sleep(5) 

## results per page
webElem2 <- remDrv$findElement(using = 'css selector', "select.basicPullDn")    ## . denotes class
webElem2$getElementAttribute("class")
webElem2$getElementAttribute("type")
webElem2$getElementAttribute("id")
Sys.sleep(5) 

## results per page more in detail
webElem3 <- remDrv$findElement(using = 'xpath', "//*/select[@class = 'basicPullDn']/option[@value = '500']")
webElem3$getElementAttribute("class")
webElem3$getElementAttribute("type")
webElem3$getElementAttribute("id")
webElem3$clickElement()  ## number of pages
Sys.sleep(5) 

## element to get total count
webElem5 <- remDrv$findElement(using = 'css selector', "span.RecordCount")    ## . denotes class
webElem5$getElementAttribute("class")
webElem5$getElementAttribute("type")
webElem5$getElementAttribute("id")

## find first element of interest: text search
webElem0 <- remDrv$findElement(using = 'css selector', "input.login")    ## . denotes class
webElem0$getElementAttribute("class")
webElem0$getElementAttribute("type")
webElem0$getElementAttribute("id")
Sys.sleep(5) 

df_all <- data.frame()
###### need to run search for multiple countries
country_list <- c("Morocco", "Italy", "France")

for (i in country_list){
  Sys.sleep(5) 
  ## define keys to search
  keys <- paste(i, "Agreement promotion investment", sep=" ")

  ## search for files, one by one and save results
  webElem0$clearElement()
  webElem0$sendKeysToElement(list(keys, key = "enter"))
  Sys.sleep(20) 

  # check if the table is there
  doc<-htmlParse(remDrv$getPageSource()[[1]])  ## now parse html so that we can search it
  tables = readHTMLTable(doc)  ## extract all tables
  #names(tables)   ## names of all tables

  tableexists <- grepl("ctl00_ContentPlaceHolder1_dgSearch", names(tables))
  yes_no <- all(tableexists==F)
  yes_no

  if(yes_no==FALSE){
  ## copy table
  table <- remDrv$findElement(using = 'xpath', "//*[@id = 'ctl00_ContentPlaceHolder1_dgSearch']")
  table$getElementAttribute("class")
  table$getElementAttribute("type")
  table$getElementAttribute("id")

  ## extract table of interest
  tabledat <-readHTMLTable(doc, stringsAsFactors = F , skip.rows=c(1))[[37]]
  df_all <- rbind(tabledat, df_all)
  }else{print("caccadicane")}
}

write.csv(df_all[,-(7:ncol(df_all))], ("un_bits.csv"))  

为什么V2中的字符串会被截断?

好的,过了一段时间,我发现即使ReadHTML命令有限制,这也不是本例中文本被截断的原因。通过仔细检查html文件,我发现文本已经被截断,而全名在元素“title”中

因此,解决办法是阅读每个“标题”中的文本,以获得协议的全称。下面是代码,如果有人感兴趣,还添加了一些其他内容

###
###   RSelenium scraping of UN treaty collection
###

# https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html

rm(list=ls())

###
setwd("C:/Users/HP/Desktop/BIT/UN_treaties")

library("XML")
library("RCurl")
library("RSelenium")
library("RHTMLForms")


# Start Selenium Server --------------------------------------------------------

checkForServer()  ## check if the server is there
startServer() ## start the selenium server
remDrv <- remoteDriver() ## connect server
remDrv$open() ## connect server
Sys.sleep(5) 
#remDrv$getStatus()  ## info connection, not necessary

# Simulate browser session and fill out form -----------------------------------

## go to page
remDrv$navigate('https://treaties.un.org/Pages/UNTSOnline.aspx?id=2')

## check out what is of interest:

## additional attributes
webElem1 <- remDrv$findElement(using = 'xpath', "//*/input[@value = 'cnd2']")
webElem1$getElementAttribute("class")
webElem1$getElementAttribute("type")
webElem1$getElementAttribute("id")
webElem1$clickElement()  ## match all the elements
Sys.sleep(5) 

## results per page
webElem2 <- remDrv$findElement(using = 'css selector', "select.basicPullDn")    ## . denotes class
webElem2$getElementAttribute("class")
webElem2$getElementAttribute("type")
webElem2$getElementAttribute("id")
Sys.sleep(5) 

## results per page more in detail
webElem3 <- remDrv$findElement(using = 'xpath', "//*/select[@class = 'basicPullDn']/option[@value = '500']")
webElem3$getElementAttribute("class")
webElem3$getElementAttribute("type")
webElem3$getElementAttribute("id")
webElem3$clickElement()  ## number of pages
Sys.sleep(5) 

## element to get total count
webElem5 <- remDrv$findElement(using = 'css selector', "span.RecordCount")    ## . denotes class
webElem5$getElementAttribute("class")
webElem5$getElementAttribute("type")
webElem5$getElementAttribute("id")

## find first element of interest: text search
webElem0 <- remDrv$findElement(using = 'css selector', "input.login")    ## . denotes class
webElem0$getElementAttribute("class")
webElem0$getElementAttribute("type")
webElem0$getElementAttribute("id")
Sys.sleep(5) 

df_all <- data.frame()
###### need to run search for multiple countries
#country_list <- c("Morocco", "Italy", "Brutto porco", "France")
names <- read.csv("participants_clean.csv")
country_list <- names$names
current_search  <- length(country_list)

for (i in country_list){

  #i <- "Morocco" 
  print("-------------------------")
  print("-------------------------")
  text <- paste("Still", current_search, "searches to do... ", sep=" ")
  print(text)
  text0 <- paste("Now looking for treaties signed by...  ", i , " ----------------------->>" , sep=" ")
  print(text0)
  Sys.sleep(5) 
  ## define keys to search
  keys <- paste(i, "Agreement promotion investment", sep=" ")

  ## search for files, one by one and save results
  webElem0$clearElement()
  webElem0$sendKeysToElement(list(keys, key = "enter"))
  Sys.sleep(20) 

  # check if the table is there
  doc<-htmlParse(remDrv$getPageSource()[[1]])  ## now parse html so that we can search it
  tables = readHTMLTable(doc)  ## extract all tables
  #names(tables)   ## names of all tables

  tableexists <- grepl("ctl00_ContentPlaceHolder1_dgSearch", names(tables))
  yes_no <- all(tableexists==F)
  yes_no

  if(yes_no==FALSE){
  ## copy table
  table <- remDrv$findElement(using = 'xpath', "//*[@id = 'ctl00_ContentPlaceHolder1_dgSearch']")
  table$getElementAttribute("class")
  table$getElementAttribute("type")
  table$getElementAttribute("id")

  ## extract table of interest
  tabledat <-readHTMLTable(doc, stringsAsFactors = F )[[37]]
  treatfou <-nrow(tabledat)
  text1 <- paste("Amazing, I just found", treatfou - 1, " !!", sep=" ")
  print(text1)

  ## now need to extract the real names of the treaties: start from 2 to treatfound
  names_new <- vector(mode="character",length = treatfou)
  urls <- vector(mode="character",length = treatfou)

  for (jj in 2:treatfou) {
    cell_add <- paste("//*[@id='ctl00_ContentPlaceHolder1_dgSearch']/tbody/tr[", jj, "]/td[2]", sep="")
    cell_table <- remDrv$findElement(using = 'xpath', cell_add)
    names_new[[jj]] <- as.character(cell_table$getElementAttribute("title"))
  }

  ## now substitute in the real titles:
  names_new <- as.vector(unlist(names_new))
  tabledat$title <- names_new
  tabledat$party <- i

  ## get the link
  for (jj in 2:treatfou) {
    url_add <- paste("//*[@id='ctl00_ContentPlaceHolder1_dgSearch']/tbody/tr[", jj, "]/td[3]/a", sep="")
    url_add <- remDrv$findElement(using = 'xpath', url_add)
    gio <- unlist(url_add$getElementAttribute("href"))
    gio <- gsub("javascript:void%20window.open\\('","",gio)   ## need to excape the parenthesis with \\
    gio <- gsub("\\'.*", "", gio)  ## cancel everything after '
    urls[[jj]] <- paste0("https://treaties.un.org/Pages/",gio)
  }
  tabledat$url <-urls


  df_all <- rbind(tabledat[-(1),], df_all)
  }else{print("Too bad, there is nothing, I'll try with the next one :) " )}
  current_search <- current_search -1
}

write.csv(df_all[,-(7:10)], ("un_bits.csv"))
###
###联合国条约汇编中的硒刮除
###
# https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html
rm(list=ls())
###
setwd(“C:/Users/HP/Desktop/BIT/UN_”)
库(“XML”)
图书馆(“RCurl”)
图书馆(“RSelenium”)
库(“RHTMLForms”)
#启动Selenium服务器--------------------------------------------------------
checkForServer()##检查服务器是否在那里
startServer()##启动selenium服务器
remDrv
###
###   RSelenium scraping of UN treaty collection
###

# https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html

rm(list=ls())

###
setwd("C:/Users/HP/Desktop/BIT/UN_treaties")

library("XML")
library("RCurl")
library("RSelenium")
library("RHTMLForms")


# Start Selenium Server --------------------------------------------------------

checkForServer()  ## check if the server is there
startServer() ## start the selenium server
remDrv <- remoteDriver() ## connect server
remDrv$open() ## connect server
Sys.sleep(5) 
#remDrv$getStatus()  ## info connection, not necessary

# Simulate browser session and fill out form -----------------------------------

## go to page
remDrv$navigate('https://treaties.un.org/Pages/UNTSOnline.aspx?id=2')

## check out what is of interest:

## additional attributes
webElem1 <- remDrv$findElement(using = 'xpath', "//*/input[@value = 'cnd2']")
webElem1$getElementAttribute("class")
webElem1$getElementAttribute("type")
webElem1$getElementAttribute("id")
webElem1$clickElement()  ## match all the elements
Sys.sleep(5) 

## results per page
webElem2 <- remDrv$findElement(using = 'css selector', "select.basicPullDn")    ## . denotes class
webElem2$getElementAttribute("class")
webElem2$getElementAttribute("type")
webElem2$getElementAttribute("id")
Sys.sleep(5) 

## results per page more in detail
webElem3 <- remDrv$findElement(using = 'xpath', "//*/select[@class = 'basicPullDn']/option[@value = '500']")
webElem3$getElementAttribute("class")
webElem3$getElementAttribute("type")
webElem3$getElementAttribute("id")
webElem3$clickElement()  ## number of pages
Sys.sleep(5) 

## element to get total count
webElem5 <- remDrv$findElement(using = 'css selector', "span.RecordCount")    ## . denotes class
webElem5$getElementAttribute("class")
webElem5$getElementAttribute("type")
webElem5$getElementAttribute("id")

## find first element of interest: text search
webElem0 <- remDrv$findElement(using = 'css selector', "input.login")    ## . denotes class
webElem0$getElementAttribute("class")
webElem0$getElementAttribute("type")
webElem0$getElementAttribute("id")
Sys.sleep(5) 

df_all <- data.frame()
###### need to run search for multiple countries
#country_list <- c("Morocco", "Italy", "Brutto porco", "France")
names <- read.csv("participants_clean.csv")
country_list <- names$names
current_search  <- length(country_list)

for (i in country_list){

  #i <- "Morocco" 
  print("-------------------------")
  print("-------------------------")
  text <- paste("Still", current_search, "searches to do... ", sep=" ")
  print(text)
  text0 <- paste("Now looking for treaties signed by...  ", i , " ----------------------->>" , sep=" ")
  print(text0)
  Sys.sleep(5) 
  ## define keys to search
  keys <- paste(i, "Agreement promotion investment", sep=" ")

  ## search for files, one by one and save results
  webElem0$clearElement()
  webElem0$sendKeysToElement(list(keys, key = "enter"))
  Sys.sleep(20) 

  # check if the table is there
  doc<-htmlParse(remDrv$getPageSource()[[1]])  ## now parse html so that we can search it
  tables = readHTMLTable(doc)  ## extract all tables
  #names(tables)   ## names of all tables

  tableexists <- grepl("ctl00_ContentPlaceHolder1_dgSearch", names(tables))
  yes_no <- all(tableexists==F)
  yes_no

  if(yes_no==FALSE){
  ## copy table
  table <- remDrv$findElement(using = 'xpath', "//*[@id = 'ctl00_ContentPlaceHolder1_dgSearch']")
  table$getElementAttribute("class")
  table$getElementAttribute("type")
  table$getElementAttribute("id")

  ## extract table of interest
  tabledat <-readHTMLTable(doc, stringsAsFactors = F )[[37]]
  treatfou <-nrow(tabledat)
  text1 <- paste("Amazing, I just found", treatfou - 1, " !!", sep=" ")
  print(text1)

  ## now need to extract the real names of the treaties: start from 2 to treatfound
  names_new <- vector(mode="character",length = treatfou)
  urls <- vector(mode="character",length = treatfou)

  for (jj in 2:treatfou) {
    cell_add <- paste("//*[@id='ctl00_ContentPlaceHolder1_dgSearch']/tbody/tr[", jj, "]/td[2]", sep="")
    cell_table <- remDrv$findElement(using = 'xpath', cell_add)
    names_new[[jj]] <- as.character(cell_table$getElementAttribute("title"))
  }

  ## now substitute in the real titles:
  names_new <- as.vector(unlist(names_new))
  tabledat$title <- names_new
  tabledat$party <- i

  ## get the link
  for (jj in 2:treatfou) {
    url_add <- paste("//*[@id='ctl00_ContentPlaceHolder1_dgSearch']/tbody/tr[", jj, "]/td[3]/a", sep="")
    url_add <- remDrv$findElement(using = 'xpath', url_add)
    gio <- unlist(url_add$getElementAttribute("href"))
    gio <- gsub("javascript:void%20window.open\\('","",gio)   ## need to excape the parenthesis with \\
    gio <- gsub("\\'.*", "", gio)  ## cancel everything after '
    urls[[jj]] <- paste0("https://treaties.un.org/Pages/",gio)
  }
  tabledat$url <-urls


  df_all <- rbind(tabledat[-(1),], df_all)
  }else{print("Too bad, there is nothing, I'll try with the next one :) " )}
  current_search <- current_search -1
}

write.csv(df_all[,-(7:10)], ("un_bits.csv"))