Xml readHTMLTable对单元格中的字符数有限制吗?
我使用RSelenium在联合国条约收集网站上提交表格并保存结果。除了条约名称在我的最后一个表格中被截断之外,一切都正常。是因为readHTML可以读取的字符数有限制,还是我做错了什么 下面是一个(希望)可重复的示例:Xml readHTMLTable对单元格中的字符数有限制吗?,xml,r,rselenium,Xml,R,Rselenium,我使用RSelenium在联合国条约收集网站上提交表格并保存结果。除了条约名称在我的最后一个表格中被截断之外,一切都正常。是因为readHTML可以读取的字符数有限制,还是我做错了什么 下面是一个(希望)可重复的示例: ### ### RSelenium scraping of UN treaty collection ### # https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.htm
###
### RSelenium scraping of UN treaty collection
###
# https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html
rm(list=ls())
###
setwd("C:/Users/HP/Desktop/BIT/UN_treaties")
library("XML")
library("RCurl")
library("RSelenium")
library("RHTMLForms")
# Start Selenium Server --------------------------------------------------------
checkForServer() ## check if the server is there
startServer() ## start the selenium server
remDrv <- remoteDriver() ## connect server
remDrv$open() ## connect server
Sys.sleep(5)
#remDrv$getStatus() ## info connection, not necessary
# Simulate browser session and fill out form -----------------------------------
## go to page
remDrv$navigate('https://treaties.un.org/Pages/UNTSOnline.aspx?id=2')
## check out what is of interest:
## additional attributes
webElem1 <- remDrv$findElement(using = 'xpath', "//*/input[@value = 'cnd2']")
webElem1$getElementAttribute("class")
webElem1$getElementAttribute("type")
webElem1$getElementAttribute("id")
webElem1$clickElement() ## match all the elements
Sys.sleep(5)
## results per page
webElem2 <- remDrv$findElement(using = 'css selector', "select.basicPullDn") ## . denotes class
webElem2$getElementAttribute("class")
webElem2$getElementAttribute("type")
webElem2$getElementAttribute("id")
Sys.sleep(5)
## results per page more in detail
webElem3 <- remDrv$findElement(using = 'xpath', "//*/select[@class = 'basicPullDn']/option[@value = '500']")
webElem3$getElementAttribute("class")
webElem3$getElementAttribute("type")
webElem3$getElementAttribute("id")
webElem3$clickElement() ## number of pages
Sys.sleep(5)
## element to get total count
webElem5 <- remDrv$findElement(using = 'css selector', "span.RecordCount") ## . denotes class
webElem5$getElementAttribute("class")
webElem5$getElementAttribute("type")
webElem5$getElementAttribute("id")
## find first element of interest: text search
webElem0 <- remDrv$findElement(using = 'css selector', "input.login") ## . denotes class
webElem0$getElementAttribute("class")
webElem0$getElementAttribute("type")
webElem0$getElementAttribute("id")
Sys.sleep(5)
df_all <- data.frame()
###### need to run search for multiple countries
country_list <- c("Morocco", "Italy", "France")
for (i in country_list){
Sys.sleep(5)
## define keys to search
keys <- paste(i, "Agreement promotion investment", sep=" ")
## search for files, one by one and save results
webElem0$clearElement()
webElem0$sendKeysToElement(list(keys, key = "enter"))
Sys.sleep(20)
# check if the table is there
doc<-htmlParse(remDrv$getPageSource()[[1]]) ## now parse html so that we can search it
tables = readHTMLTable(doc) ## extract all tables
#names(tables) ## names of all tables
tableexists <- grepl("ctl00_ContentPlaceHolder1_dgSearch", names(tables))
yes_no <- all(tableexists==F)
yes_no
if(yes_no==FALSE){
## copy table
table <- remDrv$findElement(using = 'xpath', "//*[@id = 'ctl00_ContentPlaceHolder1_dgSearch']")
table$getElementAttribute("class")
table$getElementAttribute("type")
table$getElementAttribute("id")
## extract table of interest
tabledat <-readHTMLTable(doc, stringsAsFactors = F , skip.rows=c(1))[[37]]
df_all <- rbind(tabledat, df_all)
}else{print("caccadicane")}
}
write.csv(df_all[,-(7:ncol(df_all))], ("un_bits.csv"))
为什么V2中的字符串会被截断?好的,过了一段时间,我发现即使ReadHTML命令有限制,这也不是本例中文本被截断的原因。通过仔细检查html文件,我发现文本已经被截断,而全名在元素“title”中 因此,解决办法是阅读每个“标题”中的文本,以获得协议的全称。下面是代码,如果有人感兴趣,还添加了一些其他内容
###
### RSelenium scraping of UN treaty collection
###
# https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html
rm(list=ls())
###
setwd("C:/Users/HP/Desktop/BIT/UN_treaties")
library("XML")
library("RCurl")
library("RSelenium")
library("RHTMLForms")
# Start Selenium Server --------------------------------------------------------
checkForServer() ## check if the server is there
startServer() ## start the selenium server
remDrv <- remoteDriver() ## connect server
remDrv$open() ## connect server
Sys.sleep(5)
#remDrv$getStatus() ## info connection, not necessary
# Simulate browser session and fill out form -----------------------------------
## go to page
remDrv$navigate('https://treaties.un.org/Pages/UNTSOnline.aspx?id=2')
## check out what is of interest:
## additional attributes
webElem1 <- remDrv$findElement(using = 'xpath', "//*/input[@value = 'cnd2']")
webElem1$getElementAttribute("class")
webElem1$getElementAttribute("type")
webElem1$getElementAttribute("id")
webElem1$clickElement() ## match all the elements
Sys.sleep(5)
## results per page
webElem2 <- remDrv$findElement(using = 'css selector', "select.basicPullDn") ## . denotes class
webElem2$getElementAttribute("class")
webElem2$getElementAttribute("type")
webElem2$getElementAttribute("id")
Sys.sleep(5)
## results per page more in detail
webElem3 <- remDrv$findElement(using = 'xpath', "//*/select[@class = 'basicPullDn']/option[@value = '500']")
webElem3$getElementAttribute("class")
webElem3$getElementAttribute("type")
webElem3$getElementAttribute("id")
webElem3$clickElement() ## number of pages
Sys.sleep(5)
## element to get total count
webElem5 <- remDrv$findElement(using = 'css selector', "span.RecordCount") ## . denotes class
webElem5$getElementAttribute("class")
webElem5$getElementAttribute("type")
webElem5$getElementAttribute("id")
## find first element of interest: text search
webElem0 <- remDrv$findElement(using = 'css selector', "input.login") ## . denotes class
webElem0$getElementAttribute("class")
webElem0$getElementAttribute("type")
webElem0$getElementAttribute("id")
Sys.sleep(5)
df_all <- data.frame()
###### need to run search for multiple countries
#country_list <- c("Morocco", "Italy", "Brutto porco", "France")
names <- read.csv("participants_clean.csv")
country_list <- names$names
current_search <- length(country_list)
for (i in country_list){
#i <- "Morocco"
print("-------------------------")
print("-------------------------")
text <- paste("Still", current_search, "searches to do... ", sep=" ")
print(text)
text0 <- paste("Now looking for treaties signed by... ", i , " ----------------------->>" , sep=" ")
print(text0)
Sys.sleep(5)
## define keys to search
keys <- paste(i, "Agreement promotion investment", sep=" ")
## search for files, one by one and save results
webElem0$clearElement()
webElem0$sendKeysToElement(list(keys, key = "enter"))
Sys.sleep(20)
# check if the table is there
doc<-htmlParse(remDrv$getPageSource()[[1]]) ## now parse html so that we can search it
tables = readHTMLTable(doc) ## extract all tables
#names(tables) ## names of all tables
tableexists <- grepl("ctl00_ContentPlaceHolder1_dgSearch", names(tables))
yes_no <- all(tableexists==F)
yes_no
if(yes_no==FALSE){
## copy table
table <- remDrv$findElement(using = 'xpath', "//*[@id = 'ctl00_ContentPlaceHolder1_dgSearch']")
table$getElementAttribute("class")
table$getElementAttribute("type")
table$getElementAttribute("id")
## extract table of interest
tabledat <-readHTMLTable(doc, stringsAsFactors = F )[[37]]
treatfou <-nrow(tabledat)
text1 <- paste("Amazing, I just found", treatfou - 1, " !!", sep=" ")
print(text1)
## now need to extract the real names of the treaties: start from 2 to treatfound
names_new <- vector(mode="character",length = treatfou)
urls <- vector(mode="character",length = treatfou)
for (jj in 2:treatfou) {
cell_add <- paste("//*[@id='ctl00_ContentPlaceHolder1_dgSearch']/tbody/tr[", jj, "]/td[2]", sep="")
cell_table <- remDrv$findElement(using = 'xpath', cell_add)
names_new[[jj]] <- as.character(cell_table$getElementAttribute("title"))
}
## now substitute in the real titles:
names_new <- as.vector(unlist(names_new))
tabledat$title <- names_new
tabledat$party <- i
## get the link
for (jj in 2:treatfou) {
url_add <- paste("//*[@id='ctl00_ContentPlaceHolder1_dgSearch']/tbody/tr[", jj, "]/td[3]/a", sep="")
url_add <- remDrv$findElement(using = 'xpath', url_add)
gio <- unlist(url_add$getElementAttribute("href"))
gio <- gsub("javascript:void%20window.open\\('","",gio) ## need to excape the parenthesis with \\
gio <- gsub("\\'.*", "", gio) ## cancel everything after '
urls[[jj]] <- paste0("https://treaties.un.org/Pages/",gio)
}
tabledat$url <-urls
df_all <- rbind(tabledat[-(1),], df_all)
}else{print("Too bad, there is nothing, I'll try with the next one :) " )}
current_search <- current_search -1
}
write.csv(df_all[,-(7:10)], ("un_bits.csv"))
###
###联合国条约汇编中的硒刮除
###
# https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html
rm(list=ls())
###
setwd(“C:/Users/HP/Desktop/BIT/UN_”)
库(“XML”)
图书馆(“RCurl”)
图书馆(“RSelenium”)
库(“RHTMLForms”)
#启动Selenium服务器--------------------------------------------------------
checkForServer()##检查服务器是否在那里
startServer()##启动selenium服务器
remDrv
###
### RSelenium scraping of UN treaty collection
###
# https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html
rm(list=ls())
###
setwd("C:/Users/HP/Desktop/BIT/UN_treaties")
library("XML")
library("RCurl")
library("RSelenium")
library("RHTMLForms")
# Start Selenium Server --------------------------------------------------------
checkForServer() ## check if the server is there
startServer() ## start the selenium server
remDrv <- remoteDriver() ## connect server
remDrv$open() ## connect server
Sys.sleep(5)
#remDrv$getStatus() ## info connection, not necessary
# Simulate browser session and fill out form -----------------------------------
## go to page
remDrv$navigate('https://treaties.un.org/Pages/UNTSOnline.aspx?id=2')
## check out what is of interest:
## additional attributes
webElem1 <- remDrv$findElement(using = 'xpath', "//*/input[@value = 'cnd2']")
webElem1$getElementAttribute("class")
webElem1$getElementAttribute("type")
webElem1$getElementAttribute("id")
webElem1$clickElement() ## match all the elements
Sys.sleep(5)
## results per page
webElem2 <- remDrv$findElement(using = 'css selector', "select.basicPullDn") ## . denotes class
webElem2$getElementAttribute("class")
webElem2$getElementAttribute("type")
webElem2$getElementAttribute("id")
Sys.sleep(5)
## results per page more in detail
webElem3 <- remDrv$findElement(using = 'xpath', "//*/select[@class = 'basicPullDn']/option[@value = '500']")
webElem3$getElementAttribute("class")
webElem3$getElementAttribute("type")
webElem3$getElementAttribute("id")
webElem3$clickElement() ## number of pages
Sys.sleep(5)
## element to get total count
webElem5 <- remDrv$findElement(using = 'css selector', "span.RecordCount") ## . denotes class
webElem5$getElementAttribute("class")
webElem5$getElementAttribute("type")
webElem5$getElementAttribute("id")
## find first element of interest: text search
webElem0 <- remDrv$findElement(using = 'css selector', "input.login") ## . denotes class
webElem0$getElementAttribute("class")
webElem0$getElementAttribute("type")
webElem0$getElementAttribute("id")
Sys.sleep(5)
df_all <- data.frame()
###### need to run search for multiple countries
#country_list <- c("Morocco", "Italy", "Brutto porco", "France")
names <- read.csv("participants_clean.csv")
country_list <- names$names
current_search <- length(country_list)
for (i in country_list){
#i <- "Morocco"
print("-------------------------")
print("-------------------------")
text <- paste("Still", current_search, "searches to do... ", sep=" ")
print(text)
text0 <- paste("Now looking for treaties signed by... ", i , " ----------------------->>" , sep=" ")
print(text0)
Sys.sleep(5)
## define keys to search
keys <- paste(i, "Agreement promotion investment", sep=" ")
## search for files, one by one and save results
webElem0$clearElement()
webElem0$sendKeysToElement(list(keys, key = "enter"))
Sys.sleep(20)
# check if the table is there
doc<-htmlParse(remDrv$getPageSource()[[1]]) ## now parse html so that we can search it
tables = readHTMLTable(doc) ## extract all tables
#names(tables) ## names of all tables
tableexists <- grepl("ctl00_ContentPlaceHolder1_dgSearch", names(tables))
yes_no <- all(tableexists==F)
yes_no
if(yes_no==FALSE){
## copy table
table <- remDrv$findElement(using = 'xpath', "//*[@id = 'ctl00_ContentPlaceHolder1_dgSearch']")
table$getElementAttribute("class")
table$getElementAttribute("type")
table$getElementAttribute("id")
## extract table of interest
tabledat <-readHTMLTable(doc, stringsAsFactors = F )[[37]]
treatfou <-nrow(tabledat)
text1 <- paste("Amazing, I just found", treatfou - 1, " !!", sep=" ")
print(text1)
## now need to extract the real names of the treaties: start from 2 to treatfound
names_new <- vector(mode="character",length = treatfou)
urls <- vector(mode="character",length = treatfou)
for (jj in 2:treatfou) {
cell_add <- paste("//*[@id='ctl00_ContentPlaceHolder1_dgSearch']/tbody/tr[", jj, "]/td[2]", sep="")
cell_table <- remDrv$findElement(using = 'xpath', cell_add)
names_new[[jj]] <- as.character(cell_table$getElementAttribute("title"))
}
## now substitute in the real titles:
names_new <- as.vector(unlist(names_new))
tabledat$title <- names_new
tabledat$party <- i
## get the link
for (jj in 2:treatfou) {
url_add <- paste("//*[@id='ctl00_ContentPlaceHolder1_dgSearch']/tbody/tr[", jj, "]/td[3]/a", sep="")
url_add <- remDrv$findElement(using = 'xpath', url_add)
gio <- unlist(url_add$getElementAttribute("href"))
gio <- gsub("javascript:void%20window.open\\('","",gio) ## need to excape the parenthesis with \\
gio <- gsub("\\'.*", "", gio) ## cancel everything after '
urls[[jj]] <- paste0("https://treaties.un.org/Pages/",gio)
}
tabledat$url <-urls
df_all <- rbind(tabledat[-(1),], df_all)
}else{print("Too bad, there is nothing, I'll try with the next one :) " )}
current_search <- current_search -1
}
write.csv(df_all[,-(7:10)], ("un_bits.csv"))