在R中为循环应用vs

在R中为循环应用vs,r,apply,lapply,sapply,R,Apply,Lapply,Sapply,我编写了以下代码,每天从门户中删除投标信息 packages <- c('rvest', 'stringi', 'tidyverse','lubridate','dplyr') purrr::walk(packages, library, character.only = TRUE, warn.conflicts = FALSE) start_time <- proc.time() packages您的sapply函数不正确。我对您的代码进行了一些编辑,并在样本大小N=50上对其进

我编写了以下代码,每天从门户中删除投标信息

packages <- c('rvest', 'stringi', 'tidyverse','lubridate','dplyr')
purrr::walk(packages, library, character.only = TRUE, warn.conflicts = FALSE)
start_time <- proc.time()

packages您的sapply函数不正确。我对您的代码进行了一些编辑,并在样本大小N=50上对其进行了测试。我们可以使用system.time()来确定完成任务所需的时间

“for”方法:

system.time(
  for (page_no in 1:50){
    closeAllConnections()
    on.exit(closeAllConnections())
    url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
    url <- paste(url_bit1, page_no, sep="")
    cat(page_no,"\t",proc.time() - start_time,"\n")
    data <- read_html(url)
    total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
    Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
    links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
    links_fair <- html_attr(links,'href')
    links_fair <- links_fair[grep("tendersfullview",links_fair)]
    Page_tenders <- cbind(Page_tenders,links_fair)
    All_tenders <- rbind(All_tenders,Page_tenders)
  }
)

#user  system elapsed 
# 50.15   81.26  132.73
All_tenders = NULL
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
  closeAllConnections()
  on.exit(closeAllConnections())
  url <- paste(url_bit1, datain, sep="")
  cat(datain,"\t",proc.time() - start_time,"\n")
  data <- read_html(url)
  total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
  Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
  links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
  links_fair <- html_attr(links,'href')
  links_fair <- links_fair[grep("tendersfullview",links_fair)]
  Page_tenders <- cbind(Page_tenders,links_fair)
  All_tenders <- rbind(All_tenders,Page_tenders)
}

system.time(
  All_tenders <- lapply(1:50, function(x) read_page(x))
)
# user  system elapsed 
# 49.84   78.97  131.16

原来lappy稍微快一点。

for
循环和
sapply
的工作原理不同: -
for
循环以迭代的方式进行操作:它们对第一个元素进行计算,然后对第二个元素进行计算。。。 -
sapply
独立地(以任何顺序)在元素列表中执行操作。因此结果集是独立构造的

因此,在for循环的和处,当您执行以下操作时:

All_tenders <- rbind(All_tenders,Page_tenders)
要返回每个页面的结果并按以下方式应用它,请执行以下操作:

All_tenders_tmp <- sapply(2:round(Count_of_Recs/10), FUN=read_page)

All_tenders\u tmp我认为在你的函数中更改
All_tenders
会让它变得很慢……好吧,你必须处理它。Web抓取速度不快,否则您可能会被服务器管理员禁止。另一种(明显更快的)方法是通过python使用TOR和来自不同IP的多个请求,但这是另一种情况。。。。。正如我提到的,我是R的唯一新手。。。。请详细说明以下部分,请系统。时间(所有投标者@shikharaparashar您可以在下面检查我的答案,它应该可以工作;)您可以看这里:顺便说一句,此函数将抛出一个错误,因为“页码”、“序号”变量未定义。两个变量都是DataFrame中数据的COL这一个仍然不适用于我。。出于这个原因,我倾向于选择前面的答案作为正确的答案。输入变量是datain,我更正了它。但无论如何,这个想法是为了让你知道sapply应该如何工作
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
   closeAllConnections()
   on.exit(closeAllConnections())
   url <- paste(url_bit1, datain$S.No., sep="")
   cat(S.No.,"\t",proc.time() - start_time,"\n")
   data <- read_html(url)
   total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
   Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
   links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
   links_fair <- html_attr(links,'href')
   links_fair <- links_fair[grep("tendersfullview",links_fair)]
   Page_tenders <- cbind(Page_tenders,links_fair)
   All_tenders <- rbind(All_tenders,Page_tenders)
}

All_tenders <- sapply(All_tenders, FUN=read_page(All_tenders$S.No.))
system.time(
  for (page_no in 1:50){
    closeAllConnections()
    on.exit(closeAllConnections())
    url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
    url <- paste(url_bit1, page_no, sep="")
    cat(page_no,"\t",proc.time() - start_time,"\n")
    data <- read_html(url)
    total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
    Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
    links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
    links_fair <- html_attr(links,'href')
    links_fair <- links_fair[grep("tendersfullview",links_fair)]
    Page_tenders <- cbind(Page_tenders,links_fair)
    All_tenders <- rbind(All_tenders,Page_tenders)
  }
)

#user  system elapsed 
# 50.15   81.26  132.73
All_tenders = NULL
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
  closeAllConnections()
  on.exit(closeAllConnections())
  url <- paste(url_bit1, datain, sep="")
  cat(datain,"\t",proc.time() - start_time,"\n")
  data <- read_html(url)
  total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
  Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
  links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
  links_fair <- html_attr(links,'href')
  links_fair <- links_fair[grep("tendersfullview",links_fair)]
  Page_tenders <- cbind(Page_tenders,links_fair)
  All_tenders <- rbind(All_tenders,Page_tenders)
}

system.time(
  All_tenders <- lapply(1:50, function(x) read_page(x))
)
# user  system elapsed 
# 49.84   78.97  131.16
All_tenders = do.call(rbind, lapply(All_tenders, data.frame, stringsAsFactors=FALSE)
All_tenders <- rbind(All_tenders,Page_tenders)
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
   closeAllConnections()
   on.exit(closeAllConnections())
   url <- paste(url_bit1,  datain, sep="")
   cat(S.No.,"\t",proc.time() - start_time,"\n")
   data <- read_html(url)
   total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
   Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
   links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
   links_fair <- html_attr(links,'href')
   links_fair <- links_fair[grep("tendersfullview",links_fair)]
   Page_tenders <- cbind(Page_tenders,links_fair)
   return(Page_tenders)
}
All_tenders_tmp <- sapply(2:round(Count_of_Recs/10), FUN=read_page)