Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/xpath/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
如何从XHTML收集多个URL_R_Xpath_Web Scraping - Fatal编程技术网

如何从XHTML收集多个URL

如何从XHTML收集多个URL,r,xpath,web-scraping,R,Xpath,Web Scraping,一般来说,我对XPath和R都是新手。所以我希望我的问题不是太愚蠢 我想从一个网页收集多个URL(搜索结果): 但它只给出了1-10页的每个URL的结果x 如果有人能把我引向正确的方向,那将是令人惊讶的。 library(RCurl);library(XML) pageNum <- seq(1:10) url <- paste0("http://www.totaljobs.com/JobSearch/Results.aspx?Keywords=Leadership&LTx

一般来说,我对XPath和R都是新手。所以我希望我的问题不是太愚蠢

我想从一个网页收集多个URL(搜索结果):

但它只给出了1-10页的每个URL的结果x

如果有人能把我引向正确的方向,那将是令人惊讶的。

library(RCurl);library(XML)

pageNum <- seq(1:10)
url <- paste0("http://www.totaljobs.com/JobSearch/Results.aspx?Keywords=Leadership&LTxt=&Radius=10&RateType=0&JobType1=&CompanyType=&PageNum=") 
urls <- paste0(url, pageNum) 

allPages <- lapply(urls, function(x) getURLContent(x)[[1]])
xmlDocs <- lapply(allPages, function(x) XML::htmlParse(x))

ResultsPerPage <- 19

# Essentially this is the difference from your code
xPath <- paste0("//*[@id='resultsList_rptSearchResults_ctl", 
                ifelse(nchar(0:ResultsPerPage)==1, paste0("0", (0:ResultsPerPage)), (0:ResultsPerPage)),
               "_lnkJobTitle']")

linksToArticle <- unlist(lapply(xmlDocs, function(x) XML::getNodeSet(x, xPath)))
linkUrls <- lapply(linksToArticle, function (x) XML::xmlGetAttr(x, "href")) 

#Remove all objects except for linkUrls
rm(list=ls()[!(ls()=="linkUrls")])

length(linkUrls)
print(paste0("http://www.totaljobs.com", linkUrls))
库(RCurl);库(XML)

pageNum强制性强音诗版本:

library(rvest)
library(httr)
library(pbapply)

base_url <- "http://www.totaljobs.com/JobSearch/Results.aspx?Keywords=Leadership&LTxt=&Radius=10&RateType=0&JobType1=&CompanyType=&PageNum=%d"

unlist(pblapply(1:10, function(i) {

  # grab the page
  pg <- html_session(sprintf(base_url, i), 
                     user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.15 Safari/537.36"))

  # extract the links
  pg %>% 
    html_nodes("a[id^='resultsList_rptSearchResults'][href^='/JobSearch']") %>% 
    html_attr("href")

})) -> links
库(rvest)
图书馆(httr)
图书馆(pbapply)
基本url%
html_attr(“href”)
}))->链接

这使用CSS选择器而不是XPath,
pblappy
免费获取进度条。我需要使用
用户代理
,因为它限制了我(403)的使用。

非常感谢!工作起来很有魅力。非常非常好。很好,我不知道
user\u agent
,这就是我使用
RCurl
的原因,请确保在继续努力之前仔细阅读。
//*[(@id = "resultsList_rptSearchResults_ctl01_lnkJobTitle")]
library(RCurl);library(XML)

pageNum <- seq(1:10)
url <- paste0("http://www.totaljobs.com/JobSearch/Results.aspx?Keywords=Leadership&LTxt=&Radius=10&RateType=0&JobType1=&CompanyType=&PageNum=") 
urls <- paste0(url, pageNum) 

allPages <- lapply(urls, function(x) getURLContent(x)[[1]])
xmlDocs <- lapply(allPages, function(x) XML::htmlParse(x))

ResultsPerPage <- 19

# Essentially this is the difference from your code
xPath <- paste0("//*[@id='resultsList_rptSearchResults_ctl", 
                ifelse(nchar(0:ResultsPerPage)==1, paste0("0", (0:ResultsPerPage)), (0:ResultsPerPage)),
               "_lnkJobTitle']")

linksToArticle <- unlist(lapply(xmlDocs, function(x) XML::getNodeSet(x, xPath)))
linkUrls <- lapply(linksToArticle, function (x) XML::xmlGetAttr(x, "href")) 

#Remove all objects except for linkUrls
rm(list=ls()[!(ls()=="linkUrls")])

length(linkUrls)
print(paste0("http://www.totaljobs.com", linkUrls))
library(rvest)
library(httr)
library(pbapply)

base_url <- "http://www.totaljobs.com/JobSearch/Results.aspx?Keywords=Leadership&LTxt=&Radius=10&RateType=0&JobType1=&CompanyType=&PageNum=%d"

unlist(pblapply(1:10, function(i) {

  # grab the page
  pg <- html_session(sprintf(base_url, i), 
                     user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.15 Safari/537.36"))

  # extract the links
  pg %>% 
    html_nodes("a[id^='resultsList_rptSearchResults'][href^='/JobSearch']") %>% 
    html_attr("href")

})) -> links