Java 从一组url中提取必要的url
我复制了相同的代码并尝试执行,但它显示以下异常。请检查此项。如果没有,请提供HtmlUnit API的链接,以确保我尝试了相同的方法Java 从一组url中提取必要的url,java,arraylist,jsoup,Java,Arraylist,Jsoup,我复制了相同的代码并尝试执行,但它显示以下异常。请检查此项。如果没有,请提供HtmlUnit API的链接,以确保我尝试了相同的方法 public class Program{ public static void main(String[] args) throws FailingHttpStatusCodeException, IOException { final WebClient webClient = new WebClient(BrowserVersion.
public class Program{
public static void main(String[] args) throws FailingHttpStatusCodeException, IOException {
final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_17);
WebRequest request = new WebRequest(new URL("http://www.firstpost.com/tag/crime-in-india"));
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.setJavaScriptTimeout(10000);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
webClient.getOptions().setTimeout(10000);
HtmlPage page = webClient.getPage(request);
webClient.waitForBackgroundJavaScript(10000);
List<HtmlAnchor> anchors1 = page.getAnchors();
HtmlAnchor link2 = null;
for (HtmlAnchor anchor : anchors1) {
if (anchor.asText().indexOf("Why do we care more when NRIs go missing?") > -1) {
link2 = anchor;
break;
}
}
page = link2.click();
webClient.waitForBackgroundJavaScript(10000);
DomNodeList<DomElement> paras = page.getElementsByTagName("p");
for (DomElement el : paras.toArray(new DomElement[paras.size()])) {
System.out.println(el.asText());
}
}
}
公共类程序{
公共静态void main(字符串[]args)引发FailingHttpStatusCodeException、IOException{
final WebClient WebClient=新的WebClient(BrowserVersion.FIREFOX_17);
WebRequest=新的WebRequest(新的URL(“http://www.firstpost.com/tag/crime-in-india"));
webClient.getOptions().SetThroweExceptionOnScriptError(false);
setJavaScriptTimeout(10000);
webClient.getOptions().setJavaScriptEnabled(true);
setAjaxController(新的NicelyResynchronizingAjaxController());
webClient.getOptions().setTimeout(10000);
HtmlPage=webClient.getPage(请求);
webClient.waitForBackgroundJavaScript(10000);
List anchors1=page.getAnchors();
HtmlAnchor link2=null;
对于(HtmlAnchor锚:主播1){
if(anchor.asText().indexOf(“当NRI丢失时我们为什么更关心?”)>-1){
link2=锚;
打破
}
}
page=link2.单击();
webClient.waitForBackgroundJavaScript(10000);
DomNodeList-paras=page.getElementsByTagName(“p”);
对于(DomeElement el:paras.toArray(新的DomeElement[paras.size()])){
System.out.println(el.asText());
}
}
}
我相信它将是特定于页面的,因为所有页面都有不同的结构。您希望从多少个网站获取数据?如果仅为5或6,则可能需要分别为其中的每一个进行编码。我相信这将是特定于页面的,因为所有页面都有不同的结构。您希望从多少个网站获取数据?如果仅为5或6,则可能需要分别为其中的每一个进行编码。我相信这将是特定于页面的,因为所有页面都有不同的结构。您希望从多少个网站获取数据?如果仅为5或6,则可能需要分别为其中的每一个进行编码。我相信这将是特定于页面的,因为所有页面都有不同的结构。您希望从多少个网站获取数据?如果仅为5或6,则您可能需要分别为它们中的每一个进行编码。您可以结合使用Jsoup
和HtmlUnit
HtmlUnit
将有助于您动态地“单击”这些链接并刮取结果页面的内容。您可以累积这些锚定标记的文本以迭代所有锚定标记,Jsoup
也有助于累积那些超链接的文本,这些超链接可供HtmlUnit
以后使用,但需要2个连接
下面是一个从你的网站url中从其中一个超链接中抓取数据的代码示例,请注意,我给出了你的基本url,然后检查特定链接以获取其内容-你可以根据模式选择页面上的特定链接集
它遍历到您的链接,单击它并检索您正在查找的特定数据集
代码:
public static void main(String[] args)
throws FailingHttpStatusCodeException, IOException {
final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_17);
WebRequest request = new WebRequest(new URL(
"http://www.firstpost.com/tag/crime-in-india"));
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.setJavaScriptTimeout(10000);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
webClient.getOptions().setTimeout(10000);
HtmlPage page = webClient.getPage(request);
webClient.waitForBackgroundJavaScript(10000);
List<HtmlAnchor> anchors1 = page.getAnchors();
HtmlAnchor link2 = null;
for (HtmlAnchor anchor : anchors1) {
if (anchor.asText().indexOf(
"Why do we care more when NRIs go missing?") > -1) {
link2 = anchor;
break;
}
}
page = link2.click();
webClient.waitForBackgroundJavaScript(10000);
DomNodeList<DomElement> paras = page.getElementsByTagName("p");
for (DomElement el : paras.toArray(new DomElement[paras.size()])) {
System.out.println(el.asText());
}
}
publicstaticvoidmain(字符串[]args)
引发FailingHttpStatusCodeException,IOException{
final WebClient WebClient=新的WebClient(BrowserVersion.FIREFOX_17);
WebRequest=newwebrequest(新URL(
"http://www.firstpost.com/tag/crime-in-india"));
webClient.getOptions().SetThroweExceptionOnScriptError(false);
setJavaScriptTimeout(10000);
webClient.getOptions().setJavaScriptEnabled(true);
setAjaxController(新的NicelyResynchronizingAjaxController());
webClient.getOptions().setTimeout(10000);
HtmlPage=webClient.getPage(请求);
webClient.waitForBackgroundJavaScript(10000);
List anchors1=page.getAnchors();
HtmlAnchor link2=null;
对于(HtmlAnchor锚:主播1){
if(anchor.asText().indexOf(
“为什么NRI失踪时我们更关心?”)>-1){
link2=锚;
打破
}
}
page=link2.单击();
webClient.waitForBackgroundJavaScript(10000);
DomNodeList-paras=page.getElementsByTagName(“p”);
对于(DomeElement el:paras.toArray(新的DomeElement[paras.size()])){
System.out.println(el.asText());
}
}
打印新闻文章内容:
public static void main(String[] args)
throws FailingHttpStatusCodeException, IOException {
final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_17);
WebRequest request = new WebRequest(new URL(
"http://www.firstpost.com/tag/crime-in-india"));
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.setJavaScriptTimeout(10000);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
webClient.getOptions().setTimeout(10000);
HtmlPage page = webClient.getPage(request);
webClient.waitForBackgroundJavaScript(10000);
List<HtmlAnchor> anchors1 = page.getAnchors();
HtmlAnchor link2 = null;
for (HtmlAnchor anchor : anchors1) {
if (anchor.asText().indexOf(
"Why do we care more when NRIs go missing?") > -1) {
link2 = anchor;
break;
}
}
page = link2.click();
webClient.waitForBackgroundJavaScript(10000);
DomNodeList<DomElement> paras = page.getElementsByTagName("p");
for (DomElement el : paras.toArray(new DomElement[paras.size()])) {
System.out.println(el.asText());
}
}
您可以结合使用
Jsoup
和HtmlUnit
HtmlUnit
将有助于您动态地“单击”这些链接并刮取结果页面的内容。您可以累积这些锚定标记的文本以迭代所有锚定标记,Jsoup
也有助于累积那些超链接的文本,这些超链接可供HtmlUnit
以后使用,但需要2个连接
下面是一个从你的网站url中从其中一个超链接中抓取数据的代码示例,请注意,我给出了你的基本url,然后检查特定链接以获取其内容-你可以根据模式选择页面上的特定链接集
它遍历到您的链接,单击它并检索您正在查找的特定数据集
代码:
public static void main(String[] args)
throws FailingHttpStatusCodeException, IOException {
final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_17);
WebRequest request = new WebRequest(new URL(
"http://www.firstpost.com/tag/crime-in-india"));
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.setJavaScriptTimeout(10000);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
webClient.getOptions().setTimeout(10000);
HtmlPage page = webClient.getPage(request);
webClient.waitForBackgroundJavaScript(10000);
List<HtmlAnchor> anchors1 = page.getAnchors();
HtmlAnchor link2 = null;
for (HtmlAnchor anchor : anchors1) {
if (anchor.asText().indexOf(
"Why do we care more when NRIs go missing?") > -1) {
link2 = anchor;
break;
}
}
page = link2.click();
webClient.waitForBackgroundJavaScript(10000);
DomNodeList<DomElement> paras = page.getElementsByTagName("p");
for (DomElement el : paras.toArray(new DomElement[paras.size()])) {
System.out.println(el.asText());
}
}
publicstaticvoidmain(字符串[]args)
引发FailingHttpStatusCodeException,IOException{
final WebClient WebClient=新的WebClient(BrowserVersion.FIREFOX_17);
WebRequest=newwebrequest(新URL(
"http://www.firstpost.com/tag/crime-in-india"));
webClient.getOptions().SetThroweExceptionOnScriptError(false);
setJavaScriptTimeout(10000);
webClient.get