Java HtmlUnit网络客户端超时_Java_Multithreading_Timeout_Web Scraping_Htmlunit

Java HtmlUnit网络客户端超时

java multithreading web-scraping

Java HtmlUnit网络客户端超时,java,multithreading,timeout,web-scraping,htmlunit,Java,Multithreading,Timeout,Web Scraping,Htmlunit,在我之前关于HtmlUnit的问题中及我已经提到URL被卡住了。我还发现，由于HtmlUnit库中的一个方法（parse）没有执行出来，它被卡住了我在这方面做了进一步的工作。我编写了一段代码，以便在需要超过指定的超时秒数时退出该方法 import java.io.IOException; import java.net.MalformedURLException; import java.util.Date; import java.util.concurrent.ExecutorSer

在我之前关于HtmlUnit的问题中及

我已经提到URL被卡住了。我还发现，由于HtmlUnit库中的一个方法（parse）没有执行出来，它被卡住了

我在这方面做了进一步的工作。我编写了一段代码，以便在需要超过指定的超时秒数时退出该方法

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class HandleHtmlUnitTimeout {

public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException, InterruptedException, TimeoutException 
    {   
        Date start = new Date();
        String url = "http://ericaweiner.com/collections/";
        doWorkWithTimeout(url, 60);
    }

public static void doWorkWithTimeout(final String url, long timeoutSecs) throws InterruptedException, TimeoutException {
    //maintains a thread for executing the doWork method
    ExecutorService executor = Executors.newFixedThreadPool(1);
    //logger.info("Starting method with "+timeoutSecs+" seconds as timeout");
    //set the executor thread working

    final Future<?> future = executor.submit(new Runnable() {
        public void run() 
            {
            try 
                {
                getPageSource(url);
                }
            catch (Exception e) 
                {
                throw new RuntimeException(e);
                }
        }
    });

    //check the outcome of the executor thread and limit the time allowed for it to complete
    try {
        future.get(timeoutSecs, TimeUnit.SECONDS);
    } catch (Exception e) {
        //ExecutionException: deliverer threw exception
        //TimeoutException: didn't complete within downloadTimeoutSecs
        //InterruptedException: the executor thread was interrupted

        //interrupts the worker thread if necessary
        future.cancel(true);

        //logger.warn("encountered problem while doing some work", e);
        throw new TimeoutException();
    }finally{ 
    executor.shutdownNow();
    }
}

public static void getPageSource(String productPageUrl)
    {
    try {
    if(productPageUrl == null)
        {
        productPageUrl = "http://ericaweiner.com/collections/";
        }   

        WebClient wb = new WebClient(BrowserVersion.FIREFOX_3_6);
        wb.getOptions().setTimeout(120000);
        wb.getOptions().setJavaScriptEnabled(true);
        wb.getOptions().setThrowExceptionOnScriptError(true);
        wb.getOptions().setThrowExceptionOnFailingStatusCode(false);
        HtmlPage page = wb.getPage(productPageUrl);
        wb.waitForBackgroundJavaScript(4000);
        wb.closeAllWindows();
} 
catch (FailingHttpStatusCodeException e) 
    {
    e.printStackTrace();
    } 
catch (MalformedURLException e) 
    {
    e.printStackTrace();
    } 
catch (IOException e) 
    {
    e.printStackTrace();
    }
    }

import java.io.IOException；
导入java.net.MalformedURLException；
导入java.util.Date；
导入java.util.concurrent.ExecutorService；
导入java.util.concurrent.Executors；
导入java.util.concurrent.Future；
导入java.util.concurrent.TimeUnit；
导入java.util.concurrent.TimeoutException；
导入com.gargoylesoftware.htmlunit.BrowserVersion；
导入com.gargoylesoftware.htmlunit.failinghttpstatuscodecoexception；
导入com.gargoylesoftware.htmlunit.WebClient；
导入com.gargoylesoftware.htmlunit.html.HtmlPage；
公共类HandleHtmlUnitTimeout{
公共静态void main（字符串[]args）引发FailingHttpStatusCodeException、MalformedURLException、IOException、InterruptedException、TimeoutException
{   
开始日期=新日期（）；
字符串url=”http://ericaweiner.com/collections/";
doWorkWithTimeout（url，60）；
}
public static void doWorkWithTimeout（最终字符串url，long-timeoutSecs）引发InterruptedException，TimeoutException{
//维护用于执行doWork方法的线程
ExecutorService executor=Executors.newFixedThreadPool（1）；
//logger.info（“以“+timeoutSecs+”秒作为超时值的启动方法”）；
//设置执行器线程工作
final Future=executor.submit（new Runnable（））{
公开募捐
{
尝试
{
getPageSource（url）；
}
捕获（例外e）
{
抛出新的运行时异常（e）；
}
}
});
//检查执行器线程的结果并限制其完成所允许的时间
试一试{
获取（timeoutSecs，TimeUnit.SECONDS）；
}捕获（例外e）{
//ExecutionException:传递程序引发异常
//TimeoutException:未在下载TimeOutSecs内完成
//InterruptedException：执行器线程被中断
//必要时中断工作线程
future.cancel（true）；
//logger.warn（“在执行某些工作时遇到问题”，e）；
抛出新的TimeoutException（）；
}最后{
执行者。关机现在（）；
}
}
公共静态无效getPageSource（字符串productPageUrl）
{
试一试{
if（productPageUrl==null）
{
productPageUrl=”http://ericaweiner.com/collections/";
}   
WebClient wb=新的WebClient（BrowserVersion.FIREFOX\u 3\u 6）；
wb.getOptions（）.setTimeout（120000）；
wb.getOptions（）.setJavaScriptEnabled（true）；
wb.getOptions（）.setThroweExceptionOnScriptError（true）；
wb.getOptions（）.SetThroweExceptionOnFailingStatusCode（false）；
HtmlPage=wb.getPage（productPageUrl）；
wb.waitForBackgroundJavaScript（4000）；
wb.关闭所有窗口（）；
} 
捕获（失败TTPStatusCodeException e）
{
e、 printStackTrace（）；
} 
捕获（格式错误）
{
e、 printStackTrace（）；
} 
捕获（IOE异常）
{
e、 printStackTrace（）；
}
}

}

此代码确实来自doWorkWithTimeout（url，60）；方法。但这并没有结束

当我尝试使用以下代码调用类似的实现时：

import java.util.Date;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import org.apache.log4j.Logger;


public class HandleScraperTimeOut {

private static Logger logger = Logger.getLogger(HandleScraperTimeOut .class);


public void doWork() throws InterruptedException {
    logger.info(new Date()+ "Starting worker method ");
    Thread.sleep(20000);
    logger.info(new Date()+ "Ending worker method ");
    //perform some long running task here...
}

public void doWorkWithTimeout(int timeoutSecs) {
    //maintains a thread for executing the doWork method
    ExecutorService executor = Executors.newFixedThreadPool(1);
    logger.info("Starting method with "+timeoutSecs+" seconds as timeout");
    //set the executor thread working

    final Future<?> future = executor.submit(new Runnable() {
        public void run() 
            {
            try 
                {
                doWork();
                }
            catch (Exception e) 
                {
                throw new RuntimeException(e);
                }
        }
    });

    //check the outcome of the executor thread and limit the time allowed for it to complete
    try {
        future.get(timeoutSecs, TimeUnit.SECONDS);
    } catch (Exception e) {
        //ExecutionException: deliverer threw exception
        //TimeoutException: didn't complete within downloadTimeoutSecs
        //InterruptedException: the executor thread was interrupted

        //interrupts the worker thread if necessary
        future.cancel(true);

        logger.warn("encountered problem while doing some work", e);
    }
    executor.shutdown();
}

public static void main(String a[])
    {
        HandleScraperTimeOut hcto = new HandleScraperTimeOut ();
        hcto.doWorkWithTimeout(30);

    }

import java.util.Date；
导入java.util.concurrent.ExecutorService；
导入java.util.concurrent.Executors；
导入java.util.concurrent.Future；
导入java.util.concurrent.TimeUnit；
导入org.apache.log4j.Logger；
公共类HandleScraperTimeOut{
私有静态记录器Logger=Logger.getLogger（HandleScraperTimeOut.class）；
public void doWork（）引发InterruptedException{
logger.info（新日期（）+“开始工作方法”）；
睡眠（20000）；
logger.info（新日期（）+“结束工作方法”）；
//在这里执行一些长时间运行的任务。。。
}
public void dowork with timeout（int timeoutSecs）{
//维护用于执行doWork方法的线程
ExecutorService executor=Executors.newFixedThreadPool（1）；
logger.info（“以“+timeoutSecs+”秒作为超时值的启动方法”）；
//设置执行器线程工作
final Future=executor.submit（new Runnable（））{
公开募捐
{
尝试
{
销钉（）；
}
捕获（例外e）
{
抛出新的运行时异常（e）；
}
}
});
//检查执行器线程的结果并限制其完成所允许的时间
试一试{
获取（timeoutSecs，TimeUnit.SECONDS）；
}捕获（例外e）{
//ExecutionException:传递程序引发异常
//TimeoutException:未在下载TimeOutSecs内完成
//InterruptedException：执行器线程被中断
//必要时中断工作线程
future.cancel（true）；
logger.warn（“在执行某些工作时遇到问题”，e）；
}
executor.shutdown（）；
}
公共静态void main（字符串a[]
{
HandleScraperTimeOut hcto=新的HandleScraperTimeOut（）；
hcto.doWorkWithTimeout（30）；
}

}

如果有人能看一看，告诉我是什么问题，这将是非常有帮助的

有关此问题的更多详细信息，您可以查看及

更新1 奇怪的是：未来。取消（真）；在这两种情况下都返回TRUE。我所期望的是：

对于HtmlUnit，它应该返回FALSE，因为进程仍然挂起
使用普通线程。sleep（）；它应该返回TRUE，因为进程已成功取消

更新2 它只能挂起来