Java HtmlUnit网络客户端超时
在我之前关于HtmlUnit的问题中 及 我已经提到URL被卡住了。我还发现,由于HtmlUnit库中的一个方法(parse)没有执行出来,它被卡住了 我在这方面做了进一步的工作。我编写了一段代码,以便在需要超过指定的超时秒数时退出该方法Java HtmlUnit网络客户端超时,java,multithreading,timeout,web-scraping,htmlunit,Java,Multithreading,Timeout,Web Scraping,Htmlunit,在我之前关于HtmlUnit的问题中 及 我已经提到URL被卡住了。我还发现,由于HtmlUnit库中的一个方法(parse)没有执行出来,它被卡住了 我在这方面做了进一步的工作。我编写了一段代码,以便在需要超过指定的超时秒数时退出该方法 import java.io.IOException; import java.net.MalformedURLException; import java.util.Date; import java.util.concurrent.ExecutorSer
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class HandleHtmlUnitTimeout {
public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException, InterruptedException, TimeoutException
{
Date start = new Date();
String url = "http://ericaweiner.com/collections/";
doWorkWithTimeout(url, 60);
}
public static void doWorkWithTimeout(final String url, long timeoutSecs) throws InterruptedException, TimeoutException {
//maintains a thread for executing the doWork method
ExecutorService executor = Executors.newFixedThreadPool(1);
//logger.info("Starting method with "+timeoutSecs+" seconds as timeout");
//set the executor thread working
final Future<?> future = executor.submit(new Runnable() {
public void run()
{
try
{
getPageSource(url);
}
catch (Exception e)
{
throw new RuntimeException(e);
}
}
});
//check the outcome of the executor thread and limit the time allowed for it to complete
try {
future.get(timeoutSecs, TimeUnit.SECONDS);
} catch (Exception e) {
//ExecutionException: deliverer threw exception
//TimeoutException: didn't complete within downloadTimeoutSecs
//InterruptedException: the executor thread was interrupted
//interrupts the worker thread if necessary
future.cancel(true);
//logger.warn("encountered problem while doing some work", e);
throw new TimeoutException();
}finally{
executor.shutdownNow();
}
}
public static void getPageSource(String productPageUrl)
{
try {
if(productPageUrl == null)
{
productPageUrl = "http://ericaweiner.com/collections/";
}
WebClient wb = new WebClient(BrowserVersion.FIREFOX_3_6);
wb.getOptions().setTimeout(120000);
wb.getOptions().setJavaScriptEnabled(true);
wb.getOptions().setThrowExceptionOnScriptError(true);
wb.getOptions().setThrowExceptionOnFailingStatusCode(false);
HtmlPage page = wb.getPage(productPageUrl);
wb.waitForBackgroundJavaScript(4000);
wb.closeAllWindows();
}
catch (FailingHttpStatusCodeException e)
{
e.printStackTrace();
}
catch (MalformedURLException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
}
import java.io.IOException;
导入java.net.MalformedURLException;
导入java.util.Date;
导入java.util.concurrent.ExecutorService;
导入java.util.concurrent.Executors;
导入java.util.concurrent.Future;
导入java.util.concurrent.TimeUnit;
导入java.util.concurrent.TimeoutException;
导入com.gargoylesoftware.htmlunit.BrowserVersion;
导入com.gargoylesoftware.htmlunit.failinghttpstatuscodecoexception;
导入com.gargoylesoftware.htmlunit.WebClient;
导入com.gargoylesoftware.htmlunit.html.HtmlPage;
公共类HandleHtmlUnitTimeout{
公共静态void main(字符串[]args)引发FailingHttpStatusCodeException、MalformedURLException、IOException、InterruptedException、TimeoutException
{
开始日期=新日期();
字符串url=”http://ericaweiner.com/collections/";
doWorkWithTimeout(url,60);
}
public static void doWorkWithTimeout(最终字符串url,long-timeoutSecs)引发InterruptedException,TimeoutException{
//维护用于执行doWork方法的线程
ExecutorService executor=Executors.newFixedThreadPool(1);
//logger.info(“以“+timeoutSecs+”秒作为超时值的启动方法”);
//设置执行器线程工作
final Future=executor.submit(new Runnable()){
公开募捐
{
尝试
{
getPageSource(url);
}
捕获(例外e)
{
抛出新的运行时异常(e);
}
}
});
//检查执行器线程的结果并限制其完成所允许的时间
试一试{
获取(timeoutSecs,TimeUnit.SECONDS);
}捕获(例外e){
//ExecutionException:传递程序引发异常
//TimeoutException:未在下载TimeOutSecs内完成
//InterruptedException:执行器线程被中断
//必要时中断工作线程
future.cancel(true);
//logger.warn(“在执行某些工作时遇到问题”,e);
抛出新的TimeoutException();
}最后{
执行者。关机现在();
}
}
公共静态无效getPageSource(字符串productPageUrl)
{
试一试{
if(productPageUrl==null)
{
productPageUrl=”http://ericaweiner.com/collections/";
}
WebClient wb=新的WebClient(BrowserVersion.FIREFOX\u 3\u 6);
wb.getOptions().setTimeout(120000);
wb.getOptions().setJavaScriptEnabled(true);
wb.getOptions().setThroweExceptionOnScriptError(true);
wb.getOptions().SetThroweExceptionOnFailingStatusCode(false);
HtmlPage=wb.getPage(productPageUrl);
wb.waitForBackgroundJavaScript(4000);
wb.关闭所有窗口();
}
捕获(失败TTPStatusCodeException e)
{
e、 printStackTrace();
}
捕获(格式错误)
{
e、 printStackTrace();
}
捕获(IOE异常)
{
e、 printStackTrace();
}
}
}
此代码确实来自doWorkWithTimeout(url,60);方法。但这并没有结束
当我尝试使用以下代码调用类似的实现时:
import java.util.Date;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.apache.log4j.Logger;
public class HandleScraperTimeOut {
private static Logger logger = Logger.getLogger(HandleScraperTimeOut .class);
public void doWork() throws InterruptedException {
logger.info(new Date()+ "Starting worker method ");
Thread.sleep(20000);
logger.info(new Date()+ "Ending worker method ");
//perform some long running task here...
}
public void doWorkWithTimeout(int timeoutSecs) {
//maintains a thread for executing the doWork method
ExecutorService executor = Executors.newFixedThreadPool(1);
logger.info("Starting method with "+timeoutSecs+" seconds as timeout");
//set the executor thread working
final Future<?> future = executor.submit(new Runnable() {
public void run()
{
try
{
doWork();
}
catch (Exception e)
{
throw new RuntimeException(e);
}
}
});
//check the outcome of the executor thread and limit the time allowed for it to complete
try {
future.get(timeoutSecs, TimeUnit.SECONDS);
} catch (Exception e) {
//ExecutionException: deliverer threw exception
//TimeoutException: didn't complete within downloadTimeoutSecs
//InterruptedException: the executor thread was interrupted
//interrupts the worker thread if necessary
future.cancel(true);
logger.warn("encountered problem while doing some work", e);
}
executor.shutdown();
}
public static void main(String a[])
{
HandleScraperTimeOut hcto = new HandleScraperTimeOut ();
hcto.doWorkWithTimeout(30);
}
import java.util.Date;
导入java.util.concurrent.ExecutorService;
导入java.util.concurrent.Executors;
导入java.util.concurrent.Future;
导入java.util.concurrent.TimeUnit;
导入org.apache.log4j.Logger;
公共类HandleScraperTimeOut{
私有静态记录器Logger=Logger.getLogger(HandleScraperTimeOut.class);
public void doWork()引发InterruptedException{
logger.info(新日期()+“开始工作方法”);
睡眠(20000);
logger.info(新日期()+“结束工作方法”);
//在这里执行一些长时间运行的任务。。。
}
public void dowork with timeout(int timeoutSecs){
//维护用于执行doWork方法的线程
ExecutorService executor=Executors.newFixedThreadPool(1);
logger.info(“以“+timeoutSecs+”秒作为超时值的启动方法”);
//设置执行器线程工作
final Future=executor.submit(new Runnable()){
公开募捐
{
尝试
{
销钉();
}
捕获(例外e)
{
抛出新的运行时异常(e);
}
}
});
//检查执行器线程的结果并限制其完成所允许的时间
试一试{
获取(timeoutSecs,TimeUnit.SECONDS);
}捕获(例外e){
//ExecutionException:传递程序引发异常
//TimeoutException:未在下载TimeOutSecs内完成
//InterruptedException:执行器线程被中断
//必要时中断工作线程
future.cancel(true);
logger.warn(“在执行某些工作时遇到问题”,e);
}
executor.shutdown();
}
公共静态void main(字符串a[]
{
HandleScraperTimeOut hcto=新的HandleScraperTimeOut();
hcto.doWorkWithTimeout(30);
}
}
如果有人能看一看,告诉我是什么问题,这将是非常有帮助的
有关此问题的更多详细信息,您可以查看
及
更新1 奇怪的是:未来。取消(真);在这两种情况下都返回TRUE。 我所期望的是:
- 对于HtmlUnit,它应该返回FALSE,因为进程仍然挂起
- 使用普通线程。sleep();它应该返回TRUE,因为进程 已成功取消