Java多链接检查器爬行器-需要改进

Java多链接检查器爬行器-需要改进,java,performance,testing,web-crawler,Java,Performance,Testing,Web Crawler,我有下面的工作代码(在这里和那里更改,所以当你复制和粘贴它时,你可以使用你的大脑)。我想改进它,这样它可以检测所有无效的页面,包括出售的域名。它的工作效率约为89%。如果您看到任何我可以通过使用额外的现有库或一些小调整来改进的地方,那将是非常棒的 List all = linkService.getAllLinks(); notValidLinks = new LinkedList(); final ArrayBlockingQueue<Runnable> queu

我有下面的工作代码(在这里和那里更改,所以当你复制和粘贴它时,你可以使用你的大脑)。我想改进它,这样它可以检测所有无效的页面,包括出售的域名。它的工作效率约为89%。如果您看到任何我可以通过使用额外的现有库或一些小调整来改进的地方,那将是非常棒的

 List all = linkService.getAllLinks();
    notValidLinks = new LinkedList();
    final ArrayBlockingQueue<Runnable> queue = new ArrayBlockingQueue<Runnable>(39867);
    int poolSize = 90;
    int maxPoolSize = 100;
    long keepAliveTime = 40;
    ThreadPoolExecutor tpe = new ThreadPoolExecutor(poolSize, maxPoolSize,
            keepAliveTime, TimeUnit.SECONDS, queue);

    for (link : all) {
       Thread task = new CheckSite(link);
       tpe.execute(task);
       System.out.println("Task count:" + queue.size());
    }

class CheckSite extends Thread {
    Link link;

    CheckSite(Link link) {
        this.link = link;
    }

    public void run() {
        boolean notValid = false;
        try {
            log.info(link.getLink() + " " + link.getId());
            URL u = new URL(link.getLink());
            HttpURLConnection huc = (HttpURLConnection) u.openConnection();
            HttpURLConnection.setFollowRedirects(false);
            huc.setConnectTimeout(40000);
            huc.setRequestMethod("GET");
            huc.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 (.NET CLR 3.5.30729)");

            huc.connect();
            int code = huc.getResponseCode();

            if (code != HttpURLConnection.HTTP_OK
                    && code != HttpURLConnection.HTTP_MOVED_PERM
                    && code != HttpURLConnection.HTTP_MOVED_TEMP ){
                notValid = true;
                log.info("Invalid code: " + code + " - " + link.getLink());
            }
            if (code == HttpURLConnection.HTTP_MOVED_PERM) {
                log.info(link.getLink() + " Perm move");
            }
            if (code == HttpURLConnection.HTTP_MOVED_TEMP) {
                log.info(link.getLink() + " Temp move");
            }

            try {
                if (!notValid) {
                    BufferedReader reader = new BufferedReader(new InputStreamReader(huc.getInputStream()));
                    StringBuilder stringBuilder = new StringBuilder();

                    String line;
                    while ((line = reader.readLine()) != null) {
                        stringBuilder.append(line);
                    }

                    notValid = StringUtils.containsIgnoreCase(Jsoup.parse(stringBuilder.toString()).text(), "Related Searches");

                }
            } catch (Exception e) {
                   log.error(e.getMessage());
            }

            huc.disconnect();
        } catch (MalformedURLException me) {
            log.info("Malformed URL:" + link.getLink());
            notValid = true;
        } catch (IOException e) {
            log.info("Refused connection | Does not exist:" + link.getLink());
            notValid = true;
        }
        if (notValid) {
            link.setApproved(false);
            link.setDateApproved(null);
            notValidLinks.add(linkService.save(link));

        }
        log.debug("URL Finieshed!");
    }
}
List all=linkService.getAllLinks();
notValidLinks=newlinkedlist();
最终ArrayBlockingQueue=新的ArrayBlockingQueue(39867);
int poolSize=90;
int maxPoolSize=100;
长keepAliveTime=40;
ThreadPoolExecutor tpe=新的ThreadPoolExecutor(poolSize,maxPoolSize,
keepAliveTime,TimeUnit.SECONDS,queue);
用于(链接:全部){
线程任务=新检查站点(链接);
执行(任务);
System.out.println(“任务计数:+queue.size());
}
类CheckSite扩展线程{
链接;
检查站点(链接){
this.link=link;
}
公开募捐{
布尔notValid=false;
试一试{
log.info(link.getLink()+“”+link.getId());
URL u=新URL(link.getLink());
HttpURLConnection huc=(HttpURLConnection)u.openConnection();
HttpURLConnection.setFollowRedirects(false);
设置连接超时(40000);
huc.setRequestMethod(“GET”);
setRequestProperty(“用户代理”、“Mozilla/5.0(Windows;U;Windows NT 6.0;en-US;rv:1.9.1.2)Gecko/20090729 Firefox/3.5.2(.NET CLR 3.5.30729)”;
huc.connect();
int code=huc.getResponseCode();
如果(代码!=HttpURLConnection.HTTP\u正常
&&代码!=HttpURLConnection.HTTP\u MOVED\u PERM
&&代码!=HttpURLConnection.HTTP_MOVED_TEMP){
notValid=true;
log.info(“无效代码:“+code+”-“+link.getLink()”);
}
if(code==HttpURLConnection.HTTP\u MOVED\u PERM){
log.info(link.getLink()+“Perm move”);
}
if(code==HttpURLConnection.HTTP\u MOVED\u TEMP){
log.info(link.getLink()+“临时移动”);
}
试一试{
如果(!无效){
BufferedReader=新的BufferedReader(新的InputStreamReader(huc.getInputStream());
StringBuilder StringBuilder=新的StringBuilder();
弦线;
而((line=reader.readLine())!=null){
stringBuilder.append(行);
}
notValid=StringUtils.containsIgnoreCase(Jsoup.parse(stringBuilder.toString()).text(),“相关搜索”);
}
}捕获(例外e){
log.error(例如getMessage());
}
huc.disconnect();
}catch(格式错误的durlexception me){
log.info(“格式错误的URL:+link.getLink());
notValid=true;
}捕获(IOE异常){
log.info(“拒绝的连接”不存在:“+link.getLink());
notValid=true;
}
如果(无效){
link.setApproved(假);
link.setDateApproved(空);
添加(linkService.save(link));
}
调试(“URL完成!”);
}
}
我想改进它,以便它检测所有无效的页面,包括出售的域名

我怀疑突出显示的部分是不切实际的。蜘蛛怎么能分辨出一个域名正在出售

跟进

@Mat Banik建议寻找特定短语或检查DNS记录作为可能的解决方案

  • 检查特定短语的启发式方法将给出误报和误报

  • 在Java中,检查DNS记录很棘手。您可以在URL的主机名部分执行简单的DNS查找,并根据已知的DNS站点IP列表检查生成的IP地址。但这并不能告诉您原始主机名是否真的在出售。它可能是一个托管在同一基础设施上的真实站点。。。或者是一个不出售的域名

但我想,如果您准备接受一些误报和漏报,那么尝试筛选出要出售的域是可行的。

签出。这将帮助您快速、高效地查找内存。Bloom filter的问题是它会出现误报,也就是说,对于不存在的内容,它会判断为真。但是如果Bloom filter说为假这肯定是假的