Java-带ExecutorService的多线程爬虫程序

Java-带ExecutorService的多线程爬虫程序,java,multithreading,executorservice,Java,Multithreading,Executorservice,我正在用Java制作一个爬虫。我制作了一个单线程爬虫来访问一个页面并获取该页面上的所有链接。现在我想让它多线程但面临困难。一开始,我从页面的单个链接开始,爬过其中的所有链接,现在我想运行一个ExecutorService,其中线程从未访问的链接中获取一个url开始,并开始处理它,就像使用单线程爬虫一样,还有一些线程做同样的事情。下面是我创建的爬虫类,它实现了Runnable,使其成为线程: import java.net.URI; import java.util.ArrayList; impo

我正在用Java制作一个爬虫。我制作了一个单线程爬虫来访问一个页面并获取该页面上的所有链接。现在我想让它多线程但面临困难。一开始,我从页面的单个链接开始,爬过其中的所有链接,现在我想运行一个ExecutorService,其中线程从未访问的链接中获取一个url开始,并开始处理它,就像使用单线程爬虫一样,还有一些线程做同样的事情。下面是我创建的爬虫类,它实现了Runnable,使其成为线程:

import java.net.URI;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class MyCrawler implements Runnable {
    volatile static int counter =0;
    String originaUrl, currentUrl;
    List<String> unvisitedLinks = new ArrayList<>();
    Set<String> visitedLinks = new HashSet<>();
    URI uri;
    ExecutorService executor = null;
    int pagesVisited = 0;


    public MyCrawler(String url) {
        this.originaUrl = url;
        unvisitedLinks.add(url);
         this.uri = URI.create(url);
    }

    @Override
    public void run() {
        do{
            try{
                executor = Executors.newFixedThreadPool(10);
                String url; 
                synchronized (this) {
                    url = unvisitedLinks.get(0);
                    while (unvisitedLinks.contains(url)) {
                        unvisitedLinks.remove(url);
                    }
                }
                //Visit this page and fetch all the links;
                VisitPage(url);

                visitedLinks.add(url);

                for(int i = 0; i< 10; i++){
                    synchronized (this) {
                        url = unvisitedLinks.get(i);
                        while (unvisitedLinks.contains(url)) {
                            unvisitedLinks.remove(url);
                        }
                    }
                    Runnable worker = new MyCrawler(url);
                    executor.execute(worker);
                }

                executor.shutdown();
                while(!executor.isTerminated()){ //WAIT FOR EXECUTOR TO FINISH

                }
                executor = null;
            }catch(Exception e){
                e.printStackTrace();
            }

        }while(unvisitedLinks.size() != 0);
        System.out.println("total pages visited: " + counter);
        System.out.println("TOTAL LINKS FOUND " + visitedLinks.size());

        for(String s: visitedLinks){
            System.out.println(s + "\n");
        }
    }

    private void VisitPage(String url){

        List<String> linksOnthisPage = new ArrayList<>();

        if(!visitedLinks.contains(url)){
            if(!url.contains("javascript") && !url.contains("#")){

                try{
                    Document doc = Jsoup.connect(url).timeout(0).get();
                    Elements linkTags = doc.select("a[href]");

                    for(Element e : linkTags){
                        String link = e.attr("href");
                        if(!visitedLinks.contains(link) && !link.contains("#") && !link.contains("javascript") && !link.equals(url)){
                            if(link.startsWith("http") || link.startsWith("www")){
                                if(link.contains(uri.getHost())){
                                    linksOnthisPage.add(link);
                                }else{
                                    System.out.println("SOME OTHER WEBSITE -- " + link);
                                }

                            }else if(link.startsWith("/")){
                                link = url + link.substring(1, link.length());
                                linksOnthisPage.add(link);
                            }else{
                                System.out.println("LINK IGNORED DUE TO  -- " + url);
                            }
                        }else{
                            System.out.println("LINK IGNORED -- " + url);
                        }
                    }
                    System.out.println("\n\nLinks found in \"" + url+ "\" : " + linksOnthisPage.size());
                    unvisitedLinks.addAll(linksOnthisPage);
                    System.out.println("UNVISITED LINKS NOW: " + unvisitedLinks.size());
                }catch(Exception e){
                    System.out.println("EXCEPTION -- " + url);
                    return;
                }
            }else{
                System.out.println("UNWANTED URL -- " + url);
            }
        }else{
            System.out.println("LINK VISITED -- " + url);
        }
    }

}

另外,在这段代码中你可能会犯很多错误。请尽可能地纠正我。

我认为您需要做的是在Runnable中只处理url访问部分,这意味着Runnable类将是这样的:

public class MyCrawler implements Runnable {

    URI uri;



    public MyCrawler(String url) {
         this.uri = URI.create(url);
    }

    @Override
    public void run() {

        try{
            VisitPage(url);

        }catch(Exception e){
            e.printStackTrace();
        }


    }

    private void VisitPage(String url){

        List<String> linksOnthisPage = new ArrayList<>();

        if(!url.contains("javascript") && !url.contains("#")){

            try{
                Document doc = Jsoup.connect(url).timeout(0).get();
                Elements linkTags = doc.select("a[href]");

                for(Element e : linkTags){
                    String link = e.attr("href");
                    if(!link.contains("#") && !link.contains("javascript") && !link.equals(url)){
                        if(link.startsWith("http") || link.startsWith("www")){
                            if(link.contains(uri.getHost())){
                                linksOnthisPage.add(link);
                            }else{
                                System.out.println("SOME OTHER WEBSITE -- " + link);
                            }

                        }else if(link.startsWith("/")){
                            link = url + link.substring(1, link.length());
                            linksOnthisPage.add(link);
                        }else{
                            System.out.println("LINK IGNORED DUE TO  -- " + url);
                        }
                    }else{
                        System.out.println("LINK IGNORED -- " + url);
                    }
                }
                System.out.println("\n\nLinks found in \"" + url+ "\" : " + linksOnthisPage.size());

            }catch(Exception e){
                System.out.println("EXCEPTION -- " + url);
                return;
            }
        }else{
            System.out.println("UNWANTED URL -- " + url);
        }
    }

}

我认为您需要做的是在Runnable中只处理url访问部分,这意味着Runnable类将是这样的:

public class MyCrawler implements Runnable {

    URI uri;



    public MyCrawler(String url) {
         this.uri = URI.create(url);
    }

    @Override
    public void run() {

        try{
            VisitPage(url);

        }catch(Exception e){
            e.printStackTrace();
        }


    }

    private void VisitPage(String url){

        List<String> linksOnthisPage = new ArrayList<>();

        if(!url.contains("javascript") && !url.contains("#")){

            try{
                Document doc = Jsoup.connect(url).timeout(0).get();
                Elements linkTags = doc.select("a[href]");

                for(Element e : linkTags){
                    String link = e.attr("href");
                    if(!link.contains("#") && !link.contains("javascript") && !link.equals(url)){
                        if(link.startsWith("http") || link.startsWith("www")){
                            if(link.contains(uri.getHost())){
                                linksOnthisPage.add(link);
                            }else{
                                System.out.println("SOME OTHER WEBSITE -- " + link);
                            }

                        }else if(link.startsWith("/")){
                            link = url + link.substring(1, link.length());
                            linksOnthisPage.add(link);
                        }else{
                            System.out.println("LINK IGNORED DUE TO  -- " + url);
                        }
                    }else{
                        System.out.println("LINK IGNORED -- " + url);
                    }
                }
                System.out.println("\n\nLinks found in \"" + url+ "\" : " + linksOnthisPage.size());

            }catch(Exception e){
                System.out.println("EXCEPTION -- " + url);
                return;
            }
        }else{
            System.out.println("UNWANTED URL -- " + url);
        }
    }

}

在开始之前我们需要考虑几件事:

  • 如何避免重访已经看到的页面
  • 何时终止线程池
  • 如何通知主线程所有任务都已完成,以便主线程可以输出站点地图
  • 为了解决第一个问题,我们需要记住一个链接是否被地图访问过

    为了解决第二个问题,我们需要一个计数器,每次提交新任务时增加1,每次完成任务时减少1。当计数器为零时,表示我们已完成所有任务

    为了解决第三个问题,我们需要在主线程和线程池之间建立某种同步机制

    现在我们有了这样的解决方案

    public class Worker {
    
    public static final Logger logger = LoggerFactory.getLogger(Worker.class);
    
    private ConcurrentHashMap<String, Boolean> visited = new ConcurrentHashMap<>();
    private ConcurrentHashMap<String, Set<String>> graph = new ConcurrentHashMap<>();
    
    private final String domain;
    private final ExecutorService executorService = Executors.newFixedThreadPool(8);
    
    public final AtomicInteger counter = new AtomicInteger(0);
    private final CountDownLatch done;
    
    public Worker(String domain, CountDownLatch done) {
        this.domain = domain;
        this.done = done;
    }
    
    public void start() {
        executorService.submit(new CrawlTask(domain));
    }
    
    public ConcurrentHashMap<String, Set<String>> getGraph() {
        return graph;
    }
    
    public class CrawlTask implements Runnable {
        public final String url;
    
        public CrawlTask(String url) {
            this.url = url;
        }
    
        @Override
        public void run() {
            logger.info("remaining tasks: {}, visiting {}", counter.get(), url);
    
            Document doc;
    
            try {
                doc = Jsoup.connect(url).timeout(5000).get();
            } catch (IOException e) {
                logger.warn("URL: {}, {}", url, e.toString());
                return;
            }
    
            Set<String> links = doc.select("a").stream().map(e -> e.attr("abs:href"))
                    .filter(l -> Utils.isInDomain(l, domain))
                    .map(Utils::trimURL)
                    .collect(Collectors.toSet());
    
            graph.put(url, links);
    
            for (String link : links) {
                if (!visited.getOrDefault(link, false)) {
                    visited.put(link, true);
                    counter.getAndIncrement();
                    executorService.submit(new CrawlTask(link));
                }
            }
    
            int n = counter.getAndDecrement();
            if (n == 0) {
                executorService.shutdown();
                try {
                    executorService.awaitTermination(1, TimeUnit.SECONDS);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                } finally {
                    done.countDown();
                }
            }
        }
    }}
    
    公共类工作者{
    公共静态最终记录器Logger=LoggerFactory.getLogger(Worker.class);
    访问的私有ConcurrentHashMap=新建ConcurrentHashMap();
    私有ConcurrentHashMap图形=新ConcurrentHashMap();
    私有最终字符串域;
    private final ExecutorService ExecutorService=Executors.newFixedThreadPool(8);
    公共最终AtomicInteger计数器=新的AtomicInteger(0);
    私人最后倒计时结束;
    公共工作者(字符串域,倒数锁存完成){
    this.domain=域;
    this.done=done;
    }
    公开作废开始(){
    提交(新任务(域));
    }
    公共ConcurrentHashMap getGraph(){
    返回图;
    }
    公共类爬虫任务实现可运行{
    公共最终字符串url;
    公共爬网任务(字符串url){
    this.url=url;
    }
    @凌驾
    公开募捐{
    info(“剩余任务:{},访问{}”,counter.get(),url);
    文件文件;
    试一试{
    doc=Jsoup.connect(url).timeout(5000.get();
    }捕获(IOE异常){
    warn(“URL:{},{}”,URL,e.toString());
    返回;
    }
    设置links=doc.select(“a”).stream().map(e->e.attr(“abs:href”))
    .filter(l->Utils.isInDomain(l,域))
    .map(Utils::trimURL)
    .collect(收集器.toSet());
    放置(url、链接);
    用于(字符串链接:链接){
    如果(!visted.getOrDefault(link,false)){
    已访问。放置(链接,true);
    counter.getAndIncrement();
    提交(新任务(链接));
    }
    }
    int n=计数器。getAndDecrement();
    如果(n==0){
    executorService.shutdown();
    试一试{
    执行器服务。等待终止(1,时间单位。秒);
    }捕捉(中断异常e){
    e、 printStackTrace();
    }最后{
    完成。倒计时();
    }
    }
    }
    }}
    
    主要功能是什么

    public class CounterApp {
    public static void main(String[] args) throws InterruptedException {
        CountDownLatch doneSignal = new CountDownLatch(1);
        String domain = "https://example.com";
        Worker worker = new Worker(domain, doneSignal);
        worker.start();
        doneSignal.await();
    
        Map<String, Set<String>> graph = worker.getGraph();
        graph.forEach((k, v) -> {
            System.out.println(k + ": ");
            v.forEach(l -> System.out.println("   " + l));
        });
    }}
    
    公共类计数器应用程序{
    公共静态void main(字符串[]args)引发InterruptedException{
    CountDownLatch doneSignal=新的CountDownLatch(1);
    字符串域=”https://example.com";
    工作者=新工作者(域,doneSignal);
    worker.start();
    doneSignal.wait();
    Map-graph=worker.getGraph();
    图.forEach((k,v)->{
    System.out.println(k+“:”);
    v、 forEach(l->System.out.println(“+l”);
    });
    }}
    

    <>源代码

    在开始之前我们需要考虑几件事:

  • 如何避免重访已经看到的页面
  • 何时终止线程池
  • 如何通知主线程所有任务都已完成,以便主线程可以输出站点地图
  • 为了解决第一个问题,我们需要记住一个链接是否被地图访问过

    为了解决第二个问题,我们需要一个计数器,每次提交新任务时增加1,每次完成任务时减少1。当计数器为零时,表示我们已完成所有任务

    为了解决第三个问题,我们需要在主线程和线程池之间建立某种同步机制

    现在我们有了这样的解决方案

    public class Worker {
    
    public static final Logger logger = LoggerFactory.getLogger(Worker.class);
    
    private ConcurrentHashMap<String, Boolean> visited = new ConcurrentHashMap<>();
    private ConcurrentHashMap<String, Set<String>> graph = new ConcurrentHashMap<>();
    
    private final String domain;
    private final ExecutorService executorService = Executors.newFixedThreadPool(8);
    
    public final AtomicInteger counter = new AtomicInteger(0);
    private final CountDownLatch done;
    
    public Worker(String domain, CountDownLatch done) {
        this.domain = domain;
        this.done = done;
    }
    
    public void start() {
        executorService.submit(new CrawlTask(domain));
    }
    
    public ConcurrentHashMap<String, Set<String>> getGraph() {
        return graph;
    }
    
    public class CrawlTask implements Runnable {
        public final String url;
    
        public CrawlTask(String url) {
            this.url = url;
        }
    
        @Override
        public void run() {
            logger.info("remaining tasks: {}, visiting {}", counter.get(), url);
    
            Document doc;
    
            try {
                doc = Jsoup.connect(url).timeout(5000).get();
            } catch (IOException e) {
                logger.warn("URL: {}, {}", url, e.toString());
                return;
            }
    
            Set<String> links = doc.select("a").stream().map(e -> e.attr("abs:href"))
                    .filter(l -> Utils.isInDomain(l, domain))
                    .map(Utils::trimURL)
                    .collect(Collectors.toSet());
    
            graph.put(url, links);
    
            for (String link : links) {
                if (!visited.getOrDefault(link, false)) {
                    visited.put(link, true);
                    counter.getAndIncrement();
                    executorService.submit(new CrawlTask(link));
                }
            }
    
            int n = counter.getAndDecrement();
            if (n == 0) {
                executorService.shutdown();
                try {
                    executorService.awaitTermination(1, TimeUnit.SECONDS);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                } finally {
                    done.countDown();
                }
            }
        }
    }}
    
    公共类工作者{
    公共静态最终记录器Logger=LoggerFactory.getLogger(Worker.class);
    访问的私有ConcurrentHashMap=新建ConcurrentHashMap();
    私有ConcurrentHashMap图形=新ConcurrentHashMap();
    私有最终字符串域;
    private final ExecutorService ExecutorService=Executors.newFixedThreadPool(8);
    公共最终AtomicInteger计数器=新的AtomicInteger(0);
    私人最后倒计时结束;
    公共工作者(字符串域,倒数锁存完成){
    this.domain=域;
    this.done=done;
    }
    公开作废开始(){
    提交(新任务(域));
    }
    公共ConcurrentHashMap getGraph(){
    返回图;
    }
    公共类爬虫任务实现可运行{
    公共最终字符串url;
    公众爬虫