Java 在我的网络爬虫中尝试运行几天时出现许多内存错误

Java 在我的网络爬虫中尝试运行几天时出现许多内存错误,java,memory,memory-management,out-of-memory,Java,Memory,Memory Management,Out Of Memory,我正在开发一个网络爬虫应用程序。当我运行程序时,我收到以下错误消息: 我在运行程序3个多小时后出现了这些错误。我试图通过将eclipse.ini设置更改为2048MB内存来分配内存,但在3小时或更短的时间后仍然会出现相同的错误。我应该连续运行程序2-3天以上,以分析结果 你能告诉我我在这里遗漏了什么来获得下面的错误吗 这些是我的课程: seeds.txt http://www.stanford.edu http://www.archive.org WebCrawler.java packa

我正在开发一个网络爬虫应用程序。当我运行程序时,我收到以下错误消息:


我在运行程序3个多小时后出现了这些错误。我试图通过将eclipse.ini设置更改为2048MB内存来分配内存,但在3小时或更短的时间后仍然会出现相同的错误。我应该连续运行程序2-3天以上,以分析结果

你能告诉我我在这里遗漏了什么来获得下面的错误吗

这些是我的课程:

seeds.txt

http://www.stanford.edu
http://www.archive.org
WebCrawler.java

 package pkg.crawler;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.TimeUnit;

import org.jsoup.HttpStatusException;
import org.jsoup.UnsupportedMimeTypeException;
import org.joda.time.DateTime;


public class WebCrawler {

public static Queue <LinkNodeLight> queue = new PriorityBlockingQueue <> (); // priority queue
public static final int n_threads = 5;                                 // amount of threads
private static Set<String> processed = new LinkedHashSet <> ();         // set of processed urls
private PrintWriter out;                                                // output file
private PrintWriter err;                                                // error file
private static Integer cntIntra = new Integer (0);                              // counters for intra- links in the queue
private static Integer cntInter = new Integer (0);                              // counters for inter- links in the queue
private static Integer dub = new Integer (0);                                   // amount of skipped urls

public static void main(String[] args) throws Exception {
    System.out.println("Running web crawler: " + new Date());

    WebCrawler webCrawler = new WebCrawler();
    webCrawler.createFiles();
    try (Scanner in = new Scanner(new File ("seeds.txt"))) {
        while (in.hasNext()) {
            webCrawler.enque(new LinkNode (in.nextLine().trim()));
        }
    } catch (IOException e) {
        e.printStackTrace();
        return;
    }
    webCrawler.processQueue();
    webCrawler.out.close();
    webCrawler.err.close();
}

public void processQueue(){
    /* run in threads */
    Runnable r = new Runnable() {
        @Override 
        public void run() {
            /* queue may be empty but process is not finished, that's why we need to check if any links are being processed */
            while (true) {
                LinkNode link = deque();
                if (link == null)
                    continue;
                link.setStartTime(new DateTime());
                boolean process = processLink(link);
                link.setEndTime(new DateTime());
                if (!process)
                    continue;
                /* print the data to the csv file */
                if (link.getStatus() != null && link.getStatus().equals(LinkNodeStatus.OK)) {
                    synchronized(out) {
                        out.println(getOutputLine(link));
                        out.flush();
                    }
                } else {
                    synchronized(err) {
                        err.println(getOutputLine(link));
                        err.flush();
                    }
                }
            }
        }
    };
    /* run n_threads threads which perform dequeue and process */
    LinkedList <Thread> threads = new LinkedList <> ();
    for (int i = 0; i < n_threads; i++) {
        threads.add(new Thread(r));
        threads.getLast().start();
    }
    for (Thread thread : threads) {
        try {
            thread.join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}


/* returns true if link was actually processed */
private boolean processLink(LinkNode inputLink) {
    String url = getUrlGeneralForm(inputLink);
    boolean process = true;
    synchronized (processed) {
        if (processed.contains(url)) {
            process = false;
            synchronized (dub) {dub++;}
        } else
            processed.add(url);
    }
    /* start processing only if the url have not been processed yet or not being processed */
    if (process) {
        System.out.println("Processing url " + url);
        List<LinkNodeLight> outputLinks = parseAndWieghtResults(inputLink);
        for (LinkNodeLight outputLink : outputLinks) {
            String getUrlGeneralForumOutput = getUrlGeneralForm(outputLink);
            /* add the new link to the queue only if it has not been processed yet */
            process = true;
            synchronized (processed) {
                if (processed.contains(getUrlGeneralForumOutput)) {
                    process = false;
                    synchronized (dub) {dub++;}
                }
            }
            if (process) {
                enque(outputLink);
            }
        }
        return true;
    }
    return false;
}

void enque(LinkNodeLight link){
    link.setEnqueTime(new DateTime());
    /* the add method requires implicit priority */
    synchronized (queue) {
        if (link.interLinks)
            synchronized (cntInter) {cntInter++;}
        else
            synchronized (cntIntra) {cntIntra++;}
      //queue.add(link, 100 - (int)(link.getWeight() * 100.f));
        queue.add(link);
    }
}


/**
 * Picks an element from the queue
 * @return top element from the queue or null if the queue is empty
 */
LinkNode deque(){
    /* link must be checked */
    LinkNode link = null;
    synchronized (queue) {
        link = (LinkNode) queue.poll();
        if (link != null) {
            link.setDequeTime(new DateTime());
            if (link.isInterLinks())
                synchronized (cntInter) {cntInter--;}
            else
                synchronized (cntIntra) {cntIntra--;}
        }
    }
    return link;
}

private void createFiles() {
    /* create output file */
    try {
        out = new PrintWriter(new BufferedWriter(new FileWriter("CrawledURLS.csv", false)));
        out.println(generateHeaderFile());
    } catch (IOException e) {
        System.err.println(e);
    }
    /* create error file */
    try {
        err = new PrintWriter(new BufferedWriter(new FileWriter("CrawledURLSERROR.csv", false)));
        err.println(generateHeaderFile());
    } catch (IOException e) {
        System.err.println(e);
    }
}
/**
 * formats the string so it can be valid entry in csv file
 * @param s
 * @return
 */
private static String format(String s) {
    // replace " by ""
    String ret = s.replaceAll("\"", "\"\"");
    // put string into quotes
    return "\"" + ret + "\"";
}
/**
 * Creates the line that needs to be written in the outputfile
 * @param link
 * @return
 */
public static String getOutputLine(LinkNode link){
    StringBuilder builder = new StringBuilder();
    builder.append(link.getParentLink()!=null ? format(link.getParentLink().getUrl()) : "");
    builder.append(",");
    builder.append(link.getParentLink()!=null ? link.getParentLink().getIpAdress() : "");
    builder.append(",");
    builder.append(link.getParentLink()!=null ? link.getParentLink().linkProcessingDuration() : "");
    builder.append(",");
    builder.append(format(link.getUrl()));
    builder.append(",");
    builder.append(link.getDomain());
    builder.append(",");
    builder.append(link.isInterLinks());
    builder.append(",");
    builder.append(Util.formatDate(link.getEnqueTime()));
    builder.append(",");
    builder.append(Util.formatDate(link.getDequeTime()));
    builder.append(",");
    builder.append(link.waitingInQueue());
    builder.append(",");
    builder.append(queue.size());
    /* Inter and intra links in queue */
    builder.append(",");
    builder.append(cntIntra.toString());
    builder.append(",");
    builder.append(cntInter.toString());
    builder.append(",");
    builder.append(dub);
    builder.append(",");
    builder.append(new Date ());
    /* URL size*/
    builder.append(",");
    builder.append(link.getSize());
    /* HTML file
    builder.append(",");
    builder.append(link.getFileName());*/
    /* add HTTP error */
    builder.append(",");
    if (link.getParseException() != null) {
        if (link.getParseException() instanceof HttpStatusException)
            builder.append(((HttpStatusException) link.getParseException()).getStatusCode());
        if (link.getParseException() instanceof SocketTimeoutException)
            builder.append("Time out");
        if (link.getParseException() instanceof MalformedURLException)
            builder.append("URL is not valid");
        if (link.getParseException() instanceof UnsupportedMimeTypeException)
            builder.append("Unsupported mime type: " + ((UnsupportedMimeTypeException)link.getParseException()).getMimeType());
    }
    return builder.toString();

}

/**
 * generates the Header for the file
 * @param link
 * @return
 */
private String generateHeaderFile(){
    StringBuilder builder = new StringBuilder();
    builder.append("Seed URL");
    builder.append(",");
    builder.append("Seed IP");
    builder.append(",");
    builder.append("Process Duration");
    builder.append(",");
    builder.append("Link URL");
    builder.append(",");
    builder.append("Link domain");
    builder.append(",");
    builder.append("Link IP");
    builder.append(",");
    builder.append("Enque Time");
    builder.append(",");
    builder.append("Deque Time");
    builder.append(",");
    builder.append("Waiting in the Queue");
    builder.append(",");
    builder.append("QueueSize");
    builder.append(",");
    builder.append("Intra in queue");
    builder.append(",");
    builder.append("Inter in queue");
    builder.append(",");
    builder.append("Dublications skipped");
    /* time was printed, but no header was */
    builder.append(",");
    builder.append("Time");
    /* URL size*/
    builder.append(",");
    builder.append("Size bytes");
    /* HTTP errors */
    builder.append(",");
    builder.append("HTTP error");
    return builder.toString();

}



String getUrlGeneralForm(LinkNodeLight link){
    String url = link.getUrl();
    if (url.endsWith("/")){
        url = url.substring(0, url.length() - 1);
    }
    return url;
}


private List<LinkNodeLight> parseAndWieghtResults(LinkNode inputLink) {
    List<LinkNodeLight> outputLinks = HTMLParser.parse(inputLink);
    if (inputLink.hasParseException()) {
        return outputLinks;
    } else {
        return URLWeight.weight(inputLink, outputLinks);
    }
}
}
package pkg.crawler;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.math.BigInteger;
import java.util.Formatter;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.logging.Logger;
import java.security.*;
import java.nio.file.Path;
import java.nio.file.Paths;


public class HTMLParser {

private static final int READ_TIMEOUT_IN_MILLISSECS = (int) TimeUnit.MILLISECONDS.convert(30, TimeUnit.SECONDS);
private static HashMap <String, Integer> filecounter = new HashMap<> ();


public static List<LinkNodeLight> parse(LinkNode inputLink){
    List<LinkNodeLight> outputLinks = new LinkedList<>();
    try {
        inputLink.setIpAdress(IpFromUrl.getIp(inputLink.getUrl()));
        String url = inputLink.getUrl();
        if (inputLink.getIpAdress() != null) {
            url.replace(URLWeight.getHostName(url), inputLink.getIpAdress());
        }
        Document parsedResults =  Jsoup
                .connect(url)
                .timeout(READ_TIMEOUT_IN_MILLISSECS)
                .get();
        inputLink.setSize(parsedResults.html().length());
        /* IP address moved here in order to speed up the process */
        inputLink.setStatus(LinkNodeStatus.OK);
        inputLink.setDomain(URLWeight.getDomainName(inputLink.getUrl()));
        if (true) {
            /* save the file to the html */
            String filename = parsedResults.title();//digestBig.toString(16) + ".html";
            if (filename.length() > 24) {
                filename = filename.substring(0, 24);
            }
            filename = filename.replaceAll("[^\\w\\d\\s]", "").trim();
            filename = filename.replaceAll("\\s+",  " ");

            if (!filecounter.containsKey(filename)) {
                filecounter.put(filename, 1);
            } else {
                Integer tmp = filecounter.remove(filename);
                filecounter.put(filename, tmp + 1);
            }
            filename = filename + "-" + (filecounter.get(filename)).toString() + ".html";
            filename = Paths.get("downloads", filename).toString();
            inputLink.setFileName(filename);
            /* use md5 of url as file name */
            try (PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(filename)))) {
                out.println("<!--" + inputLink.getUrl() + "-->");
                out.print(parsedResults.html());
                out.flush();
                out.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        String tag;
        Elements tagElements;
        List<LinkNode> result;


        tag = "a[href";
        tagElements = parsedResults.select(tag);
        result = toLinkNodeObject(inputLink, tagElements, tag);
        outputLinks.addAll(result);


        tag = "area[href";
        tagElements = parsedResults.select(tag);
        result = toLinkNodeObject(inputLink, tagElements, tag);
        outputLinks.addAll(result);
    } catch (IOException e) {
        inputLink.setParseException(e);
        inputLink.setStatus(LinkNodeStatus.ERROR);
    }

    return outputLinks;
}


static List<LinkNode> toLinkNodeObject(LinkNode parentLink, Elements tagElements, String tag) {
    List<LinkNode> links = new LinkedList<>();
    for (Element element : tagElements) {

        if(isFragmentRef(element)){
            continue;
        }

        String absoluteRef = String.format("abs:%s", tag.contains("[") ? tag.substring(tag.indexOf("[") + 1, tag.length()) : "href");
        String url = element.attr(absoluteRef);

        if(url!=null && url.trim().length()>0) {
            LinkNode link = new LinkNode(url);
            link.setTag(element.tagName());
            link.setParentLink(parentLink);
            links.add(link);
        }
    }
    return links;
}

static boolean isFragmentRef(Element element){
    String href = element.attr("href");
    return href!=null && (href.trim().startsWith("#") || href.startsWith("mailto:"));
}
package pkg.crawler;

import java.util.Date;

import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;


public class Util {

private static DateTimeFormatter formatter;
static {



    formatter =   DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss:SSS");


}


public static String linkToString(LinkNode inputLink){


    return String.format("%s\t%s\t%s\t%s\t%s\t%s",
            inputLink.getUrl(),
            inputLink.getWeight(),
            formatDate(inputLink.getEnqueTime()),
            formatDate(inputLink.getDequeTime()),
            differenceInMilliSeconds(inputLink.getEnqueTime(), inputLink.getDequeTime()),
            inputLink.getParentLink()==null?"":inputLink.getParentLink().getUrl()
    );
}

public static String linkToErrorString(LinkNode inputLink){

    return String.format("%s\t%s\t%s\t%s\t%s\t%s",
            inputLink.getUrl(),
            inputLink.getWeight(),
            formatDate(inputLink.getEnqueTime()),
            formatDate(inputLink.getDequeTime()),
            inputLink.getParentLink()==null?"":inputLink.getParentLink().getUrl(),
            inputLink.getParseException().getMessage()
    );
}


public static String formatDate(DateTime date){
    return formatter.print(date);
}

public static long differenceInMilliSeconds(DateTime dequeTime, DateTime enqueTime){
    return (dequeTime.getMillis()- enqueTime.getMillis());
}

public static int differenceInSeconds(Date enqueTime, Date dequeTime){
    return (int)((dequeTime.getTime()/1000) - (enqueTime.getTime()/1000));
}

public static int differenceInMinutes(Date enqueTime, Date dequeTime){
    return (int)((dequeTime.getTime()/60000) - (enqueTime.getTime()/60000));
}

}
package pkg.crawler;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;

public class URLWeight {

public static List<LinkNodeLight> weight(LinkNode sourceLink, List<LinkNodeLight> links) {

    List<LinkNodeLight> interLinks = new LinkedList<>();
    List<LinkNodeLight> intraLinks = new LinkedList<>();

    for (LinkNodeLight link : links) {
        if (isIntraLink(sourceLink, link)) {
            intraLinks.add(link);
            link.setInterLinks(false);
        } else {
            interLinks.add(link);
            link.setInterLinks(true);
        }
    }



static boolean isIntraLink(LinkNodeLight sourceLink, LinkNodeLight link){

    String parentDomainName = getHostName(sourceLink.getUrl());

    String childDomainName = getHostName(link.getUrl());
    return parentDomainName.equalsIgnoreCase(childDomainName);
}

public static String getHostName(String url) {
    if(url == null){
    //  System.out.println("Deneme");
        return "";

    }

    String domainName = new String(url);

    int index = domainName.indexOf("://");
    if (index != -1) {

        domainName = domainName.substring(index + 3);
    }
    for (int i = 0; i < domainName.length(); i++)
        if (domainName.charAt(i) == '?' || domainName.charAt(i) == '/') {
            domainName = domainName.substring(0, i);
            break;
        }

    /*if (index != -1) {

        domainName = domainName.substring(0, index);
    }*/

    /* have to keep www in order to do replacements with IP */
    //domainName = domainName.replaceFirst("^www.*?\\.", "");

    return domainName;
}
public static String getDomainName(String url) {
    String [] tmp= getHostName(url).split("\\.");
    if (tmp.length == 0)
        return "";
    return tmp[tmp.length - 1];
}


}
package pkg.crawler;

import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class PingTaskManager {

private static ExecutorService executor = Executors.newFixedThreadPool(100);

public  static void ping (LinkNode e) {
    executor.submit(new PingTaks(e));
}


}

class PingTaks implements Runnable {
 private LinkNode link;
public PingTaks( LinkNode link ) {

}

@Override
public void run() {
    /* link.ping(); */      
}


}
package pkg.crawler;

public enum LinkNodeStatus {
OK,
ERROR

}
package pkg.crawler;

import org.joda.time.DateTime;

public class LinkNodeLight implements Comparable<LinkNodeLight> {
protected String url;
protected float weight;
protected DateTime enqueTime;
protected boolean interLinks;

public String getUrl() {
    return url;
}

public float getWeight() {
    return weight;
}

public void setWeight(float weight) {
    this.weight = weight;
}

public DateTime getEnqueTime() {
    return enqueTime;
}


public LinkNodeLight(String url) {
    this.url = url;
}


public void setEnqueTime(DateTime enqueTime) {
    this.enqueTime = enqueTime;
}

@Override
public int compareTo(LinkNodeLight link) {

    if (this.weight < link.weight) return 1;
     else if (this.weight > link.weight) return -1;
        return 0;

    }
}
package pkg.crawler;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.Socket;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Date;



import org.joda.time.DateTime;


public class LinkNode extends LinkNodeLight{
public LinkNode(String url) {
    super(url);
}

private String tag;
private LinkNode parentLink;
private IOException parseException = null; // initialize parse Exception with null
private float weight;
private DateTime dequeTime;
private DateTime startTime;
private DateTime endTime;
private LinkNodeStatus status;
private String ipAdress;
private int size;
private String filename;
private String domain;

public DateTime getStartTime() {
    return startTime;
}

public void setStartTime(DateTime startTime) {
    this.startTime = startTime;
}

public DateTime getEndTime() {
    return endTime;
}

public void setEndTime(DateTime endTime) {
    this.endTime = endTime;
}

public DateTime getDequeTime() {
    return dequeTime;
}

public String getTag() {
    return tag;
}

public LinkNode getParentLink() {
    return parentLink;
}

public Exception getParseException() {
    return parseException;
}

public boolean hasParseException(){
    return parseException!=null;
}


public void setDequeTime(DateTime dequeTime) {
    this.dequeTime = dequeTime;
}

public void setTag(String tag) {
    this.tag = tag;
}

public void setParentLink(LinkNode parentLink) {
    this.parentLink = parentLink;
}

public void setParseException(IOException parseException) {
    this.parseException = parseException;
}

@Override
public boolean equals(Object o) {
    if (this == o) {
        return true;
    }
    if (o == null || getClass() != o.getClass()) {
        return false;
    }

    LinkNode link = (LinkNode) o;

    if (url != null ? !url.equals(link.url) : link.url != null) {
        return false;
    }

    return true;
}

@Override
public int hashCode() {
    return url != null ? url.hashCode() : 0;
}

public long waitingInQueue(){
    return Util.differenceInMilliSeconds( dequeTime,enqueTime );
}

public long linkProcessingDuration(){
    return Util.differenceInMilliSeconds( endTime,startTime );
}

@Override
public String toString() {
    StringBuilder sb = new StringBuilder("LinkNode{");
    sb.append("url='").append(url).append('\'');
    sb.append(", score=").append(weight);
    sb.append(", enqueTime=").append(enqueTime);
    sb.append(", dequeTime=").append(dequeTime);
    sb.append(", tag=").append(tag);
    if(parentLink!=null) {
        sb.append(", parentLink=").append(parentLink.getUrl());
    }
    sb.append('}');
    return sb.toString();
}

public void setStatus(LinkNodeStatus status) {
    this.status = status;
}

public LinkNodeStatus getStatus(){
    if (status == null) {
        status = LinkNodeStatus.ERROR;
    }
    return status;
}

// check server link is it exist or not
/* this method gives fake errors
public LinkNodeStatus ping () {

    boolean reachable = false;
    String sanitizeUrl = url.replaceFirst("^https", "http");

    try {
        HttpURLConnection connection = (HttpURLConnection) new URL(sanitizeUrl).openConnection();
        connection.setConnectTimeout(1000);
        connection.setRequestMethod("HEAD");
        int responseCode = connection.getResponseCode();
        System.err.println(url + " " + responseCode);
        reachable = (200 <= responseCode && responseCode <= 399);
    } catch (IOException exception) {
    }
    return reachable?LinkNodeStatus.OK: LinkNodeStatus.ERROR;
}*/


public String getIpAdress() {
    return ipAdress;
}

public void setIpAdress(String ipAdress) {
    this.ipAdress = ipAdress;
}

/* methods for controlling url size */
public void setSize(int size) {
    this.size = size;
}

public int getSize() {
    return this.size;
}

public void setFileName(String filename) {
    this.filename = filename;
}

public String getFileName() {
    return this.filename;
}

public String getDomain() {
    return domain;
}

public void setDomain(String domain) {
    this.domain = domain;
    }
}
URLWeight.java

 package pkg.crawler;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.TimeUnit;

import org.jsoup.HttpStatusException;
import org.jsoup.UnsupportedMimeTypeException;
import org.joda.time.DateTime;


public class WebCrawler {

public static Queue <LinkNodeLight> queue = new PriorityBlockingQueue <> (); // priority queue
public static final int n_threads = 5;                                 // amount of threads
private static Set<String> processed = new LinkedHashSet <> ();         // set of processed urls
private PrintWriter out;                                                // output file
private PrintWriter err;                                                // error file
private static Integer cntIntra = new Integer (0);                              // counters for intra- links in the queue
private static Integer cntInter = new Integer (0);                              // counters for inter- links in the queue
private static Integer dub = new Integer (0);                                   // amount of skipped urls

public static void main(String[] args) throws Exception {
    System.out.println("Running web crawler: " + new Date());

    WebCrawler webCrawler = new WebCrawler();
    webCrawler.createFiles();
    try (Scanner in = new Scanner(new File ("seeds.txt"))) {
        while (in.hasNext()) {
            webCrawler.enque(new LinkNode (in.nextLine().trim()));
        }
    } catch (IOException e) {
        e.printStackTrace();
        return;
    }
    webCrawler.processQueue();
    webCrawler.out.close();
    webCrawler.err.close();
}

public void processQueue(){
    /* run in threads */
    Runnable r = new Runnable() {
        @Override 
        public void run() {
            /* queue may be empty but process is not finished, that's why we need to check if any links are being processed */
            while (true) {
                LinkNode link = deque();
                if (link == null)
                    continue;
                link.setStartTime(new DateTime());
                boolean process = processLink(link);
                link.setEndTime(new DateTime());
                if (!process)
                    continue;
                /* print the data to the csv file */
                if (link.getStatus() != null && link.getStatus().equals(LinkNodeStatus.OK)) {
                    synchronized(out) {
                        out.println(getOutputLine(link));
                        out.flush();
                    }
                } else {
                    synchronized(err) {
                        err.println(getOutputLine(link));
                        err.flush();
                    }
                }
            }
        }
    };
    /* run n_threads threads which perform dequeue and process */
    LinkedList <Thread> threads = new LinkedList <> ();
    for (int i = 0; i < n_threads; i++) {
        threads.add(new Thread(r));
        threads.getLast().start();
    }
    for (Thread thread : threads) {
        try {
            thread.join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}


/* returns true if link was actually processed */
private boolean processLink(LinkNode inputLink) {
    String url = getUrlGeneralForm(inputLink);
    boolean process = true;
    synchronized (processed) {
        if (processed.contains(url)) {
            process = false;
            synchronized (dub) {dub++;}
        } else
            processed.add(url);
    }
    /* start processing only if the url have not been processed yet or not being processed */
    if (process) {
        System.out.println("Processing url " + url);
        List<LinkNodeLight> outputLinks = parseAndWieghtResults(inputLink);
        for (LinkNodeLight outputLink : outputLinks) {
            String getUrlGeneralForumOutput = getUrlGeneralForm(outputLink);
            /* add the new link to the queue only if it has not been processed yet */
            process = true;
            synchronized (processed) {
                if (processed.contains(getUrlGeneralForumOutput)) {
                    process = false;
                    synchronized (dub) {dub++;}
                }
            }
            if (process) {
                enque(outputLink);
            }
        }
        return true;
    }
    return false;
}

void enque(LinkNodeLight link){
    link.setEnqueTime(new DateTime());
    /* the add method requires implicit priority */
    synchronized (queue) {
        if (link.interLinks)
            synchronized (cntInter) {cntInter++;}
        else
            synchronized (cntIntra) {cntIntra++;}
      //queue.add(link, 100 - (int)(link.getWeight() * 100.f));
        queue.add(link);
    }
}


/**
 * Picks an element from the queue
 * @return top element from the queue or null if the queue is empty
 */
LinkNode deque(){
    /* link must be checked */
    LinkNode link = null;
    synchronized (queue) {
        link = (LinkNode) queue.poll();
        if (link != null) {
            link.setDequeTime(new DateTime());
            if (link.isInterLinks())
                synchronized (cntInter) {cntInter--;}
            else
                synchronized (cntIntra) {cntIntra--;}
        }
    }
    return link;
}

private void createFiles() {
    /* create output file */
    try {
        out = new PrintWriter(new BufferedWriter(new FileWriter("CrawledURLS.csv", false)));
        out.println(generateHeaderFile());
    } catch (IOException e) {
        System.err.println(e);
    }
    /* create error file */
    try {
        err = new PrintWriter(new BufferedWriter(new FileWriter("CrawledURLSERROR.csv", false)));
        err.println(generateHeaderFile());
    } catch (IOException e) {
        System.err.println(e);
    }
}
/**
 * formats the string so it can be valid entry in csv file
 * @param s
 * @return
 */
private static String format(String s) {
    // replace " by ""
    String ret = s.replaceAll("\"", "\"\"");
    // put string into quotes
    return "\"" + ret + "\"";
}
/**
 * Creates the line that needs to be written in the outputfile
 * @param link
 * @return
 */
public static String getOutputLine(LinkNode link){
    StringBuilder builder = new StringBuilder();
    builder.append(link.getParentLink()!=null ? format(link.getParentLink().getUrl()) : "");
    builder.append(",");
    builder.append(link.getParentLink()!=null ? link.getParentLink().getIpAdress() : "");
    builder.append(",");
    builder.append(link.getParentLink()!=null ? link.getParentLink().linkProcessingDuration() : "");
    builder.append(",");
    builder.append(format(link.getUrl()));
    builder.append(",");
    builder.append(link.getDomain());
    builder.append(",");
    builder.append(link.isInterLinks());
    builder.append(",");
    builder.append(Util.formatDate(link.getEnqueTime()));
    builder.append(",");
    builder.append(Util.formatDate(link.getDequeTime()));
    builder.append(",");
    builder.append(link.waitingInQueue());
    builder.append(",");
    builder.append(queue.size());
    /* Inter and intra links in queue */
    builder.append(",");
    builder.append(cntIntra.toString());
    builder.append(",");
    builder.append(cntInter.toString());
    builder.append(",");
    builder.append(dub);
    builder.append(",");
    builder.append(new Date ());
    /* URL size*/
    builder.append(",");
    builder.append(link.getSize());
    /* HTML file
    builder.append(",");
    builder.append(link.getFileName());*/
    /* add HTTP error */
    builder.append(",");
    if (link.getParseException() != null) {
        if (link.getParseException() instanceof HttpStatusException)
            builder.append(((HttpStatusException) link.getParseException()).getStatusCode());
        if (link.getParseException() instanceof SocketTimeoutException)
            builder.append("Time out");
        if (link.getParseException() instanceof MalformedURLException)
            builder.append("URL is not valid");
        if (link.getParseException() instanceof UnsupportedMimeTypeException)
            builder.append("Unsupported mime type: " + ((UnsupportedMimeTypeException)link.getParseException()).getMimeType());
    }
    return builder.toString();

}

/**
 * generates the Header for the file
 * @param link
 * @return
 */
private String generateHeaderFile(){
    StringBuilder builder = new StringBuilder();
    builder.append("Seed URL");
    builder.append(",");
    builder.append("Seed IP");
    builder.append(",");
    builder.append("Process Duration");
    builder.append(",");
    builder.append("Link URL");
    builder.append(",");
    builder.append("Link domain");
    builder.append(",");
    builder.append("Link IP");
    builder.append(",");
    builder.append("Enque Time");
    builder.append(",");
    builder.append("Deque Time");
    builder.append(",");
    builder.append("Waiting in the Queue");
    builder.append(",");
    builder.append("QueueSize");
    builder.append(",");
    builder.append("Intra in queue");
    builder.append(",");
    builder.append("Inter in queue");
    builder.append(",");
    builder.append("Dublications skipped");
    /* time was printed, but no header was */
    builder.append(",");
    builder.append("Time");
    /* URL size*/
    builder.append(",");
    builder.append("Size bytes");
    /* HTTP errors */
    builder.append(",");
    builder.append("HTTP error");
    return builder.toString();

}



String getUrlGeneralForm(LinkNodeLight link){
    String url = link.getUrl();
    if (url.endsWith("/")){
        url = url.substring(0, url.length() - 1);
    }
    return url;
}


private List<LinkNodeLight> parseAndWieghtResults(LinkNode inputLink) {
    List<LinkNodeLight> outputLinks = HTMLParser.parse(inputLink);
    if (inputLink.hasParseException()) {
        return outputLinks;
    } else {
        return URLWeight.weight(inputLink, outputLinks);
    }
}
}
package pkg.crawler;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.math.BigInteger;
import java.util.Formatter;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.logging.Logger;
import java.security.*;
import java.nio.file.Path;
import java.nio.file.Paths;


public class HTMLParser {

private static final int READ_TIMEOUT_IN_MILLISSECS = (int) TimeUnit.MILLISECONDS.convert(30, TimeUnit.SECONDS);
private static HashMap <String, Integer> filecounter = new HashMap<> ();


public static List<LinkNodeLight> parse(LinkNode inputLink){
    List<LinkNodeLight> outputLinks = new LinkedList<>();
    try {
        inputLink.setIpAdress(IpFromUrl.getIp(inputLink.getUrl()));
        String url = inputLink.getUrl();
        if (inputLink.getIpAdress() != null) {
            url.replace(URLWeight.getHostName(url), inputLink.getIpAdress());
        }
        Document parsedResults =  Jsoup
                .connect(url)
                .timeout(READ_TIMEOUT_IN_MILLISSECS)
                .get();
        inputLink.setSize(parsedResults.html().length());
        /* IP address moved here in order to speed up the process */
        inputLink.setStatus(LinkNodeStatus.OK);
        inputLink.setDomain(URLWeight.getDomainName(inputLink.getUrl()));
        if (true) {
            /* save the file to the html */
            String filename = parsedResults.title();//digestBig.toString(16) + ".html";
            if (filename.length() > 24) {
                filename = filename.substring(0, 24);
            }
            filename = filename.replaceAll("[^\\w\\d\\s]", "").trim();
            filename = filename.replaceAll("\\s+",  " ");

            if (!filecounter.containsKey(filename)) {
                filecounter.put(filename, 1);
            } else {
                Integer tmp = filecounter.remove(filename);
                filecounter.put(filename, tmp + 1);
            }
            filename = filename + "-" + (filecounter.get(filename)).toString() + ".html";
            filename = Paths.get("downloads", filename).toString();
            inputLink.setFileName(filename);
            /* use md5 of url as file name */
            try (PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(filename)))) {
                out.println("<!--" + inputLink.getUrl() + "-->");
                out.print(parsedResults.html());
                out.flush();
                out.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        String tag;
        Elements tagElements;
        List<LinkNode> result;


        tag = "a[href";
        tagElements = parsedResults.select(tag);
        result = toLinkNodeObject(inputLink, tagElements, tag);
        outputLinks.addAll(result);


        tag = "area[href";
        tagElements = parsedResults.select(tag);
        result = toLinkNodeObject(inputLink, tagElements, tag);
        outputLinks.addAll(result);
    } catch (IOException e) {
        inputLink.setParseException(e);
        inputLink.setStatus(LinkNodeStatus.ERROR);
    }

    return outputLinks;
}


static List<LinkNode> toLinkNodeObject(LinkNode parentLink, Elements tagElements, String tag) {
    List<LinkNode> links = new LinkedList<>();
    for (Element element : tagElements) {

        if(isFragmentRef(element)){
            continue;
        }

        String absoluteRef = String.format("abs:%s", tag.contains("[") ? tag.substring(tag.indexOf("[") + 1, tag.length()) : "href");
        String url = element.attr(absoluteRef);

        if(url!=null && url.trim().length()>0) {
            LinkNode link = new LinkNode(url);
            link.setTag(element.tagName());
            link.setParentLink(parentLink);
            links.add(link);
        }
    }
    return links;
}

static boolean isFragmentRef(Element element){
    String href = element.attr("href");
    return href!=null && (href.trim().startsWith("#") || href.startsWith("mailto:"));
}
package pkg.crawler;

import java.util.Date;

import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;


public class Util {

private static DateTimeFormatter formatter;
static {



    formatter =   DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss:SSS");


}


public static String linkToString(LinkNode inputLink){


    return String.format("%s\t%s\t%s\t%s\t%s\t%s",
            inputLink.getUrl(),
            inputLink.getWeight(),
            formatDate(inputLink.getEnqueTime()),
            formatDate(inputLink.getDequeTime()),
            differenceInMilliSeconds(inputLink.getEnqueTime(), inputLink.getDequeTime()),
            inputLink.getParentLink()==null?"":inputLink.getParentLink().getUrl()
    );
}

public static String linkToErrorString(LinkNode inputLink){

    return String.format("%s\t%s\t%s\t%s\t%s\t%s",
            inputLink.getUrl(),
            inputLink.getWeight(),
            formatDate(inputLink.getEnqueTime()),
            formatDate(inputLink.getDequeTime()),
            inputLink.getParentLink()==null?"":inputLink.getParentLink().getUrl(),
            inputLink.getParseException().getMessage()
    );
}


public static String formatDate(DateTime date){
    return formatter.print(date);
}

public static long differenceInMilliSeconds(DateTime dequeTime, DateTime enqueTime){
    return (dequeTime.getMillis()- enqueTime.getMillis());
}

public static int differenceInSeconds(Date enqueTime, Date dequeTime){
    return (int)((dequeTime.getTime()/1000) - (enqueTime.getTime()/1000));
}

public static int differenceInMinutes(Date enqueTime, Date dequeTime){
    return (int)((dequeTime.getTime()/60000) - (enqueTime.getTime()/60000));
}

}
package pkg.crawler;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;

public class URLWeight {

public static List<LinkNodeLight> weight(LinkNode sourceLink, List<LinkNodeLight> links) {

    List<LinkNodeLight> interLinks = new LinkedList<>();
    List<LinkNodeLight> intraLinks = new LinkedList<>();

    for (LinkNodeLight link : links) {
        if (isIntraLink(sourceLink, link)) {
            intraLinks.add(link);
            link.setInterLinks(false);
        } else {
            interLinks.add(link);
            link.setInterLinks(true);
        }
    }



static boolean isIntraLink(LinkNodeLight sourceLink, LinkNodeLight link){

    String parentDomainName = getHostName(sourceLink.getUrl());

    String childDomainName = getHostName(link.getUrl());
    return parentDomainName.equalsIgnoreCase(childDomainName);
}

public static String getHostName(String url) {
    if(url == null){
    //  System.out.println("Deneme");
        return "";

    }

    String domainName = new String(url);

    int index = domainName.indexOf("://");
    if (index != -1) {

        domainName = domainName.substring(index + 3);
    }
    for (int i = 0; i < domainName.length(); i++)
        if (domainName.charAt(i) == '?' || domainName.charAt(i) == '/') {
            domainName = domainName.substring(0, i);
            break;
        }

    /*if (index != -1) {

        domainName = domainName.substring(0, index);
    }*/

    /* have to keep www in order to do replacements with IP */
    //domainName = domainName.replaceFirst("^www.*?\\.", "");

    return domainName;
}
public static String getDomainName(String url) {
    String [] tmp= getHostName(url).split("\\.");
    if (tmp.length == 0)
        return "";
    return tmp[tmp.length - 1];
}


}
package pkg.crawler;

import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class PingTaskManager {

private static ExecutorService executor = Executors.newFixedThreadPool(100);

public  static void ping (LinkNode e) {
    executor.submit(new PingTaks(e));
}


}

class PingTaks implements Runnable {
 private LinkNode link;
public PingTaks( LinkNode link ) {

}

@Override
public void run() {
    /* link.ping(); */      
}


}
package pkg.crawler;

public enum LinkNodeStatus {
OK,
ERROR

}
package pkg.crawler;

import org.joda.time.DateTime;

public class LinkNodeLight implements Comparable<LinkNodeLight> {
protected String url;
protected float weight;
protected DateTime enqueTime;
protected boolean interLinks;

public String getUrl() {
    return url;
}

public float getWeight() {
    return weight;
}

public void setWeight(float weight) {
    this.weight = weight;
}

public DateTime getEnqueTime() {
    return enqueTime;
}


public LinkNodeLight(String url) {
    this.url = url;
}


public void setEnqueTime(DateTime enqueTime) {
    this.enqueTime = enqueTime;
}

@Override
public int compareTo(LinkNodeLight link) {

    if (this.weight < link.weight) return 1;
     else if (this.weight > link.weight) return -1;
        return 0;

    }
}
package pkg.crawler;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.Socket;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Date;



import org.joda.time.DateTime;


public class LinkNode extends LinkNodeLight{
public LinkNode(String url) {
    super(url);
}

private String tag;
private LinkNode parentLink;
private IOException parseException = null; // initialize parse Exception with null
private float weight;
private DateTime dequeTime;
private DateTime startTime;
private DateTime endTime;
private LinkNodeStatus status;
private String ipAdress;
private int size;
private String filename;
private String domain;

public DateTime getStartTime() {
    return startTime;
}

public void setStartTime(DateTime startTime) {
    this.startTime = startTime;
}

public DateTime getEndTime() {
    return endTime;
}

public void setEndTime(DateTime endTime) {
    this.endTime = endTime;
}

public DateTime getDequeTime() {
    return dequeTime;
}

public String getTag() {
    return tag;
}

public LinkNode getParentLink() {
    return parentLink;
}

public Exception getParseException() {
    return parseException;
}

public boolean hasParseException(){
    return parseException!=null;
}


public void setDequeTime(DateTime dequeTime) {
    this.dequeTime = dequeTime;
}

public void setTag(String tag) {
    this.tag = tag;
}

public void setParentLink(LinkNode parentLink) {
    this.parentLink = parentLink;
}

public void setParseException(IOException parseException) {
    this.parseException = parseException;
}

@Override
public boolean equals(Object o) {
    if (this == o) {
        return true;
    }
    if (o == null || getClass() != o.getClass()) {
        return false;
    }

    LinkNode link = (LinkNode) o;

    if (url != null ? !url.equals(link.url) : link.url != null) {
        return false;
    }

    return true;
}

@Override
public int hashCode() {
    return url != null ? url.hashCode() : 0;
}

public long waitingInQueue(){
    return Util.differenceInMilliSeconds( dequeTime,enqueTime );
}

public long linkProcessingDuration(){
    return Util.differenceInMilliSeconds( endTime,startTime );
}

@Override
public String toString() {
    StringBuilder sb = new StringBuilder("LinkNode{");
    sb.append("url='").append(url).append('\'');
    sb.append(", score=").append(weight);
    sb.append(", enqueTime=").append(enqueTime);
    sb.append(", dequeTime=").append(dequeTime);
    sb.append(", tag=").append(tag);
    if(parentLink!=null) {
        sb.append(", parentLink=").append(parentLink.getUrl());
    }
    sb.append('}');
    return sb.toString();
}

public void setStatus(LinkNodeStatus status) {
    this.status = status;
}

public LinkNodeStatus getStatus(){
    if (status == null) {
        status = LinkNodeStatus.ERROR;
    }
    return status;
}

// check server link is it exist or not
/* this method gives fake errors
public LinkNodeStatus ping () {

    boolean reachable = false;
    String sanitizeUrl = url.replaceFirst("^https", "http");

    try {
        HttpURLConnection connection = (HttpURLConnection) new URL(sanitizeUrl).openConnection();
        connection.setConnectTimeout(1000);
        connection.setRequestMethod("HEAD");
        int responseCode = connection.getResponseCode();
        System.err.println(url + " " + responseCode);
        reachable = (200 <= responseCode && responseCode <= 399);
    } catch (IOException exception) {
    }
    return reachable?LinkNodeStatus.OK: LinkNodeStatus.ERROR;
}*/


public String getIpAdress() {
    return ipAdress;
}

public void setIpAdress(String ipAdress) {
    this.ipAdress = ipAdress;
}

/* methods for controlling url size */
public void setSize(int size) {
    this.size = size;
}

public int getSize() {
    return this.size;
}

public void setFileName(String filename) {
    this.filename = filename;
}

public String getFileName() {
    return this.filename;
}

public String getDomain() {
    return domain;
}

public void setDomain(String domain) {
    this.domain = domain;
    }
}
LinkNodeStatus.java

 package pkg.crawler;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.TimeUnit;

import org.jsoup.HttpStatusException;
import org.jsoup.UnsupportedMimeTypeException;
import org.joda.time.DateTime;


public class WebCrawler {

public static Queue <LinkNodeLight> queue = new PriorityBlockingQueue <> (); // priority queue
public static final int n_threads = 5;                                 // amount of threads
private static Set<String> processed = new LinkedHashSet <> ();         // set of processed urls
private PrintWriter out;                                                // output file
private PrintWriter err;                                                // error file
private static Integer cntIntra = new Integer (0);                              // counters for intra- links in the queue
private static Integer cntInter = new Integer (0);                              // counters for inter- links in the queue
private static Integer dub = new Integer (0);                                   // amount of skipped urls

public static void main(String[] args) throws Exception {
    System.out.println("Running web crawler: " + new Date());

    WebCrawler webCrawler = new WebCrawler();
    webCrawler.createFiles();
    try (Scanner in = new Scanner(new File ("seeds.txt"))) {
        while (in.hasNext()) {
            webCrawler.enque(new LinkNode (in.nextLine().trim()));
        }
    } catch (IOException e) {
        e.printStackTrace();
        return;
    }
    webCrawler.processQueue();
    webCrawler.out.close();
    webCrawler.err.close();
}

public void processQueue(){
    /* run in threads */
    Runnable r = new Runnable() {
        @Override 
        public void run() {
            /* queue may be empty but process is not finished, that's why we need to check if any links are being processed */
            while (true) {
                LinkNode link = deque();
                if (link == null)
                    continue;
                link.setStartTime(new DateTime());
                boolean process = processLink(link);
                link.setEndTime(new DateTime());
                if (!process)
                    continue;
                /* print the data to the csv file */
                if (link.getStatus() != null && link.getStatus().equals(LinkNodeStatus.OK)) {
                    synchronized(out) {
                        out.println(getOutputLine(link));
                        out.flush();
                    }
                } else {
                    synchronized(err) {
                        err.println(getOutputLine(link));
                        err.flush();
                    }
                }
            }
        }
    };
    /* run n_threads threads which perform dequeue and process */
    LinkedList <Thread> threads = new LinkedList <> ();
    for (int i = 0; i < n_threads; i++) {
        threads.add(new Thread(r));
        threads.getLast().start();
    }
    for (Thread thread : threads) {
        try {
            thread.join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}


/* returns true if link was actually processed */
private boolean processLink(LinkNode inputLink) {
    String url = getUrlGeneralForm(inputLink);
    boolean process = true;
    synchronized (processed) {
        if (processed.contains(url)) {
            process = false;
            synchronized (dub) {dub++;}
        } else
            processed.add(url);
    }
    /* start processing only if the url have not been processed yet or not being processed */
    if (process) {
        System.out.println("Processing url " + url);
        List<LinkNodeLight> outputLinks = parseAndWieghtResults(inputLink);
        for (LinkNodeLight outputLink : outputLinks) {
            String getUrlGeneralForumOutput = getUrlGeneralForm(outputLink);
            /* add the new link to the queue only if it has not been processed yet */
            process = true;
            synchronized (processed) {
                if (processed.contains(getUrlGeneralForumOutput)) {
                    process = false;
                    synchronized (dub) {dub++;}
                }
            }
            if (process) {
                enque(outputLink);
            }
        }
        return true;
    }
    return false;
}

void enque(LinkNodeLight link){
    link.setEnqueTime(new DateTime());
    /* the add method requires implicit priority */
    synchronized (queue) {
        if (link.interLinks)
            synchronized (cntInter) {cntInter++;}
        else
            synchronized (cntIntra) {cntIntra++;}
      //queue.add(link, 100 - (int)(link.getWeight() * 100.f));
        queue.add(link);
    }
}


/**
 * Picks an element from the queue
 * @return top element from the queue or null if the queue is empty
 */
LinkNode deque(){
    /* link must be checked */
    LinkNode link = null;
    synchronized (queue) {
        link = (LinkNode) queue.poll();
        if (link != null) {
            link.setDequeTime(new DateTime());
            if (link.isInterLinks())
                synchronized (cntInter) {cntInter--;}
            else
                synchronized (cntIntra) {cntIntra--;}
        }
    }
    return link;
}

private void createFiles() {
    /* create output file */
    try {
        out = new PrintWriter(new BufferedWriter(new FileWriter("CrawledURLS.csv", false)));
        out.println(generateHeaderFile());
    } catch (IOException e) {
        System.err.println(e);
    }
    /* create error file */
    try {
        err = new PrintWriter(new BufferedWriter(new FileWriter("CrawledURLSERROR.csv", false)));
        err.println(generateHeaderFile());
    } catch (IOException e) {
        System.err.println(e);
    }
}
/**
 * formats the string so it can be valid entry in csv file
 * @param s
 * @return
 */
private static String format(String s) {
    // replace " by ""
    String ret = s.replaceAll("\"", "\"\"");
    // put string into quotes
    return "\"" + ret + "\"";
}
/**
 * Creates the line that needs to be written in the outputfile
 * @param link
 * @return
 */
public static String getOutputLine(LinkNode link){
    StringBuilder builder = new StringBuilder();
    builder.append(link.getParentLink()!=null ? format(link.getParentLink().getUrl()) : "");
    builder.append(",");
    builder.append(link.getParentLink()!=null ? link.getParentLink().getIpAdress() : "");
    builder.append(",");
    builder.append(link.getParentLink()!=null ? link.getParentLink().linkProcessingDuration() : "");
    builder.append(",");
    builder.append(format(link.getUrl()));
    builder.append(",");
    builder.append(link.getDomain());
    builder.append(",");
    builder.append(link.isInterLinks());
    builder.append(",");
    builder.append(Util.formatDate(link.getEnqueTime()));
    builder.append(",");
    builder.append(Util.formatDate(link.getDequeTime()));
    builder.append(",");
    builder.append(link.waitingInQueue());
    builder.append(",");
    builder.append(queue.size());
    /* Inter and intra links in queue */
    builder.append(",");
    builder.append(cntIntra.toString());
    builder.append(",");
    builder.append(cntInter.toString());
    builder.append(",");
    builder.append(dub);
    builder.append(",");
    builder.append(new Date ());
    /* URL size*/
    builder.append(",");
    builder.append(link.getSize());
    /* HTML file
    builder.append(",");
    builder.append(link.getFileName());*/
    /* add HTTP error */
    builder.append(",");
    if (link.getParseException() != null) {
        if (link.getParseException() instanceof HttpStatusException)
            builder.append(((HttpStatusException) link.getParseException()).getStatusCode());
        if (link.getParseException() instanceof SocketTimeoutException)
            builder.append("Time out");
        if (link.getParseException() instanceof MalformedURLException)
            builder.append("URL is not valid");
        if (link.getParseException() instanceof UnsupportedMimeTypeException)
            builder.append("Unsupported mime type: " + ((UnsupportedMimeTypeException)link.getParseException()).getMimeType());
    }
    return builder.toString();

}

/**
 * generates the Header for the file
 * @param link
 * @return
 */
private String generateHeaderFile(){
    StringBuilder builder = new StringBuilder();
    builder.append("Seed URL");
    builder.append(",");
    builder.append("Seed IP");
    builder.append(",");
    builder.append("Process Duration");
    builder.append(",");
    builder.append("Link URL");
    builder.append(",");
    builder.append("Link domain");
    builder.append(",");
    builder.append("Link IP");
    builder.append(",");
    builder.append("Enque Time");
    builder.append(",");
    builder.append("Deque Time");
    builder.append(",");
    builder.append("Waiting in the Queue");
    builder.append(",");
    builder.append("QueueSize");
    builder.append(",");
    builder.append("Intra in queue");
    builder.append(",");
    builder.append("Inter in queue");
    builder.append(",");
    builder.append("Dublications skipped");
    /* time was printed, but no header was */
    builder.append(",");
    builder.append("Time");
    /* URL size*/
    builder.append(",");
    builder.append("Size bytes");
    /* HTTP errors */
    builder.append(",");
    builder.append("HTTP error");
    return builder.toString();

}



String getUrlGeneralForm(LinkNodeLight link){
    String url = link.getUrl();
    if (url.endsWith("/")){
        url = url.substring(0, url.length() - 1);
    }
    return url;
}


private List<LinkNodeLight> parseAndWieghtResults(LinkNode inputLink) {
    List<LinkNodeLight> outputLinks = HTMLParser.parse(inputLink);
    if (inputLink.hasParseException()) {
        return outputLinks;
    } else {
        return URLWeight.weight(inputLink, outputLinks);
    }
}
}
package pkg.crawler;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.math.BigInteger;
import java.util.Formatter;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.logging.Logger;
import java.security.*;
import java.nio.file.Path;
import java.nio.file.Paths;


public class HTMLParser {

private static final int READ_TIMEOUT_IN_MILLISSECS = (int) TimeUnit.MILLISECONDS.convert(30, TimeUnit.SECONDS);
private static HashMap <String, Integer> filecounter = new HashMap<> ();


public static List<LinkNodeLight> parse(LinkNode inputLink){
    List<LinkNodeLight> outputLinks = new LinkedList<>();
    try {
        inputLink.setIpAdress(IpFromUrl.getIp(inputLink.getUrl()));
        String url = inputLink.getUrl();
        if (inputLink.getIpAdress() != null) {
            url.replace(URLWeight.getHostName(url), inputLink.getIpAdress());
        }
        Document parsedResults =  Jsoup
                .connect(url)
                .timeout(READ_TIMEOUT_IN_MILLISSECS)
                .get();
        inputLink.setSize(parsedResults.html().length());
        /* IP address moved here in order to speed up the process */
        inputLink.setStatus(LinkNodeStatus.OK);
        inputLink.setDomain(URLWeight.getDomainName(inputLink.getUrl()));
        if (true) {
            /* save the file to the html */
            String filename = parsedResults.title();//digestBig.toString(16) + ".html";
            if (filename.length() > 24) {
                filename = filename.substring(0, 24);
            }
            filename = filename.replaceAll("[^\\w\\d\\s]", "").trim();
            filename = filename.replaceAll("\\s+",  " ");

            if (!filecounter.containsKey(filename)) {
                filecounter.put(filename, 1);
            } else {
                Integer tmp = filecounter.remove(filename);
                filecounter.put(filename, tmp + 1);
            }
            filename = filename + "-" + (filecounter.get(filename)).toString() + ".html";
            filename = Paths.get("downloads", filename).toString();
            inputLink.setFileName(filename);
            /* use md5 of url as file name */
            try (PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(filename)))) {
                out.println("<!--" + inputLink.getUrl() + "-->");
                out.print(parsedResults.html());
                out.flush();
                out.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        String tag;
        Elements tagElements;
        List<LinkNode> result;


        tag = "a[href";
        tagElements = parsedResults.select(tag);
        result = toLinkNodeObject(inputLink, tagElements, tag);
        outputLinks.addAll(result);


        tag = "area[href";
        tagElements = parsedResults.select(tag);
        result = toLinkNodeObject(inputLink, tagElements, tag);
        outputLinks.addAll(result);
    } catch (IOException e) {
        inputLink.setParseException(e);
        inputLink.setStatus(LinkNodeStatus.ERROR);
    }

    return outputLinks;
}


static List<LinkNode> toLinkNodeObject(LinkNode parentLink, Elements tagElements, String tag) {
    List<LinkNode> links = new LinkedList<>();
    for (Element element : tagElements) {

        if(isFragmentRef(element)){
            continue;
        }

        String absoluteRef = String.format("abs:%s", tag.contains("[") ? tag.substring(tag.indexOf("[") + 1, tag.length()) : "href");
        String url = element.attr(absoluteRef);

        if(url!=null && url.trim().length()>0) {
            LinkNode link = new LinkNode(url);
            link.setTag(element.tagName());
            link.setParentLink(parentLink);
            links.add(link);
        }
    }
    return links;
}

static boolean isFragmentRef(Element element){
    String href = element.attr("href");
    return href!=null && (href.trim().startsWith("#") || href.startsWith("mailto:"));
}
package pkg.crawler;

import java.util.Date;

import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;


public class Util {

private static DateTimeFormatter formatter;
static {



    formatter =   DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss:SSS");


}


public static String linkToString(LinkNode inputLink){


    return String.format("%s\t%s\t%s\t%s\t%s\t%s",
            inputLink.getUrl(),
            inputLink.getWeight(),
            formatDate(inputLink.getEnqueTime()),
            formatDate(inputLink.getDequeTime()),
            differenceInMilliSeconds(inputLink.getEnqueTime(), inputLink.getDequeTime()),
            inputLink.getParentLink()==null?"":inputLink.getParentLink().getUrl()
    );
}

public static String linkToErrorString(LinkNode inputLink){

    return String.format("%s\t%s\t%s\t%s\t%s\t%s",
            inputLink.getUrl(),
            inputLink.getWeight(),
            formatDate(inputLink.getEnqueTime()),
            formatDate(inputLink.getDequeTime()),
            inputLink.getParentLink()==null?"":inputLink.getParentLink().getUrl(),
            inputLink.getParseException().getMessage()
    );
}


public static String formatDate(DateTime date){
    return formatter.print(date);
}

public static long differenceInMilliSeconds(DateTime dequeTime, DateTime enqueTime){
    return (dequeTime.getMillis()- enqueTime.getMillis());
}

public static int differenceInSeconds(Date enqueTime, Date dequeTime){
    return (int)((dequeTime.getTime()/1000) - (enqueTime.getTime()/1000));
}

public static int differenceInMinutes(Date enqueTime, Date dequeTime){
    return (int)((dequeTime.getTime()/60000) - (enqueTime.getTime()/60000));
}

}
package pkg.crawler;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;

public class URLWeight {

public static List<LinkNodeLight> weight(LinkNode sourceLink, List<LinkNodeLight> links) {

    List<LinkNodeLight> interLinks = new LinkedList<>();
    List<LinkNodeLight> intraLinks = new LinkedList<>();

    for (LinkNodeLight link : links) {
        if (isIntraLink(sourceLink, link)) {
            intraLinks.add(link);
            link.setInterLinks(false);
        } else {
            interLinks.add(link);
            link.setInterLinks(true);
        }
    }



static boolean isIntraLink(LinkNodeLight sourceLink, LinkNodeLight link){

    String parentDomainName = getHostName(sourceLink.getUrl());

    String childDomainName = getHostName(link.getUrl());
    return parentDomainName.equalsIgnoreCase(childDomainName);
}

public static String getHostName(String url) {
    if(url == null){
    //  System.out.println("Deneme");
        return "";

    }

    String domainName = new String(url);

    int index = domainName.indexOf("://");
    if (index != -1) {

        domainName = domainName.substring(index + 3);
    }
    for (int i = 0; i < domainName.length(); i++)
        if (domainName.charAt(i) == '?' || domainName.charAt(i) == '/') {
            domainName = domainName.substring(0, i);
            break;
        }

    /*if (index != -1) {

        domainName = domainName.substring(0, index);
    }*/

    /* have to keep www in order to do replacements with IP */
    //domainName = domainName.replaceFirst("^www.*?\\.", "");

    return domainName;
}
public static String getDomainName(String url) {
    String [] tmp= getHostName(url).split("\\.");
    if (tmp.length == 0)
        return "";
    return tmp[tmp.length - 1];
}


}
package pkg.crawler;

import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class PingTaskManager {

private static ExecutorService executor = Executors.newFixedThreadPool(100);

public  static void ping (LinkNode e) {
    executor.submit(new PingTaks(e));
}


}

class PingTaks implements Runnable {
 private LinkNode link;
public PingTaks( LinkNode link ) {

}

@Override
public void run() {
    /* link.ping(); */      
}


}
package pkg.crawler;

public enum LinkNodeStatus {
OK,
ERROR

}
package pkg.crawler;

import org.joda.time.DateTime;

public class LinkNodeLight implements Comparable<LinkNodeLight> {
protected String url;
protected float weight;
protected DateTime enqueTime;
protected boolean interLinks;

public String getUrl() {
    return url;
}

public float getWeight() {
    return weight;
}

public void setWeight(float weight) {
    this.weight = weight;
}

public DateTime getEnqueTime() {
    return enqueTime;
}


public LinkNodeLight(String url) {
    this.url = url;
}


public void setEnqueTime(DateTime enqueTime) {
    this.enqueTime = enqueTime;
}

@Override
public int compareTo(LinkNodeLight link) {

    if (this.weight < link.weight) return 1;
     else if (this.weight > link.weight) return -1;
        return 0;

    }
}
package pkg.crawler;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.Socket;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Date;



import org.joda.time.DateTime;


public class LinkNode extends LinkNodeLight{
public LinkNode(String url) {
    super(url);
}

private String tag;
private LinkNode parentLink;
private IOException parseException = null; // initialize parse Exception with null
private float weight;
private DateTime dequeTime;
private DateTime startTime;
private DateTime endTime;
private LinkNodeStatus status;
private String ipAdress;
private int size;
private String filename;
private String domain;

public DateTime getStartTime() {
    return startTime;
}

public void setStartTime(DateTime startTime) {
    this.startTime = startTime;
}

public DateTime getEndTime() {
    return endTime;
}

public void setEndTime(DateTime endTime) {
    this.endTime = endTime;
}

public DateTime getDequeTime() {
    return dequeTime;
}

public String getTag() {
    return tag;
}

public LinkNode getParentLink() {
    return parentLink;
}

public Exception getParseException() {
    return parseException;
}

public boolean hasParseException(){
    return parseException!=null;
}


public void setDequeTime(DateTime dequeTime) {
    this.dequeTime = dequeTime;
}

public void setTag(String tag) {
    this.tag = tag;
}

public void setParentLink(LinkNode parentLink) {
    this.parentLink = parentLink;
}

public void setParseException(IOException parseException) {
    this.parseException = parseException;
}

@Override
public boolean equals(Object o) {
    if (this == o) {
        return true;
    }
    if (o == null || getClass() != o.getClass()) {
        return false;
    }

    LinkNode link = (LinkNode) o;

    if (url != null ? !url.equals(link.url) : link.url != null) {
        return false;
    }

    return true;
}

@Override
public int hashCode() {
    return url != null ? url.hashCode() : 0;
}

public long waitingInQueue(){
    return Util.differenceInMilliSeconds( dequeTime,enqueTime );
}

public long linkProcessingDuration(){
    return Util.differenceInMilliSeconds( endTime,startTime );
}

@Override
public String toString() {
    StringBuilder sb = new StringBuilder("LinkNode{");
    sb.append("url='").append(url).append('\'');
    sb.append(", score=").append(weight);
    sb.append(", enqueTime=").append(enqueTime);
    sb.append(", dequeTime=").append(dequeTime);
    sb.append(", tag=").append(tag);
    if(parentLink!=null) {
        sb.append(", parentLink=").append(parentLink.getUrl());
    }
    sb.append('}');
    return sb.toString();
}

public void setStatus(LinkNodeStatus status) {
    this.status = status;
}

public LinkNodeStatus getStatus(){
    if (status == null) {
        status = LinkNodeStatus.ERROR;
    }
    return status;
}

// check server link is it exist or not
/* this method gives fake errors
public LinkNodeStatus ping () {

    boolean reachable = false;
    String sanitizeUrl = url.replaceFirst("^https", "http");

    try {
        HttpURLConnection connection = (HttpURLConnection) new URL(sanitizeUrl).openConnection();
        connection.setConnectTimeout(1000);
        connection.setRequestMethod("HEAD");
        int responseCode = connection.getResponseCode();
        System.err.println(url + " " + responseCode);
        reachable = (200 <= responseCode && responseCode <= 399);
    } catch (IOException exception) {
    }
    return reachable?LinkNodeStatus.OK: LinkNodeStatus.ERROR;
}*/


public String getIpAdress() {
    return ipAdress;
}

public void setIpAdress(String ipAdress) {
    this.ipAdress = ipAdress;
}

/* methods for controlling url size */
public void setSize(int size) {
    this.size = size;
}

public int getSize() {
    return this.size;
}

public void setFileName(String filename) {
    this.filename = filename;
}

public String getFileName() {
    return this.filename;
}

public String getDomain() {
    return domain;
}

public void setDomain(String domain) {
    this.domain = domain;
    }
}
LinkNodeLight.java

 package pkg.crawler;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.TimeUnit;

import org.jsoup.HttpStatusException;
import org.jsoup.UnsupportedMimeTypeException;
import org.joda.time.DateTime;


public class WebCrawler {

public static Queue <LinkNodeLight> queue = new PriorityBlockingQueue <> (); // priority queue
public static final int n_threads = 5;                                 // amount of threads
private static Set<String> processed = new LinkedHashSet <> ();         // set of processed urls
private PrintWriter out;                                                // output file
private PrintWriter err;                                                // error file
private static Integer cntIntra = new Integer (0);                              // counters for intra- links in the queue
private static Integer cntInter = new Integer (0);                              // counters for inter- links in the queue
private static Integer dub = new Integer (0);                                   // amount of skipped urls

public static void main(String[] args) throws Exception {
    System.out.println("Running web crawler: " + new Date());

    WebCrawler webCrawler = new WebCrawler();
    webCrawler.createFiles();
    try (Scanner in = new Scanner(new File ("seeds.txt"))) {
        while (in.hasNext()) {
            webCrawler.enque(new LinkNode (in.nextLine().trim()));
        }
    } catch (IOException e) {
        e.printStackTrace();
        return;
    }
    webCrawler.processQueue();
    webCrawler.out.close();
    webCrawler.err.close();
}

public void processQueue(){
    /* run in threads */
    Runnable r = new Runnable() {
        @Override 
        public void run() {
            /* queue may be empty but process is not finished, that's why we need to check if any links are being processed */
            while (true) {
                LinkNode link = deque();
                if (link == null)
                    continue;
                link.setStartTime(new DateTime());
                boolean process = processLink(link);
                link.setEndTime(new DateTime());
                if (!process)
                    continue;
                /* print the data to the csv file */
                if (link.getStatus() != null && link.getStatus().equals(LinkNodeStatus.OK)) {
                    synchronized(out) {
                        out.println(getOutputLine(link));
                        out.flush();
                    }
                } else {
                    synchronized(err) {
                        err.println(getOutputLine(link));
                        err.flush();
                    }
                }
            }
        }
    };
    /* run n_threads threads which perform dequeue and process */
    LinkedList <Thread> threads = new LinkedList <> ();
    for (int i = 0; i < n_threads; i++) {
        threads.add(new Thread(r));
        threads.getLast().start();
    }
    for (Thread thread : threads) {
        try {
            thread.join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}


/* returns true if link was actually processed */
private boolean processLink(LinkNode inputLink) {
    String url = getUrlGeneralForm(inputLink);
    boolean process = true;
    synchronized (processed) {
        if (processed.contains(url)) {
            process = false;
            synchronized (dub) {dub++;}
        } else
            processed.add(url);
    }
    /* start processing only if the url have not been processed yet or not being processed */
    if (process) {
        System.out.println("Processing url " + url);
        List<LinkNodeLight> outputLinks = parseAndWieghtResults(inputLink);
        for (LinkNodeLight outputLink : outputLinks) {
            String getUrlGeneralForumOutput = getUrlGeneralForm(outputLink);
            /* add the new link to the queue only if it has not been processed yet */
            process = true;
            synchronized (processed) {
                if (processed.contains(getUrlGeneralForumOutput)) {
                    process = false;
                    synchronized (dub) {dub++;}
                }
            }
            if (process) {
                enque(outputLink);
            }
        }
        return true;
    }
    return false;
}

void enque(LinkNodeLight link){
    link.setEnqueTime(new DateTime());
    /* the add method requires implicit priority */
    synchronized (queue) {
        if (link.interLinks)
            synchronized (cntInter) {cntInter++;}
        else
            synchronized (cntIntra) {cntIntra++;}
      //queue.add(link, 100 - (int)(link.getWeight() * 100.f));
        queue.add(link);
    }
}


/**
 * Picks an element from the queue
 * @return top element from the queue or null if the queue is empty
 */
LinkNode deque(){
    /* link must be checked */
    LinkNode link = null;
    synchronized (queue) {
        link = (LinkNode) queue.poll();
        if (link != null) {
            link.setDequeTime(new DateTime());
            if (link.isInterLinks())
                synchronized (cntInter) {cntInter--;}
            else
                synchronized (cntIntra) {cntIntra--;}
        }
    }
    return link;
}

private void createFiles() {
    /* create output file */
    try {
        out = new PrintWriter(new BufferedWriter(new FileWriter("CrawledURLS.csv", false)));
        out.println(generateHeaderFile());
    } catch (IOException e) {
        System.err.println(e);
    }
    /* create error file */
    try {
        err = new PrintWriter(new BufferedWriter(new FileWriter("CrawledURLSERROR.csv", false)));
        err.println(generateHeaderFile());
    } catch (IOException e) {
        System.err.println(e);
    }
}
/**
 * formats the string so it can be valid entry in csv file
 * @param s
 * @return
 */
private static String format(String s) {
    // replace " by ""
    String ret = s.replaceAll("\"", "\"\"");
    // put string into quotes
    return "\"" + ret + "\"";
}
/**
 * Creates the line that needs to be written in the outputfile
 * @param link
 * @return
 */
public static String getOutputLine(LinkNode link){
    StringBuilder builder = new StringBuilder();
    builder.append(link.getParentLink()!=null ? format(link.getParentLink().getUrl()) : "");
    builder.append(",");
    builder.append(link.getParentLink()!=null ? link.getParentLink().getIpAdress() : "");
    builder.append(",");
    builder.append(link.getParentLink()!=null ? link.getParentLink().linkProcessingDuration() : "");
    builder.append(",");
    builder.append(format(link.getUrl()));
    builder.append(",");
    builder.append(link.getDomain());
    builder.append(",");
    builder.append(link.isInterLinks());
    builder.append(",");
    builder.append(Util.formatDate(link.getEnqueTime()));
    builder.append(",");
    builder.append(Util.formatDate(link.getDequeTime()));
    builder.append(",");
    builder.append(link.waitingInQueue());
    builder.append(",");
    builder.append(queue.size());
    /* Inter and intra links in queue */
    builder.append(",");
    builder.append(cntIntra.toString());
    builder.append(",");
    builder.append(cntInter.toString());
    builder.append(",");
    builder.append(dub);
    builder.append(",");
    builder.append(new Date ());
    /* URL size*/
    builder.append(",");
    builder.append(link.getSize());
    /* HTML file
    builder.append(",");
    builder.append(link.getFileName());*/
    /* add HTTP error */
    builder.append(",");
    if (link.getParseException() != null) {
        if (link.getParseException() instanceof HttpStatusException)
            builder.append(((HttpStatusException) link.getParseException()).getStatusCode());
        if (link.getParseException() instanceof SocketTimeoutException)
            builder.append("Time out");
        if (link.getParseException() instanceof MalformedURLException)
            builder.append("URL is not valid");
        if (link.getParseException() instanceof UnsupportedMimeTypeException)
            builder.append("Unsupported mime type: " + ((UnsupportedMimeTypeException)link.getParseException()).getMimeType());
    }
    return builder.toString();

}

/**
 * generates the Header for the file
 * @param link
 * @return
 */
private String generateHeaderFile(){
    StringBuilder builder = new StringBuilder();
    builder.append("Seed URL");
    builder.append(",");
    builder.append("Seed IP");
    builder.append(",");
    builder.append("Process Duration");
    builder.append(",");
    builder.append("Link URL");
    builder.append(",");
    builder.append("Link domain");
    builder.append(",");
    builder.append("Link IP");
    builder.append(",");
    builder.append("Enque Time");
    builder.append(",");
    builder.append("Deque Time");
    builder.append(",");
    builder.append("Waiting in the Queue");
    builder.append(",");
    builder.append("QueueSize");
    builder.append(",");
    builder.append("Intra in queue");
    builder.append(",");
    builder.append("Inter in queue");
    builder.append(",");
    builder.append("Dublications skipped");
    /* time was printed, but no header was */
    builder.append(",");
    builder.append("Time");
    /* URL size*/
    builder.append(",");
    builder.append("Size bytes");
    /* HTTP errors */
    builder.append(",");
    builder.append("HTTP error");
    return builder.toString();

}



String getUrlGeneralForm(LinkNodeLight link){
    String url = link.getUrl();
    if (url.endsWith("/")){
        url = url.substring(0, url.length() - 1);
    }
    return url;
}


private List<LinkNodeLight> parseAndWieghtResults(LinkNode inputLink) {
    List<LinkNodeLight> outputLinks = HTMLParser.parse(inputLink);
    if (inputLink.hasParseException()) {
        return outputLinks;
    } else {
        return URLWeight.weight(inputLink, outputLinks);
    }
}
}
package pkg.crawler;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.math.BigInteger;
import java.util.Formatter;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.logging.Logger;
import java.security.*;
import java.nio.file.Path;
import java.nio.file.Paths;


public class HTMLParser {

private static final int READ_TIMEOUT_IN_MILLISSECS = (int) TimeUnit.MILLISECONDS.convert(30, TimeUnit.SECONDS);
private static HashMap <String, Integer> filecounter = new HashMap<> ();


public static List<LinkNodeLight> parse(LinkNode inputLink){
    List<LinkNodeLight> outputLinks = new LinkedList<>();
    try {
        inputLink.setIpAdress(IpFromUrl.getIp(inputLink.getUrl()));
        String url = inputLink.getUrl();
        if (inputLink.getIpAdress() != null) {
            url.replace(URLWeight.getHostName(url), inputLink.getIpAdress());
        }
        Document parsedResults =  Jsoup
                .connect(url)
                .timeout(READ_TIMEOUT_IN_MILLISSECS)
                .get();
        inputLink.setSize(parsedResults.html().length());
        /* IP address moved here in order to speed up the process */
        inputLink.setStatus(LinkNodeStatus.OK);
        inputLink.setDomain(URLWeight.getDomainName(inputLink.getUrl()));
        if (true) {
            /* save the file to the html */
            String filename = parsedResults.title();//digestBig.toString(16) + ".html";
            if (filename.length() > 24) {
                filename = filename.substring(0, 24);
            }
            filename = filename.replaceAll("[^\\w\\d\\s]", "").trim();
            filename = filename.replaceAll("\\s+",  " ");

            if (!filecounter.containsKey(filename)) {
                filecounter.put(filename, 1);
            } else {
                Integer tmp = filecounter.remove(filename);
                filecounter.put(filename, tmp + 1);
            }
            filename = filename + "-" + (filecounter.get(filename)).toString() + ".html";
            filename = Paths.get("downloads", filename).toString();
            inputLink.setFileName(filename);
            /* use md5 of url as file name */
            try (PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(filename)))) {
                out.println("<!--" + inputLink.getUrl() + "-->");
                out.print(parsedResults.html());
                out.flush();
                out.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        String tag;
        Elements tagElements;
        List<LinkNode> result;


        tag = "a[href";
        tagElements = parsedResults.select(tag);
        result = toLinkNodeObject(inputLink, tagElements, tag);
        outputLinks.addAll(result);


        tag = "area[href";
        tagElements = parsedResults.select(tag);
        result = toLinkNodeObject(inputLink, tagElements, tag);
        outputLinks.addAll(result);
    } catch (IOException e) {
        inputLink.setParseException(e);
        inputLink.setStatus(LinkNodeStatus.ERROR);
    }

    return outputLinks;
}


static List<LinkNode> toLinkNodeObject(LinkNode parentLink, Elements tagElements, String tag) {
    List<LinkNode> links = new LinkedList<>();
    for (Element element : tagElements) {

        if(isFragmentRef(element)){
            continue;
        }

        String absoluteRef = String.format("abs:%s", tag.contains("[") ? tag.substring(tag.indexOf("[") + 1, tag.length()) : "href");
        String url = element.attr(absoluteRef);

        if(url!=null && url.trim().length()>0) {
            LinkNode link = new LinkNode(url);
            link.setTag(element.tagName());
            link.setParentLink(parentLink);
            links.add(link);
        }
    }
    return links;
}

static boolean isFragmentRef(Element element){
    String href = element.attr("href");
    return href!=null && (href.trim().startsWith("#") || href.startsWith("mailto:"));
}
package pkg.crawler;

import java.util.Date;

import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;


public class Util {

private static DateTimeFormatter formatter;
static {



    formatter =   DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss:SSS");


}


public static String linkToString(LinkNode inputLink){


    return String.format("%s\t%s\t%s\t%s\t%s\t%s",
            inputLink.getUrl(),
            inputLink.getWeight(),
            formatDate(inputLink.getEnqueTime()),
            formatDate(inputLink.getDequeTime()),
            differenceInMilliSeconds(inputLink.getEnqueTime(), inputLink.getDequeTime()),
            inputLink.getParentLink()==null?"":inputLink.getParentLink().getUrl()
    );
}

public static String linkToErrorString(LinkNode inputLink){

    return String.format("%s\t%s\t%s\t%s\t%s\t%s",
            inputLink.getUrl(),
            inputLink.getWeight(),
            formatDate(inputLink.getEnqueTime()),
            formatDate(inputLink.getDequeTime()),
            inputLink.getParentLink()==null?"":inputLink.getParentLink().getUrl(),
            inputLink.getParseException().getMessage()
    );
}


public static String formatDate(DateTime date){
    return formatter.print(date);
}

public static long differenceInMilliSeconds(DateTime dequeTime, DateTime enqueTime){
    return (dequeTime.getMillis()- enqueTime.getMillis());
}

public static int differenceInSeconds(Date enqueTime, Date dequeTime){
    return (int)((dequeTime.getTime()/1000) - (enqueTime.getTime()/1000));
}

public static int differenceInMinutes(Date enqueTime, Date dequeTime){
    return (int)((dequeTime.getTime()/60000) - (enqueTime.getTime()/60000));
}

}
package pkg.crawler;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;

public class URLWeight {

public static List<LinkNodeLight> weight(LinkNode sourceLink, List<LinkNodeLight> links) {

    List<LinkNodeLight> interLinks = new LinkedList<>();
    List<LinkNodeLight> intraLinks = new LinkedList<>();

    for (LinkNodeLight link : links) {
        if (isIntraLink(sourceLink, link)) {
            intraLinks.add(link);
            link.setInterLinks(false);
        } else {
            interLinks.add(link);
            link.setInterLinks(true);
        }
    }



static boolean isIntraLink(LinkNodeLight sourceLink, LinkNodeLight link){

    String parentDomainName = getHostName(sourceLink.getUrl());

    String childDomainName = getHostName(link.getUrl());
    return parentDomainName.equalsIgnoreCase(childDomainName);
}

public static String getHostName(String url) {
    if(url == null){
    //  System.out.println("Deneme");
        return "";

    }

    String domainName = new String(url);

    int index = domainName.indexOf("://");
    if (index != -1) {

        domainName = domainName.substring(index + 3);
    }
    for (int i = 0; i < domainName.length(); i++)
        if (domainName.charAt(i) == '?' || domainName.charAt(i) == '/') {
            domainName = domainName.substring(0, i);
            break;
        }

    /*if (index != -1) {

        domainName = domainName.substring(0, index);
    }*/

    /* have to keep www in order to do replacements with IP */
    //domainName = domainName.replaceFirst("^www.*?\\.", "");

    return domainName;
}
public static String getDomainName(String url) {
    String [] tmp= getHostName(url).split("\\.");
    if (tmp.length == 0)
        return "";
    return tmp[tmp.length - 1];
}


}
package pkg.crawler;

import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class PingTaskManager {

private static ExecutorService executor = Executors.newFixedThreadPool(100);

public  static void ping (LinkNode e) {
    executor.submit(new PingTaks(e));
}


}

class PingTaks implements Runnable {
 private LinkNode link;
public PingTaks( LinkNode link ) {

}

@Override
public void run() {
    /* link.ping(); */      
}


}
package pkg.crawler;

public enum LinkNodeStatus {
OK,
ERROR

}
package pkg.crawler;

import org.joda.time.DateTime;

public class LinkNodeLight implements Comparable<LinkNodeLight> {
protected String url;
protected float weight;
protected DateTime enqueTime;
protected boolean interLinks;

public String getUrl() {
    return url;
}

public float getWeight() {
    return weight;
}

public void setWeight(float weight) {
    this.weight = weight;
}

public DateTime getEnqueTime() {
    return enqueTime;
}


public LinkNodeLight(String url) {
    this.url = url;
}


public void setEnqueTime(DateTime enqueTime) {
    this.enqueTime = enqueTime;
}

@Override
public int compareTo(LinkNodeLight link) {

    if (this.weight < link.weight) return 1;
     else if (this.weight > link.weight) return -1;
        return 0;

    }
}
package pkg.crawler;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.Socket;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Date;



import org.joda.time.DateTime;


public class LinkNode extends LinkNodeLight{
public LinkNode(String url) {
    super(url);
}

private String tag;
private LinkNode parentLink;
private IOException parseException = null; // initialize parse Exception with null
private float weight;
private DateTime dequeTime;
private DateTime startTime;
private DateTime endTime;
private LinkNodeStatus status;
private String ipAdress;
private int size;
private String filename;
private String domain;

public DateTime getStartTime() {
    return startTime;
}

public void setStartTime(DateTime startTime) {
    this.startTime = startTime;
}

public DateTime getEndTime() {
    return endTime;
}

public void setEndTime(DateTime endTime) {
    this.endTime = endTime;
}

public DateTime getDequeTime() {
    return dequeTime;
}

public String getTag() {
    return tag;
}

public LinkNode getParentLink() {
    return parentLink;
}

public Exception getParseException() {
    return parseException;
}

public boolean hasParseException(){
    return parseException!=null;
}


public void setDequeTime(DateTime dequeTime) {
    this.dequeTime = dequeTime;
}

public void setTag(String tag) {
    this.tag = tag;
}

public void setParentLink(LinkNode parentLink) {
    this.parentLink = parentLink;
}

public void setParseException(IOException parseException) {
    this.parseException = parseException;
}

@Override
public boolean equals(Object o) {
    if (this == o) {
        return true;
    }
    if (o == null || getClass() != o.getClass()) {
        return false;
    }

    LinkNode link = (LinkNode) o;

    if (url != null ? !url.equals(link.url) : link.url != null) {
        return false;
    }

    return true;
}

@Override
public int hashCode() {
    return url != null ? url.hashCode() : 0;
}

public long waitingInQueue(){
    return Util.differenceInMilliSeconds( dequeTime,enqueTime );
}

public long linkProcessingDuration(){
    return Util.differenceInMilliSeconds( endTime,startTime );
}

@Override
public String toString() {
    StringBuilder sb = new StringBuilder("LinkNode{");
    sb.append("url='").append(url).append('\'');
    sb.append(", score=").append(weight);
    sb.append(", enqueTime=").append(enqueTime);
    sb.append(", dequeTime=").append(dequeTime);
    sb.append(", tag=").append(tag);
    if(parentLink!=null) {
        sb.append(", parentLink=").append(parentLink.getUrl());
    }
    sb.append('}');
    return sb.toString();
}

public void setStatus(LinkNodeStatus status) {
    this.status = status;
}

public LinkNodeStatus getStatus(){
    if (status == null) {
        status = LinkNodeStatus.ERROR;
    }
    return status;
}

// check server link is it exist or not
/* this method gives fake errors
public LinkNodeStatus ping () {

    boolean reachable = false;
    String sanitizeUrl = url.replaceFirst("^https", "http");

    try {
        HttpURLConnection connection = (HttpURLConnection) new URL(sanitizeUrl).openConnection();
        connection.setConnectTimeout(1000);
        connection.setRequestMethod("HEAD");
        int responseCode = connection.getResponseCode();
        System.err.println(url + " " + responseCode);
        reachable = (200 <= responseCode && responseCode <= 399);
    } catch (IOException exception) {
    }
    return reachable?LinkNodeStatus.OK: LinkNodeStatus.ERROR;
}*/


public String getIpAdress() {
    return ipAdress;
}

public void setIpAdress(String ipAdress) {
    this.ipAdress = ipAdress;
}

/* methods for controlling url size */
public void setSize(int size) {
    this.size = size;
}

public int getSize() {
    return this.size;
}

public void setFileName(String filename) {
    this.filename = filename;
}

public String getFileName() {
    return this.filename;
}

public String getDomain() {
    return domain;
}

public void setDomain(String domain) {
    this.domain = domain;
    }
}
package-pkg.crawler;
导入org.joda.time.DateTime;
公共类LinkNodeLight实现了可比较的{
受保护的字符串url;
保护浮子重量;
受保护的日期时间归属时间;
受保护的布尔链接;
公共字符串getUrl(){
返回url;
}
公共浮点数getWeight(){
返回重量;
}
公共空隙设定重量(浮子重量){
重量=重量;
}
公共日期时间getEnqueTime(){
回访时间;
}
公共链接节点灯(字符串url){
this.url=url;
}
public void setEnqueTime(DateTime enqueTime){
this.enqueTime=enqueTime;
}
@凌驾
公共int比较(链接节点灯光链接){
if(this.weightlink.weight)返回-1;
返回0;
}
}
LinkNode.java

 package pkg.crawler;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.TimeUnit;

import org.jsoup.HttpStatusException;
import org.jsoup.UnsupportedMimeTypeException;
import org.joda.time.DateTime;


public class WebCrawler {

public static Queue <LinkNodeLight> queue = new PriorityBlockingQueue <> (); // priority queue
public static final int n_threads = 5;                                 // amount of threads
private static Set<String> processed = new LinkedHashSet <> ();         // set of processed urls
private PrintWriter out;                                                // output file
private PrintWriter err;                                                // error file
private static Integer cntIntra = new Integer (0);                              // counters for intra- links in the queue
private static Integer cntInter = new Integer (0);                              // counters for inter- links in the queue
private static Integer dub = new Integer (0);                                   // amount of skipped urls

public static void main(String[] args) throws Exception {
    System.out.println("Running web crawler: " + new Date());

    WebCrawler webCrawler = new WebCrawler();
    webCrawler.createFiles();
    try (Scanner in = new Scanner(new File ("seeds.txt"))) {
        while (in.hasNext()) {
            webCrawler.enque(new LinkNode (in.nextLine().trim()));
        }
    } catch (IOException e) {
        e.printStackTrace();
        return;
    }
    webCrawler.processQueue();
    webCrawler.out.close();
    webCrawler.err.close();
}

public void processQueue(){
    /* run in threads */
    Runnable r = new Runnable() {
        @Override 
        public void run() {
            /* queue may be empty but process is not finished, that's why we need to check if any links are being processed */
            while (true) {
                LinkNode link = deque();
                if (link == null)
                    continue;
                link.setStartTime(new DateTime());
                boolean process = processLink(link);
                link.setEndTime(new DateTime());
                if (!process)
                    continue;
                /* print the data to the csv file */
                if (link.getStatus() != null && link.getStatus().equals(LinkNodeStatus.OK)) {
                    synchronized(out) {
                        out.println(getOutputLine(link));
                        out.flush();
                    }
                } else {
                    synchronized(err) {
                        err.println(getOutputLine(link));
                        err.flush();
                    }
                }
            }
        }
    };
    /* run n_threads threads which perform dequeue and process */
    LinkedList <Thread> threads = new LinkedList <> ();
    for (int i = 0; i < n_threads; i++) {
        threads.add(new Thread(r));
        threads.getLast().start();
    }
    for (Thread thread : threads) {
        try {
            thread.join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}


/* returns true if link was actually processed */
private boolean processLink(LinkNode inputLink) {
    String url = getUrlGeneralForm(inputLink);
    boolean process = true;
    synchronized (processed) {
        if (processed.contains(url)) {
            process = false;
            synchronized (dub) {dub++;}
        } else
            processed.add(url);
    }
    /* start processing only if the url have not been processed yet or not being processed */
    if (process) {
        System.out.println("Processing url " + url);
        List<LinkNodeLight> outputLinks = parseAndWieghtResults(inputLink);
        for (LinkNodeLight outputLink : outputLinks) {
            String getUrlGeneralForumOutput = getUrlGeneralForm(outputLink);
            /* add the new link to the queue only if it has not been processed yet */
            process = true;
            synchronized (processed) {
                if (processed.contains(getUrlGeneralForumOutput)) {
                    process = false;
                    synchronized (dub) {dub++;}
                }
            }
            if (process) {
                enque(outputLink);
            }
        }
        return true;
    }
    return false;
}

void enque(LinkNodeLight link){
    link.setEnqueTime(new DateTime());
    /* the add method requires implicit priority */
    synchronized (queue) {
        if (link.interLinks)
            synchronized (cntInter) {cntInter++;}
        else
            synchronized (cntIntra) {cntIntra++;}
      //queue.add(link, 100 - (int)(link.getWeight() * 100.f));
        queue.add(link);
    }
}


/**
 * Picks an element from the queue
 * @return top element from the queue or null if the queue is empty
 */
LinkNode deque(){
    /* link must be checked */
    LinkNode link = null;
    synchronized (queue) {
        link = (LinkNode) queue.poll();
        if (link != null) {
            link.setDequeTime(new DateTime());
            if (link.isInterLinks())
                synchronized (cntInter) {cntInter--;}
            else
                synchronized (cntIntra) {cntIntra--;}
        }
    }
    return link;
}

private void createFiles() {
    /* create output file */
    try {
        out = new PrintWriter(new BufferedWriter(new FileWriter("CrawledURLS.csv", false)));
        out.println(generateHeaderFile());
    } catch (IOException e) {
        System.err.println(e);
    }
    /* create error file */
    try {
        err = new PrintWriter(new BufferedWriter(new FileWriter("CrawledURLSERROR.csv", false)));
        err.println(generateHeaderFile());
    } catch (IOException e) {
        System.err.println(e);
    }
}
/**
 * formats the string so it can be valid entry in csv file
 * @param s
 * @return
 */
private static String format(String s) {
    // replace " by ""
    String ret = s.replaceAll("\"", "\"\"");
    // put string into quotes
    return "\"" + ret + "\"";
}
/**
 * Creates the line that needs to be written in the outputfile
 * @param link
 * @return
 */
public static String getOutputLine(LinkNode link){
    StringBuilder builder = new StringBuilder();
    builder.append(link.getParentLink()!=null ? format(link.getParentLink().getUrl()) : "");
    builder.append(",");
    builder.append(link.getParentLink()!=null ? link.getParentLink().getIpAdress() : "");
    builder.append(",");
    builder.append(link.getParentLink()!=null ? link.getParentLink().linkProcessingDuration() : "");
    builder.append(",");
    builder.append(format(link.getUrl()));
    builder.append(",");
    builder.append(link.getDomain());
    builder.append(",");
    builder.append(link.isInterLinks());
    builder.append(",");
    builder.append(Util.formatDate(link.getEnqueTime()));
    builder.append(",");
    builder.append(Util.formatDate(link.getDequeTime()));
    builder.append(",");
    builder.append(link.waitingInQueue());
    builder.append(",");
    builder.append(queue.size());
    /* Inter and intra links in queue */
    builder.append(",");
    builder.append(cntIntra.toString());
    builder.append(",");
    builder.append(cntInter.toString());
    builder.append(",");
    builder.append(dub);
    builder.append(",");
    builder.append(new Date ());
    /* URL size*/
    builder.append(",");
    builder.append(link.getSize());
    /* HTML file
    builder.append(",");
    builder.append(link.getFileName());*/
    /* add HTTP error */
    builder.append(",");
    if (link.getParseException() != null) {
        if (link.getParseException() instanceof HttpStatusException)
            builder.append(((HttpStatusException) link.getParseException()).getStatusCode());
        if (link.getParseException() instanceof SocketTimeoutException)
            builder.append("Time out");
        if (link.getParseException() instanceof MalformedURLException)
            builder.append("URL is not valid");
        if (link.getParseException() instanceof UnsupportedMimeTypeException)
            builder.append("Unsupported mime type: " + ((UnsupportedMimeTypeException)link.getParseException()).getMimeType());
    }
    return builder.toString();

}

/**
 * generates the Header for the file
 * @param link
 * @return
 */
private String generateHeaderFile(){
    StringBuilder builder = new StringBuilder();
    builder.append("Seed URL");
    builder.append(",");
    builder.append("Seed IP");
    builder.append(",");
    builder.append("Process Duration");
    builder.append(",");
    builder.append("Link URL");
    builder.append(",");
    builder.append("Link domain");
    builder.append(",");
    builder.append("Link IP");
    builder.append(",");
    builder.append("Enque Time");
    builder.append(",");
    builder.append("Deque Time");
    builder.append(",");
    builder.append("Waiting in the Queue");
    builder.append(",");
    builder.append("QueueSize");
    builder.append(",");
    builder.append("Intra in queue");
    builder.append(",");
    builder.append("Inter in queue");
    builder.append(",");
    builder.append("Dublications skipped");
    /* time was printed, but no header was */
    builder.append(",");
    builder.append("Time");
    /* URL size*/
    builder.append(",");
    builder.append("Size bytes");
    /* HTTP errors */
    builder.append(",");
    builder.append("HTTP error");
    return builder.toString();

}



String getUrlGeneralForm(LinkNodeLight link){
    String url = link.getUrl();
    if (url.endsWith("/")){
        url = url.substring(0, url.length() - 1);
    }
    return url;
}


private List<LinkNodeLight> parseAndWieghtResults(LinkNode inputLink) {
    List<LinkNodeLight> outputLinks = HTMLParser.parse(inputLink);
    if (inputLink.hasParseException()) {
        return outputLinks;
    } else {
        return URLWeight.weight(inputLink, outputLinks);
    }
}
}
package pkg.crawler;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.math.BigInteger;
import java.util.Formatter;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.logging.Logger;
import java.security.*;
import java.nio.file.Path;
import java.nio.file.Paths;


public class HTMLParser {

private static final int READ_TIMEOUT_IN_MILLISSECS = (int) TimeUnit.MILLISECONDS.convert(30, TimeUnit.SECONDS);
private static HashMap <String, Integer> filecounter = new HashMap<> ();


public static List<LinkNodeLight> parse(LinkNode inputLink){
    List<LinkNodeLight> outputLinks = new LinkedList<>();
    try {
        inputLink.setIpAdress(IpFromUrl.getIp(inputLink.getUrl()));
        String url = inputLink.getUrl();
        if (inputLink.getIpAdress() != null) {
            url.replace(URLWeight.getHostName(url), inputLink.getIpAdress());
        }
        Document parsedResults =  Jsoup
                .connect(url)
                .timeout(READ_TIMEOUT_IN_MILLISSECS)
                .get();
        inputLink.setSize(parsedResults.html().length());
        /* IP address moved here in order to speed up the process */
        inputLink.setStatus(LinkNodeStatus.OK);
        inputLink.setDomain(URLWeight.getDomainName(inputLink.getUrl()));
        if (true) {
            /* save the file to the html */
            String filename = parsedResults.title();//digestBig.toString(16) + ".html";
            if (filename.length() > 24) {
                filename = filename.substring(0, 24);
            }
            filename = filename.replaceAll("[^\\w\\d\\s]", "").trim();
            filename = filename.replaceAll("\\s+",  " ");

            if (!filecounter.containsKey(filename)) {
                filecounter.put(filename, 1);
            } else {
                Integer tmp = filecounter.remove(filename);
                filecounter.put(filename, tmp + 1);
            }
            filename = filename + "-" + (filecounter.get(filename)).toString() + ".html";
            filename = Paths.get("downloads", filename).toString();
            inputLink.setFileName(filename);
            /* use md5 of url as file name */
            try (PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(filename)))) {
                out.println("<!--" + inputLink.getUrl() + "-->");
                out.print(parsedResults.html());
                out.flush();
                out.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        String tag;
        Elements tagElements;
        List<LinkNode> result;


        tag = "a[href";
        tagElements = parsedResults.select(tag);
        result = toLinkNodeObject(inputLink, tagElements, tag);
        outputLinks.addAll(result);


        tag = "area[href";
        tagElements = parsedResults.select(tag);
        result = toLinkNodeObject(inputLink, tagElements, tag);
        outputLinks.addAll(result);
    } catch (IOException e) {
        inputLink.setParseException(e);
        inputLink.setStatus(LinkNodeStatus.ERROR);
    }

    return outputLinks;
}


static List<LinkNode> toLinkNodeObject(LinkNode parentLink, Elements tagElements, String tag) {
    List<LinkNode> links = new LinkedList<>();
    for (Element element : tagElements) {

        if(isFragmentRef(element)){
            continue;
        }

        String absoluteRef = String.format("abs:%s", tag.contains("[") ? tag.substring(tag.indexOf("[") + 1, tag.length()) : "href");
        String url = element.attr(absoluteRef);

        if(url!=null && url.trim().length()>0) {
            LinkNode link = new LinkNode(url);
            link.setTag(element.tagName());
            link.setParentLink(parentLink);
            links.add(link);
        }
    }
    return links;
}

static boolean isFragmentRef(Element element){
    String href = element.attr("href");
    return href!=null && (href.trim().startsWith("#") || href.startsWith("mailto:"));
}
package pkg.crawler;

import java.util.Date;

import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;


public class Util {

private static DateTimeFormatter formatter;
static {



    formatter =   DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss:SSS");


}


public static String linkToString(LinkNode inputLink){


    return String.format("%s\t%s\t%s\t%s\t%s\t%s",
            inputLink.getUrl(),
            inputLink.getWeight(),
            formatDate(inputLink.getEnqueTime()),
            formatDate(inputLink.getDequeTime()),
            differenceInMilliSeconds(inputLink.getEnqueTime(), inputLink.getDequeTime()),
            inputLink.getParentLink()==null?"":inputLink.getParentLink().getUrl()
    );
}

public static String linkToErrorString(LinkNode inputLink){

    return String.format("%s\t%s\t%s\t%s\t%s\t%s",
            inputLink.getUrl(),
            inputLink.getWeight(),
            formatDate(inputLink.getEnqueTime()),
            formatDate(inputLink.getDequeTime()),
            inputLink.getParentLink()==null?"":inputLink.getParentLink().getUrl(),
            inputLink.getParseException().getMessage()
    );
}


public static String formatDate(DateTime date){
    return formatter.print(date);
}

public static long differenceInMilliSeconds(DateTime dequeTime, DateTime enqueTime){
    return (dequeTime.getMillis()- enqueTime.getMillis());
}

public static int differenceInSeconds(Date enqueTime, Date dequeTime){
    return (int)((dequeTime.getTime()/1000) - (enqueTime.getTime()/1000));
}

public static int differenceInMinutes(Date enqueTime, Date dequeTime){
    return (int)((dequeTime.getTime()/60000) - (enqueTime.getTime()/60000));
}

}
package pkg.crawler;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;

public class URLWeight {

public static List<LinkNodeLight> weight(LinkNode sourceLink, List<LinkNodeLight> links) {

    List<LinkNodeLight> interLinks = new LinkedList<>();
    List<LinkNodeLight> intraLinks = new LinkedList<>();

    for (LinkNodeLight link : links) {
        if (isIntraLink(sourceLink, link)) {
            intraLinks.add(link);
            link.setInterLinks(false);
        } else {
            interLinks.add(link);
            link.setInterLinks(true);
        }
    }



static boolean isIntraLink(LinkNodeLight sourceLink, LinkNodeLight link){

    String parentDomainName = getHostName(sourceLink.getUrl());

    String childDomainName = getHostName(link.getUrl());
    return parentDomainName.equalsIgnoreCase(childDomainName);
}

public static String getHostName(String url) {
    if(url == null){
    //  System.out.println("Deneme");
        return "";

    }

    String domainName = new String(url);

    int index = domainName.indexOf("://");
    if (index != -1) {

        domainName = domainName.substring(index + 3);
    }
    for (int i = 0; i < domainName.length(); i++)
        if (domainName.charAt(i) == '?' || domainName.charAt(i) == '/') {
            domainName = domainName.substring(0, i);
            break;
        }

    /*if (index != -1) {

        domainName = domainName.substring(0, index);
    }*/

    /* have to keep www in order to do replacements with IP */
    //domainName = domainName.replaceFirst("^www.*?\\.", "");

    return domainName;
}
public static String getDomainName(String url) {
    String [] tmp= getHostName(url).split("\\.");
    if (tmp.length == 0)
        return "";
    return tmp[tmp.length - 1];
}


}
package pkg.crawler;

import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class PingTaskManager {

private static ExecutorService executor = Executors.newFixedThreadPool(100);

public  static void ping (LinkNode e) {
    executor.submit(new PingTaks(e));
}


}

class PingTaks implements Runnable {
 private LinkNode link;
public PingTaks( LinkNode link ) {

}

@Override
public void run() {
    /* link.ping(); */      
}


}
package pkg.crawler;

public enum LinkNodeStatus {
OK,
ERROR

}
package pkg.crawler;

import org.joda.time.DateTime;

public class LinkNodeLight implements Comparable<LinkNodeLight> {
protected String url;
protected float weight;
protected DateTime enqueTime;
protected boolean interLinks;

public String getUrl() {
    return url;
}

public float getWeight() {
    return weight;
}

public void setWeight(float weight) {
    this.weight = weight;
}

public DateTime getEnqueTime() {
    return enqueTime;
}


public LinkNodeLight(String url) {
    this.url = url;
}


public void setEnqueTime(DateTime enqueTime) {
    this.enqueTime = enqueTime;
}

@Override
public int compareTo(LinkNodeLight link) {

    if (this.weight < link.weight) return 1;
     else if (this.weight > link.weight) return -1;
        return 0;

    }
}
package pkg.crawler;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.Socket;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Date;



import org.joda.time.DateTime;


public class LinkNode extends LinkNodeLight{
public LinkNode(String url) {
    super(url);
}

private String tag;
private LinkNode parentLink;
private IOException parseException = null; // initialize parse Exception with null
private float weight;
private DateTime dequeTime;
private DateTime startTime;
private DateTime endTime;
private LinkNodeStatus status;
private String ipAdress;
private int size;
private String filename;
private String domain;

public DateTime getStartTime() {
    return startTime;
}

public void setStartTime(DateTime startTime) {
    this.startTime = startTime;
}

public DateTime getEndTime() {
    return endTime;
}

public void setEndTime(DateTime endTime) {
    this.endTime = endTime;
}

public DateTime getDequeTime() {
    return dequeTime;
}

public String getTag() {
    return tag;
}

public LinkNode getParentLink() {
    return parentLink;
}

public Exception getParseException() {
    return parseException;
}

public boolean hasParseException(){
    return parseException!=null;
}


public void setDequeTime(DateTime dequeTime) {
    this.dequeTime = dequeTime;
}

public void setTag(String tag) {
    this.tag = tag;
}

public void setParentLink(LinkNode parentLink) {
    this.parentLink = parentLink;
}

public void setParseException(IOException parseException) {
    this.parseException = parseException;
}

@Override
public boolean equals(Object o) {
    if (this == o) {
        return true;
    }
    if (o == null || getClass() != o.getClass()) {
        return false;
    }

    LinkNode link = (LinkNode) o;

    if (url != null ? !url.equals(link.url) : link.url != null) {
        return false;
    }

    return true;
}

@Override
public int hashCode() {
    return url != null ? url.hashCode() : 0;
}

public long waitingInQueue(){
    return Util.differenceInMilliSeconds( dequeTime,enqueTime );
}

public long linkProcessingDuration(){
    return Util.differenceInMilliSeconds( endTime,startTime );
}

@Override
public String toString() {
    StringBuilder sb = new StringBuilder("LinkNode{");
    sb.append("url='").append(url).append('\'');
    sb.append(", score=").append(weight);
    sb.append(", enqueTime=").append(enqueTime);
    sb.append(", dequeTime=").append(dequeTime);
    sb.append(", tag=").append(tag);
    if(parentLink!=null) {
        sb.append(", parentLink=").append(parentLink.getUrl());
    }
    sb.append('}');
    return sb.toString();
}

public void setStatus(LinkNodeStatus status) {
    this.status = status;
}

public LinkNodeStatus getStatus(){
    if (status == null) {
        status = LinkNodeStatus.ERROR;
    }
    return status;
}

// check server link is it exist or not
/* this method gives fake errors
public LinkNodeStatus ping () {

    boolean reachable = false;
    String sanitizeUrl = url.replaceFirst("^https", "http");

    try {
        HttpURLConnection connection = (HttpURLConnection) new URL(sanitizeUrl).openConnection();
        connection.setConnectTimeout(1000);
        connection.setRequestMethod("HEAD");
        int responseCode = connection.getResponseCode();
        System.err.println(url + " " + responseCode);
        reachable = (200 <= responseCode && responseCode <= 399);
    } catch (IOException exception) {
    }
    return reachable?LinkNodeStatus.OK: LinkNodeStatus.ERROR;
}*/


public String getIpAdress() {
    return ipAdress;
}

public void setIpAdress(String ipAdress) {
    this.ipAdress = ipAdress;
}

/* methods for controlling url size */
public void setSize(int size) {
    this.size = size;
}

public int getSize() {
    return this.size;
}

public void setFileName(String filename) {
    this.filename = filename;
}

public String getFileName() {
    return this.filename;
}

public String getDomain() {
    return domain;
}

public void setDomain(String domain) {
    this.domain = domain;
    }
}
package-pkg.crawler;
导入java.io.IOException;
导入java.net.HttpURLConnection;
导入java.net.Socket;
导入java.net.URL;
导入java.net.UnknownHostException;
导入java.util.Date;
导入org.joda.time.DateTime;
公共类LinkNode扩展了LinkNodeLight{
公共链接节点(字符串url){
超级链接(url);
}
私有字符串标签;
私有链接节点父链接;
私有IOException parseException=null;//使用null初始化解析异常
私人浮重;
私有日期时间;
私人日期时间开始时间;
私有日期时间结束时间;
私有链接节点状态;
私服;
私有整数大小;
私有字符串文件名;
私有字符串域;
public DateTime getStartTime(){
返回起始时间;
}
公共无效设置开始时间(日期时间开始时间){
this.startTime=startTime;
}
公共日期时间getEndTime(){
返回结束时间;
}
public void setEndTime(DateTime endTime){
this.endTime=endTime;
}
公共日期时间getDequeTime(){
返回时间;
}
公共字符串getTag(){
返回标签;
}
公共链接节点getParentLink(){
返回父链接;
}
公共异常getParseException(){
返回parseException;
}
公共布尔hasParseException(){
返回parseException!=null;
}
public void setDequeTime(DateTime dequeTime){
this.dequeTime=dequeTime;
}
公共void setTag(字符串标记){
this.tag=tag;
}
public void setParentLink(LinkNode parentLink){
this.parentLink=parentLink;
}
公共void setParseException(IOException parseException){
this.parseException=parseException;
}
@凌驾
公共布尔等于(对象o){
if(this==o){
返回true;
}
如果(o==null | | getClass()!=o.getClass()){
返回false;
}
LinkNode link=(LinkNode)o;
如果(url!=null?!url.equals(link.url):link.url!=null){
返回false;
}
返回true;
}
@凌驾
公共int hashCode(){
返回url!=null?url.hashCode():0;
}
公共长等待队列(){
返回Util.differenceInMilliSeconds(dequeTime,enqueTime);
}
公共长链接处理持续时间(){
返回Util.differenceInMilliSeconds(endTime,startTime);
}
@凌驾
公共字符串toString(){
StringBuilder sb=新的StringBuilder(“链接节点{”);
sb.append(“url=”).append(url).append(“\”);
sb.追加(“,score=”)。追加(重量);
sb.append(“,enqueTime=”).append(enqueTime);
sb.append(“,dequeTime=”).append(dequeTime);
sb.append(“,tag=”).append(tag);
if(parentLink!=null){
sb.append(“,parentLink=”).append(parentLink.getUrl());
}
某人附加('}');
使某人返回字符串();
}
公共无效设置状态(链接节点状态){
这个状态=状态;
}
公共链接nodestatus getStatus(){
如果(状态==null){
状态=LinkNodeStatus.ERROR;
}
返回状态;
}
//检查服务器链接是否存在
/*这种方法会产生假错误
公共链接nodestatus ping(){
布尔可达=假;
字符串sanitizeUrl=url.replaceFirst(“^https”,“http”);
试一试{
HttpURLConnection连接=(HttpURLConnection)新URL(sanitizeUrl).openConnection();
连接。设置连接超时(1000);
连接。设置请求方法(“HEAD”);
int responseCode=connection.getResponseCode();
System.err.println(url+“”+响应代码);
可达=(200

我试图通过将eclipse.ini设置更改为2048MB的ram来分配内存,正如在本主题中回答的那样,但在3小时或更短的时间后仍然会出现相同的错误

我不想(*),但在eclipse.ini中,您为eclipse设置了内存,这与爬虫的内存无关

使用命令行时,需要通过
java-Xmx2G pkg.crawler.WebCrawler
启动它

从Eclipse启动时,需要将
-Xmx2G
添加到运行配置(“VM参数”而不是“程序参数”)



(*)指向已删除问题的链接;需要一些信誉才能查看。

“我试图通过将eclipse.ini设置更改为2048 MB ram来分配内存,正如本主题“Well thread Suggestions”中回答的那样。简单的答案(见上文)增加JVM内存大小。这会有所帮助,但真正的问题可能是您的web爬行算法正在创建一个内存中的数据结构,该结构与您访问的页面数量成比例增长