Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/loops/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Java 在项目中使用PriorityQueue.size()作为条件_Java_Loops_Io_Priority Queue - Fatal编程技术网

Java 在项目中使用PriorityQueue.size()作为条件

Java 在项目中使用PriorityQueue.size()作为条件,java,loops,io,priority-queue,Java,Loops,Io,Priority Queue,我正在尝试向我的优先级QueuePQueue添加一些条件,条件是我正在尝试检查队列大小是否达到我放置的固定大小或URL达到的最大等待时间MaxWaitTime的固定最大停留时间PQueue。当这两个条件之一为真时,它应该清除PQueue中的所有URL,以便在其中留出一些可用空间 我提出了两个条件: while(!PQueue.isEmpty()&& PQueue.size()>= MaxSizeOfPQueue) 我为PQueue.size()和MaxSizeOfPQue

我正在尝试向我的优先级Queue
PQueue
添加一些条件,条件是我正在尝试检查队列大小是否达到我放置的固定大小
URL达到的最大等待时间
MaxWaitTime
的固定最大停留时间
PQueue
。当这两个条件之一为真时,它应该清除
PQueue
中的所有URL,以便在其中留出一些可用空间

我提出了两个条件:

while(!PQueue.isEmpty()&& PQueue.size()>= MaxSizeOfPQueue)
我为
PQueue.size()
MaxSizeOfPQueue

这是我的课

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.util.Date;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Scanner;
import java.util.Set;
import org.jsoup.HttpStatusException;
import org.jsoup.UnsupportedMimeTypeException;
import org.joda.time.DateTime;


public class WebCrawler {

public static PriorityQueue <LinkNodeLight> PQueue = new PriorityQueue <> (); 

public static final int NumberOfThreads = 10;                                 

private static Set<String> DuplicationLinksHub = new LinkedHashSet <> ();         
private PrintWriter outputFile;                                                
private PrintWriter errFile;   

private static Integer IntraLinkCount = new Integer (0);                 
private static Integer InterLinkCount = new Integer (0);                 
private static Integer DuplicationLinksCount = new Integer (0);     
private static Integer MaxWaitTime = new Integer (600000); // in MilliSecond
private static Integer MaxSizeOfPQueue = new Integer (1000);
private static long minDate;
public static void main(String[] args) {
    System.out.println("Running web crawler: " + new Date());
    minDate = Long.MAX_VALUE;
    WebCrawler webCrawler = new WebCrawler();
    webCrawler.createFiles();
    try (Scanner readSeedFile = new Scanner(new File ("seeds.txt"))) {
        while (readSeedFile.hasNext()) {
            webCrawler.enque(new LinkNode (readSeedFile.nextLine().trim()));
        }
    } catch (IOException e) {
        e.printStackTrace();
        return;
    }
    webCrawler.processQueue();
    webCrawler.outputFile.close();
    webCrawler.errFile.close();
}
private void cleanQueue()
{       
    // calculate the minimal date in milliseconds 
    //and if the current difference doesn't exceed MaxWaitTime
    // i just skip cleaning queue

    if((System.currentTimeMillis()-minDate)<MaxWaitTime)
        return;
    //Temporary queue for URLs that we are going to keep
    PriorityQueue <LinkNodeLight> TempQueue = new PriorityQueue<>();

    LinkNodeLight tempNode;
    //Checking queue one by one

    // It will not drop links until :
    // PQueue size greater or equal than MaxSizeOfPQueue, AND PQueueis NOT Empty


    while(!PQueue.isEmpty()|| PQueue.size()>= MaxSizeOfPQueue)
    {
        //if the URL has been in the queue for way too long we can delete it.
        }
        tempNode = PQueue.remove();
        long diff = System.currentTimeMillis()-tempNode.getEnqueTime().getMillis();
        System.out.println("************************************");
        System.out.println("This URL is Removed: "+ tempNode);

        try(PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter("removedURLs.csv", true)))) {
            out.println(tempNode);

        }catch (IOException e) {

        }

        if(diff < MaxWaitTime){
            TempQueue.add(tempNode);
            minDate = Math.min(minDate, tempNode.getEnqueTime().getMillis());
        }

    //puts back all the urls that are still good
    // remove the urls from TempQueue and add them to Orginal PQueue
    while(!TempQueue.isEmpty())
        PQueue.add(TempQueue.remove());

}

 public void processQueue(){

    Runnable r = new Runnable() {
        @Override 
        public void run() {

            while (true) {
                LinkNode urlLink = deque();
                if (urlLink == null)
                    continue;

                urlLink.setStartTime(new DateTime());
                boolean process = isLinkProcessed(urlLink);
                urlLink.setEndTime(new DateTime());

                if (!process)
                    continue;

                if (urlLink.getStatus() != null && urlLink.getStatus().equals(LinkNodeStatus.OK)) {
                    synchronized(outputFile) {
                        outputFile.println(getOutputLine(urlLink));
                        outputFile.flush();
                    }
                } else {
                    synchronized(errFile) {
                        errFile.println(getOutputLine(urlLink));
                        errFile.flush();
                    }
                }
            }
        }
    };

    LinkedList <Thread> threadsList = new LinkedList <> ();
    for (int i = 0; i < NumberOfThreads; i++) {
        threadsList.add(new Thread(r));
        threadsList.getLast().start();
    }
    for (Thread thread : threadsList) {
        try {
            thread.join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}



private boolean isLinkProcessed(LinkNode inputLink) {
    String url = getUrlGeneralForm(inputLink);

    boolean linkProcess = true;
    synchronized (DuplicationLinksHub) {

        if (DuplicationLinksHub.contains(url)) {

            linkProcess = false;

            synchronized (DuplicationLinksCount)
            {
                DuplicationLinksCount++;
                }
        } else

            DuplicationLinksHub.add(url);
    }

    if (linkProcess) {
        System.out.println("Processing url " + url);
        List<LinkNodeLight> outputLinks = parseAndWieghtResults(inputLink);

        for (LinkNodeLight outputLink : outputLinks) {
            String getUrlGeneralForumOutput = getUrlGeneralForm(outputLink);

            linkProcess = true;
            synchronized (DuplicationLinksHub) {
                if (DuplicationLinksHub.contains(getUrlGeneralForumOutput)) {
                    linkProcess = false;
                    synchronized (DuplicationLinksCount) 
                    {
                        DuplicationLinksCount++;
                        }
                }
            }
            if (linkProcess) {
                enque(outputLink);
            }
        }
        return true;
    }
    return false;
}


void enque(LinkNodeLight link){

    link.setEnqueTime(new DateTime());

    synchronized (PQueue) {
        PQueue.add(link);
        minDate = Math.min(minDate, link.getEnqueTime().getMillis());
        cleanQueue();
        if (link.interLinks)
            synchronized (InterLinkCount) {
                InterLinkCount++;
                }

        else
            synchronized (IntraLinkCount) {
                IntraLinkCount++;
                }

    }



        }


LinkNode deque(){

    LinkNode link = null;
    synchronized (PQueue) {

        link = (LinkNode) PQueue.poll();
        if (link != null) {

            link.setDequeTime(new DateTime());
            if (link.isInterLinks())
                synchronized (InterLinkCount) {
                    InterLinkCount--;
                    }
            else
                synchronized (IntraLinkCount) {
                    IntraLinkCount--;
                    }
        }


        return link;


        }


}



private void createFiles() {

    try {
        outputFile = new PrintWriter(new BufferedWriter(new FileWriter("Qsizde300_Q_Con_27.10.5dk_CrawledURLS.csv", false)));
        outputFile.println(generateHeaderFile());
    } catch (IOException e) {
        System.err.println(e);
    }

    try {
        errFile = new PrintWriter(new BufferedWriter(new FileWriter("Qsizde300_Q_Con_27.10.5dk_CrawledURLSERROR.csv", false)));
        errFile.println(generateHeaderFile());
    } catch (IOException e) {
        System.err.println(e);
    }
}


private static String format(String s) {

    String ret = s.replaceAll("\"", "\"\"");

    return "\"" + ret + "\"";
}

public static String getOutputLine(LinkNode link){
    StringBuilder builder = new StringBuilder();
    builder.append(link.getParentLink()!=null ? format(link.getParentLink().getUrl()) : "");
    builder.append(",");
    builder.append(link.getParentLink()!=null ? link.getParentLink().getIpAdress() : "");
    builder.append(",");
    builder.append(link.getParentLink()!=null ? link.getParentLink().linkProcessingDuration() : "");
    builder.append(",");
    builder.append(format(link.getUrl()));
    builder.append(",");
    builder.append(link.getDomain());
    builder.append(",");
    builder.append(link.getIpAdress());
    builder.append(",");
    builder.append(link.isInterLinks());
    builder.append(",");
    builder.append(link.getWeight());
    builder.append(",");
    builder.append(Util.formatDate(link.getEnqueTime()));
    builder.append(",");
    builder.append(Util.formatDate(link.getDequeTime()));
    builder.append(",");
    builder.append(link.waitingInQueue());
    builder.append(",");
    builder.append(PQueue.size());
    builder.append(",");
    builder.append(IntraLinkCount.toString());
    builder.append(",");
    builder.append(InterLinkCount.toString());
    builder.append(",");
    builder.append(DuplicationLinksCount);
    builder.append(",");
    builder.append(new Date ());
    builder.append(",");
    builder.append(link.getSize());
    builder.append(",");
    if (link.getParseException() != null) {
        if (link.getParseException() instanceof HttpStatusException)
            builder.append(((HttpStatusException) link.getParseException()).getStatusCode());
        if (link.getParseException() instanceof SocketTimeoutException)
            builder.append("Time out");
        if (link.getParseException() instanceof MalformedURLException)
            builder.append("URL is not valid");
        if (link.getParseException() instanceof UnsupportedMimeTypeException)
            builder.append("Unsupported mime type: " + ((UnsupportedMimeTypeException)link.getParseException()).getMimeType());
    }
    return builder.toString();

}


private String generateHeaderFile(){
    StringBuilder builder = new StringBuilder();
    builder.append("Seed URL");
    builder.append(",");
    builder.append("Seed IP");
    builder.append(",");
    builder.append("Process Duration");
    builder.append(",");
    builder.append("Link URL");
    builder.append(",");
    builder.append("Link domain");
    builder.append(",");
    builder.append("Link IP");
    builder.append(",");
    builder.append("isIntern");
    builder.append(",");
    builder.append("Weight");
    builder.append(",");
    builder.append("Enque Time");
    builder.append(",");
    builder.append("Deque Time");
    builder.append(",");
    builder.append("Waiting in the Queue");
    builder.append(",");
    builder.append("QueueSize");
    builder.append(",");
    builder.append("Intra in queue");
    builder.append(",");
    builder.append("Inter in queue");
    builder.append(",");
    builder.append("Dublications skipped");
    builder.append(",");
    builder.append("Time");
    builder.append(",");
    builder.append("Size bytes");
    builder.append(",");
    builder.append("HTTP error");
    return builder.toString();

}

String getUrlGeneralForm(LinkNodeLight link){
    String url = link.getUrl();
    if (url.endsWith("/")){
        url = url.substring(0, url.length() - 1);
    }
    return url;
}


private List<LinkNodeLight> parseAndWieghtResults(LinkNode inputLink) {
    List<LinkNodeLight> outputLinks = HTMLParser.parse(inputLink);
    if (inputLink.hasParseException()) {
        return outputLinks;
    } else {
        return URLWeight.weight(inputLink, outputLinks);
    }
}
}
导入java.io.BufferedWriter;
导入java.io.File;
导入java.io.FileWriter;
导入java.io.IOException;
导入java.io.PrintWriter;
导入java.net.MalformedURLException;
导入java.net.SocketTimeoutException;
导入java.util.Date;
导入java.util.LinkedHashSet;
导入java.util.LinkedList;
导入java.util.List;
导入java.util.PriorityQueue;
导入java.util.Scanner;
导入java.util.Set;
导入org.jsoup.HttpStatusException;
导入org.jsoup.unsupportedmitypeeexception;
导入org.joda.time.DateTime;
公共类网络爬虫器{
public static PriorityQueue PQueue=new PriorityQueue();
公共静态最终int NumberOfThreads=10;
私有静态集replicationlinkshub=newlinkedhashset();
私有PrintWriter输出文件;
私有打印文件;
私有静态整数IntraLinkCount=新整数(0);
私有静态整数计数=新整数(0);
私有静态整数复制LinkScont=新整数(0);
私有静态整数MaxWaitTime=新整数(600000);//毫秒
私有静态整数MaxSizeOffQueue=新整数(1000);
私人静态长思维;
公共静态void main(字符串[]args){
System.out.println(“正在运行的网络爬虫:+newdate());
minDate=Long.MAX_值;
WebCrawler WebCrawler=新的WebCrawler();
webCrawler.createFiles();
try(Scanner readSeedFile=new Scanner(新文件(“seeds.txt”)){
while(readSeedFile.hasNext()){
enque(新链接节点(readSeedFile.nextLine().trim());
}
}捕获(IOE异常){
e、 printStackTrace();
返回;
}
webCrawler.processQueue();
webCrawler.outputFile.close();
webCrawler.errFile.close();
}
private void cleanQueue()
{       
//以毫秒为单位计算最小日期
//如果电流差不超过MaxWaitTime
//我只是跳过了清洁队列
if((System.currentTimeMillis()-minDate)=MaxSizeOffQueue)
{
//如果URL在队列中的时间过长,我们可以将其删除。
}
tempNode=PQueue.remove();
long diff=System.currentTimeMillis()-tempNode.getEnqueTime().getMillis();
System.out.println(“*************************************************”);
System.out.println(“此URL已删除:“+tempNode”);
try(PrintWriter out=new PrintWriter(new BufferedWriter(new FileWriter)(“removedURLs.csv”,true))){
out.println(tempNode);
}捕获(IOE异常){
}
如果(差异<最大等待时间){
添加(tempNode);
minDate=Math.min(minDate,tempNode.getEnqueTime().getMillis());
}
//放回所有仍然良好的URL
//从临时队列中删除URL并将其添加到原始PQUE
而(!TempQueue.isEmpty())
添加(TempQueue.remove());
}
public void processQueue(){
Runnable r=新的Runnable(){
@凌驾
公开募捐{
while(true){
LinkNode urlink=deque();
if(urlink==null)
继续;
设置开始时间(new DateTime());
布尔进程=isLinkProcessed(urlink);
setEndTime(new DateTime());
如果(!进程)
继续;
如果(urlink.getStatus()!=null&&urlink.getStatus().equals(LinkNodeStatus.OK)){
已同步(输出文件){
println(getOutputLine(urlink));
outputFile.flush();
}
}否则{
已同步(错误文件){
errFile.println(getOutputLine(urlLink));
errFile.flush();
}
}
}
}
};
LinkedList threadsList=新LinkedList();
for(int i=0;i