Java 在项目中使用PriorityQueue.size()作为条件
我正在尝试向我的优先级QueueJava 在项目中使用PriorityQueue.size()作为条件,java,loops,io,priority-queue,Java,Loops,Io,Priority Queue,我正在尝试向我的优先级QueuePQueue添加一些条件,条件是我正在尝试检查队列大小是否达到我放置的固定大小或URL达到的最大等待时间MaxWaitTime的固定最大停留时间PQueue。当这两个条件之一为真时,它应该清除PQueue中的所有URL,以便在其中留出一些可用空间 我提出了两个条件: while(!PQueue.isEmpty()&& PQueue.size()>= MaxSizeOfPQueue) 我为PQueue.size()和MaxSizeOfPQue
PQueue
添加一些条件,条件是我正在尝试检查队列大小是否达到我放置的固定大小或URL达到的最大等待时间MaxWaitTime
的固定最大停留时间PQueue
。当这两个条件之一为真时,它应该清除PQueue
中的所有URL,以便在其中留出一些可用空间
我提出了两个条件:
while(!PQueue.isEmpty()&& PQueue.size()>= MaxSizeOfPQueue)
我为PQueue.size()
和MaxSizeOfPQueue
这是我的课
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.util.Date;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Scanner;
import java.util.Set;
import org.jsoup.HttpStatusException;
import org.jsoup.UnsupportedMimeTypeException;
import org.joda.time.DateTime;
public class WebCrawler {
public static PriorityQueue <LinkNodeLight> PQueue = new PriorityQueue <> ();
public static final int NumberOfThreads = 10;
private static Set<String> DuplicationLinksHub = new LinkedHashSet <> ();
private PrintWriter outputFile;
private PrintWriter errFile;
private static Integer IntraLinkCount = new Integer (0);
private static Integer InterLinkCount = new Integer (0);
private static Integer DuplicationLinksCount = new Integer (0);
private static Integer MaxWaitTime = new Integer (600000); // in MilliSecond
private static Integer MaxSizeOfPQueue = new Integer (1000);
private static long minDate;
public static void main(String[] args) {
System.out.println("Running web crawler: " + new Date());
minDate = Long.MAX_VALUE;
WebCrawler webCrawler = new WebCrawler();
webCrawler.createFiles();
try (Scanner readSeedFile = new Scanner(new File ("seeds.txt"))) {
while (readSeedFile.hasNext()) {
webCrawler.enque(new LinkNode (readSeedFile.nextLine().trim()));
}
} catch (IOException e) {
e.printStackTrace();
return;
}
webCrawler.processQueue();
webCrawler.outputFile.close();
webCrawler.errFile.close();
}
private void cleanQueue()
{
// calculate the minimal date in milliseconds
//and if the current difference doesn't exceed MaxWaitTime
// i just skip cleaning queue
if((System.currentTimeMillis()-minDate)<MaxWaitTime)
return;
//Temporary queue for URLs that we are going to keep
PriorityQueue <LinkNodeLight> TempQueue = new PriorityQueue<>();
LinkNodeLight tempNode;
//Checking queue one by one
// It will not drop links until :
// PQueue size greater or equal than MaxSizeOfPQueue, AND PQueueis NOT Empty
while(!PQueue.isEmpty()|| PQueue.size()>= MaxSizeOfPQueue)
{
//if the URL has been in the queue for way too long we can delete it.
}
tempNode = PQueue.remove();
long diff = System.currentTimeMillis()-tempNode.getEnqueTime().getMillis();
System.out.println("************************************");
System.out.println("This URL is Removed: "+ tempNode);
try(PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter("removedURLs.csv", true)))) {
out.println(tempNode);
}catch (IOException e) {
}
if(diff < MaxWaitTime){
TempQueue.add(tempNode);
minDate = Math.min(minDate, tempNode.getEnqueTime().getMillis());
}
//puts back all the urls that are still good
// remove the urls from TempQueue and add them to Orginal PQueue
while(!TempQueue.isEmpty())
PQueue.add(TempQueue.remove());
}
public void processQueue(){
Runnable r = new Runnable() {
@Override
public void run() {
while (true) {
LinkNode urlLink = deque();
if (urlLink == null)
continue;
urlLink.setStartTime(new DateTime());
boolean process = isLinkProcessed(urlLink);
urlLink.setEndTime(new DateTime());
if (!process)
continue;
if (urlLink.getStatus() != null && urlLink.getStatus().equals(LinkNodeStatus.OK)) {
synchronized(outputFile) {
outputFile.println(getOutputLine(urlLink));
outputFile.flush();
}
} else {
synchronized(errFile) {
errFile.println(getOutputLine(urlLink));
errFile.flush();
}
}
}
}
};
LinkedList <Thread> threadsList = new LinkedList <> ();
for (int i = 0; i < NumberOfThreads; i++) {
threadsList.add(new Thread(r));
threadsList.getLast().start();
}
for (Thread thread : threadsList) {
try {
thread.join();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
private boolean isLinkProcessed(LinkNode inputLink) {
String url = getUrlGeneralForm(inputLink);
boolean linkProcess = true;
synchronized (DuplicationLinksHub) {
if (DuplicationLinksHub.contains(url)) {
linkProcess = false;
synchronized (DuplicationLinksCount)
{
DuplicationLinksCount++;
}
} else
DuplicationLinksHub.add(url);
}
if (linkProcess) {
System.out.println("Processing url " + url);
List<LinkNodeLight> outputLinks = parseAndWieghtResults(inputLink);
for (LinkNodeLight outputLink : outputLinks) {
String getUrlGeneralForumOutput = getUrlGeneralForm(outputLink);
linkProcess = true;
synchronized (DuplicationLinksHub) {
if (DuplicationLinksHub.contains(getUrlGeneralForumOutput)) {
linkProcess = false;
synchronized (DuplicationLinksCount)
{
DuplicationLinksCount++;
}
}
}
if (linkProcess) {
enque(outputLink);
}
}
return true;
}
return false;
}
void enque(LinkNodeLight link){
link.setEnqueTime(new DateTime());
synchronized (PQueue) {
PQueue.add(link);
minDate = Math.min(minDate, link.getEnqueTime().getMillis());
cleanQueue();
if (link.interLinks)
synchronized (InterLinkCount) {
InterLinkCount++;
}
else
synchronized (IntraLinkCount) {
IntraLinkCount++;
}
}
}
LinkNode deque(){
LinkNode link = null;
synchronized (PQueue) {
link = (LinkNode) PQueue.poll();
if (link != null) {
link.setDequeTime(new DateTime());
if (link.isInterLinks())
synchronized (InterLinkCount) {
InterLinkCount--;
}
else
synchronized (IntraLinkCount) {
IntraLinkCount--;
}
}
return link;
}
}
private void createFiles() {
try {
outputFile = new PrintWriter(new BufferedWriter(new FileWriter("Qsizde300_Q_Con_27.10.5dk_CrawledURLS.csv", false)));
outputFile.println(generateHeaderFile());
} catch (IOException e) {
System.err.println(e);
}
try {
errFile = new PrintWriter(new BufferedWriter(new FileWriter("Qsizde300_Q_Con_27.10.5dk_CrawledURLSERROR.csv", false)));
errFile.println(generateHeaderFile());
} catch (IOException e) {
System.err.println(e);
}
}
private static String format(String s) {
String ret = s.replaceAll("\"", "\"\"");
return "\"" + ret + "\"";
}
public static String getOutputLine(LinkNode link){
StringBuilder builder = new StringBuilder();
builder.append(link.getParentLink()!=null ? format(link.getParentLink().getUrl()) : "");
builder.append(",");
builder.append(link.getParentLink()!=null ? link.getParentLink().getIpAdress() : "");
builder.append(",");
builder.append(link.getParentLink()!=null ? link.getParentLink().linkProcessingDuration() : "");
builder.append(",");
builder.append(format(link.getUrl()));
builder.append(",");
builder.append(link.getDomain());
builder.append(",");
builder.append(link.getIpAdress());
builder.append(",");
builder.append(link.isInterLinks());
builder.append(",");
builder.append(link.getWeight());
builder.append(",");
builder.append(Util.formatDate(link.getEnqueTime()));
builder.append(",");
builder.append(Util.formatDate(link.getDequeTime()));
builder.append(",");
builder.append(link.waitingInQueue());
builder.append(",");
builder.append(PQueue.size());
builder.append(",");
builder.append(IntraLinkCount.toString());
builder.append(",");
builder.append(InterLinkCount.toString());
builder.append(",");
builder.append(DuplicationLinksCount);
builder.append(",");
builder.append(new Date ());
builder.append(",");
builder.append(link.getSize());
builder.append(",");
if (link.getParseException() != null) {
if (link.getParseException() instanceof HttpStatusException)
builder.append(((HttpStatusException) link.getParseException()).getStatusCode());
if (link.getParseException() instanceof SocketTimeoutException)
builder.append("Time out");
if (link.getParseException() instanceof MalformedURLException)
builder.append("URL is not valid");
if (link.getParseException() instanceof UnsupportedMimeTypeException)
builder.append("Unsupported mime type: " + ((UnsupportedMimeTypeException)link.getParseException()).getMimeType());
}
return builder.toString();
}
private String generateHeaderFile(){
StringBuilder builder = new StringBuilder();
builder.append("Seed URL");
builder.append(",");
builder.append("Seed IP");
builder.append(",");
builder.append("Process Duration");
builder.append(",");
builder.append("Link URL");
builder.append(",");
builder.append("Link domain");
builder.append(",");
builder.append("Link IP");
builder.append(",");
builder.append("isIntern");
builder.append(",");
builder.append("Weight");
builder.append(",");
builder.append("Enque Time");
builder.append(",");
builder.append("Deque Time");
builder.append(",");
builder.append("Waiting in the Queue");
builder.append(",");
builder.append("QueueSize");
builder.append(",");
builder.append("Intra in queue");
builder.append(",");
builder.append("Inter in queue");
builder.append(",");
builder.append("Dublications skipped");
builder.append(",");
builder.append("Time");
builder.append(",");
builder.append("Size bytes");
builder.append(",");
builder.append("HTTP error");
return builder.toString();
}
String getUrlGeneralForm(LinkNodeLight link){
String url = link.getUrl();
if (url.endsWith("/")){
url = url.substring(0, url.length() - 1);
}
return url;
}
private List<LinkNodeLight> parseAndWieghtResults(LinkNode inputLink) {
List<LinkNodeLight> outputLinks = HTMLParser.parse(inputLink);
if (inputLink.hasParseException()) {
return outputLinks;
} else {
return URLWeight.weight(inputLink, outputLinks);
}
}
}
导入java.io.BufferedWriter;
导入java.io.File;
导入java.io.FileWriter;
导入java.io.IOException;
导入java.io.PrintWriter;
导入java.net.MalformedURLException;
导入java.net.SocketTimeoutException;
导入java.util.Date;
导入java.util.LinkedHashSet;
导入java.util.LinkedList;
导入java.util.List;
导入java.util.PriorityQueue;
导入java.util.Scanner;
导入java.util.Set;
导入org.jsoup.HttpStatusException;
导入org.jsoup.unsupportedmitypeeexception;
导入org.joda.time.DateTime;
公共类网络爬虫器{
public static PriorityQueue PQueue=new PriorityQueue();
公共静态最终int NumberOfThreads=10;
私有静态集replicationlinkshub=newlinkedhashset();
私有PrintWriter输出文件;
私有打印文件;
私有静态整数IntraLinkCount=新整数(0);
私有静态整数计数=新整数(0);
私有静态整数复制LinkScont=新整数(0);
私有静态整数MaxWaitTime=新整数(600000);//毫秒
私有静态整数MaxSizeOffQueue=新整数(1000);
私人静态长思维;
公共静态void main(字符串[]args){
System.out.println(“正在运行的网络爬虫:+newdate());
minDate=Long.MAX_值;
WebCrawler WebCrawler=新的WebCrawler();
webCrawler.createFiles();
try(Scanner readSeedFile=new Scanner(新文件(“seeds.txt”)){
while(readSeedFile.hasNext()){
enque(新链接节点(readSeedFile.nextLine().trim());
}
}捕获(IOE异常){
e、 printStackTrace();
返回;
}
webCrawler.processQueue();
webCrawler.outputFile.close();
webCrawler.errFile.close();
}
private void cleanQueue()
{
//以毫秒为单位计算最小日期
//如果电流差不超过MaxWaitTime
//我只是跳过了清洁队列
if((System.currentTimeMillis()-minDate)=MaxSizeOffQueue)
{
//如果URL在队列中的时间过长,我们可以将其删除。
}
tempNode=PQueue.remove();
long diff=System.currentTimeMillis()-tempNode.getEnqueTime().getMillis();
System.out.println(“*************************************************”);
System.out.println(“此URL已删除:“+tempNode”);
try(PrintWriter out=new PrintWriter(new BufferedWriter(new FileWriter)(“removedURLs.csv”,true))){
out.println(tempNode);
}捕获(IOE异常){
}
如果(差异<最大等待时间){
添加(tempNode);
minDate=Math.min(minDate,tempNode.getEnqueTime().getMillis());
}
//放回所有仍然良好的URL
//从临时队列中删除URL并将其添加到原始PQUE
而(!TempQueue.isEmpty())
添加(TempQueue.remove());
}
public void processQueue(){
Runnable r=新的Runnable(){
@凌驾
公开募捐{
while(true){
LinkNode urlink=deque();
if(urlink==null)
继续;
设置开始时间(new DateTime());
布尔进程=isLinkProcessed(urlink);
setEndTime(new DateTime());
如果(!进程)
继续;
如果(urlink.getStatus()!=null&&urlink.getStatus().equals(LinkNodeStatus.OK)){
已同步(输出文件){
println(getOutputLine(urlink));
outputFile.flush();
}
}否则{
已同步(错误文件){
errFile.println(getOutputLine(urlLink));
errFile.flush();
}
}
}
}
};
LinkedList threadsList=新LinkedList();
for(int i=0;i