Java wordcount:一个普通的实现
我用Java实现了一个wordcount程序。基本上,该程序会获取一个大文件(在我的测试中,我使用了一个仅包含数字的10GB数据文件),并计算每个“字”出现的次数——在本例中,一个数字(例如23723可能在文件中出现243次) 下面是我的实现。我试图改进它,主要考虑性能,但还有一些其他的事情,我正在寻找一些指导。以下是我希望纠正的几个问题:Java wordcount:一个普通的实现,java,multithreading,concurrenthashmap,word-count,worker-pool,Java,Multithreading,Concurrenthashmap,Word Count,Worker Pool,我用Java实现了一个wordcount程序。基本上,该程序会获取一个大文件(在我的测试中,我使用了一个仅包含数字的10GB数据文件),并计算每个“字”出现的次数——在本例中,一个数字(例如23723可能在文件中出现243次) 下面是我的实现。我试图改进它,主要考虑性能,但还有一些其他的事情,我正在寻找一些指导。以下是我希望纠正的几个问题: 目前,该程序是线程化的,工作正常。但是,我要做的是将一块内存(500MB/NUM_THREADS)传递给每个线程,然后每个线程继续进行字计数。这里的问题是,
(500MB/NUM_THREADS)
传递给每个线程,然后每个线程继续进行字计数。这里的问题是,我让主线程在向每个线程传递更多数据之前等待所有线程完成。这不是一个太大的问题,但是有一段时间,一些线程会等待一段时间而什么也不做。我相信某种工作者池或执行器服务可以解决这个问题(我还没有学会这个问题的语法)int
,因此没有内存问题。我希望能够使用某种分隔符,无论该分隔符是空格还是几个字符 public class BigCount2 {
public static void main(String[] args) throws IOException, InterruptedException {
int num, counter;
long i, j;
String delimiterString = " ";
ArrayList<Character> delim = new ArrayList<Character>();
for (char c : delimiterString.toCharArray()) {
delim.add(c);
}
int counter2 = 0;
num = Integer.parseInt(args[0]);
int bytesToRead = 1024 * 1024 * 1024 / 2; //500 MB, size of loop
int remainder = bytesToRead % num;
int k = 0;
bytesToRead = bytesToRead - remainder;
int byr = bytesToRead / num;
String filepath = "C:/Users/Daniel/Desktop/int-dataset-10g.dat";
RandomAccessFile file = new RandomAccessFile(filepath, "r");
Thread[] t = new Thread [num];//array of threads
ConcurrentMap<Integer, Integer> wordCountMap = new ConcurrentHashMap<Integer, Integer>(25000);
byte [] byteArray = new byte [byr]; //allocates 500mb to a 2D byte array
char[] newbyte;
for (i = 0; i < file.length(); i += bytesToRead) {
counter = 0;
for (j = 0; j < bytesToRead; j += byr) {
file.seek(i + j);
file.read(byteArray, 0, byr);
newbyte = new String(byteArray).toCharArray();
t[counter] = new Thread(
new BigCountThread2(counter,
newbyte,
delim,
wordCountMap));//giving each thread t[i] different file fileReader[i]
t[counter].start();
counter++;
newbyte = null;
}
for (k = 0; k < num; k++){
t[k].join(); //main thread continues after ALL threads have finished.
}
counter2++;
System.gc();
}
file.close();
System.exit(0);
}
}
class BigCountThread2 implements Runnable {
private final ConcurrentMap<Integer, Integer> wordCountMap;
char [] newbyte;
private ArrayList<Character> delim;
private int threadId; //use for later
BigCountThread2(int tid,
char[] newbyte,
ArrayList<Character> delim,
ConcurrentMap<Integer, Integer> wordCountMap) {
this.delim = delim;
threadId = tid;
this.wordCountMap = wordCountMap;
this.newbyte = newbyte;
}
public void run() {
int intCheck = 0;
int counter = 0; int i = 0; Integer check; int j =0; int temp = 0; int intbuilder = 0;
for (i = 0; i < newbyte.length; i++) {
intCheck = Character.getNumericValue(newbyte[i]);
if (newbyte[i] == ' ' || intCheck == -1) { //once a delimiter is found, the current tempArray needs to be added to the MAP
check = wordCountMap.putIfAbsent(intbuilder, 1);
if (check != null) { //if returns null, then it is the first instance
wordCountMap.put(intbuilder, wordCountMap.get(intbuilder) + 1);
}
intbuilder = 0;
}
else {
intbuilder = (intbuilder * 10) + intCheck;
counter++;
}
}
}
}
公共类BigCount2{
公共静态void main(字符串[]args)引发IOException、InterruptedException{
int num,计数器;
长i,j;
字符串分隔符字符串=”;
ArrayList delim=新的ArrayList();
for(char c:delimiterString.toCharArray()){
添加(c);
}
int计数器2=0;
num=Integer.parseInt(args[0]);
int bytesToRead=1024*1024*1024/2;//500 MB,循环大小
整数余数=bytesToRead%num;
int k=0;
bytesToRead=bytesToRead-余数;
int byr=bytesToRead/num;
String filepath=“C:/Users/Daniel/Desktop/int-dataset-10g.dat”;
RandomAccessFile文件=新的RandomAccessFile(文件路径,“r”);
线程[]t=新线程[num];//线程数组
ConcurrentMap wordCountMap=新的ConcurrentHashMap(25000);
byte[]byteArray=新字节[byr];//将500mb分配给2D字节数组
新细胞;
对于(i=0;i