Java 写下;压缩的;阵列以提高IO性能?
我有一个int和float数组,每个数组的长度为2.2亿(固定)。现在,我想在内存和磁盘中存储/上传这些阵列。目前,我正在使用JavaNIO的FileChannel和MappedByteBuffer来解决这个问题。它工作正常,但将阵列存储/上传到内存或从内存上传到磁盘大约需要5秒(挂钟时间)。现在,我想让它更快 在这里,我应该提到的是,大多数数组元素都是0(接近52%) 比如: 有人能帮我吗?有没有什么好方法可以通过不存储或加载那些0来提高速度。这可以用Arrays.fill(数组,0)补偿。< P>根据分布,考虑: 游程长度编码(RLE)是一种非常简单的数据压缩形式,其中数据的游程(即,相同数据值出现在许多连续数据元素中的序列)存储为单个数据值和计数,而不是原始游程这对包含许多此类运行的数据最有用。Java 写下;压缩的;阵列以提高IO性能?,java,arrays,io,compression,Java,Arrays,Io,Compression,我有一个int和float数组,每个数组的长度为2.2亿(固定)。现在,我想在内存和磁盘中存储/上传这些阵列。目前,我正在使用JavaNIO的FileChannel和MappedByteBuffer来解决这个问题。它工作正常,但将阵列存储/上传到内存或从内存上传到磁盘大约需要5秒(挂钟时间)。现在,我想让它更快 在这里,我应该提到的是,大多数数组元素都是0(接近52%) 比如: 有人能帮我吗?有没有什么好方法可以通过不存储或加载那些0来提高速度。这可以用Arrays.fill(数组,0)补偿。<
这很简单。。。这在这里是好的,也可能是坏的;-) 如果您愿意自己编写序列化代码,而不是存储所有的零,您可以存储一系列范围,这些范围指示这些零的位置(带有特殊标记),以及实际的非零数据 因此,示例中的数组:{0,0,6,7,1,0,0…} 可存储为: %0-1,6,7,1%5-6 读取此数据时,如果您点击了%,则表示您有一个从您开始的范围,您读取开始和结束,并填充一个零。然后你继续看到一个非#,这意味着你得到了一个实际值
在具有大量连续值序列的稀疏数组中,这将产生巨大的压缩。以下方法需要磁盘上的n/8+nz*4字节,其中n是数组的大小,nz是非零条目的数量。对于52%的零条目,您可以将存储大小减少52%-3%=49% 你可以做:
void write(int[] array) {
BitSet zeroes = new BitSet();
for (int i = 0; i < array.length; i++)
zeroes.set(i, array[i] == 0);
write(zeroes); // one bit per index
for (int i = 0; i < array.length; i++)
if (array[i] != 0)
write(array[y]);
}
int[] read() {
BitSet zeroes = readBitSet();
array = new int[zeroes.length];
for (int i = 0; i < zeroes.length; i++) {
if (zeroes.get(i)) {
// nothing to do (array[i] was initialized to 0)
} else {
array[i] = readInt();
}
}
}
void写入(int[]数组){
位集零=新位集();
for(int i=0;i
编辑:您说这稍微慢一点意味着磁盘不是瓶颈。您可以通过在构建位集时写入位集来优化上述方法,这样就不必在将位集写入磁盘之前将其写入内存。此外,通过逐字写入散布在实际数据中的位集,我们只能在阵列上执行一次传递,从而减少缓存未命中:
void write(int[] array) {
writeInt(array.length);
int ni;
for (int i = 0; i < array.length; i = ni) {
ni = i + 32;
int zeroesMap = 0;
for (j = i + 31; j >= i; j--) {
zeroesMap <<= 1;
if (array[j] == 0) {
zeroesMap |= 1;
}
}
writeInt(zeroesMap);
for (j = i; j < ni; j++)
if (array[j] != 0) {
writeInt(array[j]);
}
}
}
}
int[] read() {
int[] array = new int[readInt()];
int ni;
for (int i = 0; i < array.length; i = ni) {
ni = i + 32;
zeroesMap = readInt();
for (j = i; j < ni; j++) {
if (zeroesMap & 1 == 1) {
// nothing to do (array[i] was initialized to 0)
} else {
array[j] = readInt();
}
zeroesMap >>= 1;
}
}
return array;
}
void写入(int[]数组){
writeInt(array.length);
国际货币基金组织;
对于(int i=0;i=i;j--){
zeroesMap=1;
}
}
返回数组;
}
(前面的代码假设array.length是32的倍数。如果不是,则以您喜欢的方式写入数组的最后一个片段)
如果这也不能减少处理时间,那么压缩就不是一条出路(我认为任何通用压缩算法都不会比上述算法更快)。java中有一个标准的压缩utils:java.util.zip——它是通用库,但由于完全可用,是一个不错的解决方案。如果需要,应该研究专门的压缩和编码,我很少推荐zip作为首选 下面是一个如何通过
放气/充气器处理拉链的示例。
大多数人都知道ZipInput/Output流(尤其是Gzip)。它们在处理mem->zlib和esp.GZip的拷贝时都遇到了问题,这是一场彻底的灾难,因为CRC32调用本机代码(调用本机代码会消除优化的能力,并引入更多性能影响)
几点重要注意事项:不要将压缩速度提高到很高的水平,这将破坏任何性能——当然,可以尝试并调整CPU和磁盘活动之间的最佳比率
这段代码还展示了java.util.zip
的一个真正缺点——它不支持直接缓冲区。这种支持非同小可,但没有人愿意去做。直接缓冲区将节省少量内存拷贝并减少内存占用
最后一点注意:有java版本的,它优于本机impl。对压缩相当好
package t1;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Random;
import java.util.zip.DataFormatException;
import java.util.zip.Deflater;
import java.util.zip.Inflater;
public class ZInt {
private static final int bucketSize = 1<<17;//in real world should not be const, but we bored horribly
static final int zipLevel = 2;//feel free to experiement, higher compression (5+)is likely to be total waste
static void write(int[] a, File file, boolean sync) throws IOException{
byte[] bucket = new byte[Math.min(bucketSize, Math.max(1<<13, Integer.highestOneBit(a.length >>3)))];//128KB bucket
byte[] zipOut = new byte[bucket.length];
final FileOutputStream fout = new FileOutputStream(file);
FileChannel channel = fout.getChannel();
try{
ByteBuffer buf = ByteBuffer.wrap(bucket);
//unfortunately java.util.zip doesn't support Direct Buffer - that would be the perfect fit
ByteBuffer out = ByteBuffer.wrap(zipOut);
out.putInt(a.length);//write length aka header
if (a.length==0){
doWrite(channel, out, 0);
return;
}
Deflater deflater = new Deflater(zipLevel, false);
try{
for (int i=0;i<a.length;){
i = put(a, buf, i);
buf.flip();
deflater.setInput(bucket, buf.position(), buf.limit());
if (i==a.length)
deflater.finish();
//hacking and using bucket here is tempting since it's copied twice but well
for (int n; (n= deflater.deflate(zipOut, out.position(), out.remaining()))>0;){
doWrite(channel, out, n);
}
buf.clear();
}
}finally{
deflater.end();
}
}finally{
if (sync)
fout.getFD().sync();
channel.close();
}
}
static int[] read(File file) throws IOException, DataFormatException{
FileChannel channel = new FileInputStream(file).getChannel();
try{
byte[] in = new byte[(int)Math.min(bucketSize, channel.size())];
ByteBuffer buf = ByteBuffer.wrap(in);
channel.read(buf);
buf.flip();
int[] a = new int[buf.getInt()];
if (a.length==0)
return a;
int i=0;
byte[] inflated = new byte[Math.min(1<<17, a.length*4)];
ByteBuffer intBuffer = ByteBuffer.wrap(inflated);
Inflater inflater = new Inflater(false);
try{
do{
if (!buf.hasRemaining()){
buf.clear();
channel.read(buf);
buf.flip();
}
inflater.setInput(in, buf.position(), buf.remaining());
buf.position(buf.position()+buf.remaining());//simulate all read
for (;;){
int n = inflater.inflate(inflated,intBuffer.position(), intBuffer.remaining());
if (n==0)
break;
intBuffer.position(intBuffer.position()+n).flip();
for (;intBuffer.remaining()>3 && i<a.length;i++){//need at least 4 bytes to form an int
a[i] = intBuffer.getInt();
}
intBuffer.compact();
}
}while (channel.position()<channel.size() && i<a.length);
}finally{
inflater.end();
}
// System.out.printf("read ints: %d - channel.position:%d %n", i, channel.position());
return a;
}finally{
channel.close();
}
}
private static void doWrite(FileChannel channel, ByteBuffer out, int n) throws IOException {
out.position(out.position()+n).flip();
while (out.hasRemaining())
channel.write(out);
out.clear();
}
private static int put(int[] a, ByteBuffer buf, int i) {
for (;buf.hasRemaining() && i<a.length;){
buf.putInt(a[i++]);
}
return i;
}
private static int[] generateRandom(int len){
Random r = new Random(17);
int[] n = new int[len];
for (int i=0;i<len;i++){
n[i]= r.nextBoolean()?0: r.nextInt(1<<23);//limit bounds to have any sensible compression
}
return n;
}
public static void main(String[] args) throws Throwable{
File file = new File("xxx.xxx");
int[] n = generateRandom(3000000); //{0,2,4,1,2,3};
long start = System.nanoTime();
write(n, file, false);
long elapsed = System.nanoTime() - start;//elapsed will be fairer if the sync is true
System.out.printf("File length: %d, for %d ints, ratio %.2f in %.2fms %n", file.length(), n.length, ((double)file.length())/4/n.length, java.math.BigDecimal.valueOf(elapsed, 6) );
int[] m = read(file);
//compare, Arrays.equals doesn't return position, so it sucks/kinda
for (int i=0; i<n.length; i++){
if (m[i]!=n[i]){
System.err.printf("Failed at %d%n",i);
break;
}
}
System.out.printf("All done!");
};
}
t1包;
导入java.io.File;
导入java.io.FileInputStream;
导入java.io.FileOutputStream;
导入java.io.IOException;
导入java.nio.ByteBuffer;
导入java.nio.channels.FileChannel;
导入java.util.Random;
导入java.util.zip.DataFormatException;
导入java.util.zip.Deflater;
导入java.util.zip.Inflater;
公共类ZInt{
private static final int-bucketSize=1最简单的方法是利用java.util.zip
的内置压缩。这可能不是最佳压缩,但需要几行代码。@bestsss,谢谢。但是,如何使用zip压缩数组并将其存储并加载到内存中?您能给出一个小指针吗?@arpsss(以及相关的GZip/Zip流)。我想我应该使用deflatterOutputStream,因为同时使用DefalterOutputStream和gziOutStream(不指定大小)会导致糟糕的性能。@arpsss数字的范围是多少?可能在短[]或字节[]中?(也就是说,在编码每个数字时是否存在大量“浪费空间”)是什么让您认为数据包含运行?例如,序列0,1,0,2,0,3,0,4,0,5,0,6,0,7符合OP的要求,但没有任何运行。@meriton“Dep”
package t1;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Random;
import java.util.zip.DataFormatException;
import java.util.zip.Deflater;
import java.util.zip.Inflater;
public class ZInt {
private static final int bucketSize = 1<<17;//in real world should not be const, but we bored horribly
static final int zipLevel = 2;//feel free to experiement, higher compression (5+)is likely to be total waste
static void write(int[] a, File file, boolean sync) throws IOException{
byte[] bucket = new byte[Math.min(bucketSize, Math.max(1<<13, Integer.highestOneBit(a.length >>3)))];//128KB bucket
byte[] zipOut = new byte[bucket.length];
final FileOutputStream fout = new FileOutputStream(file);
FileChannel channel = fout.getChannel();
try{
ByteBuffer buf = ByteBuffer.wrap(bucket);
//unfortunately java.util.zip doesn't support Direct Buffer - that would be the perfect fit
ByteBuffer out = ByteBuffer.wrap(zipOut);
out.putInt(a.length);//write length aka header
if (a.length==0){
doWrite(channel, out, 0);
return;
}
Deflater deflater = new Deflater(zipLevel, false);
try{
for (int i=0;i<a.length;){
i = put(a, buf, i);
buf.flip();
deflater.setInput(bucket, buf.position(), buf.limit());
if (i==a.length)
deflater.finish();
//hacking and using bucket here is tempting since it's copied twice but well
for (int n; (n= deflater.deflate(zipOut, out.position(), out.remaining()))>0;){
doWrite(channel, out, n);
}
buf.clear();
}
}finally{
deflater.end();
}
}finally{
if (sync)
fout.getFD().sync();
channel.close();
}
}
static int[] read(File file) throws IOException, DataFormatException{
FileChannel channel = new FileInputStream(file).getChannel();
try{
byte[] in = new byte[(int)Math.min(bucketSize, channel.size())];
ByteBuffer buf = ByteBuffer.wrap(in);
channel.read(buf);
buf.flip();
int[] a = new int[buf.getInt()];
if (a.length==0)
return a;
int i=0;
byte[] inflated = new byte[Math.min(1<<17, a.length*4)];
ByteBuffer intBuffer = ByteBuffer.wrap(inflated);
Inflater inflater = new Inflater(false);
try{
do{
if (!buf.hasRemaining()){
buf.clear();
channel.read(buf);
buf.flip();
}
inflater.setInput(in, buf.position(), buf.remaining());
buf.position(buf.position()+buf.remaining());//simulate all read
for (;;){
int n = inflater.inflate(inflated,intBuffer.position(), intBuffer.remaining());
if (n==0)
break;
intBuffer.position(intBuffer.position()+n).flip();
for (;intBuffer.remaining()>3 && i<a.length;i++){//need at least 4 bytes to form an int
a[i] = intBuffer.getInt();
}
intBuffer.compact();
}
}while (channel.position()<channel.size() && i<a.length);
}finally{
inflater.end();
}
// System.out.printf("read ints: %d - channel.position:%d %n", i, channel.position());
return a;
}finally{
channel.close();
}
}
private static void doWrite(FileChannel channel, ByteBuffer out, int n) throws IOException {
out.position(out.position()+n).flip();
while (out.hasRemaining())
channel.write(out);
out.clear();
}
private static int put(int[] a, ByteBuffer buf, int i) {
for (;buf.hasRemaining() && i<a.length;){
buf.putInt(a[i++]);
}
return i;
}
private static int[] generateRandom(int len){
Random r = new Random(17);
int[] n = new int[len];
for (int i=0;i<len;i++){
n[i]= r.nextBoolean()?0: r.nextInt(1<<23);//limit bounds to have any sensible compression
}
return n;
}
public static void main(String[] args) throws Throwable{
File file = new File("xxx.xxx");
int[] n = generateRandom(3000000); //{0,2,4,1,2,3};
long start = System.nanoTime();
write(n, file, false);
long elapsed = System.nanoTime() - start;//elapsed will be fairer if the sync is true
System.out.printf("File length: %d, for %d ints, ratio %.2f in %.2fms %n", file.length(), n.length, ((double)file.length())/4/n.length, java.math.BigDecimal.valueOf(elapsed, 6) );
int[] m = read(file);
//compare, Arrays.equals doesn't return position, so it sucks/kinda
for (int i=0; i<n.length; i++){
if (m[i]!=n[i]){
System.err.printf("Failed at %d%n",i);
break;
}
}
System.out.printf("All done!");
};
}