Java 将大型gzip数据文件上载到HDFS
我有一个用例,我想在HDFS上上传大的Gzip文本数据文件(~60GB) 我下面的代码大约需要2个小时来上传这些500MB的文件。下面是伪代码。我在检查是否有人能帮我减少这次: i) int fileFetchBuffer=500000000; System.out.println(“文件提取缓冲区为:“+fileFetchBuffer”); 整数偏移=0; int字节读取=-1Java 将大型gzip数据文件上载到HDFS,java,hadoop,hdfs,gzipinputstream,Java,Hadoop,Hdfs,Gzipinputstream,我有一个用例,我想在HDFS上上传大的Gzip文本数据文件(~60GB) 我下面的代码大约需要2个小时来上传这些500MB的文件。下面是伪代码。我在检查是否有人能帮我减少这次: i) int fileFetchBuffer=500000000; System.out.println(“文件提取缓冲区为:“+fileFetchBuffer”); 整数偏移=0; int字节读取=-1 try { fileStream = new FileInputStream (file);
try {
fileStream = new FileInputStream (file);
if (fileName.endsWith(".gz")) {
stream = new GZIPInputStream(fileStream);
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
String[] fileN = fileName.split("\\.");
System.out.println("fil 0 : " + fileN[0]);
System.out.println("fil 1 : " + fileN[1]);
//logger.info("First line is: " + streamBuff.readLine());
byte[] buffer = new byte[fileFetchBuffer];
FileSystem fs = FileSystem.get(conf);
int charsLeft = fileFetchBuffer;
while (true) {
charsLeft = fileFetchBuffer;
logger.info("charsLeft outside while: " + charsLeft);
FSDataOutputStream dos = null;
while (charsLeft != 0) {
bytesRead = stream.read(buffer, 0, charsLeft);
if (bytesRead < 0) {
dos.flush();
dos.close();
break;
}
offset = offset + bytesRead;
charsLeft = charsLeft - bytesRead;
logger.info("offset in record: " + offset);
logger.info("charsLeft: " + charsLeft);
logger.info("bytesRead in record: " + bytesRead);
//prettyPrintHex(buffer);
String outFileStr = Utils.getOutputFileName(
stagingDir,
fileN[0],
outFileNum);
if (dos == null) {
Path outFile = new Path(outFileStr);
if (fs.exists(outFile)) {
fs.delete(outFile, false);
}
dos = fs.create(outFile);
}
dos.write(buffer, 0, bytesRead);
}
logger.info("done writing: " + outFileNum);
dos.flush();
dos.close();
if (bytesRead < 0) {
dos.flush();
dos.close();
break;
}
outFileNum++;
} // end of if
} else {
// Assume uncompressed file
stream = fileStream;
}
} catch(FileNotFoundException e) {
logger.error("File not found" + e);
}
试试看{
fileStream=新文件输入流(文件);
if(fileName.endsWith(“.gz”)){
stream=新的gzip输入流(fileStream);
BufferedReader reader=新的BufferedReader(新的InputStreamReader(流));
字符串[]fileN=fileName.split(“\\”);
System.out.println(“文件0:+fileN[0]);
System.out.println(“文件1:+fileN[1]);
//info(“第一行是:”+streamBuff.readLine());
字节[]缓冲区=新字节[fileFetchBuffer];
FileSystem fs=FileSystem.get(conf);
int charsLeft=fileFetchBuffer;
while(true){
charsLeft=fileFetchBuffer;
logger.info(“charsLeft外部,而“+charsLeft”);
FSDataOutputStream dos=null;
while(charsLeft!=0){
bytesRead=stream.read(缓冲区,0,charsLeft);
如果(字节读取<0){
dos.flush();
dos.close();
打破
}
偏移量=偏移量+字节读取;
charsLeft=charsLeft-字节读;
logger.info(“记录中的偏移量:”+偏移量);
logger.info(“charsLeft:+charsLeft”);
logger.info(“记录中的bytesRead:+bytesRead”);
//prettyPrintHex(缓冲区);
字符串outFileStr=Utils.getOutputFileName(
stagingDir,
fileN[0],
外流);
if(dos==null){
路径outFile=新路径(outFileStr);
如果(fs.存在(输出文件)){
fs.删除(outFile,false);
}
dos=fs.create(输出文件);
}
写(缓冲区,0,字节读);
}
logger.info(“完成写入:+outFileNum”);
dos.flush();
dos.close();
如果(字节读取<0){
dos.flush();
dos.close();
打破
}
outFileNum++;
}//如果结束
}否则{
//假定未压缩文件
流=文件流;
}
}catch(filenotfounde异常){
logger.error(“未找到文件”+e);
}
< /代码> 您应该考虑使用.< /p>
它有一个方法
IOUtils.copy( InputStream, OutputStream )
<>这将极大地减少复制文件所需的时间。 你应该考虑使用.< /p>
它有一个方法
IOUtils.copy( InputStream, OutputStream )
这将大大减少复制文件所需的时间。我尝试使用缓冲输入流,但没有发现真正的区别。
我认为文件通道实现可能更有效。如果速度不够快,告诉我
package toto;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
public class Slicer {
private static final int BUFFER_SIZE = 50000;
public static void main(String[] args) {
try
{
slice( args[ 0 ], args[ 1 ], Long.parseLong( args[2]) );
}//try
catch (IOException e)
{
e.printStackTrace();
}//catch
catch( Exception ex )
{
ex.printStackTrace();
System.out.println( "Usage : toto.Slicer <big file> <chunk name radix > <chunks size>" );
}//catch
}//met
/**
* Slices a huge files in chunks.
* @param inputFileName the big file to slice.
* @param outputFileRadix the base name of slices generated by the slicer. All slices will then be numbered outputFileRadix0,outputFileRadix1,outputFileRadix2...
* @param chunkSize the size of chunks in bytes
* @return the number of slices.
*/
public static int slice( String inputFileName, String outputFileRadix, long chunkSize ) throws IOException
{
//I would had some code to pretty print the output file names
//I mean adding a couple of 0 before chunkNumber in output file name
//so that they all have same number of chars
//use java.io.File for that, estimate number of chunks, take power of 10, got number of leading 0s
//just to get some stats
long timeStart = System.currentTimeMillis();
long timeStartSlice = timeStart;
long timeEnd = 0;
//io streams and chunk counter
int chunkNumber = 0;
FileInputStream fis = null;
FileOutputStream fos = null;
try
{
//open files
fis = new FileInputStream( inputFileName );
fos = new FileOutputStream( outputFileRadix + chunkNumber );
//declare state variables
boolean finished = false;
byte[] buffer = new byte[ BUFFER_SIZE ];
int bytesRead = 0;
long bytesInChunk = 0;
while( !finished )
{
//System.out.println( "bytes to read " +(int)Math.min( BUFFER_SIZE, chunkSize - bytesInChunk ) );
bytesRead = fis.read( buffer,0, (int)Math.min( BUFFER_SIZE, chunkSize - bytesInChunk ) );
if( bytesRead == -1 )
finished = true;
else
{
fos.write( buffer, 0, bytesRead );
bytesInChunk += bytesRead;
if( bytesInChunk == chunkSize )
{
if( fos != null )
{
fos.close();
timeEnd = System.currentTimeMillis();
System.out.println( "Chunk "+chunkNumber + " has been generated in "+ (timeEnd - timeStartSlice) +" ms");
chunkNumber ++;
bytesInChunk = 0;
timeStartSlice = timeEnd;
System.out.println( "Creating slice number " + chunkNumber );
fos = new FileOutputStream( outputFileRadix + chunkNumber );
}//if
}//if
}//else
}//while
}
catch (Exception e)
{
System.out.println( "A problem occured during slicing : " );
e.printStackTrace();
}//catch
finally
{
//whatever happens close all files
System.out.println( "Closing all files.");
if( fis != null )
fis.close();
if( fos != null )
fos.close();
}//fin
timeEnd = System.currentTimeMillis();
System.out.println( "Total slicing time : " + (timeEnd - timeStart) +" ms" );
System.out.println( "Total number of slices "+ (chunkNumber +1) );
return chunkNumber+1;
}//met
}//class
package-toto;
导入java.io.FileInputStream;
导入java.io.FileOutputStream;
导入java.io.IOException;
公共类切片器{
专用静态最终整数缓冲区大小=50000;
公共静态void main(字符串[]args){
尝试
{
切片(args[0],args[1],Long.parseLong(args[2]);
}//试一试
捕获(IOE异常)
{
e、 printStackTrace();
}//抓住
捕获(例外情况除外)
{
例如printStackTrace();
System.out.println(“用法:toto.Slicer”);
}//抓住
}//遇见
/**
*将一个巨大的文件切成块。
*@param inputFileName要切片的大文件。
*@param outputFileRadix切片器生成的切片的基本名称。然后,所有切片将编号为outputFileRadix0、outputFileRadix1、outputFileRadix2。。。
*@param chunkSize块的大小(以字节为单位)
*@返回切片数。
*/
公共静态int切片(String inputFileName、String outputFileRadix、long chunkSize)引发IOException
{
//我需要一些代码来打印输出文件名
//我的意思是在输出文件名的chunkNumber之前添加两个0
//所以它们都有相同数量的字符
//使用java.io.File,估计块的数量,取10的幂,得到前导0的数量
//只是为了得到一些数据
long timeStart=System.currentTimeMillis();
long timeStartSlice=timeStart;
长时间结束=0;
//io流和块计数器
int chunkNumber=0;
FileInputStream fis=null;
FileOutputStream=null;
尝试
{
//打开文件
fis=新文件InputStream(inputFileName);
fos=新文件OutputStream(outputFileRadix+chunkNumber);
//声明状态变量
布尔完成=假;
字节[]缓冲区=新字节[缓冲区大小];
int字节读取=0;
long bytesInChunk=0;
当(!完成)
{
//System.out.println(“要读取的字节数”+(int)Math.min(缓冲区大小,chunkSize-bytesInChunk));
bytesRead=fis.read(buffer,0,(int)Math.min(buffer_SIZE,chunkSize-bytesInChunk));
如果(字节读==-1)
完成=正确;
其他的
{
fos.写入(缓冲区,0,字节读取);
bytesInChunk+=字节读取;
如果(byt)