在Java中读取HDFS和本地文件
我想读取文件路径,而不管它们是HDFS还是本地路径。目前,我使用前缀file://传递本地路径,使用前缀HDFS://传递HDFS路径,并编写如下代码在Java中读取HDFS和本地文件,java,hadoop,mapreduce,hdfs,Java,Hadoop,Mapreduce,Hdfs,我想读取文件路径,而不管它们是HDFS还是本地路径。目前,我使用前缀file://传递本地路径,使用前缀HDFS://传递HDFS路径,并编写如下代码 Configuration configuration = new Configuration(); FileSystem fileSystem = null; if (filePath.startsWith("hdfs://")) { fileSystem = FileSystem.get(configuration); } else if
Configuration configuration = new Configuration();
FileSystem fileSystem = null;
if (filePath.startsWith("hdfs://")) {
fileSystem = FileSystem.get(configuration);
} else if (filePath.startsWith("file://")) {
fileSystem = FileSystem.getLocal(configuration).getRawFileSystem();
}
从这里,我使用文件系统的API来读取文件
你能告诉我还有没有比这更好的方法吗?这有意义吗
public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
conf.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/core-site.xml"));
conf.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/hdfs-site.xml"));
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
System.out.println("Enter the file path...");
String filePath = br.readLine();
Path path = new Path(filePath);
FileSystem fs = path.getFileSystem(conf);
FSDataInputStream inputStream = fs.open(path);
System.out.println(inputStream.available());
fs.close();
}
如果你走这条路,你不必付支票。直接从Path获取文件系统,然后执行任意操作。您可以通过以下方式获取
文件系统
:
Configuration conf = new Configuration();
Path path = new Path(stringPath);
FileSystem fs = FileSystem.get(path.toUri(), conf);
您不需要判断路径是否以
hdfs://
或file://
开头。此API将完成此工作。请检查下面的代码段,其中列出了HDFS路径中的文件;即以hdfs://
开头的路径字符串。如果您可以提供Hadoop配置和本地路径,它还将列出本地文件系统中的文件;即以file://
开头的路径字符串
//helper method to get the list of files from the HDFS path
public static List<String> listFilesFromHDFSPath(Configuration hadoopConfiguration, String hdfsPath,
boolean recursive)
{
//resulting list of files
List<String> filePaths = new ArrayList<String>();
FileSystem fs = null;
//try-catch-finally all possible exceptions
try
{
//get path from string and then the filesystem
Path path = new Path(hdfsPath); //throws IllegalArgumentException, all others will only throw IOException
fs = path.getFileSystem(hadoopConfiguration);
//resolve hdfsPath first to check whether the path exists => either a real directory or o real file
//resolvePath() returns fully-qualified variant of the path
path = fs.resolvePath(path);
//if recursive approach is requested
if (recursive)
{
//(heap issues with recursive approach) => using a queue
Queue<Path> fileQueue = new LinkedList<Path>();
//add the obtained path to the queue
fileQueue.add(path);
//while the fileQueue is not empty
while (!fileQueue.isEmpty())
{
//get the file path from queue
Path filePath = fileQueue.remove();
//filePath refers to a file
if (fs.isFile(filePath))
{
filePaths.add(filePath.toString());
}
else //else filePath refers to a directory
{
//list paths in the directory and add to the queue
FileStatus[] fileStatuses = fs.listStatus(filePath);
for (FileStatus fileStatus : fileStatuses)
{
fileQueue.add(fileStatus.getPath());
} // for
} // else
} // while
} // if
else //non-recursive approach => no heap overhead
{
//if the given hdfsPath is actually directory
if (fs.isDirectory(path))
{
FileStatus[] fileStatuses = fs.listStatus(path);
//loop all file statuses
for (FileStatus fileStatus : fileStatuses)
{
//if the given status is a file, then update the resulting list
if (fileStatus.isFile())
filePaths.add(fileStatus.getPath().toString());
} // for
} // if
else //it is a file then
{
//return the one and only file path to the resulting list
filePaths.add(path.toString());
} // else
} // else
} // try
catch(Exception ex) //will catch all exception including IOException and IllegalArgumentException
{
ex.printStackTrace();
//if some problem occurs return an empty array list
return new ArrayList<String>();
} //
finally
{
//close filesystem; not more operations
try
{
if(fs != null)
fs.close();
} catch (IOException e)
{
e.printStackTrace();
} // catch
} // finally
//return the resulting list; list can be empty if given path is an empty directory without files and sub-directories
return filePaths;
} // listFilesFromHDFSPath
//helper method to list files from the local path in the local file system
public static List<String> listFilesFromLocalPath(String localPathString, boolean recursive)
{
//resulting list of files
List<String> localFilePaths = new ArrayList<String>();
//get the Java file instance from local path string
File localPath = new File(localPathString);
//this case is possible if the given localPathString does not exit => which means neither file nor a directory
if(!localPath.exists())
{
System.err.println("\n" + localPathString + " is neither a file nor a directory; please provide correct local path");
//return with empty list
return new ArrayList<String>();
} // if
//at this point localPath does exist in the file system => either as a directory or a file
//if recursive approach is requested
if (recursive)
{
//recursive approach => using a queue
Queue<File> fileQueue = new LinkedList<File>();
//add the file in obtained path to the queue
fileQueue.add(localPath);
//while the fileQueue is not empty
while (!fileQueue.isEmpty())
{
//get the file from queue
File file = fileQueue.remove();
//file instance refers to a file
if (file.isFile())
{
//update the list with file absolute path
localFilePaths.add(file.getAbsolutePath());
} // if
else //else file instance refers to a directory
{
//list files in the directory and add to the queue
File[] listedFiles = file.listFiles();
for (File listedFile : listedFiles)
{
fileQueue.add(listedFile);
} // for
} // else
} // while
} // if
else //non-recursive approach
{
//if the given localPathString is actually a directory
if (localPath.isDirectory())
{
File[] listedFiles = localPath.listFiles();
//loop all listed files
for (File listedFile : listedFiles)
{
//if the given listedFile is actually a file, then update the resulting list
if (listedFile.isFile())
localFilePaths.add(listedFile.getAbsolutePath());
} // for
} // if
else //it is a file then
{
//return the one and only file absolute path to the resulting list
localFilePaths.add(localPath.getAbsolutePath());
} // else
} // else
//return the resulting list; list can be empty if given path is an empty directory without files and sub-directories
return localFilePaths;
} // listFilesFromLocalPath
这项工作
package com.leerhdfs;
//import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.Progressable;
import java.io.*;
import java.net.URI;
import java.nio.charset.StandardCharsets;
public class ReadWriteHDFSExample {
public static void main(String[] args) throws IOException {
Path inFile = new Path(args[0]);
String destinosrc = args[1];
//InputStream in = new BufferedInputStream(new FileInputStream(localsrc));
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(destinosrc), conf);
FSDataInputStream in = fs.open(inFile);
//Progressable ir viendo aumento 10%, 20%, 30%
OutputStream out = fs.create(new Path(destinosrc), new Progressable() {
public void progress() {
System.out.println("Leyendo y escribiendo...");
}
});
IOUtils.copyBytes(in ,out, 4096, true);
in.close();
}
}
你为什么对你目前的方法不满意?我本身并不不满意。我希望我的方法接收Path对象,我想知道是否有任何方法可以告诉我该路径是否属于本地或HDFS文件系统。我试着对路径进行一次旋转,并进行上面的比较,但没有成功。我必须访问路径上的toURI().toString()并执行此检查。我不确定是否需要为此创建新帖子。如果我应该的话,我很抱歉。或者,我的问题是,如果我有一个路径而不是字符串,那么如何查找文件路径是HDFS还是local。是否像我在第一篇文章中提到的那样,执行toURI().toString()并进行检查。或者执行一个toURI()并检查方案是否正确。谢谢……塔里克的解决方案也是这样。Path.getFileSystem将调用此文件系统。get(URI,配置)方法Hi Tariq,源本地文件是否可以位于边缘节点上?你也有mapper类的完整例子吗?虽然这段代码可以解决这个问题,但如何以及为什么解决这个问题将真正有助于提高你的文章质量,并可能导致更多的投票。请记住,你是在将来回答读者的问题,而不仅仅是现在提问的人。请在回答中添加解释,并说明适用的限制和假设。