Hadoop 从spark worker节点访问hdfs文件
我正在开发一个spark应用程序,它需要访问和更新hdfs中存储为文件的对象。我不知道该怎么做 如果我正在创建并使用文件系统hdfs对象:Hadoop 从spark worker节点访问hdfs文件,hadoop,apache-spark,hdfs,apache-spark-sql,Hadoop,Apache Spark,Hdfs,Apache Spark Sql,我正在开发一个spark应用程序,它需要访问和更新hdfs中存储为文件的对象。我不知道该怎么做 如果我正在创建并使用文件系统hdfs对象: boolean fileExists = hdfs.exists(new org.apache.hadoop.fs.Path(filePath)); if (fileExists){ JavaRDD<MyObject> modelRDD = sc.objectFile(filePath); } java.lang.NullPointer
boolean fileExists = hdfs.exists(new org.apache.hadoop.fs.Path(filePath));
if (fileExists){
JavaRDD<MyObject> modelRDD = sc.objectFile(filePath);
}
java.lang.NullPointerException
这部分代码在worker上运行,因此我假设它失败,因为它无法访问Spark上下文。在这种情况下,如何访问此hdfs文件
此hdfs文件位于驱动程序节点。我可以将hdfs替换为配置单元,将数据存储为配置单元中的字节数组,但即使是配置单元上下文也无法从工作节点访问
添加完整代码以更好地理解:
public class MyProgram {
private static JavaSparkContext sc;
private static HiveContext hiveContext;
private static String ObjectPersistenceDir = "/metadata/objects";
private static org.apache.hadoop.fs.FileSystem hdfs;
private static String NameNodeURI = "hdfs://<mymachineurl>:9000";
// create and maintain a cache of objects for every run session
//private static HashMap<String, MyObject> cacheObjects;
public static void main(String ... args) {
System.out.println("Inside constructor: creating Spark context and Hive context");
System.out.println("Starting Spark context and SQL context");
sc = new JavaSparkContext(new SparkConf());
hiveContext = new HiveContext(sc);
//cacheObjects= new HashMap<>();
//DataFrame loadedObjects= hiveContext.sql("select id, filepath from saved_objects where name = 'TEST'");
//List<Row> rows = loadedObjects.collectAsList();
//for(Row row : rows){
// String key = (String) row.get(0) ;
// String value = (String) row.get(1);
// JavaRDD<MyObject> objectRDD = sc.objectFile(value);
// cacheObjects.put(key, objectRDD.first());
//}
DataFrame partitionedDF = hiveContext.sql('select * from mydata');
String partitionColumnName = "id";
JavaRDD<Row> partitionedRecs = partitionedDF.repartition(partitionedDF.col(partitionColumnName)).javaRDD();
FlatMapFunction<Iterator<Row>, MyObject> flatMapSetup = new FlatMapFunction<java.util.Iterator<Row>, MyObject>() {
List<MyObject> lm_list = new ArrayList<>();
MyObject object = null;
@Override
public List<MyObject> call(java.util.Iterator<Row> it) throws Exception {
// for every row, create a record and update the object
while (it.hasNext()) {
Row row = it.next();
if (object == null) {
String objectKey = "" + id;
//object = cacheObjects.get(objectKey);
String modelPath = ModelPersistenceDir + "/" +'TEST'+ "/" + id;
JavaRDD<MyObject> objectRDD = sc.objectFile(objectPath);
object = objectRDD.collect().get(0);
// object not in cache means not already created
if(object == null){
if (object == null){
ObjectDef objectDef = new ObjectDef('TEST');
object = new MyObject(objectDef);
}
}
}
/*
/ some update on object
*/
String objectKey = "" + id ;
cacheObjects.put(objectKey, object);
// Algorithm step 2.6 : to save in hive, add to list
lm_list.add(object);
} // while Has Next ends
return lm_list;
} // Call -- Iterator ends
};//); //Map Partition Ends
//todo_nidhi put all objects in collectedObject back to hive
List<MyObject> collectedObject = partitionedRecs.mapPartitions(flatMapSetup).collect();
}
如果数据存储在HDFS上,则驱动程序和工作人员可以访问它。此外,员工还可以访问sparkContext,请阅读spark文档。我认为您在数据访问方面没有问题。另外,您可能不想使用objectFile,而是想使用TextFile,但当我试图从FlatMap函数内部访问JavaSparkContext对象时(我假设该函数将在worker节点上的某个位置运行),它会发出NullPointerException。请提供有关NPE的更多信息,并更新相关的完整代码,NPE位于JavaRDD objectRDD=sc.objectFileobjectPath;如果尝试打印sc,它会打印null。如果无法在flatmap或foreach函数中使用JavaSparkContext,则最终会出现null指针异常。如果您想访问flatamp内的额外数据,则需要收集到map外的列表,然后在flatmap内使用它。
public class MyProgram {
private static JavaSparkContext sc;
private static HiveContext hiveContext;
private static String ObjectPersistenceDir = "/metadata/objects";
private static org.apache.hadoop.fs.FileSystem hdfs;
private static String NameNodeURI = "hdfs://<mymachineurl>:9000";
// create and maintain a cache of objects for every run session
//private static HashMap<String, MyObject> cacheObjects;
public static void main(String ... args) {
System.out.println("Inside constructor: creating Spark context and Hive context");
System.out.println("Starting Spark context and SQL context");
sc = new JavaSparkContext(new SparkConf());
hiveContext = new HiveContext(sc);
//cacheObjects= new HashMap<>();
//DataFrame loadedObjects= hiveContext.sql("select id, filepath from saved_objects where name = 'TEST'");
//List<Row> rows = loadedObjects.collectAsList();
//for(Row row : rows){
// String key = (String) row.get(0) ;
// String value = (String) row.get(1);
// JavaRDD<MyObject> objectRDD = sc.objectFile(value);
// cacheObjects.put(key, objectRDD.first());
//}
DataFrame partitionedDF = hiveContext.sql('select * from mydata');
String partitionColumnName = "id";
JavaRDD<Row> partitionedRecs = partitionedDF.repartition(partitionedDF.col(partitionColumnName)).javaRDD();
FlatMapFunction<Iterator<Row>, MyObject> flatMapSetup = new FlatMapFunction<java.util.Iterator<Row>, MyObject>() {
List<MyObject> lm_list = new ArrayList<>();
MyObject object = null;
@Override
public List<MyObject> call(java.util.Iterator<Row> it) throws Exception {
// for every row, create a record and update the object
while (it.hasNext()) {
Row row = it.next();
if (object == null) {
String objectKey = "" + id;
//object = cacheObjects.get(objectKey);
String modelPath = ModelPersistenceDir + "/" +'TEST'+ "/" + id;
JavaRDD<MyObject> objectRDD = sc.objectFile(objectPath);
object = objectRDD.collect().get(0);
// object not in cache means not already created
if(object == null){
if (object == null){
ObjectDef objectDef = new ObjectDef('TEST');
object = new MyObject(objectDef);
}
}
}
/*
/ some update on object
*/
String objectKey = "" + id ;
cacheObjects.put(objectKey, object);
// Algorithm step 2.6 : to save in hive, add to list
lm_list.add(object);
} // while Has Next ends
return lm_list;
} // Call -- Iterator ends
};//); //Map Partition Ends
//todo_nidhi put all objects in collectedObject back to hive
List<MyObject> collectedObject = partitionedRecs.mapPartitions(flatMapSetup).collect();
}