Apache spark 如何在ApacheSpark2中从excel文件读取多张工作表

Apache spark 如何在ApacheSpark2中从excel文件读取多张工作表,apache-spark,apache-spark-sql,Apache Spark,Apache Spark Sql,如何通过在spark2中读取excel文件的多个工作表来创建数据框 注意:从excel文件(单个文件)读取多个工作表,而不是单个工作表 def readExcel(file: String): DataFrame = sqlContext.read .format("com.crealytics.spark.excel") .option("location", file) .option("useHeader", "true") .option("treatEmptyValuesAsNulls

如何通过在spark2中读取excel文件的多个工作表来创建数据框

注意:从excel文件(单个文件)读取多个工作表,而不是单个工作表

def readExcel(file: String): DataFrame = sqlContext.read
.format("com.crealytics.spark.excel")
.option("location", file)
.option("useHeader", "true")
.option("treatEmptyValuesAsNulls", "true")
.option("inferSchema", "true")
.option("addColorColumns", "False")
.load()
val data=readExcel(“excel文件的路径”)


data.show(false)

public void loadMultipleExcel()引发异常{

try {

    File file = new File("your xls file path");
    FileInputStream fIP = new FileInputStream(file)
   StructType  customSchema = new StructType(new StructField[] {

            new StructField("year", DataTypes.DateType, true,Metadata.empty()),
            new StructField("make", DataTypes.DateType, true,Metadata.empty()),
            new StructField("model", DataTypes.DoubleType,true,Metadata.empty()),
            new StructField("comment", DataTypes.StringType,true,Metadata.empty()),
            new StructField("blank", DataTypes.StringType,true,Metadata.empty())});
    //Get the workbook instance for XLSX file 
    XSSFWorkbook wb = new XSSFWorkbook(fIP);
System.out.println(wb.getNumberOfSheets()); 对于(int i=0;i
XSSFSheet sheet=wb.getSheetAt(i);
System.out.print(sheet.getSheetName()+“\n”);
Dataset Dataset=spark.read().format(“com.crealytics.spark.excel”)
.option(“sheetName”,sheet.getSheetName())
.选项(“useHeader”、“true”)
.option(“推断模式”,true)
.option(“位置”、“文件路径”)
.选项(“treatEmptyValuesAsNulls”,true)
.选项(“addColorColumns”、“false”)
.加载(“excel表格名称”);
行标题=dataSet.first();
dataSet=dataSet.filter(x->x!=标题);
dataSet.printSchema();
dataSet.show();
}
}捕获(例外情况除外){
例如printStackTrace();
}
}

public void loadMultipleExcel()引发异常{
public void loadMultipleExcel()throws Exception{
    try {
        File file = new File("/PATH TO YOUR XLS FILE INCLUDING FILENAME");
        FileInputStream fIP = new FileInputStream(file)

//use customSchema when the schema is not inferred properly orelse ignore go ahead with inferSchema=true when creating dataset

        StructType  customSchema = new StructType(new StructField[] {
            new StructField("year", DataTypes.DateType, true,Metadata.empty()),
            new StructField("make", DataTypes.DateType, true,Metadata.empty()),
            new StructField("model", DataTypes.DoubleType,true,Metadata.empty()),
            new StructField("comment", DataTypes.StringType,true,Metadata.empty()),
            new StructField("blank", DataTypes.StringType,true,Metadata.empty())});



//Get the workbook instance for XLSX file 
//import org.apache.poi.xssf.usermodel.XSSFSheet;
//import org.apache.poi.xssf.usermodel.XSSFWorkbook;




            XSSFWorkbook wb = new XSSFWorkbook(fIP);
            System.out.println(wb.getNumberOfSheets());
//get the 1st sheet as dataset1
            XSSFSheet sheet1 = wb.getSheetAt(0);
            Dataset<Row> ds1 =spark.read().format("com.crealytics.spark.excel")
            .option("sheetName", sheet1.getSheetName())
            .option("useHeader", "true")
            .option("inferSchema", true)
            .option("location", "/PATH TO YOUR XLS FILE INCLUDING FILENAME")
            .option("treatEmptyValuesAsNulls", true)
            .option("addColorColumns", "false") 
            .load("NAME OF YOUR XLSM OR XLS FILE");

            Row  header1 = ds1.first() ;
            ds1 = ds1.filter(x->x != header1) ; 

//get rest of the sheets as dataset2 and union it with dataset1
        for (int i = 1; i < wb.getNumberOfSheets(); i++) {
            XSSFSheet sheet = wb.getSheetAt(i);
            System.out.print(sheet.getSheetName()+"\n");
            Dataset<Row> ds2 =spark.read().format("com.crealytics.spark.excel")
                    .option("sheetName", sheet.getSheetName())
                    .option("useHeader", "true")
                    .option("inferSchema", true)
                    .option("location", "/PATH TO YOUR XLS FILE INCLUDING FILENAME")
                    .option("treatEmptyValuesAsNulls", true)
                    .option("addColorColumns", "false") 
                    .load("NAME OF YOUR XLSM OR XLS FILE");

            Row  header = ds2.first() ;
            ds2 = ds2.filter(x->x != header) ; 
            ds1=ds1.union(ds2);
            ds2.unpersist();
            ds2=null;

    } catch (Exception  ex ) {
        ex.printStackTrace();
    }
}   
试一试{ File File=新文件(“/XLS文件的路径,包括文件名”); FileInputStream fIP=新的FileInputStream(文件) //当模式推断不正确时使用customSchema,或者在创建数据集时忽略继续推断schema=true StructType customSchema=new StructType(new StructField[]{ new StructField(“年”,DataTypes.DateType,true,Metadata.empty()), 新建StructField(“make”,DataTypes.DateType,true,Metadata.empty()), 新的StructField(“model”,DataTypes.DoubleType,true,Metadata.empty()), 新建StructField(“comment”,DataTypes.StringType,true,Metadata.empty()), 新的StructField(“blank”,DataTypes.StringType,true,Metadata.empty())}); //获取XLSX文件的工作簿实例 //导入org.apache.poi.xssf.usermodel.xssfheet; //导入org.apache.poi.xssf.usermodel.xssf工作簿; XSSF工作簿wb=新XSSF工作簿(fIP); System.out.println(wb.getNumberOfSheets()); //将第一张图纸作为数据集1获取 XSSFSheet sheet1=wb.getSheetAt(0); 数据集ds1=spark.read().format(“com.crealytics.spark.excel”) .option(“sheetName”,sheet1.getSheetName()) .选项(“useHeader”、“true”) .option(“推断模式”,true) .option(“位置”,“XLS文件的路径,包括文件名”) .选项(“treatEmptyValuesAsNulls”,true) .选项(“addColorColumns”、“false”) .load(“您的XLSM或XLS文件的名称”); 行标题1=ds1.first(); ds1=ds1.过滤器(x->x!=header1); //将其余的工作表作为dataset2,并将其与dataset1合并 对于(int i=1;ix!=标题); ds1=ds1.union(ds2); ds2.unpersist(); ds2=null; }捕获(例外情况除外){ 例如printStackTrace(); } }
请重新格式化您的源代码并提供一些信息,解释此代码是如何解决问题的。上面的代码片段不起作用,抛出非法参数异常,称为“使用路径”。与“路径”一起使用只会读取第一张表。但是,我的问题是如何使用spark 2.0读取多张excel表。此问题不是重复的请再次阅读问题并与另一个问题核对,因此,请重新打开它,因为我仍然没有得到答案。
public void loadMultipleExcel()throws Exception{
    try {
        File file = new File("/PATH TO YOUR XLS FILE INCLUDING FILENAME");
        FileInputStream fIP = new FileInputStream(file)

//use customSchema when the schema is not inferred properly orelse ignore go ahead with inferSchema=true when creating dataset

        StructType  customSchema = new StructType(new StructField[] {
            new StructField("year", DataTypes.DateType, true,Metadata.empty()),
            new StructField("make", DataTypes.DateType, true,Metadata.empty()),
            new StructField("model", DataTypes.DoubleType,true,Metadata.empty()),
            new StructField("comment", DataTypes.StringType,true,Metadata.empty()),
            new StructField("blank", DataTypes.StringType,true,Metadata.empty())});



//Get the workbook instance for XLSX file 
//import org.apache.poi.xssf.usermodel.XSSFSheet;
//import org.apache.poi.xssf.usermodel.XSSFWorkbook;




            XSSFWorkbook wb = new XSSFWorkbook(fIP);
            System.out.println(wb.getNumberOfSheets());
//get the 1st sheet as dataset1
            XSSFSheet sheet1 = wb.getSheetAt(0);
            Dataset<Row> ds1 =spark.read().format("com.crealytics.spark.excel")
            .option("sheetName", sheet1.getSheetName())
            .option("useHeader", "true")
            .option("inferSchema", true)
            .option("location", "/PATH TO YOUR XLS FILE INCLUDING FILENAME")
            .option("treatEmptyValuesAsNulls", true)
            .option("addColorColumns", "false") 
            .load("NAME OF YOUR XLSM OR XLS FILE");

            Row  header1 = ds1.first() ;
            ds1 = ds1.filter(x->x != header1) ; 

//get rest of the sheets as dataset2 and union it with dataset1
        for (int i = 1; i < wb.getNumberOfSheets(); i++) {
            XSSFSheet sheet = wb.getSheetAt(i);
            System.out.print(sheet.getSheetName()+"\n");
            Dataset<Row> ds2 =spark.read().format("com.crealytics.spark.excel")
                    .option("sheetName", sheet.getSheetName())
                    .option("useHeader", "true")
                    .option("inferSchema", true)
                    .option("location", "/PATH TO YOUR XLS FILE INCLUDING FILENAME")
                    .option("treatEmptyValuesAsNulls", true)
                    .option("addColorColumns", "false") 
                    .load("NAME OF YOUR XLSM OR XLS FILE");

            Row  header = ds2.first() ;
            ds2 = ds2.filter(x->x != header) ; 
            ds1=ds1.union(ds2);
            ds2.unpersist();
            ds2=null;

    } catch (Exception  ex ) {
        ex.printStackTrace();
    }
}