Java CSVRecordReader和CSV行末尾的未终止引号字段

Java CSVRecordReader和CSV行末尾的未终止引号字段,java,csv,deeplearning4j,dl4j,univocity,Java,Csv,Deeplearning4j,Dl4j,Univocity,我使用的数据集有问题。它们是包含假新闻的CSV。我的问题在于CSVRecordReader类,这是DataVec(Deeplearning4j)提供给我的。我正在尝试一个火花转换过程。我的问题是众所周知的“CSV行末尾的未终止引用字段”错误 在Internet上搜索每个人都建议您查找出现这种情况的行并在csv中修复问题,但这将非常困难,因为数据集包含文章的部分内容(可能是真的,也可能是假的)。这些文章中有许多引用,其中包括一些典型的文章 为了寻找解决方案,我最终使用Univocity csv解析

我使用的数据集有问题。它们是包含假新闻的CSV。我的问题在于
CSVRecordReader
类,这是DataVec(Deeplearning4j)提供给我的。我正在尝试一个火花转换过程。我的问题是众所周知的“CSV行末尾的未终止引用字段”错误

在Internet上搜索每个人都建议您查找出现这种情况的行并在csv中修复问题,但这将非常困难,因为数据集包含文章的部分内容(可能是真的,也可能是假的)。这些文章中有许多引用,其中包括一些典型的文章

为了寻找解决方案,我最终使用Univocity csv解析器库实现了自己的CSVRecordReader,该库非常灵活,解决了当前的
CSVRecordReader
存在的所有问题,但是现在我发现了另一个难题,它是这个库的解析器没有实现接口
Serializable
,在apachespark中运行转换会抛出异常

org.apache.spark.SparkException:任务无法序列化,原因是: java.io.NotSerializableException:com.univocity.parsers.csv.CsvParser

我怎样才能解决我的问题

我自己的CSVRecordReader代码

package cu.desoft.cav.RecordReader;

import com.univocity.parsers.common.IterableResult;
import com.univocity.parsers.common.ParsingContext;
import com.univocity.parsers.common.ResultIterator;
import com.univocity.parsers.csv.CsvParser;
import com.univocity.parsers.csv.CsvParserSettings;
import org.datavec.api.records.Record;
import org.datavec.api.records.metadata.RecordMetaData;
import org.datavec.api.records.metadata.RecordMetaDataLine;
import org.datavec.api.records.reader.impl.LineRecordReader;
import org.datavec.api.split.FileSplit;
import org.datavec.api.split.InputSplit;
import org.datavec.api.writable.Text;
import org.datavec.api.writable.Writable;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

/**
 * @author: Acosta email: yunielacost738@gmail.com
 * created at: 11/25/2019
 */

public class UltraCSVRecordReader extends LineRecordReader {
    public static final char DEFAULT_DELIMITER = ',';
    public static final char DEFAULT_QUOTE = '"';
    public static final char DEFAULT_QUOTE_ESCAPE = '"';
    public static final char DEFAULT_CHAR_TO_ESCAPE_QUOTE_ESCAPING = '\0';
    private CsvParser csvParser;
    private CsvParserSettings settings;
    private ResultIterator<String[], ParsingContext> iterator;
    public UltraCSVRecordReader() {
        this(0, DEFAULT_DELIMITER, DEFAULT_QUOTE, DEFAULT_QUOTE_ESCAPE, DEFAULT_CHAR_TO_ESCAPE_QUOTE_ESCAPING);
    }

/**
 * @param unknownFormat if you can't know line endings, column delimiters and quotation characters set unknownFormat=true
 *                      for automatic detection
 */
public UltraCSVRecordReader(boolean unknownFormat) {
    this();
    if (unknownFormat) {
        settings = new CsvParserSettings();
        settings.detectFormatAutomatically();
        csvParser = new CsvParser(settings);
    }
}

public UltraCSVRecordReader(CsvParserSettings settings) {
    this.settings = settings;
    csvParser = new CsvParser(settings);
}

/**
 * @param skipNumLines              number of lines to skip
 * @param delimiter                 (default ,): value used to separate individual fields in the input
 * @param quote                     (default "): value used for escaping values where the fields delimiter is part of
 *                                  the value (e.g. the value "a,b" is parse as a , b).
 * @param quoteEscape               (default "): value used for escaping the quote character inside an already escaped value
 *                                  (e.g. the value " "" a,b "" " is parse as " a , b ").
 * @param charToEscapeQuoteEscaping (default \0): value used for escaping the quote escape character, when quote and quote escape are different
 *                                  (e.g. the value “\ " a , b " \” is parsed as \ " a , b " \, if quote = ", quoteEscape = \ and charToEscapeQuoteEscaping = \).
 */
public UltraCSVRecordReader(long skipNumLines, char delimiter, char quote, char quoteEscape,
                            char charToEscapeQuoteEscaping) {
    settings = new CsvParserSettings();
    settings.getFormat().setDelimiter(delimiter);
    settings.getFormat().setQuote(quote);
    settings.getFormat().setQuoteEscape(quoteEscape);
    settings.getFormat().setCharToEscapeQuoteEscaping(charToEscapeQuoteEscaping);
    settings.setNumberOfRowsToSkip(skipNumLines);
    csvParser = new CsvParser(settings);
}

/**
 * @param skipNumLines number of lines to skip
 */
public UltraCSVRecordReader(long skipNumLines) {
    this(skipNumLines, DEFAULT_DELIMITER, DEFAULT_QUOTE, DEFAULT_QUOTE_ESCAPE, DEFAULT_CHAR_TO_ESCAPE_QUOTE_ESCAPING);
}

/**
 * @param skipNumLines number of lines to skip
 * @param delimiter    (default ,): value used to separate individual fields in the input
 */
public UltraCSVRecordReader(long skipNumLines, char delimiter) {
    this(skipNumLines, delimiter, DEFAULT_QUOTE, DEFAULT_QUOTE_ESCAPE, DEFAULT_CHAR_TO_ESCAPE_QUOTE_ESCAPING);
}

/**
 * @param skipNumLines number of lines to skip
 * @param delimiter    (default ,): value used to separate individual fields in the input
 * @param quote        (default "): value used for escaping values where the fields delimiter is part of
 *                     the value (e.g. the value "a,b" is parse as a , b).
 */
public UltraCSVRecordReader(long skipNumLines, char delimiter, char quote) {
    this(skipNumLines, delimiter, quote, DEFAULT_QUOTE_ESCAPE, DEFAULT_CHAR_TO_ESCAPE_QUOTE_ESCAPING);
}

/**
 * @param skipNumLines number of lines to skip
 * @param delimiter    (default ,): value used to separate individual fields in the input
 * @param quote        (default "): value used for escaping values where the fields delimiter is part of
 *                     the value (e.g. the value "a,b" is parse as a , b).
 * @param quoteEscape  (default "): value used for escaping the quote character inside an already escaped value
 *                     (e.g. the value " "" a,b "" " is parse as " a , b ").
 */
public UltraCSVRecordReader(long skipNumLines, char delimiter, char quote, char quoteEscape) {
    this(skipNumLines, delimiter, quote, quoteEscape, DEFAULT_CHAR_TO_ESCAPE_QUOTE_ESCAPING);
}

@Override
public void initialize(InputSplit split) throws IOException, InterruptedException {
    super.initialize(split);
    this.initialize(((FileSplit) split).getRootDir());
}

public UltraCSVRecordReader maxLengthCharactersToParser(int numberCharacters) {
    this.settings.setMaxCharsPerColumn(numberCharacters);
    this.csvParser = new CsvParser(this.settings);
    return this;
}

public void initialize(File file) {
    IterableResult<String[], ParsingContext> iterate = this.csvParser.iterate(file);
    iterator = iterate.iterator();
}

protected List<Writable> parseLine(String line) {
    String[] split;
    split = this.csvParser.parseLine(line);
    List<Writable> values = new ArrayList<>();
    for (String value : split) {
        values.add(new Text(value));
    }
    return values;
}

public List<List<Writable>> next(int num) {
    List<List<Writable>> ret = new ArrayList<>(Math.min(num, 10000));
    int count = 0;

    while (this.hasNext() && count++ < num) {
        ret.add(this.next());
    }
    return ret;
}

public List<Writable> next() {
    String[] valuesSplit = iterator.next();
    List<Writable> values = new ArrayList<>();
    try {
        for (String value : valuesSplit) {
            values.add(new Text(value));
        }
    } catch (NullPointerException ex) {
        ex.printStackTrace();
        System.out.println(values);
        System.out.println("================================");
        System.out.println(Arrays.toString(valuesSplit));
    }

    return values;
}

public boolean batchesSupported() {
    return true;
}

public boolean hasNext() {
    return iterator.hasNext();
}

public Record nextRecord() {
    List<Writable> next = this.next();
    URI uri = this.locations != null && this.locations.length >= 1 ? this.locations[this.splitIndex] : null;
    RecordMetaData meta = new RecordMetaDataLine(this.lineIndex - 1, uri, UltraCSVRecordReader.class);
    return new org.datavec.api.records.impl.Record(next, meta);
}

public Record loadFromMetaData(RecordMetaData recordMetaData) throws IOException {
    return this.loadFromMetaData(Collections.singletonList(recordMetaData)).get(0);
}

public List<Record> loadFromMetaData(List<RecordMetaData> recordMetaDatas) throws IOException {
    List<Record> list = super.loadFromMetaData(recordMetaDatas);

    for (Record r : list) {
        String line = r.getRecord().get(0).toString();
        r.setRecord(this.parseLine(line));
    }

    return list;
}

public void reset() {
    super.reset();
}

public CsvParser getCsvParser() {
    return csvParser;
}
}
当我使用自己的CSVRecordReader和univocity csv解析器时,这就是序列化问题(该库不实现可序列化)

线程“main”org.apache.spark.SparkException中的异常:任务不可序列化 位于org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298) 位于org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288) 位于org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108) 位于org.apache.spark.SparkContext.clean(SparkContext.scala:2094) 位于org.apache.spark.rdd.rdd$$anonfun$map$1.apply(rdd.scala:370) 位于org.apache.spark.rdd.rdd$$anonfun$map$1.apply(rdd.scala:369) 位于org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) 位于org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) 位于org.apache.spark.rdd.rdd.withScope(rdd.scala:362) 位于org.apache.spark.rdd.rdd.map(rdd.scala:369) 位于org.apache.spark.api.java.JavaRDDLike$class.map(JavaRDDLike.scala:93) 位于org.apache.spark.api.java.AbstractJavaRDDLike.map(JavaRDDLike.scala:45) 在cu.desoft.cav.preprocessing.FakeNewsTransformation.transform(FakeNewsTransformation.java:71) 位于cu.desoft.cav.preprocessing.FakeNewsTransformation.main(FakeNewsTransformation.java:101) 在sun.reflect.NativeMethodAccessorImpl.invoke0(本机方法)处 位于sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 在sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)中 位于java.lang.reflect.Method.invoke(Method.java:498) 位于com.intellij.rt.execution.application.AppMainV2.main(AppMainV2.java:131) 原因:java.io.NotSerializableException:com.univocity.parsers.csv.CsvParser 序列化堆栈: -对象不可序列化(类:com.univocity.parsers.csv.CsvParser,值:com.univocity.parsers.csv)。CsvParser@75b6dd5b) -字段(类:cu.desoft.cav.RecordReader.UltraCSVRecordReader,名称:csvParser,类型:class com.univocity.parsers.csv.csvParser) -对象(类cu.desoft.cav.RecordReader.UltraCSVRRecordReader,cu.desoft.cav.RecordReader)。UltraCSVRecordReader@1fedf0a4) -字段(类:org.datavec.spark.transform.misc.StringToWritablesFunction,名称:recordReader,类型:interface org.datavec.api.records.reader.recordReader) -对象(类org.datavec.spark.transform.misc.StringToBritablesFunction,org.datavec.spark.transform.misc。StringToWritablesFunction@465b38e6) -字段(类:org.apache.spark.api.java.javapairdd$$anonfun$toScalaFunction$1,名称:fun$1,类型:interface org.apache.spark.api.java.function.function) -对象(类org.apache.spark.api.java.javapairdd$$anonfun$toScalaFunction$1,) 位于org.apache.spark.serializer.SerializationDebugger$.ImproveeException(SerializationDebugger.scala:40) 位于org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46) 位于org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100) 位于org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295) ... 还有18个
您必须修复CSV中导致错误的行。或者实现可序列化的记录读取器。

基于univocity解析器“处理未替换的引号,如果发现任何异常,您可以将其配置为引发异常。”也许可以尝试一下吗?

您必须修复CSV中导致错误的行。或者实现可序列化的记录读取器。

基于univocity解析器“处理未替换的引号,如果发现任何异常,您可以将其配置为引发异常。”也许可以尝试一下?

您好,欢迎使用堆栈溢出。我建议您添加错误的完整stacktrace,这对于理解问题的位置更有用。好的,谢谢我编辑了问题嗨,欢迎使用Stack Overflow。我建议您添加错误的完整stacktrace,这对于理解问题可能在哪里更有用。好的,谢谢我编辑了问题我已经使用univocity csv解析器实现了我自己的RecordReader,但是库中没有类实现可序列化接口,所以当我与spark一起使用时,它会抛出第二个错误。我的CSVRecordReader单独工作,但不能与sparkRight一起工作。因此,您必须修改它以使用spark或修复csv文件中的行。上面的链接告诉您如何打印未转义引号的异常。这样你就可以修复这个文件了。我已经实现了我自己的Recor
package cu.desoft.cav.preprocessing;

import cu.desoft.cav.RecordReader.UltraCSVRecordReader;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.datavec.api.records.reader.RecordReader;
import org.datavec.api.records.reader.impl.csv.CSVRecordReader;
import org.datavec.api.transform.TransformProcess;
import org.datavec.api.transform.schema.Schema;
import org.datavec.api.writable.Writable;
import org.datavec.spark.transform.SparkTransformExecutor;
import org.datavec.spark.transform.misc.StringToWritablesFunction;
import org.datavec.spark.transform.misc.WritablesToStringFunction;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;

/**
 * author: acosta
 * email: yunielacosta738@gmail.com
 * Created on: 2/3/20
 */
public class FakeNewsTransformation {
    private final String DATSETS_PATH = "data/FakeNews/";

    public void transform(boolean useSparkLocal) {
        Schema schema = new Schema.Builder()
                .addColumnString("uuid")
                .addColumnInteger("ord_in_thread")
                .addColumnString("author")
                .addColumnString("published")
                .addColumnsString("title","text","language","crawled","site_url","country")
                .addColumnInteger("domain_rank")
                .addColumnString("thread_title")
                .addColumnsInteger("spam_score","main_img_url","replies_count","participants_count","likes","comments","shares")
                .addColumnCategorical("type", Arrays.asList("bias", "bs","conspiracy","fake","hate","junksci","satire","state"))
                .build();

        TransformProcess tp = new TransformProcess.Builder(schema)
                .removeColumns("uuid", "ord_in_thread","author","published","site_url","country","thread_title")
                .categoricalToInteger("type")
                .build();

        int numActions = tp.getActionList().size();
        for (int i = 0; i < numActions; i++) {
            System.out.println("\n\n===============================");
            System.out.println("--- Schema after step " + i +
                    " (" + tp.getActionList().get(i) + ")--");
            System.out.println(tp.getSchemaAfterStep(i));
        }

        SparkConf sparkConf = new SparkConf();
        sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
        sparkConf.set("spark.kryo.registrator", "org.nd4j.Nd4jRegistrator");
        if (useSparkLocal) {
            sparkConf.setMaster("local[*]");
        }

        sparkConf.setAppName("Fake News Spanish Corpus dataset transformation");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);
        //Load our data using Spark
        JavaRDD<String> lines = sc.textFile(DATSETS_PATH + "fake.csv");
        int skipNumLines = 1;
        //We first need to parse this format. It's comma-delimited (CSV) format, so let's parse it using CSVRecordReader:
        RecordReader rr = new UltraCSVRecordReader();
//        RecordReader rr = new CSVRecordReader();
        JavaRDD<List<Writable>> parsedInputData = lines.map(new StringToWritablesFunction(rr));

        //Now, let's execute the transforms we defined earlier:
        JavaRDD<List<Writable>> processedData = SparkTransformExecutor.execute(parsedInputData, tp);

        //For the sake of this example, let's collect the data locally and print it:
        JavaRDD<String> processedAsString = processedData.map(new WritablesToStringFunction(","));
        System.out.println("<<<<<<<<<<<<<<<PATH>>>>>>>>>>>>>");
        File dataset = new File("dataset/FakeNews");
        if (dataset.exists()) {
            try {
                FileUtils.deleteDirectory(dataset);
                System.out.println("DELETE THE DIRECTORY");
            } catch (IOException e) {
                System.out.println("The directory was not delete");
                e.printStackTrace();
            }
        }
        System.out.println(dataset.getAbsolutePath());
        System.out.println("<<<<<<<<<<<<<<<END-PATH>>>>>>>>>>>>>");
        processedAsString.saveAsTextFile("file://" + dataset.getAbsolutePath());   //To save locally
        //processedAsString.saveAsTextFile("hdfs://your/hdfs/save/path/here");   //To save to hdfs

        List<String> processedCollected = processedAsString.collect();
        List<String> inputDataCollected = lines.collect();


    }

    public static void main(String[] args) {
        new FakeNewsTransformation().transform(true);
    }
}
    java.lang.RuntimeException: java.io.IOException: Un-terminated quoted field at end of CSV line
    at org.datavec.api.records.reader.impl.csv.CSVRecordReader.parseLine(CSVRecordReader.java:183)
    at org.datavec.api.records.reader.impl.csv.CSVRecordReader.next(CSVRecordReader.java:175)
    at org.datavec.spark.transform.misc.StringToWritablesFunction.call(StringToWritablesFunction.java:41)
    at org.datavec.spark.transform.misc.StringToWritablesFunction.call(StringToWritablesFunction.java:33)
    at org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1.apply(JavaPairRDD.scala:1040)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply$mcV$sp(PairRDDFunctions.scala:1211)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1210)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1210)
    at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1341)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1218)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1197)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:99)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: Un-terminated quoted field at end of CSV line
    at org.datavec.api.records.reader.impl.csv.SerializableCSVParser.parseLine(SerializableCSVParser.java:276)
    at org.datavec.api.records.reader.impl.csv.SerializableCSVParser.parseLine(SerializableCSVParser.java:186)
    at org.datavec.api.records.reader.impl.csv.CSVRecordReader.parseLine(CSVRecordReader.java:181)
    ... 21 more
  Exception in thread "main" org.apache.spark.SparkException: Task not serializable
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
    at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
    at org.apache.spark.SparkContext.clean(SparkContext.scala:2094)
    at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:370)
    at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:369)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
    at org.apache.spark.rdd.RDD.map(RDD.scala:369)
    at org.apache.spark.api.java.JavaRDDLike$class.map(JavaRDDLike.scala:93)
    at org.apache.spark.api.java.AbstractJavaRDDLike.map(JavaRDDLike.scala:45)
    at cu.desoft.cav.preprocessing.FakeNewsTransformation.transform(FakeNewsTransformation.java:71)
    at cu.desoft.cav.preprocessing.FakeNewsTransformation.main(FakeNewsTransformation.java:101)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at com.intellij.rt.execution.application.AppMainV2.main(AppMainV2.java:131)
Caused by: java.io.NotSerializableException: com.univocity.parsers.csv.CsvParser
Serialization stack:
    - object not serializable (class: com.univocity.parsers.csv.CsvParser, value: com.univocity.parsers.csv.CsvParser@75b6dd5b)
    - field (class: cu.desoft.cav.RecordReader.UltraCSVRecordReader, name: csvParser, type: class com.univocity.parsers.csv.CsvParser)
    - object (class cu.desoft.cav.RecordReader.UltraCSVRecordReader, cu.desoft.cav.RecordReader.UltraCSVRecordReader@1fedf0a4)
    - field (class: org.datavec.spark.transform.misc.StringToWritablesFunction, name: recordReader, type: interface org.datavec.api.records.reader.RecordReader)
    - object (class org.datavec.spark.transform.misc.StringToWritablesFunction, org.datavec.spark.transform.misc.StringToWritablesFunction@465b38e6)
    - field (class: org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1, name: fun$1, type: interface org.apache.spark.api.java.function.Function)
    - object (class org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1, <function1>)
    at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
    at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
    at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
    ... 18 more