Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/scala/16.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Spark&x2B;Scala:NaiveBayes.train-异常为java.util.NoSuchElementException:next on empty迭代器_Scala_Apache Spark_Apache Spark Mllib_Sentiment Analysis_Naivebayes - Fatal编程技术网

Spark&x2B;Scala:NaiveBayes.train-异常为java.util.NoSuchElementException:next on empty迭代器

Spark&x2B;Scala:NaiveBayes.train-异常为java.util.NoSuchElementException:next on empty迭代器,scala,apache-spark,apache-spark-mllib,sentiment-analysis,naivebayes,Scala,Apache Spark,Apache Spark Mllib,Sentiment Analysis,Naivebayes,我正在尝试使用Spark MLlib的推文进行情绪分析。在预处理数据并将其转换为适当的格式后,我调用NaiveBayes的train方法来获取模型,但它失败了,出现了一个异常。以下是stacktrace: java.util.NoSuchElementException: next on empty iterator at scala.collection.Iterator$$anon$2.next(Iterator.scala:39) at scala.collection.I

我正在尝试使用Spark MLlib的推文进行情绪分析。在预处理数据并将其转换为适当的格式后,我调用NaiveBayes的train方法来获取模型,但它失败了,出现了一个异常。以下是stacktrace:

java.util.NoSuchElementException: next on empty iterator
    at scala.collection.Iterator$$anon$2.next(Iterator.scala:39)
    at scala.collection.Iterator$$anon$2.next(Iterator.scala:37)
    at scala.collection.IndexedSeqLike$Elements.next(IndexedSeqLike.scala:64)
    at scala.collection.IterableLike$class.head(IterableLike.scala:91)
    at scala.collection.mutable.ArrayOps$ofRef.scala$collection$IndexedSeqOptimized$$super$head(ArrayOps.scala:108)
    at scala.collection.IndexedSeqOptimized$class.head(IndexedSeqOptimized.scala:120)
    at scala.collection.mutable.ArrayOps$ofRef.head(ArrayOps.scala:108)
    at org.apache.spark.mllib.classification.NaiveBayes.run(NaiveBayes.scala:408)
    at org.apache.spark.mllib.classification.NaiveBayes$.train(NaiveBayes.scala:467)
    at org.jc.sparknaivebayes.main.NaiveBayesTrain$delayedInit$body.apply(NaiveBayesTrain.scala:53)
    at scala.Function0$class.apply$mcV$sp(Function0.scala:40)
    at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:12)
    at scala.App$$anonfun$main$1.apply(App.scala:71)
    at scala.App$$anonfun$main$1.apply(App.scala:71)
    at scala.collection.immutable.List.foreach(List.scala:318)
    at scala.collection.generic.TraversableForwarder$class.foreach(TraversableForwarder.scala:32)
    at scala.App$class.main(App.scala:71)
    at org.jc.sparknaivebayes.main.NaiveBayesTrain$.main(NaiveBayesTrain.scala:12)
    at org.jc.sparknaivebayes.main.NaiveBayesTrain.main(NaiveBayesTrain.scala)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:606)
    at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:542)
这是我的主要方法:

val csvFiles = args(0).split(",")
    val modelStore = args(1)
    val docs = TweetParser.parseAll(csvFiles, sc)
    val termDocs = Tokenizer.tokenizeAll(docs)

    val termDocsRdd = sc.parallelize[TermDoc](termDocs.toSeq)

    val numDocs = termDocsRdd.count()

    //val terms = termDocsRdd.flatMap(_.terms).distinct().collect().sortBy(identity)
    val terms = termDocsRdd.flatMap(_.terms).distinct().sortBy(identity)
    val termDict = new Dictionary(terms)

    //val labels = termDocsRdd.flatMap(_.labels).distinct().collect()
    val labels = termDocsRdd.flatMap(_.labels).distinct()
    val labelDict = new Dictionary(labels)

    val idfs = (termDocsRdd.flatMap(termDoc => termDoc.terms.map((termDoc.doc, _))).distinct().groupBy(_._2) collect {
      case (term, docs) if docs.size > 3 =>
        term -> (numDocs.toDouble / docs.size.toDouble)
    }).collect.toMap

    val tfidfs = termDocsRdd flatMap {
      termDoc =>
        val termPairs: Seq[(Int, Double)] = termDict.tfIdfs(termDoc.terms, idfs)
        termDoc.labels.headOption.map {
          label =>
            val labelId = labelDict.indexOf(label).toDouble
            val vector = Vectors.sparse(termDict.count.toInt, termPairs)
            LabeledPoint(labelId, vector)
        }
    }

    val model = NaiveBayes.train(tfidfs)
字典类在这里:

class Dictionary(dict: RDD[String]) extends Serializable {

  //val builder = ImmutableBiMap.builder[String, Long]()
  //dict.zipWithIndex.foreach(e => builder.put(e._1, e._2))

  //val termToIndex = builder.build()
  val termToIndex = dict.zipWithIndex()

  //@transient
  //lazy val indexToTerm = termToIndex.inverse()
  lazy val indexToTerm = dict.zipWithIndex().map{
    case (k, v) => (v, k)
  } //converts from (a, 0),(b, 1),(c, 2) to (0, a),(1, b),(2, c)

  val count = termToIndex.count().toInt

  def indexOf(term: String): Int = termToIndex.lookup(term).headOption.getOrElse[Long](-1).toInt

  def valueOf(index: Int): String = indexToTerm.lookup(index).headOption.getOrElse("")

  def tfIdfs (terms: Seq[String], idfs: Map[String, Double]): Seq[(Int, Double)] = {
    val filteredTerms = terms.filter(idfs contains)
    (filteredTerms.groupBy(identity).map {
      case (term, instances) => {
        val indexOfTerm: Int = indexOf(term)
        if (indexOfTerm < 0) (-1, 0.0) else (indexOf(term), (instances.size.toDouble / filteredTerms.size.toDouble) * idfs(term))
      }
    }).filter(p => p._1.toInt  >= 0).toSeq.sortBy(_._1)
  }

  def vectorize(tfIdfs: Iterable[(Int, Double)]) = {
    Vectors.sparse(dict.count().toInt, tfIdfs.toSeq)
  }
}
TermDoc类:

case class TermDoc(doc: String, labels: Set[String], terms: Seq[String])
我被困在这一步,我真的需要完成这项工作,但我有很多麻烦,在寻找有关它的有用信息。提前谢谢

附言:这是基于黑猩猩的博客:

更新:CSV解析器和文档生成器的新代码

import org.apache.spark.SparkContext

import scala.io.Source

/**
  * Created by cespedjo on 14/02/2017.
  */
object TweetParser extends Serializable{

  val headerPart = "polarity"

  val mentionRegex = """@(.)+?\s""".r

  val fullRegex = """(\d+),(.+?),(N|P|NEU|NONE)(,\w+|;\w+)*""".r

  def parseAll(csvFiles: Iterable[String], sc: SparkContext) = csvFiles flatMap(csv => parse(csv, sc))

  def parse(csvFile: String, sc: SparkContext) = {
    val csv = sc.textFile(csvFile)
    val docs = scala.collection.mutable.ArrayBuffer.empty[Document]

    csv.foreach(
      line => if (!line.contains(headerPart)) docs += buildDocument(line)
    )
    docs
    //docs.filter(!_.docId.equals("INVALID"))
  }

  def buildDocument(line: String): Document = {

    val fullRegex(id, txt, snt, opt) = line
    if (id != null && txt != null && snt != null)
      new Document(id, mentionRegex.replaceAllIn(txt, ""), Set(snt))
    else
      new Document("INVALID")
  }
}

case class Document(docId: String, body: String = "", labels: Set[String] = Set.empty)

我认为问题在于有些文档不包含任何术语对。不能在空数据点上进行训练。尝试将代码更改为:

val tfidfs = termDocsRdd flatMap {
  termDoc =>
    val termPairs: Seq[(Int, Double)] = termDict.tfIdfs(termDoc.terms, idfs)
    if (termPairs.nonEmpty) {
      termDoc.labels.headOption.map {
        label =>
          val labelId = labelDict.indexOf(label).toDouble
          val vector = Vectors.sparse(termDict.count.toInt, termPairs)
          LabeledPoint(labelId, vector)
    } else {
      None
    }
}

我认为你的错误来自于
val vector=Vectors.sparse中的空向量。你需要找到/发布所有指向你应用程序中已损坏代码的错误消息,这样你就可以确定,我有类似的问题,并通过向向量推送更多数据来解决,顺便说一句,你可能会查找
sparse vector
类,还有你申请的更多细节谢谢你的评论Karol,我是spark和scala的新手,你能详细说明一下你的建议吗?我无法理解“将更多数据推送到矢量”部分,因为我认为它是由RDD中已经包含的数据填充的,所以缺少多少数据?顺便说一句,我查看了矢量的文档,它说本地矢量。。。这是否意味着它不能在分布式模式下使用?在分布式模式下运行时,我需要使用什么来进行监督学习?我使用ML管道,利用交叉验证器和参数转换功能:
https://databricks.com/blog/2015/10/20/audience-modeling-with-apache-spark-ml-pipelines.html
http://spark.apache.org/docs/latest/ml-pipeline.html
,重向量:在对该数据应用模型/函数之前,查看向量中的数据,错误表示没有数据/需要一些数据missing@KarolSudol你知道为什么问题中的代码更新会产生空RDD吗?我已经在CSV文件中的几行代码上进行了测试,它可以识别指定的模式,但是,没有文档附加到可变数组中。感谢您的回答Pascal,您知道问题中的代码更新为什么会产生空RDD吗?我已经在CSV文件的几行代码上进行了测试,它可以识别指定的模式,但是,没有文档附加到可变数组中。您似乎在某些地方更改了代码(注释内容)。。您正在查找一个字典,它是一个未收集的RDD,这是错误的,因为在代码中的这一点上,您需要有“全局视图”(即:您不希望在局部工作字典中进行查找,而是在全局字典中进行查找对我来说似乎是对的,但不是我理解的最新代码…但我想找到一种不使用collect来实现这一点的正确方法,因为我在某个地方读到,这不是一个好的选择,因为它会强制在驱动程序中收集数据,并且在处理大量数据时可能会导致错误…有什么建议吗?顺便说一句,行为执行这一行代码时会出现这种情况:val docs=TweetParser.parseAll(csvFiles,sc)。我已经用一个文件进行了测试,docs.size为0。我不知道为什么即使在单独的行上测试模式时也会出现这种情况。
val tfidfs = termDocsRdd flatMap {
  termDoc =>
    val termPairs: Seq[(Int, Double)] = termDict.tfIdfs(termDoc.terms, idfs)
    if (termPairs.nonEmpty) {
      termDoc.labels.headOption.map {
        label =>
          val labelId = labelDict.indexOf(label).toDouble
          val vector = Vectors.sparse(termDict.count.toInt, termPairs)
          LabeledPoint(labelId, vector)
    } else {
      None
    }
}