如何在Java/Scala中将CSV文件值解析为MatrixEntry
我在Scala中有这段代码,我必须将其更改为Java:如何在Java/Scala中将CSV文件值解析为MatrixEntry,java,scala,csv,apache-spark,Java,Scala,Csv,Apache Spark,我在Scala中有这段代码,我必须将其更改为Java: import au.com.bytecode.opencsv.CSVParser import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.distributed.MatrixEntry import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix import org.apache.spar
import au.com.bytecode.opencsv.CSVParser
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.distributed.MatrixEntry
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.SingularValueDecomposition
import org.apache.spark.mllib.linalg.Vector
import scala.collection.immutable.List
import java.io._
import java.nio.file.{Paths, Files}
import java.nio.charset.StandardCharsets
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.Matrices
import org.apache.spark.mllib.linalg.DenseMatrix
import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix
def exportMatrix(matrix:Array[Double], filename: String, numCols:Int, numRows:Int) = {
val pw = new PrintWriter(filename)
for(columnIndex <- 0 until numCols) {
pw.print("word"+columnIndex)
if(columnIndex == numCols - 1)
pw.println
else
pw.print(",")
}
for( rowIndex <- 0 until numRows){
for(columnIndex <- 0 until numCols) {
pw.print(matrix(numRows * columnIndex + rowIndex))
if(columnIndex == numCols - 1)
pw.println
else
pw.print(",")
}
}
pw.flush
pw.close
}
def exportRowMatrix(matrix:RDD[String], fileName: String) = {
val pw = new PrintWriter(fileName)
matrix.collect().foreach(line => pw.println(line))
pw.flush
pw.close
}
val csv = sc.textFile("hdfs://myhost/sparse.csv").cache() // original file
val data = csv.mapPartitions(lines => {
val parser = new CSVParser(' ')
lines.map(line => {
parser.parseLine(line)
})
}).map(line => {
MatrixEntry(line(0).toLong - 1, line(1).toLong - 1 , line(2).toInt)
}
)
val indexedRowMatrix: IndexedRowMatrix = new CoordinateMatrix(data).toIndexedRowMatrix()
/*val mat: CoordinateMatrix =
val rowMatrix: RowMatrix = mat.toRowMatrix()*/
val svd: SingularValueDecomposition[IndexedRowMatrix, Matrix] = indexedRowMatrix.computeSVD(100, computeU = true)
val U: IndexedRowMatrix = svd.U // The U factor is a RowMatrix.
val S: Vector = svd.s // The singular values are stored in a local dense vector.
val V: Matrix = svd.V // The V factor is a local dense matrix.
val sArray: Array[Double] = S.toArray // done
val vArray: Array[Double] = V.toArray // done
val rdd = U.rows.map( x => x.vector.toArray.mkString(","))
exportMatrix(sArray, "../S.csv", S.size, 1)
exportMatrix(vArray, "../V.csv", V.numCols.toInt, V.numRows.toInt)
exportRowMatrix(rdd, "../U.csv")
val diag = Matrices.diag(S)
val D = new DenseMatrix(diag.numRows, diag.numCols, diag.toArray)
val multiplyResult = V.multiply(D)
val dArray = multiplyResult.toArray
exportMatrix(dArray, "../D.csv", multiplyResult.numCols, multiplyResult.numRows)
我已经做到了:
JavaSparkContext sc = new JavaSparkContext(SparkConfiguration.getSparkConfiguration());
JavaRDD<String> csv = sc.textFile("hdfs://yoda/nlp/sparse.csv").cache();
System.out.println(csv.first());
//TODO function for parsing textFile into RDD MatrixEntry
RDD<MatrixEntry> data = null;
csv.mapPartitions(new FlatMapFunction<Iterator<String>, String>() {
@Override
public Iterable<String> call(Iterator<String> t)
throws Exception {
// TODO Auto-generated method stub
return null;
}
});
IndexedRowMatrix indexedRowMatrix = (new CoordinateMatrix(data)).toIndexedRowMatrix();
SingularValueDecomposition<IndexedRowMatrix, Matrix> svd = indexedRowMatrix.computeSVD(100, true, 0);
IndexedRowMatrix u = svd.U();
Vector s = svd.s();
Matrix v = svd.V();
double[] sArray = s.toArray();
double[] vArray = v.toArray();
//TODO function for maping each row into string value
RDD<String> rdd = u.rows().map(null, null);
Matrix diag = Matrices.diag(s);
DenseMatrix d = new DenseMatrix(diag.numRows(), diag.numCols(), diag.toArray());
DenseMatrix multiplyResult = v.multiply(d);
double[] dArray = multiplyResult.toArray();
我的问题是:
如何将矩阵市场格式中的每一行解析为矩阵中心?它应该由csv.mapPartitions完成
如何在Java中定义exportMatrix之类的函数?它与普通Java函数相同吗?