Apache spark |5.104 | 5.104 | 5.105 | 5.104 | 5.106 | 5.103 | 5.106 | 5.106 | 5.103 | 5.107 |+----------+,两者都包含列名javard sdrDS=spark.read().fo
|5.104 | 5.104 | 5.105 | 5.104 | 5.106 | 5.103 | 5.106 | 5.106 | 5.103 | 5.107 |+----------+,两者都包含列名javard sdrDS=spark.read().format(“libsvm”).jdbc(POSTGRESQL连接URL、POSTGRESQL表、选项);StructField[]fields={new StructField(“features”,new VectorUDT(),false,Metadata.empty())};StructType模式=新的StructType(字段);DataFrame dataset=sqlContext.createDataFrame(sdrDS,模式);试试这个。模型正在尝试查找您在本例中明确必须创建的features列。Apache spark |5.104 | 5.104 | 5.105 | 5.104 | 5.106 | 5.103 | 5.106 | 5.106 | 5.103 | 5.107 |+----------+,两者都包含列名javard sdrDS=spark.read().fo,apache-spark,apache-spark-ml,Apache Spark,Apache Spark Ml,|5.104 | 5.104 | 5.105 | 5.104 | 5.106 | 5.103 | 5.106 | 5.106 | 5.103 | 5.107 |+----------+,两者都包含列名javard sdrDS=spark.read().format(“libsvm”).jdbc(POSTGRESQL连接URL、POSTGRESQL表、选项);StructField[]fields={new StructField(“features”,new VectorUDT(),false,
Exception in thread "main" java.lang.IllegalArgumentException: Field "features" does not exist.
at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:264)
at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:264)
at scala.collection.MapLike$class.getOrElse(MapLike.scala:128)
at scala.collection.AbstractMap.getOrElse(Map.scala:59)
at org.apache.spark.sql.types.StructType.apply(StructType.scala:263)
at org.apache.spark.ml.util.SchemaUtils$.checkColumnType(SchemaUtils.scala:40)
at org.apache.spark.ml.clustering.KMeansParams$class.validateAndTransformSchema(KMeans.scala:92)
at org.apache.spark.ml.clustering.KMeans.validateAndTransformSchema(KMeans.scala:253)
at org.apache.spark.ml.clustering.KMeans.transformSchema(KMeans.scala:330)
at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:74)
at org.apache.spark.ml.clustering.KMeans.fit(KMeans.scala:304)
at sparkExample.spExample.ClusteringDSPOC.main(ClusteringDSPOC.java:45)
17
package sparkExample.spExample;
import java.util.Properties;
import java.util.regex.Pattern;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.ml.clustering.KMeans;
import org.apache.spark.ml.clustering.KMeansModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
public class ClusteringDSPOC {
private static final Pattern SPACE = Pattern.compile(" ");
private static final SparkContext sc = new SparkContext(new SparkConf().setAppName("SparkJdbcDs").setMaster("local[*]"));
private static final String POSTGRESQL_DRIVER = "org.postgresql.Driver";
private static final String POSTGRESQL_USERNAME = "xyz";
private static final String POSTGRESQL_PWD = "xyz";
private static final String POSTGRESQL_CONNECTION_URL = "jdbc:postgresql://192.168.111.130:5432/xyzdb?user=" + POSTGRESQL_USERNAME + "&password=" + POSTGRESQL_PWD;
private static final String POSTGRESQL_TABLE = "(select id, duration from abc where duration is not null ) as abc";
public static void main(String[] args) throws Exception {
//Datasource options
SparkSession spark = SparkSession.builder().appName("JavaKMeansExample").getOrCreate();
Class.forName(POSTGRESQL_DRIVER);
Properties options = new Properties();
Dataset<Row> sdrDS = spark.read().format("libsvm").jdbc(POSTGRESQL_CONNECTION_URL, POSTGRESQL_TABLE, options);
Dataset<Row> durationDS = sdrDS.select("duration");
KMeans kmeans = new KMeans().setK(2).setSeed(1L);
KMeansModel model = kmeans.fit(durationDS);
}
}
package sparkExample.spExample;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.mllib.clustering.KMeans;
import org.apache.spark.mllib.clustering.KMeansModel;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
public class ClusteringPOC1 {
private static final Pattern SPACE = Pattern.compile(" ");
private static final JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("SparkJdbcDs").setMaster("local[*]"));
private static final String POSTGRESQL_DRIVER = "org.postgresql.Driver";
private static final String POSTGRESQL_USERNAME = "abc";
private static final String POSTGRESQL_PWD = "abc";
private static final String POSTGRESQL_CONNECTION_URL = "jdbc:postgresql://192.168.111.130:5432/abcdb?user=" + POSTGRESQL_USERNAME + "&password=" + POSTGRESQL_PWD;
private static final SQLContext sqlContext = new SQLContext(sc);
public static void main(String[] args) throws Exception {
//Datasource options
Map<String, String> options = new HashMap<String, String>();
options.put("driver", POSTGRESQL_DRIVER);
options.put("url", POSTGRESQL_CONNECTION_URL);
options.put("dbtable", "(select id, duration from sdr_log where duration is not null ) as sdr_log");
Dataset<Row> sdrDF = sqlContext.load("jdbc", options);
JavaRDD<Row> sdrData = sdrDF.toJavaRDD();
sdrData.cache();
JavaRDD<Vector> durationData = sdrData.map(row -> {
double value = new Double(row.get(2).toString());
return Vectors.dense(value);
});
durationData.cache();
KMeansModel clusters = KMeans.train(durationData.rdd(), numClusters, numIterations);
JavaRDD<Integer> clusterLabel = clusters.predict(durationData);
JavaRDD<Long> id = sdrData.map(row -> new Long(row.get(1).toString()));
JavaPairRDD<Long, Integer> clusterLableData = id.zip(clusterLabel);
clusterLableData.saveAsTextFile("data/mlib/kmeans_output11.txt");
}
}
Dataset<Row> durationDS = sdrDS.select("duration");
> DataFrame jdbcDF= sql.Context.read().format("libsvm").jdbc(POSTGRESQL_CONNECTION_URL,POSTGRESQL_TABLE, options);
> KMeansModel model = kmeans.fit(jdbcDF);