Java 为什么ApacheMahout频繁模式minnig算法只返回1个项目集?
我目前正在测试ApacheMahout并行。在实际项目中使用它之前,我从一个简单的代码开始,只是为了确保它能像我期望的那样工作 我没有找到包含代码、数据和输出的完整示例 我目前有一个编译和执行版本(参见下面的java/scala代码),但是返回的频繁模式只包含一个元组(参见下面的示例输出) 这是故意的行为吗? 我做错了什么 谢谢你的帮助 scala代码:Java 为什么ApacheMahout频繁模式minnig算法只返回1个项目集?,java,algorithm,scala,data-mining,mahout,Java,Algorithm,Scala,Data Mining,Mahout,我目前正在测试ApacheMahout并行。在实际项目中使用它之前,我从一个简单的代码开始,只是为了确保它能像我期望的那样工作 我没有找到包含代码、数据和输出的完整示例 我目前有一个编译和执行版本(参见下面的java/scala代码),但是返回的频繁模式只包含一个元组(参见下面的示例输出) 这是故意的行为吗? 我做错了什么 谢谢你的帮助 scala代码: import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth import jav
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth
import java.util.HashSet
import org.apache.mahout.common.iterator.StringRecordIterator
import org.apache.mahout.common.iterator.FileLineIterable
import org.apache.mahout.fpm.pfpgrowth.convertors._
import org.apache.mahout.fpm.pfpgrowth.convertors.integer._
import org.apache.mahout.fpm.pfpgrowth.convertors.string._
import org.apache.hadoop.io.SequenceFile.Writer
import org.apache.mahout.fpm.pfpgrowth.convertors.StatusUpdater
import org.apache.hadoop.mapred.OutputCollector
import scala.collection.JavaConversions._
import java.util.{ List => JList }
import org.apache.mahout.common.{ Pair => JPair }
import java.lang.{ Long => JLong }
import org.apache.hadoop.io.{ Text => JText }
val minSupport = 5L
val k: Int = 50
val fps: FPGrowth[String] = new FPGrowth[String]()
val milk = "milk"
val bread = "bread"
val butter = "butter"
val bier = "bier"
val transactionStream: Iterator[JPair[JList[String], JLong]] = Iterator(
new JPair(List(milk, bread), 10L),
new JPair(List(butter), 10L),
new JPair(List(bier), 10L),
new JPair(List(milk, bread, butter), 5L),
new JPair(List(milk, bread, bier), 5L),
new JPair(List(bread), 10L)
)
val frequencies: Collection[JPair[String, JLong]] = fps.generateFList(
transactionStream, minSupport.toInt)
println("freqList :" + frequencies)
var returnableFeatures: Collection[String] = List(
milk, bread, butter, bier)
var output: OutputCollector[String, JList[JPair[JList[String], JLong]]] = (
new OutputCollector[String, JList[JPair[JList[String], JLong]]] {
def collect(x1: String,
x2: JList[JPair[JList[String], JLong]]) = {
println(x1 + ":" +
x2.map(pair => "[" + pair.getFirst.mkString(",") + "] : " +
pair.getSecond).mkString("; "))
}
}
)
val updater: StatusUpdater = new StatusUpdater {
def update(status: String) = println("updater : " + status)
}
fps.generateTopKFrequentPatterns(
transactionStream,
frequencies,
minSupport,
k,
null, //returnableFeatures
output,
updater)
java代码:
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;
import java.io.IOException;
import java.util.*;
import org.apache.mahout.common.iterator.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.*;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.*;
import org.apache.hadoop.io.Text;
class FPGrowthDemo {
public static void main(String[] args) {
long minSupport = 1L;
int k = 50;
FPGrowth<String> fps = new FPGrowth<String>();
String milk = "milk";
String bread = "bread";
String butter = "butter";
String bier = "bier";
LinkedList<Pair<List<String>, Long>> data =
new LinkedList<Pair<List<String>, Long>>();
data.add(new Pair(Arrays.asList(milk, bread), 1L));
data.add(new Pair(Arrays.asList(butter), 1L));
data.add(new Pair(Arrays.asList(bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread, butter), 1L));
data.add(new Pair(Arrays.asList(milk, bread, bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread), 1L));
Iterator<Pair<List<String>, Long>> transactions = data.iterator();
Collection<Pair<String, Long>> frequencies = fps.generateFList(
transactions, (int) minSupport);
System.out.println("freqList :" + frequencies);
Collection<String> returnableFeatures =
Arrays.asList(milk, bread, butter, bier);
OutputCollector<String, List<Pair<List<String>, Long>>> output =
new OutputCollector<String, List<Pair<List<String>, Long>>>() {
@Override
public void collect(String x1,
List<Pair<List<String>, Long>> listPair)
throws IOException {
StringBuffer sb = new StringBuffer();
sb.append(x1 + ":");
for (Pair<List<String>, Long> pair : listPair) {
sb.append("[");
String sep = "";
for (String item : pair.getFirst()) {
sb.append(item + sep);
sep = ", ";
}
sb.append("]:" + pair.getSecond());
}
System.out.println(" " + sb.toString());
}
};
StatusUpdater updater = new StatusUpdater() {
public void update(String status){
System.out.println("updater :" + status);
}
};
try {
fps.generateTopKFrequentPatterns(
transactions,
frequencies,
minSupport,
k,
null, //returnableFeatures
output,
updater);
}catch (Exception e){
e.printStackTrace();
}
}
}
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;
import java.io.IOException;
import java.util.*;
import org.apache.mahout.common.iterator.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.*;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.*;
import org.apache.hadoop.io.Text;
class FPGrowthDemo {
public static void main(String[] args) {
long minSupport = 1L;
int k = 50;
FPGrowth<String> fps = new FPGrowth<String>();
String milk = "milk";
String bread = "bread";
String butter = "butter";
String bier = "bier";
LinkedList<Pair<List<String>, Long>> data =
new LinkedList<Pair<List<String>, Long>>();
data.add(new Pair(Arrays.asList(milk, bread), 1L));
data.add(new Pair(Arrays.asList(butter), 1L));
data.add(new Pair(Arrays.asList(bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread, butter), 1L));
data.add(new Pair(Arrays.asList(milk, bread, bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread), 1L));
// This lines is removed...
// Iterator<Pair<List<String>, Long>> transactions = data.iterator();
Collection<Pair<String, Long>> frequencies = fps.generateFList(
data.iterator(), // use an iterator here...
(int) minSupport);
System.out.println("freqList :" + frequencies);
OutputCollector<String, List<Pair<List<String>, Long>>> output =
new OutputCollector<String, List<Pair<List<String>, Long>>>() {
@Override
public void collect(String x1,
List<Pair<List<String>, Long>> listPair)
throws IOException {
StringBuffer sb = new StringBuffer();
sb.append(x1 + ":");
for (Pair<List<String>, Long> pair : listPair) {
sb.append("[");
String sep = "";
for (String item : pair.getFirst()) {
sb.append(item + sep);
sep = ", ";
}
sb.append("]:" + pair.getSecond());
}
System.out.println(" " + sb.toString());
}
};
StatusUpdater updater = new StatusUpdater() {
public void update(String status) {
System.out.println("updater :" + status);
}
};
try {
fps.generateTopKFrequentPatterns(
// changed here (previously : transactions)
data.iterator(), // use a "fresh" iterator
frequencies,
minSupport,
k,
null,
output,
updater);
} catch (Exception e) {
e.printStackTrace();
}
}
}
代码有缺陷:首先调用事务的迭代器来计算频率,然后再由fp-growth算法调用。问题是,第二个调用将不返回任何值,因为迭代器已到达其末尾 以下是正确的java代码供参考:
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;
import java.io.IOException;
import java.util.*;
import org.apache.mahout.common.iterator.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.*;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.*;
import org.apache.hadoop.io.Text;
class FPGrowthDemo {
public static void main(String[] args) {
long minSupport = 1L;
int k = 50;
FPGrowth<String> fps = new FPGrowth<String>();
String milk = "milk";
String bread = "bread";
String butter = "butter";
String bier = "bier";
LinkedList<Pair<List<String>, Long>> data =
new LinkedList<Pair<List<String>, Long>>();
data.add(new Pair(Arrays.asList(milk, bread), 1L));
data.add(new Pair(Arrays.asList(butter), 1L));
data.add(new Pair(Arrays.asList(bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread, butter), 1L));
data.add(new Pair(Arrays.asList(milk, bread, bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread), 1L));
Iterator<Pair<List<String>, Long>> transactions = data.iterator();
Collection<Pair<String, Long>> frequencies = fps.generateFList(
transactions, (int) minSupport);
System.out.println("freqList :" + frequencies);
Collection<String> returnableFeatures =
Arrays.asList(milk, bread, butter, bier);
OutputCollector<String, List<Pair<List<String>, Long>>> output =
new OutputCollector<String, List<Pair<List<String>, Long>>>() {
@Override
public void collect(String x1,
List<Pair<List<String>, Long>> listPair)
throws IOException {
StringBuffer sb = new StringBuffer();
sb.append(x1 + ":");
for (Pair<List<String>, Long> pair : listPair) {
sb.append("[");
String sep = "";
for (String item : pair.getFirst()) {
sb.append(item + sep);
sep = ", ";
}
sb.append("]:" + pair.getSecond());
}
System.out.println(" " + sb.toString());
}
};
StatusUpdater updater = new StatusUpdater() {
public void update(String status){
System.out.println("updater :" + status);
}
};
try {
fps.generateTopKFrequentPatterns(
transactions,
frequencies,
minSupport,
k,
null, //returnableFeatures
output,
updater);
}catch (Exception e){
e.printStackTrace();
}
}
}
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;
import java.io.IOException;
import java.util.*;
import org.apache.mahout.common.iterator.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.*;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.*;
import org.apache.hadoop.io.Text;
class FPGrowthDemo {
public static void main(String[] args) {
long minSupport = 1L;
int k = 50;
FPGrowth<String> fps = new FPGrowth<String>();
String milk = "milk";
String bread = "bread";
String butter = "butter";
String bier = "bier";
LinkedList<Pair<List<String>, Long>> data =
new LinkedList<Pair<List<String>, Long>>();
data.add(new Pair(Arrays.asList(milk, bread), 1L));
data.add(new Pair(Arrays.asList(butter), 1L));
data.add(new Pair(Arrays.asList(bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread, butter), 1L));
data.add(new Pair(Arrays.asList(milk, bread, bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread), 1L));
// This lines is removed...
// Iterator<Pair<List<String>, Long>> transactions = data.iterator();
Collection<Pair<String, Long>> frequencies = fps.generateFList(
data.iterator(), // use an iterator here...
(int) minSupport);
System.out.println("freqList :" + frequencies);
OutputCollector<String, List<Pair<List<String>, Long>>> output =
new OutputCollector<String, List<Pair<List<String>, Long>>>() {
@Override
public void collect(String x1,
List<Pair<List<String>, Long>> listPair)
throws IOException {
StringBuffer sb = new StringBuffer();
sb.append(x1 + ":");
for (Pair<List<String>, Long> pair : listPair) {
sb.append("[");
String sep = "";
for (String item : pair.getFirst()) {
sb.append(item + sep);
sep = ", ";
}
sb.append("]:" + pair.getSecond());
}
System.out.println(" " + sb.toString());
}
};
StatusUpdater updater = new StatusUpdater() {
public void update(String status) {
System.out.println("updater :" + status);
}
};
try {
fps.generateTopKFrequentPatterns(
// changed here (previously : transactions)
data.iterator(), // use a "fresh" iterator
frequencies,
minSupport,
k,
null,
output,
updater);
} catch (Exception e) {
e.printStackTrace();
}
}
}
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;
导入java.io.IOException;
导入java.util.*;
导入org.apache.mahout.common.iterator.*;
导入org.apache.mahout.fpm.pfpgrowth.convertors.*;
导入org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
导入org.apache.mahout.fpm.pfpgrowth.convertors.string.*;
导入org.apache.hadoop.mapred.OutputCollector;
导入org.apache.mahout.common.*;
导入org.apache.hadoop.io.Text;
类FPGrowthDemo{
公共静态void main(字符串[]args){
长分钟支持=1L;
int k=50;
FPGrowth fps=新的FPGrowth();
String milk=“milk”;
stringbread=“面包”;
String butter=“butter”;
字符串bier=“bier”;
LinkedList数据=
新建LinkedList();
添加(新的一对(数组.asList(牛奶,面包),1L));
添加(新的对(Arrays.asList(butter),1L));
添加(新对(Arrays.asList(bier),1L));
添加(新的一对(数组.asList(牛奶,面包,黄油),1L));
添加(新的一对(数组.asList(牛奶,面包,啤酒),1L));
添加(新的一对(数组.asList(牛奶,面包),1L));
//此行已删除。。。
//迭代器事务=data.Iterator();
采集频率=fps.generateFList(
data.iterator(),//在此处使用迭代器。。。
(int)minSupport);
System.out.println(“频率列表:+频率);
OutputCollector输出=
新的OutputCollector(){
@凌驾
公共void collect(字符串x1,
列表(列表对)
抛出IOException{
StringBuffer sb=新的StringBuffer();
sb.附加(x1+“:”);
for(对:列表对){
某人加上(“[”);
字符串sep=“”;
for(字符串项:pair.getFirst()){
某人追加(项目+sep);
sep=“,”;
}
sb.append(“]:”+pair.getSecond());
}
System.out.println(“+sb.toString());
}
};
StatusUpdater updater=新的StatusUpdater(){
公共无效更新(字符串状态){
System.out.println(“更新程序:+状态”);
}
};
试一试{
fps.generateTopKFrequentPatterns(
//此处更改(以前为:事务)
data.iterator(),//使用“fresh”迭代器
频率,
minSupport,
K
无效的
产出,
更新程序);
}捕获(例外e){
e、 printStackTrace();
}
}
}
代码有缺陷:首先调用事务的迭代器来计算频率,然后由fp-growth算法再次调用。问题是,第二个调用将不返回任何值,因为迭代器已到达其末尾
以下是正确的java代码供参考:
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;
import java.io.IOException;
import java.util.*;
import org.apache.mahout.common.iterator.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.*;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.*;
import org.apache.hadoop.io.Text;
class FPGrowthDemo {
public static void main(String[] args) {
long minSupport = 1L;
int k = 50;
FPGrowth<String> fps = new FPGrowth<String>();
String milk = "milk";
String bread = "bread";
String butter = "butter";
String bier = "bier";
LinkedList<Pair<List<String>, Long>> data =
new LinkedList<Pair<List<String>, Long>>();
data.add(new Pair(Arrays.asList(milk, bread), 1L));
data.add(new Pair(Arrays.asList(butter), 1L));
data.add(new Pair(Arrays.asList(bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread, butter), 1L));
data.add(new Pair(Arrays.asList(milk, bread, bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread), 1L));
Iterator<Pair<List<String>, Long>> transactions = data.iterator();
Collection<Pair<String, Long>> frequencies = fps.generateFList(
transactions, (int) minSupport);
System.out.println("freqList :" + frequencies);
Collection<String> returnableFeatures =
Arrays.asList(milk, bread, butter, bier);
OutputCollector<String, List<Pair<List<String>, Long>>> output =
new OutputCollector<String, List<Pair<List<String>, Long>>>() {
@Override
public void collect(String x1,
List<Pair<List<String>, Long>> listPair)
throws IOException {
StringBuffer sb = new StringBuffer();
sb.append(x1 + ":");
for (Pair<List<String>, Long> pair : listPair) {
sb.append("[");
String sep = "";
for (String item : pair.getFirst()) {
sb.append(item + sep);
sep = ", ";
}
sb.append("]:" + pair.getSecond());
}
System.out.println(" " + sb.toString());
}
};
StatusUpdater updater = new StatusUpdater() {
public void update(String status){
System.out.println("updater :" + status);
}
};
try {
fps.generateTopKFrequentPatterns(
transactions,
frequencies,
minSupport,
k,
null, //returnableFeatures
output,
updater);
}catch (Exception e){
e.printStackTrace();
}
}
}
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;
import java.io.IOException;
import java.util.*;
import org.apache.mahout.common.iterator.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.*;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.*;
import org.apache.hadoop.io.Text;
class FPGrowthDemo {
public static void main(String[] args) {
long minSupport = 1L;
int k = 50;
FPGrowth<String> fps = new FPGrowth<String>();
String milk = "milk";
String bread = "bread";
String butter = "butter";
String bier = "bier";
LinkedList<Pair<List<String>, Long>> data =
new LinkedList<Pair<List<String>, Long>>();
data.add(new Pair(Arrays.asList(milk, bread), 1L));
data.add(new Pair(Arrays.asList(butter), 1L));
data.add(new Pair(Arrays.asList(bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread, butter), 1L));
data.add(new Pair(Arrays.asList(milk, bread, bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread), 1L));
// This lines is removed...
// Iterator<Pair<List<String>, Long>> transactions = data.iterator();
Collection<Pair<String, Long>> frequencies = fps.generateFList(
data.iterator(), // use an iterator here...
(int) minSupport);
System.out.println("freqList :" + frequencies);
OutputCollector<String, List<Pair<List<String>, Long>>> output =
new OutputCollector<String, List<Pair<List<String>, Long>>>() {
@Override
public void collect(String x1,
List<Pair<List<String>, Long>> listPair)
throws IOException {
StringBuffer sb = new StringBuffer();
sb.append(x1 + ":");
for (Pair<List<String>, Long> pair : listPair) {
sb.append("[");
String sep = "";
for (String item : pair.getFirst()) {
sb.append(item + sep);
sep = ", ";
}
sb.append("]:" + pair.getSecond());
}
System.out.println(" " + sb.toString());
}
};
StatusUpdater updater = new StatusUpdater() {
public void update(String status) {
System.out.println("updater :" + status);
}
};
try {
fps.generateTopKFrequentPatterns(
// changed here (previously : transactions)
data.iterator(), // use a "fresh" iterator
frequencies,
minSupport,
k,
null,
output,
updater);
} catch (Exception e) {
e.printStackTrace();
}
}
}
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;
导入java.io.IOException;
导入java.util.*;
导入org.apache.mahout.common.iterator.*;
导入org.apache.mahout.fpm.pfpgrowth.convertors.*;
导入org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
导入org.apache.mahout.fpm.pfpgrowth.convertors.string.*;
导入org.apache.hadoop.mapred.OutputCollector;
导入org.apache.mahout.common.*;
导入org.apache.hadoop.io.Text;
类FPGrowthDemo{
公共静态void main(字符串[]args){
长分钟支持=1L;
int k=50;
FPGrowth fps=新的FPGrowth();
String milk=“milk”;
stringbread=“面包”;
String butter=“butter”;
字符串bier=“bier”;
LinkedList数据=
新建LinkedList();
添加(新的一对(数组.asList(牛奶,面包),1L));
添加(新的对(Arrays.asList(butter),1L));
添加(新对(Arrays.asList(bier),1L));
添加(新的一对(数组.asList(牛奶,面包,黄油),1L));
添加(新的一对(数组.asList(牛奶,面包,啤酒),1L));
添加(新的一对(数组.asList(牛奶,面包),1L));
//此行已删除。。。
//迭代器事务=data.Iterator();
采集频率=fps.generateFList(
data.iterator(),//在此处使用迭代器。。。
(int)minSupport);
System.out.println(“频率列表:+频率);
OutputCollector输出=
新的OutputCollector(){
@凌驾
公共void collect(字符串x1,
列表(列表对)
抛出IOException{
StringBuffer sb=新的StringBuffer();
sb.附加(x1+“:”);
for(对:列表对){
某人加上(“[”);
字符串sep=“”;
for(字符串项:pair.getFirst()){
某人追加(项目+sep);
sep=“,”;
}
sb.append(“]:”+pair.getSecond());
}
System.out.println(“+sb.toString());
}
};
StatusUpdater updater=新的StatusUpdater(){
公共无效更新(字符串状态){
System.out.println(“更新程序:+状态”);
}
};
试一试{
fps.generateTopKFrequentPatterns(
//此处更改(以前为:事务)
data.iterator(),//使用“fresh”迭代器
频率,
minSupport,
K
无效的
产出,