Java DBSCAN群集算法工作不正常。我做错了什么?
我正试图编写DBSCAN算法来对一组点进行聚类,但结果非常糟糕。这可能是因为数据,但不仅仅如此。我得到了不应该发生的大小Java DBSCAN群集算法工作不正常。我做错了什么?,java,data-mining,cluster-analysis,dbscan,Java,Data Mining,Cluster Analysis,Dbscan,我正试图编写DBSCAN算法来对一组点进行聚类,但结果非常糟糕。这可能是因为数据,但不仅仅如此。我得到了不应该发生的大小
private static int[]dbScan(字符串[]点,intε,int minPts){
int集群=0;
//如果已访问该点,则访问了门店
boolean[]访问=新的boolean[points.length];
//点群集存储点已分配给的群集
int[]pointsCluster=新int[points.length];
对于(int iii=0;iiiminPts
*/
私有静态void expandCluster(字符串[]点、int种子点、哈希集邻居、,
int[]点群集、访问的布尔[]、int群集、intε、int minPts){
pointsCluster[seedPoint]=cluster;//将群集分配给种子点
//创建队列以处理邻居
队列种子=新的LinkedList();
种子。添加所有(邻居);
而(!seeds.isEmpty()){
int currentPoint=(整数)seeds.poll();
如果(!已访问[currentPoint]){
已访问[currentPoint]=true;//将邻居标记为已访问
//获取此点的邻居
HashSet currentNeighbors=ε(点,currentPoint,ε);
//如果currentPoint在邻域中有>=minPts,则将这些点添加到队列中
if(currentNeights.size()>=minPts){
种子。添加所有(当前邻居);
}
}
//如果currentPoint尚未分配群集,请将其分配给当前群集
如果(pointsCluster[currentPoint]=0)pointsCluster[currentPoint]=cluster;
}
}
/*
*返回一个哈希集,其中包含以下点的索引:
*在索引==当前点处点的ε邻域中
*/
私有静态哈希集epsiloneighbors(字符串[]点,int currentPoint,int epsilon){
HashSet邻居=新HashSet();
字符串蛋白质=点[当前点];
对于(int iii=0;iii=ε)相邻。添加(iii);
}
回归邻居;
}
当结果不好时,可能是因为数据不好(对于基于密度的聚类),或者是因为参数不好
事实上,如果彼此接触,DBSCAN可以产生比MINPT更小的集群。然后他们可以互相“窃取”边境点
如何使用例如验证你的算法输出?也考虑看原来的出版物,而不是维基百科!哇,你说得对。我没有想到集群“窃取”边境点。谢谢。所以,从外观上看,算法看起来不错,对吧?我没有详细检查。您的
epsiloneighbors
引用未定义的变量jjj
。还要注意的是,Java集合对基元类型的性能非常差。你可能真的想试试ELKI,因为它真的很快。是的,jjj
应该是currentPoint
。我会调查埃尔基的。谢谢你的帮助。
private static int[] dbScan(String[] points, int epsilon, int minPts) {
int cluster = 0;
// visited stores if point has been visited
boolean[] visited = new boolean[points.length];
// pointsCluster stores which cluster a point has been assigned to
int[] pointsCluster = new int[points.length];
for(int iii = 0; iii < points.length; iii++) {
// if point iii is already visited, do nothing
if(visited[iii]) continue;
visited[iii] = true; // mark point iii as visited
// get points in neighborhood of point iii
HashSet<Integer> neighbors = epsilonNeighbors(points, iii, epsilon);
if(neighbors.size() < minPts) {
// if number of neighbors < minPts, mark point iii as noise
pointsCluster[iii] = -1;
} else {
++cluster; // else, start new cluster
expandCluster(points, iii, neighbors, pointsCluster, visited, cluster, epsilon, minPts);
}
}
return pointsCluster;
}
/*
* Expands a cluster if a point is not a noise point
* and has > minPts in its epsilon neighborhood
*/
private static void expandCluster(String[] points, int seedPoint, HashSet<Integer> neighbors,
int[] pointsCluster, boolean[] visited, int cluster, int epsilon, int minPts) {
pointsCluster[seedPoint] = cluster; //assign cluster to seed point
// create queue to process neighbors
Queue<Integer> seeds = new LinkedList<Integer>();
seeds.addAll(neighbors);
while(!seeds.isEmpty()) {
int currentPoint = (Integer) seeds.poll();
if(!visited[currentPoint]) {
visited[currentPoint] = true; // mark neighbor as visited
// get neighbors of this currentPoint
HashSet<Integer> currentNeighbors = epsilonNeighbors(points, currentPoint, epsilon);
// if currentPoint has >= minPts in neighborhood, add those points to the queue
if(currentNeighbors.size() >= minPts) {
seeds.addAll(currentNeighbors);
}
}
// if currentPoint has not been assigned a cluster, assign it to the current cluster
if(pointsCluster[currentPoint] == 0) pointsCluster[currentPoint] = cluster;
}
}
/*
* Returns a HashSet containing the indexes of points which are
* in the epsilon neighborhood of the point at index == currentPoint
*/
private static HashSet<Integer> epsilonNeighbors(String[] points, int currentPoint, int epsilon) {
HashSet<Integer> neighbors = new HashSet<Integer>();
String protein = points[currentPoint];
for(int iii = 0; iii < points.length; iii++) {
int score = similarity(points[iii], points[jjj]);
if(score >= epsilon) neighbors.add(iii);
}
return neighbors;
}