Python 将点移动到最近的未占用栅格位置_Python_Pandas_Algorithm_Geometry_Data Visualization

Python 将点移动到最近的未占用栅格位置

python pandas algorithm geometry

Python 将点移动到最近的未占用栅格位置,python,pandas,algorithm,geometry,data-visualization,Python,Pandas,Algorithm,Geometry,Data Visualization,我有一个2D空间中点的随机分布，如下所示： from sklearn import datasets import pandas as pd import numpy as np arr, labels = datasets.make_moons() arr, labels = datasets.make_blobs(n_samples=1000, centers=3) pd.DataFrame(arr, columns=['x', 'y']).plot.scatter('x', 'y', s

我有一个2D空间中点的随机分布，如下所示：

from sklearn import datasets
import pandas as pd
import numpy as np

arr, labels = datasets.make_moons()
arr, labels = datasets.make_blobs(n_samples=1000, centers=3)
pd.DataFrame(arr, columns=['x', 'y']).plot.scatter('x', 'y', s=1)

我试图将这些点分配到一个虚构的六边形网格上最近的未占用插槽，以确保这些点不重叠。我用来实现这个目标的代码生成了下面的图。一般的想法是创建十六进制箱子，然后在每个点上迭代，找到允许算法将该点分配给未占用十六进制箱子的最小半径：

from scipy.spatial.distance import euclidean

def get_bounds(arr):
  '''Given a 2D array return the y_min, y_max, x_min, x_max'''
  return [
    np.min(arr[:,1]),
    np.max(arr[:,1]),
    np.min(arr[:,0]),
    np.max(arr[:,0]),
  ]

def create_mesh(arr, h=100, w=100):
  '''arr is a 2d array; h=number of vertical divisions; w=number of horizontal divisions'''
  print(' * creating mesh with size', h, w)
  bounds = get_bounds(arr)
  # create array of valid positions
  y_vals = np.arange(bounds[0], bounds[1], (bounds[1]-bounds[0])/h)
  x_vals = np.arange(bounds[2], bounds[3], (bounds[3]-bounds[2])/w)
  # create the dense mesh
  data = np.tile(
    [[0, 1], [1, 0]],
    np.array([
      int(np.ceil(len(y_vals) / 2)),
      int(np.ceil(len(x_vals) / 2)),
    ]))
  # ensure each axis has an even number of slots
  if len(y_vals) % 2 != 0 or len(x_vals) % 2 != 0:
    data = data[0:len(y_vals), 0:len(x_vals)]
  return pd.DataFrame(data, index=y_vals, columns=x_vals)


def align_points_to_grid(arr, h=100, w=100, verbose=False):
  '''arr is a 2d array; h=number of vertical divisions; w=number of horizontal divisions'''
  h = w = len(arr)/10
  grid = create_mesh(arr, h=h, w=w)

  # fill the mesh
  print(' * filling mesh')
  df = pd.DataFrame(arr, columns=['x', 'y'])
  bounds = get_bounds(arr)
  # store the number of points slotted
  c = 0
  for site, point in df[['x', 'y']].iterrows():
    # skip points not in original points domain
    if point.y < bounds[0] or point.y > bounds[1] or \
       point.x < bounds[2] or point.x > bounds[3]:
      raise Exception('Input point is out of bounds', point.x, point.y, bounds)

    # assign this point to the nearest open slot
    r_y = (bounds[1]-bounds[0])/h
    r_x = (bounds[3]-bounds[2])/w
    slotted = False
    while not slotted:

      bottom = grid.index.searchsorted(point.y - r_y)
      top = grid.index.searchsorted(point.y + r_y, side='right')
      left = grid.columns.searchsorted(point.x - r_x)
      right = grid.columns.searchsorted(point.x + r_x, side='right')
      close_grid_points = grid.iloc[bottom:top, left:right]

      # store the position in this point's radius that minimizes distortion
      best_dist = np.inf
      grid_loc = [np.nan, np.nan]
      for x, col in close_grid_points.iterrows():
        for y, val in col.items():
          if val != 1: continue
          dist = euclidean(point, (x,y))
          if dist < best_dist:
            best_dist = dist
            grid_loc = [x,y]

      # no open slots were found so increase the search radius
      if np.isnan(grid_loc[0]):
        r_y *= 2
        r_x *= 2
      # success! report the slotted position to the user
      else:
        # assign a value other than 1 to mark this slot as filled
        grid.loc[grid_loc[0], grid_loc[1]] = 2
        df.loc[site, ['x', 'y']] = grid_loc
        slotted = True
        c += 1

        if verbose:
          print(' * completed', c, 'of', len(arr), 'assignments')
  return df


# plot sample data
df = align_points_to_grid(arr, verbose=False)
df.plot.scatter('x', 'y', s=1)

从scipy.spatial.distance导入欧几里得
def get_界限（arr）：
''给定一个2D数组，返回y_min，y_max，x_min，x_max''
返回[
np.min（arr[：，1]），
np.max（arr[：，1]），
np.min（arr[：，0]），
np.max（arr[：，0]），
]
def创建网格（arr，h=100，w=100）：
“arr”是一个二维数组；h=垂直分段的数量；w=水平分段数“”'
打印（'*创建大小为''h，w'的网格）
边界=获取边界（arr）
#创建有效位置的数组
y_vals=np.arange（界[0]，界[1]，（界[1]-界[0]）/h）
x_vals=np.arange（界[2]，界[3]，（界[3]-界[2]）/w）
#创建密集网格
数据=np.tile(
[[0, 1], [1, 0]],
np.数组([
内部（np.ceil（len（y_vals）/2）），
内部（np.ceil（len（x_vals）/2）），
]))
#确保每个轴具有偶数个插槽
如果len（y_vals）%2！=0或len（x_vals）%2！=0:
数据=数据[0:len（y值），0:len（x值）]
返回pd.DataFrame（数据，索引=y\U VAL，列=x\U VAL）
def将_点_与_网格对齐（arr，h=100，w=100，verbose=False）：
“arr”是一个二维数组；h=垂直分段的数量；w=水平分段数“”'
h=w=len（arr）/10
网格=创建网格（arr，h=h，w=w）
#填充网格
打印（“*填充网格”）
df=pd.DataFrame（arr，columns=['x'，'y']）
边界=获取边界（arr）
#存储点的数量
c=0
对于站点，指向df['x'，'y']]中的点。iterrows（）：
#跳过不在原始点域中的点
如果点.y<边界[0]或点.y>边界[1]或\
点.x<边界[2]或点.x>边界[3]：
引发异常（'输入点超出边界'，点.x，点.y，边界）
#将此点指定给最近的打开插槽
r_y=（界[1]-界[0]）/h
r_x=（界[3]-界[2]）/w
时隙=假
虽然没有开槽：
底部=grid.index.searchsorted（point.y-r_y）
top=grid.index.searchsorted（point.y+r\u y，side='right'）
左=grid.columns.searchsorted（point.x-r_x）
right=grid.columns.searchsorted（point.x+r\ux，side='right'）
close\u grid\u points=grid.iloc[底部：顶部，左侧：右侧]
#将位置存储在此点的半径内，以最大限度地减少变形
最佳距离=np.inf
网格位置=[np.nan，np.nan]
对于x，列在闭合网格点中。iterrows（）：
对于y，列项目（）中的val：
如果val！=1：继续
距离=欧几里德（点，（x，y））
如果距离<最佳距离：
最佳距离
网格位置=[x，y]
#未找到打开的插槽，因此请增大搜索半径
如果np.isnan（网格位置[0]）：
r_y*=2
r_x*=2
#成功！向用户报告开槽位置
其他：
#指定一个非1的值以将此插槽标记为已填充
grid.loc[grid\u loc[0]，grid\u loc[1]=2
df.loc[站点，['x'，'y']]=网格位置
时隙=真
c+=1
如果冗长：
打印（“*已完成”，c，'of'，len（arr），'assignments'）
返回df
#绘图样本数据
df=将点与网格对齐（arr，verbose=False）
分布图（'x'，'y'，s=1）

我对这个算法的结果很满意，但对性能不满意

在Python中有没有更快的解决这种hexbin赋值问题的方法？我觉得其他更多接触到的人可能对这个问题有很有价值的见解。任何建议都会非常有用

事实证明，将每个点分配到其当前半径内的第一个可用网格点是足够有效的

对于其他来到这里的人，为了方便起见，我的实验室将此功能打包成了一个小程序。您可以

pip安装pointgrid

，然后：

from pointgrid import align_points_to_grid
from sklearn import datasets

# create fake data
arr, labels = datasets.make_blobs(n_samples=1000, centers=5)

# get updated point positions
updated = align_points_to_grid(arr)

updated

将是一个与输入阵列形状相同的numpy阵列

arr

结果表明，将每个点分配到其当前半径内的第一个可用网格点已经足够了

对于其他来到这里的人，为了方便起见，我的实验室将此功能打包成了一个小程序。您可以

pip安装pointgrid

，然后：

from pointgrid import align_points_to_grid
from sklearn import datasets

# create fake data
arr, labels = datasets.make_blobs(n_samples=1000, centers=5)

# get updated point positions
updated = align_points_to_grid(arr)

updated

将是一个与输入数组形状相同的numpy数组

arr

您知道耗时的操作是什么吗？如果是

searchsorted

，您可以直接从图像坐标计算十六进制坐标（精确程度取决于网格）。此外，可以在查询点周围的环中迭代十六进制单元格，以避免多次搜索相同的单元格。同样，这取决于你如何组织你的网格。@NicoSchertler我一直在考虑你的评论“你可以直接从图像坐标计算十六进制坐标”，但我不知道怎么做——你能说得更详细一点吗？如果你有机会访问一个小的代码示例，如果你能发送它，我将不胜感激！我在第一个可用位置开槽，而不是在搜索半径中找到最佳位置，从而略微提高了速度，但我正在寻找更大的速度增益。。。这似乎是一个很好的起点。你要找的部分是“像素到十六进制”。你知道什么是耗时的操作吗？如果是

searchsorted

，您可以直接从图像坐标计算十六进制坐标（精确程度取决于网格）。此外，可以在查询点周围的环中迭代十六进制单元格，以避免多次搜索相同的单元格。同样，这取决于你如何组织你的网格