Python 将gps数据按哈弗斯线距离分组,并计算平均值

Python 将gps数据按哈弗斯线距离分组,并计算平均值,python,pandas,dataframe,grouping,cluster-analysis,Python,Pandas,Dataframe,Grouping,Cluster Analysis,我有如下数据框(由于保密协议,无法从原始数据框显示): 我想对距离处的所有点进行聚类/分组,在某些步骤中,我使用了pandas功能,采用了不同的方法,但这应该可以做到 首先,根据您提供的数据集,我创建了一个helper数据框,它给我一个点I和点as列表 points = (df .assign( id = lambda x : np.arange(len(x)), point = lambda x : x.apply(lambda x :

我有如下数据框(由于保密协议,无法从原始数据框显示):


我想对距离处的所有点进行聚类/分组,在某些步骤中,我使用了pandas功能,采用了不同的方法,但这应该可以做到

首先,根据您提供的数据集,我创建了一个helper数据框,它给我一个点I和点as列表

points = (df
    .assign(
        id = lambda x : np.arange(len(x)),
        point = lambda x : x.apply(lambda x : 
            [x.lat, x.lon], 
            axis = 1)
    )
)
points
看起来像这样:

拉特 朗 身份证件 指向 -57.213879 17.916958 0
[-57.213878681213883,17.9169583041696]
76.392039 0.060883 1.
[76.39203948037851,0.060882542482108504]
0.124177 1.041767 2.
[0.12417670682730897,1.0417670682730924]
-64.840322 21.374279 3.
[-64.8403219767877,21.37427929614376]
-48.966303 81.336324 4.
[-48.96630293735991,81.33632377806619]

我使用嵌套列表进行集群,但在大型数据集上效率低下:

df = df.sort_values(['lat', 'lon'])
df = df.reset_index(drop=True)
max_index = 0
for index, row in df.iterrows():
    max_index = index
    
pair_list = pairs(len(list(df['lat'])))

cluster = -1
num_exists_list = []
cluster_list = []

for i, j in pair_list:
    #print(i, j)
    index = i
    #stat_nr = df.loc[i]['vwbhfnrprz']
    lat = df.loc[i]['lat']
    lon = df.loc[i]['lon']
        
    index_next = j
    #stat_nr_next = df.loc[j]['vwbhfnrprz']
    lat_next = df.loc[j]['lat']
    lon_next = df.loc[j]['lon']
    distance = haversine(lon,lat, lon_next, lat_next)
    if distance <= 400:
        if len(cluster_list)==0:
            num_exists_list.append(i)
            num_exists_list.append(j)
            cluster_list.append([i,j])
            #print('first - ' + str([i,j]))
        else:
            if i not in num_exists_list and j not in num_exists_list:
                num_exists_list.append(i)
                num_exists_list.append(j)
                cluster_list.append([i,j])
                #print('not exists - ' + str([i,j]))
            elif i in num_exists_list and j not in num_exists_list:
                index = find_index(cluster_list,i)
                cluster_list[index].append(j)
                num_exists_list.append(j)
                #print('exists - ' + str(i) + ' - missing ' + str(j))
            elif i not in num_exists_list and j in num_exists_list:
                index = find_index(cluster_list,j)
                cluster_list[index].append(i)
                num_exists_list.append(i)
                #print('exists - ' + str(i) + ' - missing ' + str(j))
                
print('--------------------')
#print(num_exists_list)            
print(cluster_list)
df=df.sort_值(['lat','lon'])
df=df.reset_索引(drop=True)
最大指数=0
对于索引,df.iterrows()中的行:
最大指数=指数
配对列表=配对(len(list(df['lat']))
集群=-1
num_exists_list=[]
集群_列表=[]
对于成对列表中的i,j:
#打印(i,j)
指数=i
#stat_nr=df.loc[i][vwbhfnprz']
lat=df.loc[i][lat']
lon=df.loc[i]['lon']
索引_next=j
#stat_nr_next=df.loc[j]['vwbhfnrprz']
lat_next=df.loc[j]['lat']
lon_next=df.loc[j]['lon']
距离=哈弗斯线(长、宽、长下、宽下)

如果你在解释中提到“群体”。它是指两组“=400”吗?不,谢谢!如果我想像在我编辑的帖子中那样将所有点彼此靠近分组,代码会是什么?阅读circle@TheDev我改变了答案的后半部分,这能回答你的问题吗?是的,谢谢,我自己也实现了一个,但总体来说速度很慢datasets@TheDev我对它进行了一些编辑,这样你也可以得到单个集群…@我同意集群的定义,但我的目的是给你一个完整的回答所有的问题。。。你是否使用它们取决于你自己。但现在你有机会了:)
def pairs(number):
    list_num = []
    for i in range(number):
        for j in range(number):
            if j >= i+1:
                list_num.append([i,j])
    return list_num

pair_list = pairs(12)
print(pair_list)
df = df.sort_values(['lat', 'lon'])
df = df.reset_index(drop=True)
max_index = 0
for index, row in df.iterrows():
    max_index = index
    
pair_list = pairs(len(list(df['lat'])))

cluster = -1
cluster_dict = {}

for i, j in pair_list:
    #print(i, j)
    index = i
    #stat_nr = df.loc[i]['vwbhfnrprz']
    lat = df.loc[i]['lat']
    lon = df.loc[i]['lon']
        
    index_next = j
    #stat_nr_next = df.loc[j]['vwbhfnrprz']
    lat_next = df.loc[j]['lat']
    lon_next = df.loc[j]['lon']
    distance = haversine(lon,lat, lon_next, lat_next)
    if distance <= 400:
        cluster += 1
        print('cluster ' + str(cluster))
        if len(cluster_dict) == 0:
            cluster_dict[cluster]=[i,j]
        elif len(cluster_dict) != 0:
            for k in cluster_dict.copy():
                print(cluster_dict)
                print('k '+str(k))
                print(i,j)
                if i not in cluster_dict[k]:
                    
                    cluster_dict[cluster] = [i]
                    print(cluster_dict)
                    #cluster_dict[k].append(i)
                if j not in cluster_dict[k]:
                    
                    #print(cluster_dict[k])
                    break
                if j in cluster_dict[k]:
                    break
        print(cluster, index, index_next, lat, lon,  lat_next, lon_next, distance)
points = (df
    .assign(
        id = lambda x : np.arange(len(x)),
        point = lambda x : x.apply(lambda x : 
            [x.lat, x.lon], 
            axis = 1)
    )
)
points
df = df.sort_values(['lat', 'lon'])
df = df.reset_index(drop=True)
max_index = 0
for index, row in df.iterrows():
    max_index = index
    
pair_list = pairs(len(list(df['lat'])))

cluster = -1
num_exists_list = []
cluster_list = []

for i, j in pair_list:
    #print(i, j)
    index = i
    #stat_nr = df.loc[i]['vwbhfnrprz']
    lat = df.loc[i]['lat']
    lon = df.loc[i]['lon']
        
    index_next = j
    #stat_nr_next = df.loc[j]['vwbhfnrprz']
    lat_next = df.loc[j]['lat']
    lon_next = df.loc[j]['lon']
    distance = haversine(lon,lat, lon_next, lat_next)
    if distance <= 400:
        if len(cluster_list)==0:
            num_exists_list.append(i)
            num_exists_list.append(j)
            cluster_list.append([i,j])
            #print('first - ' + str([i,j]))
        else:
            if i not in num_exists_list and j not in num_exists_list:
                num_exists_list.append(i)
                num_exists_list.append(j)
                cluster_list.append([i,j])
                #print('not exists - ' + str([i,j]))
            elif i in num_exists_list and j not in num_exists_list:
                index = find_index(cluster_list,i)
                cluster_list[index].append(j)
                num_exists_list.append(j)
                #print('exists - ' + str(i) + ' - missing ' + str(j))
            elif i not in num_exists_list and j in num_exists_list:
                index = find_index(cluster_list,j)
                cluster_list[index].append(i)
                num_exists_list.append(i)
                #print('exists - ' + str(i) + ' - missing ' + str(j))
                
print('--------------------')
#print(num_exists_list)            
print(cluster_list)