Python 当特定列中的单元格值相同时,如何合并数据帧中的行
我有一个数据框,如下所示: df 其中id_loc是元组列表。如果存在匹配的Python 当特定列中的单元格值相同时,如何合并数据帧中的行,python,pandas,Python,Pandas,我有一个数据框,如下所示: df 其中id_loc是元组列表。如果存在匹配的(lon,lat)对,如何按id\u loc进行分组,以逗号分隔合并这两行和其他列 预期输出 KEY NAME ID_LOCATION _GEOM 0 61196,61199 name1,name3 [(u'-85.121429', u'40.88
(lon,lat)对
,如何按id\u loc
进行分组,以逗号分隔合并这两行和其他列
预期输出
KEY NAME ID_LOCATION _GEOM
0 61196,61199 name1,name3 [(u'-85.121429', u'40.887726'), (-77.681931, 37.548851)] [[[lon00, lat00],[lon01, lat01],[lon20, lat20],[lon21, lat21]]]
1 61197 name2 [(u'-72.161934', u'35.725163')] [[[lon10,lat10], [lon11,lat11],...]]
我尝试了以下操作,但没有成功,并显示错误为不可损坏的类型列表
:
def f(x):
return pd.Series(dict(KEY='{%s}' % ', '.join(x['KEY']),
NAME='{%s}' % ', '.join(x['NAME']),
ID_LOCATION='{%s}' % ', '.join(x['ID_LOCATION']),
_GEOM='{%s}' % ', '.join(x['_GEOM']))
)
df = df.groupby('ID_LOCATION').apply(f)
我认为这应该行得通
# fun to merge elements
def merge_elements(ensemble, column):
upper_list = []
for index in ensemble:
element_list = []
for item in index:
if not isinstance(df.loc[item, column], list):
if not df.loc[item, column] in element_list:
element_list.append(df.loc[item, column])
else:
for obj in df.loc[item, column]:
if not obj in element_list:
element_list.append(obj)
upper_list.append([element_list, index])
return upper_list
# put results in dataframe
def put_in_df(df, piped, column):
for elem in piped:
for i in range(len(elem[1])):
if column == "NAME" or column == "_GEOM":
df.loc[elem[1][i], column] = str(elem[0]).replace("'", "")
else:
df.loc[elem[1][i], column] = str(elem[0])
# get list from df
list_of_locations = df.ID_LOCATION.tolist()
# get list of rows that need to be merged (no itertools needed)
# the dictionary I used here is an "overkill", I had no actual need for it, so also a common list can suit perfectly
rows = {}
for i, item in enumerate(list_of_locations):
if isinstance(item, list):
for j in range(0, len(item)):
if item[j] in rows:
rows[item[j]] = [rows[item[j]], i]
else:
rows[item[j]] = i
else:
if item in rows:
rows[item] = [rows[item], i]
else:
rows[item] = i
ensemble = []
# as I said there was no need for a dictionary, this step can be summarized
for item in rows.values():
if isinstance(item, list):
ensemble.append(item)
# conversion to tuple is optional
ensemble = tuple(ensemble)
# merge list of tuples according to indexes retrieved
put_in_df(df, merge_elements(ensemble, "ID_LOCATION"), "ID_LOCATION")
put_in_df(df, merge_elements(ensemble, "NAME"), "NAME")
put_in_df(df, merge_elements(ensemble, "KEYS"), "KEYS")
put_in_df(df, merge_elements(ensemble, "_GEOM"), "_GEOM")
# special thanks to: https://stackoverflow.com/questions/43855462/pandas-drop-duplicates-method-not-working?rq=1
df = df.iloc[df.astype(str).drop_duplicates().index]
首先将内容转换为相同类型的列表(以便sum
将内容附加在一起)
然后获得成对的行组合(对于id\u loc
)-即要合并在一起的成对行
df = pd.DataFrame(
[[['61196'], ['name1'], [('-88.121429', '41.887726')]], [['61197'], ['name2'], [('-75.161934', '38.725163')]], [['61199'], ['name3'], [('-88.121429', '41.887726'), ('-77.681931', '37.548851')]]],
columns=['KEY', 'NAME', 'id_loc']
)
# Loop through all pairwise combination of rows (will need index so loop over range() instead of raw values).
to_merge = [] # list of index-tuples, rows to merge together.
for i, j in itertools.combinations(range(len(df['id_loc'].values)), 2):
a = df['id_loc'].values[i]
b = df['id_loc'].values[j]
# Check for shared elemnts.
if not set(a).isdisjoint(b):
# Shared elements found.
to_merge.append([i,j])
现在处理有3行或更多行的情况,即to_merge=[[1,2],[2,3]
应该是to_merge=[1,2,3]
def find_intersection(m_list):
for i,v in enumerate(m_list) :
for j,k in enumerate(m_list[i+1:],i+1):
if v & k:
s[i]=v.union(m_list.pop(j))
return find_intersection(m_list)
return m_list
to_merge = [set(i) for i in to_merge if i]
to_merge = find_intersection(to_merge)
to_merge = [list(x) for x in to_merge]
(从中找到)
检查并汇总所有需要合并的行(并删除预合并行)
安托万·赞贝利的回答很好;作为练习,但也希望它能有所帮助,我想分享我个人对这个问题的看法。它还没有完全测试过,但应该可以工作
# fun to merge elements
def merge_elements(ensemble, column):
upper_list = []
for index in ensemble:
element_list = []
for item in index:
if not isinstance(df.loc[item, column], list):
if not df.loc[item, column] in element_list:
element_list.append(df.loc[item, column])
else:
for obj in df.loc[item, column]:
if not obj in element_list:
element_list.append(obj)
upper_list.append([element_list, index])
return upper_list
# put results in dataframe
def put_in_df(df, piped, column):
for elem in piped:
for i in range(len(elem[1])):
if column == "NAME" or column == "_GEOM":
df.loc[elem[1][i], column] = str(elem[0]).replace("'", "")
else:
df.loc[elem[1][i], column] = str(elem[0])
# get list from df
list_of_locations = df.ID_LOCATION.tolist()
# get list of rows that need to be merged (no itertools needed)
# the dictionary I used here is an "overkill", I had no actual need for it, so also a common list can suit perfectly
rows = {}
for i, item in enumerate(list_of_locations):
if isinstance(item, list):
for j in range(0, len(item)):
if item[j] in rows:
rows[item[j]] = [rows[item[j]], i]
else:
rows[item[j]] = i
else:
if item in rows:
rows[item] = [rows[item], i]
else:
rows[item] = i
ensemble = []
# as I said there was no need for a dictionary, this step can be summarized
for item in rows.values():
if isinstance(item, list):
ensemble.append(item)
# conversion to tuple is optional
ensemble = tuple(ensemble)
# merge list of tuples according to indexes retrieved
put_in_df(df, merge_elements(ensemble, "ID_LOCATION"), "ID_LOCATION")
put_in_df(df, merge_elements(ensemble, "NAME"), "NAME")
put_in_df(df, merge_elements(ensemble, "KEYS"), "KEYS")
put_in_df(df, merge_elements(ensemble, "_GEOM"), "_GEOM")
# special thanks to: https://stackoverflow.com/questions/43855462/pandas-drop-duplicates-method-not-working?rq=1
df = df.iloc[df.astype(str).drop_duplicates().index]
正如我在评论中所说,感谢您在列表中删除了重复项,这一点很清楚:您的
id\u loc
有两种不同的数据类型:unicode
和float
?你为什么不把它们转换成一个统一的类型?@QuangHoang你是怎么做到的?@Tokci我想那是不同的。@Atihska你的问题得到回答了吗?
# fun to merge elements
def merge_elements(ensemble, column):
upper_list = []
for index in ensemble:
element_list = []
for item in index:
if not isinstance(df.loc[item, column], list):
if not df.loc[item, column] in element_list:
element_list.append(df.loc[item, column])
else:
for obj in df.loc[item, column]:
if not obj in element_list:
element_list.append(obj)
upper_list.append([element_list, index])
return upper_list
# put results in dataframe
def put_in_df(df, piped, column):
for elem in piped:
for i in range(len(elem[1])):
if column == "NAME" or column == "_GEOM":
df.loc[elem[1][i], column] = str(elem[0]).replace("'", "")
else:
df.loc[elem[1][i], column] = str(elem[0])
# get list from df
list_of_locations = df.ID_LOCATION.tolist()
# get list of rows that need to be merged (no itertools needed)
# the dictionary I used here is an "overkill", I had no actual need for it, so also a common list can suit perfectly
rows = {}
for i, item in enumerate(list_of_locations):
if isinstance(item, list):
for j in range(0, len(item)):
if item[j] in rows:
rows[item[j]] = [rows[item[j]], i]
else:
rows[item[j]] = i
else:
if item in rows:
rows[item] = [rows[item], i]
else:
rows[item] = i
ensemble = []
# as I said there was no need for a dictionary, this step can be summarized
for item in rows.values():
if isinstance(item, list):
ensemble.append(item)
# conversion to tuple is optional
ensemble = tuple(ensemble)
# merge list of tuples according to indexes retrieved
put_in_df(df, merge_elements(ensemble, "ID_LOCATION"), "ID_LOCATION")
put_in_df(df, merge_elements(ensemble, "NAME"), "NAME")
put_in_df(df, merge_elements(ensemble, "KEYS"), "KEYS")
put_in_df(df, merge_elements(ensemble, "_GEOM"), "_GEOM")
# special thanks to: https://stackoverflow.com/questions/43855462/pandas-drop-duplicates-method-not-working?rq=1
df = df.iloc[df.astype(str).drop_duplicates().index]