Python 将一个数据帧中的行转换为另一个数据帧中的列的最佳方法是什么?
给你的人:Python 将一个数据帧中的行转换为另一个数据帧中的列的最佳方法是什么?,python,python-3.x,pandas,dataframe,Python,Python 3.x,Pandas,Dataframe,给你的人: Name 0 Tom 1 Jerry 和df_颜色(无标题行): 以df_颜色获取数据并将其添加到df_人员中的最佳方式是什么,这样当组合时df_人员将如下所示: Name Color_0 Color_1 Color_2 0 Tom Red Green Blue 1 Jerry Red Green Blue 下面是我到目前为止所做的工作,但我想知道是否有更好或更简洁的方法 # Store data for new
Name
0 Tom
1 Jerry
和df_颜色(无标题行):
以df_颜色获取数据并将其添加到df_人员中的最佳方式是什么,这样当组合时df_人员将如下所示:
Name Color_0 Color_1 Color_2
0 Tom Red Green Blue
1 Jerry Red Green Blue
下面是我到目前为止所做的工作,但我想知道是否有更好或更简洁的方法
# Store data for new columns in a dictionary
new_columns = {}
for index_people, row_people in df_people.iterrows():
for index_colors, row_colors in df_colors.iterrows():
key = 'Color_' + str(index_colors)
if (key in new_columns):
new_columns[key].append(row_colors[0])
else:
new_columns[key] = [row_colors[0]]
# Add dictionary data as new columns
for key, value in new_columns.items():
df_people[key] = value
更新
谢谢大家提供答案。由于实际数据帧的大小是GBs,速度至关重要,因此我最终选择了最快的方法。以下是测试用例的代码:
# Import required modules
import pandas as pd
import timeit
# Original
def method_1():
df_people = pd.DataFrame([['Tom'], ['Jerry']], columns=['Name'])
df_colors = pd.DataFrame([['Red'], ['Green'], ['Blue']], columns=None)
# Store data for new columns in a dictionary
new_columns = {}
for index_people, row_people in df_people.iterrows():
for index_colors, row_colors in df_colors.iterrows():
key = 'Color_' + str(index_colors)
if (key in new_columns):
new_columns[key].append(row_colors[0])
else:
new_columns[key] = [row_colors[0]]
# Add dictionary data as new columns
for key, value in new_columns.items():
df_people[key] = value
# YOBEN_S - https://stackoverflow.com/a/60805881/452587
def method_2():
df_people = pd.DataFrame([['Tom'], ['Jerry']], columns=['Name'])
df_colors = pd.DataFrame([['Red'], ['Green'], ['Blue']], columns=None)
_s = pd.concat([df_colors]*len(df_people), axis=1)
_s.columns = df_people.index
df_people = df_people.join(_s.T.add_prefix('Color_'))
# Dani Mesejo - https://stackoverflow.com/a/60805898/452587
def method_3():
df_people = pd.DataFrame([['Tom'], ['Jerry']], columns=['Name'])
df_colors = pd.DataFrame([['Red'], ['Green'], ['Blue']], columns=None)
# Create mock key
_m1 = df_people.assign(key=1)
# Set new column names, transpose, and create mock key
_m2 = df_colors.set_index('Color_' + df_colors.index.astype(str)).T.assign(key=1)
df_people = _m1.merge(_m2, on='key').drop('key', axis=1)
# Erfan - https://stackoverflow.com/a/60806018/452587
def method_4():
df_people = pd.DataFrame([['Tom'], ['Jerry']], columns=['Name'])
df_colors = pd.DataFrame([['Red'], ['Green'], ['Blue']], columns=None)
df_colors = df_colors.T.reindex(df_people.index).ffill().add_prefix('Color_')
df_people = df_people.join(df_colors)
print('Method 1:', timeit.timeit(method_1, number=10000))
print('Method 2:', timeit.timeit(method_2, number=10000))
print('Method 3:', timeit.timeit(method_3, number=10000))
print('Method 4:', timeit.timeit(method_4, number=10000))
输出:
Method 1: 36.029883089
Method 2: 27.042384837999997
Method 3: 68.22421793800001
Method 4: 32.94155895
Method 1: 74.512771493
Method 2: 1.0007798979999905
Method 3: 0.40823360299999933
Method 4: 0.08115736700000298
Method 5: 0.11704620100000795
Method 6: 0.04700596800000767
在我努力简化场景的过程中,不幸的是我过度简化了它。现在重新表述这个问题已经太晚了,所以我想我会在以后发布一个相关的问题。真正的场景还涉及数学,因此我不需要简单地将df_colors
中的列添加到df_people
,我还需要对每个添加的单元格对应行中的列执行一些计算
更新2
我已经将示例数据帧放大(感谢jezrael),并添加了两个新方法
# Import required modules
import numpy as np
import pandas as pd
import timeit
# Original
def method_1():
df_people = pd.DataFrame(['Tom', 'Jerry', 'Bob', 'John', 'Bill', 'Tim', 'Harry', 'Rick'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red', 'Green', 'Blue'] * 10, columns=None)
# Store data for new columns in a dictionary
new_columns = {}
for index_people, row_people in df_people.iterrows():
for index_colors, row_colors in df_colors.iterrows():
key = 'Color_' + str(index_colors)
if (key in new_columns):
new_columns[key].append(row_colors[0])
else:
new_columns[key] = [row_colors[0]]
# Add dictionary data as new columns
for key, value in new_columns.items():
df_people[key] = value
# YOBEN_S - https://stackoverflow.com/a/60805881/452587
def method_2():
df_people = pd.DataFrame(['Tom', 'Jerry', 'Bob', 'John', 'Bill', 'Tim', 'Harry', 'Rick'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red', 'Green', 'Blue'] * 10, columns=None)
_s = pd.concat([df_colors]*len(df_people), axis=1)
_s.columns = df_people.index
df_people = df_people.join(_s.T.add_prefix('Color_'))
# sammywemmy - https://stackoverflow.com/a/60805964/452587
def method_3():
df_people = pd.DataFrame(['Tom', 'Jerry', 'Bob', 'John', 'Bill', 'Tim', 'Harry', 'Rick'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red', 'Green', 'Blue'] * 10, columns=None)
# Create a new column in df_people with aggregate of df_colors;
df_people['Colors'] = df_colors[0].str.cat(sep=',')
# Concatenate df_people['Name'] and df_people['Colors'];
# split column, expand into a dataframe, and add prefix
df_people = pd.concat([df_people.Name, df_people.Colors.str.split(',', expand=True).add_prefix('Color_')], axis=1)
# Dani Mesejo - https://stackoverflow.com/a/60805898/452587
def method_4():
df_people = pd.DataFrame(['Tom', 'Jerry', 'Bob', 'John', 'Bill', 'Tim', 'Harry', 'Rick'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red', 'Green', 'Blue'] * 10, columns=None)
# Create mock key
_m1 = df_people.assign(key=1)
# Set new column names, transpose, and create mock key
_m2 = df_colors.set_index('Color_' + df_colors.index.astype(str)).T.assign(key=1)
df_people = _m1.merge(_m2, on='key').drop('key', axis=1)
# Erfan - https://stackoverflow.com/a/60806018/452587
def method_5():
df_people = pd.DataFrame(['Tom', 'Jerry', 'Bob', 'John', 'Bill', 'Tim', 'Harry', 'Rick'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red', 'Green', 'Blue'] * 10, columns=None)
df_colors = df_colors.T.reindex(df_people.index).ffill().add_prefix('Color_')
df_people = df_people.join(df_colors)
# jezrael - https://stackoverflow.com/a/60826723/452587
def method_6():
df_people = pd.DataFrame(['Tom', 'Jerry', 'Bob', 'John', 'Bill', 'Tim', 'Harry', 'Rick'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red', 'Green', 'Blue'] * 10, columns=None)
_a = np.broadcast_to(df_colors[0], (len(df_people), len(df_colors)))
df_people = df_people.join(pd.DataFrame(_a, index=df_people.index).add_prefix('Color_'))
print('Method 1:', timeit.timeit(method_1, number=3))
print('Method 2:', timeit.timeit(method_2, number=3))
print('Method 3:', timeit.timeit(method_3, number=3))
print('Method 4:', timeit.timeit(method_4, number=3))
print('Method 5:', timeit.timeit(method_5, number=3))
print('Method 6:', timeit.timeit(method_6, number=3))
输出:
Method 1: 36.029883089
Method 2: 27.042384837999997
Method 3: 68.22421793800001
Method 4: 32.94155895
Method 1: 74.512771493
Method 2: 1.0007798979999905
Method 3: 0.40823360299999933
Method 4: 0.08115736700000298
Method 5: 0.11704620100000795
Method 6: 0.04700596800000767
更新3
我已经发布了一个有关转换和计算的问题,它更准确地反映了真实数据集:
我们可以做
s=pd.concat([df1]*len(df),axis=1)
s.columns=df.index
df=df.join(s.T.add_prefix('color_'))
Name color_0 color_1 color_2
0 Tom Red Green Blue
1 Jerry Red Green Blue
你可以做:
import pandas as pd
# input sample data
df1 = pd.DataFrame([['Tom'], ['Jerry']], columns=['name'])
df2 = pd.DataFrame([['Red'], ['Gree'], ['Blue']], columns=None)
# create mock key
m1 = df1.assign(key=1)
# set new column names, transpose and create mock key
m2 = df2.set_index('Color_' + df2.index.astype(str)).T.assign(key=1)
result = m1.merge(m2, on='key').drop('key', axis=1)
print(result)
输出
name Color_0 Color_1 Color_2
0 Tom Red Gree Blue
1 Jerry Red Gree Blue
另一种可能的解决办法:
#create a new column in df1, with aggregate of df2:
#i set the header for df2 column as 'color'
df1['color'] = df2['color'].str.cat(sep=',')
#concatenate df1['Name'] and df1['Color'] as below:
pd.concat([df1.Name,
#split column, expand into a dataframe and add prefix
df1.color.str.split(',',expand=True).add_prefix('color_')],
axis=1)
Name color_0 color_1 color_2
0 Tom Red Green Blue
1 Jerry Red Green Blue
使用
DataFrame.reindex
、DataFrame.ffill
和DataFrame.add_前缀
:
df2 = df2.T.reindex(df1.index).ffill().add_prefix('Color_')
df1 = df1.join(df2)
Name Color_0 Color_1 Color_2
0 Tom Red Green Blue
1 Jerry Red Green Blue
您可以使用:
colors = df_colors.T.append(df_colors.T).add_prefix('Color_').reset_index(drop=True)
pd.concat([df_people, colors], axis=1)
输出:
您可以通过以下方法提高性能:
但我认为更好的方法是在大型
数据帧中进行测试,例如在这里测试3k行和30列,那么计时是不同的:
# Import required modules
import pandas as pd
import timeit
# Original
def method_1():
df_people = pd.DataFrame(['Tom','Jerry','Bob'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red','Green', 'Blue'] * 10, columns=None)
# Store data for new columns in a dictionary
new_columns = {}
for index_people, row_people in df_people.iterrows():
for index_colors, row_colors in df_colors.iterrows():
key = 'Color_' + str(index_colors)
if (key in new_columns):
new_columns[key].append(row_colors[0])
else:
new_columns[key] = [row_colors[0]]
# Add dictionary data as new columns
for key, value in new_columns.items():
df_people[key] = value
# YOBEN_S - https://stackoverflow.com/a/60805881/452587
def method_2():
df_people = pd.DataFrame(['Tom','Jerry','Bob'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red','Green', 'Blue'] * 10, columns=None)
_s = pd.concat([df_colors]*len(df_people), axis=1)
_s.columns = df_people.index
df_people = df_people.join(_s.T.add_prefix('Color_'))
# Dani Mesejo - https://stackoverflow.com/a/60805898/452587
def method_3():
df_people = pd.DataFrame(['Tom','Jerry','Bob'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red','Green', 'Blue'] * 10, columns=None)
# Create mock key
_m1 = df_people.assign(key=1)
# Set new column names, transpose, and create mock key
_m2 = df_colors.set_index('Color_' + df_colors.index.astype(str)).T.assign(key=1)
df_people = _m1.merge(_m2, on='key').drop('key', axis=1)
# Erfan - https://stackoverflow.com/a/60806018/452587
def method_4():
df_people = pd.DataFrame(['Tom','Jerry','Bob'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red','Green', 'Blue'] * 10, columns=None)
df_colors = df_colors.T.reindex(df_people.index).ffill().add_prefix('Color_')
df_people = df_people.join(df_colors)
def method_5():
df_people = pd.DataFrame(['Tom','Jerry','Bob'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red','Green', 'Blue'] * 10, columns=None)
a = np.broadcast_to(df_colors[0], (len(df_people), len(df_colors)))
df_people = df_people.join(pd.DataFrame(a, index=df_people.index).add_prefix('Color_'))
谢谢你的回答。如果您不知道df_people中的行数呢?谢谢您的回答。我得到了这个错误:“DataFrame”对象没有属性“color”(我想它指的是df2.color
)。我想问题是df\u colors没有列名,所以df\u colors['color']不存在。更改为列索引修复了它:df\u colors[0]
。我也会用你的方法更新时间。非常感谢!增加示例数据帧大小是个好主意。在您提交您的数据帧之前,我接受了这一点。我从来没有拒绝过一个被接受的答案——这是人们常做的事吗?耶斯雷尔,我的答案离我想做的更近了。我尝试使用np.broadcast_to()
但是失败了:(。如果你能使你的解决方案适应新问题,并且比我现在的速度快,我肯定会接受。这次我将等待48小时而不是24小时。你是对的——因为我最终选择了你的解决方案,我将更改接受的答案。
# Import required modules
import pandas as pd
import timeit
# Original
def method_1():
df_people = pd.DataFrame(['Tom','Jerry','Bob'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red','Green', 'Blue'] * 10, columns=None)
# Store data for new columns in a dictionary
new_columns = {}
for index_people, row_people in df_people.iterrows():
for index_colors, row_colors in df_colors.iterrows():
key = 'Color_' + str(index_colors)
if (key in new_columns):
new_columns[key].append(row_colors[0])
else:
new_columns[key] = [row_colors[0]]
# Add dictionary data as new columns
for key, value in new_columns.items():
df_people[key] = value
# YOBEN_S - https://stackoverflow.com/a/60805881/452587
def method_2():
df_people = pd.DataFrame(['Tom','Jerry','Bob'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red','Green', 'Blue'] * 10, columns=None)
_s = pd.concat([df_colors]*len(df_people), axis=1)
_s.columns = df_people.index
df_people = df_people.join(_s.T.add_prefix('Color_'))
# Dani Mesejo - https://stackoverflow.com/a/60805898/452587
def method_3():
df_people = pd.DataFrame(['Tom','Jerry','Bob'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red','Green', 'Blue'] * 10, columns=None)
# Create mock key
_m1 = df_people.assign(key=1)
# Set new column names, transpose, and create mock key
_m2 = df_colors.set_index('Color_' + df_colors.index.astype(str)).T.assign(key=1)
df_people = _m1.merge(_m2, on='key').drop('key', axis=1)
# Erfan - https://stackoverflow.com/a/60806018/452587
def method_4():
df_people = pd.DataFrame(['Tom','Jerry','Bob'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red','Green', 'Blue'] * 10, columns=None)
df_colors = df_colors.T.reindex(df_people.index).ffill().add_prefix('Color_')
df_people = df_people.join(df_colors)
def method_5():
df_people = pd.DataFrame(['Tom','Jerry','Bob'] * 1000, columns=['Name'])
df_colors = pd.DataFrame(['Red','Green', 'Blue'] * 10, columns=None)
a = np.broadcast_to(df_colors[0], (len(df_people), len(df_colors)))
df_people = df_people.join(pd.DataFrame(a, index=df_people.index).add_prefix('Color_'))
print('Method 1:', timeit.timeit(method_1, number=3))
print('Method 2:', timeit.timeit(method_2, number=3))
print('Method 3:', timeit.timeit(method_3, number=3))
print('Method 4:', timeit.timeit(method_4, number=3))
print('Method 5:', timeit.timeit(method_5, number=3))
Method 1: 34.91457201199955
Method 2: 0.7901797180002177
Method 3: 0.05690281799979857
Method 4: 0.05774562500118918
Method 5: 0.026483284000278218