Python 如何从apply返回格式正确的数据帧?
假设我们有以下数据帧:Python 如何从apply返回格式正确的数据帧?,python,pandas,dataframe,multiple-columns,nan,Python,Pandas,Dataframe,Multiple Columns,Nan,假设我们有以下数据帧: import pandas as pd import numpy as np years = [2005, 2006] location = ['city', 'suburb'] dft = pd.DataFrame({ 'year': [years[np.random.randint(0, 1+1)] for _ in range(100)], 'location': [location[np.random.randint(0, 1+1)] for _
import pandas as pd
import numpy as np
years = [2005, 2006]
location = ['city', 'suburb']
dft = pd.DataFrame({
'year': [years[np.random.randint(0, 1+1)] for _ in range(100)],
'location': [location[np.random.randint(0, 1+1)] for _ in range(100)],
'days_to_complete': np.random.randint(100, high=600, size=100),
'cost_in_millions': np.random.randint(1, high=10, size=100)
})
按年份和位置分组,然后应用如下函数:
def get_custom_summary(group):
gt_200 = group.days_to_complete > 200
lt_200 = group.days_to_complete < 200
avg_days_gt200 = group[gt_200].days_to_complete.mean()
avg_cost_gt200 = group[gt_200].cost_in_millions.mean()
avg_days_lt200 = group[lt_200].days_to_complete.mean()
avg_cost_lt200 = group[lt_200].cost_in_millions.mean()
lt_200_prop = lt_200.sum() / (gt_200.sum() + lt_200.sum())
return pd.DataFrame({
'gt_200': {'AVG_DAYS': avg_days_gt200, 'AVG_COST': avg_cost_gt200},
'lt_200': {'avg_days': avg_days_lt200, 'avg_cost': avg_cost_lt200},
'lt_200_prop' : lt_200_prop
})
result = dft.groupby(['year', 'location']).apply(get_custom_summary)
对于gt_200
和lt_200
列,调用dropna(axis=1)
将删除用NaN填充的列,但lt_200_prop
列仍然保留着错误的列名。我如何从get_custom_summary返回一个数据帧,该数据帧不广播(如果这是正确的话)子列(AVG_COST
,AVG_DAYS
,AVG_COST
,AVG_DAYS
)到列(gt 200
,lt_200
)
编辑:
期望输出:
gt_200 lt_200 lt_200_prop
AVG_COST AVG_DAYS avg_cost avg_days
year location
2005 city 4.818182 415.636364 7.250000 165.50 0.153846
suburb 5.631579 336.631579 5.166667 140.50 0.240000
2006 city 4.130435 396.913043 5.750000 150.75 0.258065
suburb 5.294118 392.823529 1.000000 128.00 0.055556
我的解决方案是在
gt\u 200
和lt\u 200
中的函数get\u custom\u summary
中使用相同的列名,然后按函数重命名它并添加最后一个自定义列名col
但是有多索引
,因此您需要通过以下方式创建新索引:
更简单的解决方案是删除列:
result = dft.groupby(['year', 'location']).apply(get_custom_summary).unstack(2)
#drop last 3 column, then drop NaN columns
result = result.drop(result.columns[[-1, -2, -3]], axis=1).dropna(axis=1)
print (result)
gt_200 lt_200 lt_200_prop
AVG_COST AVG_DAYS avg_cost avg_days AVG_COST
year location
2005 city 5.238095 392.095238 5.500000 144.666667 0.222222
suburb 4.428571 427.095238 4.000000 167.666667 0.125000
2006 city 4.368421 406.789474 4.571429 150.142857 0.269231
suburb 4.000000 439.062500 4.142857 145.142857 0.304348
返回列设置为多索引的数据帧
from collections import OrderedDict
def get_multi_index(ordered_dict):
length = len(list(ordered_dict.values())[0])
for k in ordered_dict:
assert(len(ordered_dict[k]) == length)
names = list()
arrays = list()
for k in ordered_dict:
names.append(k)
arrays.append(np.array(ordered_dict[k]))
tuples = list(zip(*arrays))
return pd.MultiIndex.from_tuples(tuples, names=names)
def get_custom_summary(group):
gt_200 = group.days_to_complete > 200
lt_200 = group.days_to_complete < 200
avg_days_gt_200 = group[gt_200].days_to_complete.mean()
avg_cost_gt_200 = group[gt_200].cost_in_millions.mean()
avg_days_lt_200 = group[lt_200].days_to_complete.mean()
avg_cost_lt_200 = group[lt_200].cost_in_millions.mean()
lt_200_prop = lt_200.sum() / (gt_200.sum() + lt_200.sum())
ordered_dict = OrderedDict()
ordered_dict['first'] = ['lt_200', 'lt_200', 'gt_200', 'gt_200', 'lt_200_prop']
ordered_dict['second'] = ['avg_cost', 'avg_days', 'AVG_COST', 'AVG_DAYS', 'prop']
data = [[avg_cost_lt_200, avg_days_lt_200, avg_cost_gt_200, avg_days_gt_200, lt_200_prop]]
return pd.DataFrame(data, columns=get_multi_index(ordered_dict))
输出:
first lt_200 gt_200 lt_200_prop
second avg_cost avg_days AVG_COST AVG_DAYS prop
year location
2005 city 7.555556 135.444444 5.300000 363.750000 0.310345
suburb 5.000000 137.333333 5.555556 444.222222 0.250000
2006 city 6.250000 169.000000 4.714286 422.380952 0.160000
suburb 4.428571 133.142857 4.333333 445.666667 0.318182
您可以添加所需的输出吗?@jezrael刚刚添加了所需的输出。虽然您的解决方案在这种情况下确实有效,但如果我们在10列上调用不同嵌套的get_custom_summary,它可能会变得混乱。我确实采纳了你关于MultiIndex.from_元组的想法,但我在apply函数内部使用了它,而不是在外部使用,到目前为止,它似乎工作得很好。我会把我所做的写在回复中。
result = dft.groupby(['year', 'location']).apply(get_custom_summary).unstack(2)
#drop last 3 column, then drop NaN columns
result = result.drop(result.columns[[-1, -2, -3]], axis=1).dropna(axis=1)
print (result)
gt_200 lt_200 lt_200_prop
AVG_COST AVG_DAYS avg_cost avg_days AVG_COST
year location
2005 city 5.238095 392.095238 5.500000 144.666667 0.222222
suburb 4.428571 427.095238 4.000000 167.666667 0.125000
2006 city 4.368421 406.789474 4.571429 150.142857 0.269231
suburb 4.000000 439.062500 4.142857 145.142857 0.304348
from collections import OrderedDict
def get_multi_index(ordered_dict):
length = len(list(ordered_dict.values())[0])
for k in ordered_dict:
assert(len(ordered_dict[k]) == length)
names = list()
arrays = list()
for k in ordered_dict:
names.append(k)
arrays.append(np.array(ordered_dict[k]))
tuples = list(zip(*arrays))
return pd.MultiIndex.from_tuples(tuples, names=names)
def get_custom_summary(group):
gt_200 = group.days_to_complete > 200
lt_200 = group.days_to_complete < 200
avg_days_gt_200 = group[gt_200].days_to_complete.mean()
avg_cost_gt_200 = group[gt_200].cost_in_millions.mean()
avg_days_lt_200 = group[lt_200].days_to_complete.mean()
avg_cost_lt_200 = group[lt_200].cost_in_millions.mean()
lt_200_prop = lt_200.sum() / (gt_200.sum() + lt_200.sum())
ordered_dict = OrderedDict()
ordered_dict['first'] = ['lt_200', 'lt_200', 'gt_200', 'gt_200', 'lt_200_prop']
ordered_dict['second'] = ['avg_cost', 'avg_days', 'AVG_COST', 'AVG_DAYS', 'prop']
data = [[avg_cost_lt_200, avg_days_lt_200, avg_cost_gt_200, avg_days_gt_200, lt_200_prop]]
return pd.DataFrame(data, columns=get_multi_index(ordered_dict))
result = dft.groupby(['year', 'location']).apply(get_custom_summary).xs(0, level=2)
print(result)
first lt_200 gt_200 lt_200_prop
second avg_cost avg_days AVG_COST AVG_DAYS prop
year location
2005 city 7.555556 135.444444 5.300000 363.750000 0.310345
suburb 5.000000 137.333333 5.555556 444.222222 0.250000
2006 city 6.250000 169.000000 4.714286 422.380952 0.160000
suburb 4.428571 133.142857 4.333333 445.666667 0.318182