Python 同名的折叠列包含不同的数据
我对这种结构的数据帧有困难:Python 同名的折叠列包含不同的数据,python,pandas,dataframe,multi-index,Python,Pandas,Dataframe,Multi Index,我对这种结构的数据帧有困难: | Depart | Employee | Employee_card | 1 | 2 | 1 | 2 | |:------:|:--------:|:-------------:|:--:|:--:|:--:|:--:| | Dep_1 | Emp_1 | 101 | 97 | 16 | 38 | 86 | | Dep_2 | Emp_2 | 102 | 7 | 10 | 3 | 58 | | D
| Depart | Employee | Employee_card | 1 | 2 | 1 | 2 |
|:------:|:--------:|:-------------:|:--:|:--:|:--:|:--:|
| Dep_1 | Emp_1 | 101 | 97 | 16 | 38 | 86 |
| Dep_2 | Emp_2 | 102 | 7 | 10 | 3 | 58 |
| Dep_2 | Emp_3 | 103 | 15 | 96 | 8 | 36 |
| Dep_1 | Emp_4 | 104 | 41 | 12 | 40 | 49 |
| Dep_3 | Emp_5 | 105 | 75 | 88 | 60 | 26 |
| Dep_1 | Emp_6 | 106 | 37 | 51 | 33 | 31 |
| Dep_3 | Emp_7 | 107 | 64 | 90 | 13 | 34 |
不要问为什么会有愚蠢的列“1”和“2”。我真的有
我想将此数据帧转换为如下结构:
| Depart | Employee | Employee_card | 1 | 2 |
|:------:|:--------:|:-------------:|:--:|:--:|
| Dep_1 | Emp_1 | 101 | 97 | 16 |
| | Emp_4 | 104 | 41 | 12 |
| | Emp_6 | 106 | 37 | 51 |
| | Emp_1 | 101 | 38 | 86 |
| | Emp_4 | 104 | 40 | 49 |
| | Emp_6 | 106 | 33 | 31 |
| Dep_2 | Emp_2 | 102 | 7 | 10 |
| | Emp_3 | 103 | 15 | 96 |
| | Emp_2 | 102 | 3 | 58 |
| | Emp_3 | 103 | 8 | 36 |
| Dep_3 | Emp_5 | 105 | 75 | 88 |
| | Emp_7 | 107 | 64 | 90 |
| | ... | ... | ...| ...|
但我不明白我怎么能做到。
我应该使用group by expression还是MULTINDEX。
或透视表…首先使用不同的列名称,然后创建临时df2
df.columns = ['Depart', 'Employee', 'Employee_card', 'A', 'B', 'C', 'D']
df2 = df[['Depart','Employee', 'Employee_card ', 'C', 'D']]
重命名df2列并从df中删除“C”和“D”列
df2.columns = ['Depart','Employee', 'A','B']
del df[['C', 'D']]
然后在2个df的
df3 = pd.concat([df,df2])
首先输入不同的列名称,然后创建临时df2
df.columns = ['Depart', 'Employee', 'Employee_card', 'A', 'B', 'C', 'D']
df2 = df[['Depart','Employee', 'Employee_card ', 'C', 'D']]
重命名df2列并从df中删除“C”和“D”列
df2.columns = ['Depart','Employee', 'A','B']
del df[['C', 'D']]
然后在2个df的
df3 = pd.concat([df,df2])
不确定性能,但您可以尝试获取唯一的列名,然后选择:
_, i = np.unique(df.columns, return_index=True)
df_with_unique_cols = df.iloc[:,i]
不确定性能,但您可以尝试获取唯一的列名,然后选择:
_, i = np.unique(df.columns, return_index=True)
df_with_unique_cols = df.iloc[:,i]
首先创建原始数据帧:
import pandas as pd
data = [
{'Depart': 'Dep_1', 'Employee': 'Emp_1', 'Employee_card': '101', '1': '97', '2': '16', '1_1': '38', '2_2': '86'},
{'Depart': 'Dep_2', 'Employee': 'Emp_2', 'Employee_card': '102', '1': '7', '2': '10', '1_1': '3', '2_2': '58'},
{'Depart': 'Dep_2', 'Employee': 'Emp_3', 'Employee_card': '103', '1': '15', '2': '96', '1_1': '8', '2_2': '36'},
{'Depart': 'Dep_1', 'Employee': 'Emp_4', 'Employee_card': '104', '1': '41', '2': '12', '1_1': '40', '2_2': '49'},
{'Depart': 'Dep_3', 'Employee': 'Emp_5', 'Employee_card': '105', '1': '75', '2': '88', '1_1': '60', '2_2': '26'},
{'Depart': 'Dep_1', 'Employee': 'Emp_6', 'Employee_card': '106', '1': '37', '2': '51', '1_1': '33', '2_2': '31'},
{'Depart': 'Dep_3', 'Employee': 'Emp_7', 'Employee_card': '107', '1': '64', '2': '90', '1_1': '13', '2_2': '34'}
]
raw = pd.DataFrame(data)
print(raw)
# 1 1_1 2 2_2 Depart Employee Employee_card
# 0 97 38 16 86 Dep_1 Emp_1 101
# 1 7 3 10 58 Dep_2 Emp_2 102
# 2 15 8 96 36 Dep_2 Emp_3 103
# 3 41 40 12 49 Dep_1 Emp_4 104
# 4 75 60 88 26 Dep_3 Emp_5 105
# 5 37 33 51 31 Dep_1 Emp_6 106
# 6 64 13 90 34 Dep_3 Emp_7 107
shared_vars = ['Depart', 'Employee', 'Employee_card']
df1 = raw.melt(id_vars=shared_vars, value_vars=['1', '1_1'], var_name='_',
value_name='1').drop('_', 1).set_index(shared_vars)
df2 = raw.melt(id_vars=shared_vars, value_vars=['2', '2_2'], var_name='_',
value_name='2').drop('_', 1).set_index(shared_vars)
df = pd.concat([df1, df2], axis=1)\
.astype({'1': int, '2': int})\ # for sorting
.sort_values(by=shared_vars + ['1', '2']) # sort all columns
print(df)
# 1 2
# Depart Employee Employee_card
# Dep_1 Emp_1 101 38 86
# 101 97 16
# Emp_4 104 40 49
# 104 41 12
# Emp_6 106 33 31
# 106 37 51
# Dep_2 Emp_2 102 3 58
# 102 7 10
# Emp_3 103 8 36
# 103 15 96
# Dep_3 Emp_5 105 60 26
# 105 75 88
# Emp_7 107 13 34
# 107 64 90
之后,您可以将结果融合并连接到新的数据帧:
import pandas as pd
data = [
{'Depart': 'Dep_1', 'Employee': 'Emp_1', 'Employee_card': '101', '1': '97', '2': '16', '1_1': '38', '2_2': '86'},
{'Depart': 'Dep_2', 'Employee': 'Emp_2', 'Employee_card': '102', '1': '7', '2': '10', '1_1': '3', '2_2': '58'},
{'Depart': 'Dep_2', 'Employee': 'Emp_3', 'Employee_card': '103', '1': '15', '2': '96', '1_1': '8', '2_2': '36'},
{'Depart': 'Dep_1', 'Employee': 'Emp_4', 'Employee_card': '104', '1': '41', '2': '12', '1_1': '40', '2_2': '49'},
{'Depart': 'Dep_3', 'Employee': 'Emp_5', 'Employee_card': '105', '1': '75', '2': '88', '1_1': '60', '2_2': '26'},
{'Depart': 'Dep_1', 'Employee': 'Emp_6', 'Employee_card': '106', '1': '37', '2': '51', '1_1': '33', '2_2': '31'},
{'Depart': 'Dep_3', 'Employee': 'Emp_7', 'Employee_card': '107', '1': '64', '2': '90', '1_1': '13', '2_2': '34'}
]
raw = pd.DataFrame(data)
print(raw)
# 1 1_1 2 2_2 Depart Employee Employee_card
# 0 97 38 16 86 Dep_1 Emp_1 101
# 1 7 3 10 58 Dep_2 Emp_2 102
# 2 15 8 96 36 Dep_2 Emp_3 103
# 3 41 40 12 49 Dep_1 Emp_4 104
# 4 75 60 88 26 Dep_3 Emp_5 105
# 5 37 33 51 31 Dep_1 Emp_6 106
# 6 64 13 90 34 Dep_3 Emp_7 107
shared_vars = ['Depart', 'Employee', 'Employee_card']
df1 = raw.melt(id_vars=shared_vars, value_vars=['1', '1_1'], var_name='_',
value_name='1').drop('_', 1).set_index(shared_vars)
df2 = raw.melt(id_vars=shared_vars, value_vars=['2', '2_2'], var_name='_',
value_name='2').drop('_', 1).set_index(shared_vars)
df = pd.concat([df1, df2], axis=1)\
.astype({'1': int, '2': int})\ # for sorting
.sort_values(by=shared_vars + ['1', '2']) # sort all columns
print(df)
# 1 2
# Depart Employee Employee_card
# Dep_1 Emp_1 101 38 86
# 101 97 16
# Emp_4 104 40 49
# 104 41 12
# Emp_6 106 33 31
# 106 37 51
# Dep_2 Emp_2 102 3 58
# 102 7 10
# Emp_3 103 8 36
# 103 15 96
# Dep_3 Emp_5 105 60 26
# 105 75 88
# Emp_7 107 13 34
# 107 64 90
首先创建原始数据帧:
import pandas as pd
data = [
{'Depart': 'Dep_1', 'Employee': 'Emp_1', 'Employee_card': '101', '1': '97', '2': '16', '1_1': '38', '2_2': '86'},
{'Depart': 'Dep_2', 'Employee': 'Emp_2', 'Employee_card': '102', '1': '7', '2': '10', '1_1': '3', '2_2': '58'},
{'Depart': 'Dep_2', 'Employee': 'Emp_3', 'Employee_card': '103', '1': '15', '2': '96', '1_1': '8', '2_2': '36'},
{'Depart': 'Dep_1', 'Employee': 'Emp_4', 'Employee_card': '104', '1': '41', '2': '12', '1_1': '40', '2_2': '49'},
{'Depart': 'Dep_3', 'Employee': 'Emp_5', 'Employee_card': '105', '1': '75', '2': '88', '1_1': '60', '2_2': '26'},
{'Depart': 'Dep_1', 'Employee': 'Emp_6', 'Employee_card': '106', '1': '37', '2': '51', '1_1': '33', '2_2': '31'},
{'Depart': 'Dep_3', 'Employee': 'Emp_7', 'Employee_card': '107', '1': '64', '2': '90', '1_1': '13', '2_2': '34'}
]
raw = pd.DataFrame(data)
print(raw)
# 1 1_1 2 2_2 Depart Employee Employee_card
# 0 97 38 16 86 Dep_1 Emp_1 101
# 1 7 3 10 58 Dep_2 Emp_2 102
# 2 15 8 96 36 Dep_2 Emp_3 103
# 3 41 40 12 49 Dep_1 Emp_4 104
# 4 75 60 88 26 Dep_3 Emp_5 105
# 5 37 33 51 31 Dep_1 Emp_6 106
# 6 64 13 90 34 Dep_3 Emp_7 107
shared_vars = ['Depart', 'Employee', 'Employee_card']
df1 = raw.melt(id_vars=shared_vars, value_vars=['1', '1_1'], var_name='_',
value_name='1').drop('_', 1).set_index(shared_vars)
df2 = raw.melt(id_vars=shared_vars, value_vars=['2', '2_2'], var_name='_',
value_name='2').drop('_', 1).set_index(shared_vars)
df = pd.concat([df1, df2], axis=1)\
.astype({'1': int, '2': int})\ # for sorting
.sort_values(by=shared_vars + ['1', '2']) # sort all columns
print(df)
# 1 2
# Depart Employee Employee_card
# Dep_1 Emp_1 101 38 86
# 101 97 16
# Emp_4 104 40 49
# 104 41 12
# Emp_6 106 33 31
# 106 37 51
# Dep_2 Emp_2 102 3 58
# 102 7 10
# Emp_3 103 8 36
# 103 15 96
# Dep_3 Emp_5 105 60 26
# 105 75 88
# Emp_7 107 13 34
# 107 64 90
之后,您可以将结果融合并连接到新的数据帧:
import pandas as pd
data = [
{'Depart': 'Dep_1', 'Employee': 'Emp_1', 'Employee_card': '101', '1': '97', '2': '16', '1_1': '38', '2_2': '86'},
{'Depart': 'Dep_2', 'Employee': 'Emp_2', 'Employee_card': '102', '1': '7', '2': '10', '1_1': '3', '2_2': '58'},
{'Depart': 'Dep_2', 'Employee': 'Emp_3', 'Employee_card': '103', '1': '15', '2': '96', '1_1': '8', '2_2': '36'},
{'Depart': 'Dep_1', 'Employee': 'Emp_4', 'Employee_card': '104', '1': '41', '2': '12', '1_1': '40', '2_2': '49'},
{'Depart': 'Dep_3', 'Employee': 'Emp_5', 'Employee_card': '105', '1': '75', '2': '88', '1_1': '60', '2_2': '26'},
{'Depart': 'Dep_1', 'Employee': 'Emp_6', 'Employee_card': '106', '1': '37', '2': '51', '1_1': '33', '2_2': '31'},
{'Depart': 'Dep_3', 'Employee': 'Emp_7', 'Employee_card': '107', '1': '64', '2': '90', '1_1': '13', '2_2': '34'}
]
raw = pd.DataFrame(data)
print(raw)
# 1 1_1 2 2_2 Depart Employee Employee_card
# 0 97 38 16 86 Dep_1 Emp_1 101
# 1 7 3 10 58 Dep_2 Emp_2 102
# 2 15 8 96 36 Dep_2 Emp_3 103
# 3 41 40 12 49 Dep_1 Emp_4 104
# 4 75 60 88 26 Dep_3 Emp_5 105
# 5 37 33 51 31 Dep_1 Emp_6 106
# 6 64 13 90 34 Dep_3 Emp_7 107
shared_vars = ['Depart', 'Employee', 'Employee_card']
df1 = raw.melt(id_vars=shared_vars, value_vars=['1', '1_1'], var_name='_',
value_name='1').drop('_', 1).set_index(shared_vars)
df2 = raw.melt(id_vars=shared_vars, value_vars=['2', '2_2'], var_name='_',
value_name='2').drop('_', 1).set_index(shared_vars)
df = pd.concat([df1, df2], axis=1)\
.astype({'1': int, '2': int})\ # for sorting
.sort_values(by=shared_vars + ['1', '2']) # sort all columns
print(df)
# 1 2
# Depart Employee Employee_card
# Dep_1 Emp_1 101 38 86
# 101 97 16
# Emp_4 104 40 49
# 104 41 12
# Emp_6 106 33 31
# 106 37 51
# Dep_2 Emp_2 102 3 58
# 102 7 10
# Emp_3 103 8 36
# 103 15 96
# Dep_3 Emp_5 105 60 26
# 105 75 88
# Emp_7 107 13 34
# 107 64 90
如果我以那种方式创建数据帧,我将没有多索引。如果我以那种方式创建数据帧,我将没有多索引。在你的
.melt
方法调用中,共享变量是什么意思?谢谢,我没有复制和粘贴它。现在它就在那里。在你的.melt
方法调用中,共享变量是什么意思?谢谢,我没有复制和粘贴它。现在它就在那里。