Python 使用重复的标题值读取excel
我有一个excel表格,我想读入熊猫多索引数据框。复杂的是excel工作表包含重复的标题值。读取时,pandas会将.x添加到第二级标题的末尾,而不是第一级标题的末尾。有没有办法重命名顶级标题而不是第二级标题 excel文件示例: 阅读脚本:Python 使用重复的标题值读取excel,python,excel,pandas,Python,Excel,Pandas,我有一个excel表格,我想读入熊猫多索引数据框。复杂的是excel工作表包含重复的标题值。读取时,pandas会将.x添加到第二级标题的末尾,而不是第一级标题的末尾。有没有办法重命名顶级标题而不是第二级标题 excel文件示例: 阅读脚本: 从pathlib导入路径 作为pd进口熊猫 def main(): xl_file=Path('.')/“pandasExample.xlsx” df=pd.read\u excel(xl\u文件,工作表名称='Sheet1',页眉=[ 0,1],ski
从pathlib导入路径
作为pd进口熊猫
def main():
xl_file=Path('.')/“pandasExample.xlsx”
df=pd.read\u excel(xl\u文件,工作表名称='Sheet1',页眉=[
0,1],skiprows=[0])
打印(df)
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu':
main()
输出:
Rectangle Ellipse Rectangle
Width Height a b Width.1 Height.1 Width.2 Height.2
0 10 20 1 2 20 30 40 50
期望输出:
Rectangle Ellipse Rectangle.1 Rectangle.2
Width Height a b Width Height Width Height
0 10 20 1 2 20 30 40 50
取消堆叠数据帧,然后将级别0重新指定给唯一标签。我是手工做的,但是你可以通过在每两列中添加一个后缀来编程。设置多重索引,然后堆叠结果。元组中有三个值:级别0、级别1和0
df=pd.read_excel('dup_header.xls',skiprows=2,nrows=10)
unstack_df=df.stack()
unstack_df=unstack_df.reset_index()
unstack_df['level_0']=['Rectangle1','Rectangle1','Ellipse','Ellipse','Rectangle2','Rectangle2','Rectangle3','Rectangle3']
unstack_df=unstack_df.set_index(['level_0','level_1'])
stack_series=unstack_df.stack()
df=stack_series.to_frame()
df.columns=['value']
#print(df.index)
#print(df.values)
print(df)
输出:
value
level_0 level_1
Rectangle1 Width 0 10
Height 0 20
Ellipse a 0 1
b 0 2
Rectangle2 width 0 20
height 0 30
Rectangle3 width.1 0 40
height.1 0 50
取消堆叠数据帧,然后将级别0重新指定给唯一标签。我是手工做的,但是你可以通过在每两列中添加一个后缀来编程。设置多重索引,然后堆叠结果。元组中有三个值:级别0、级别1和0
df=pd.read_excel('dup_header.xls',skiprows=2,nrows=10)
unstack_df=df.stack()
unstack_df=unstack_df.reset_index()
unstack_df['level_0']=['Rectangle1','Rectangle1','Ellipse','Ellipse','Rectangle2','Rectangle2','Rectangle3','Rectangle3']
unstack_df=unstack_df.set_index(['level_0','level_1'])
stack_series=unstack_df.stack()
df=stack_series.to_frame()
df.columns=['value']
#print(df.index)
#print(df.values)
print(df)
输出:
value
level_0 level_1
Rectangle1 Width 0 10
Height 0 20
Ellipse a 0 1
b 0 2
Rectangle2 width 0 20
height 0 30
Rectangle3 width.1 0 40
height.1 0 50
这里有一个不同的答案,可以产生问题中列出的精确的期望输出
from pathlib import Path
import pandas as pd
from typing import List
def rename_headers(headers: List[str]) -> List[str]:
header_dict = {}
new_headers = []
for header in headers:
header_prefix = header.split('.')[0]
header_occurance = header_dict.get(header_prefix, 0)
if header_occurance > 0:
new_header = header_prefix + f'.{header_occurance}'
else:
new_header = header_prefix
new_headers.append(new_header)
header_occurances[header_prefix] = header_occurance + 1
return new_headers
def main():
xl_file = Path('.') / 'pandasExample.xlsx'
# Read first level headers
header_df = pd.read_excel(xl_file, sheet_name='Sheet1', header=[
0], skiprows=[0], nrows=1)
headers = list(filter(lambda x: not x.startswith(
'Unnamed'), list(header_df.columns)))
# Generate the desired headers
new_headers = rename_headers(headers)
# Read in the full dataframe
df = pd.read_excel(xl_file, sheet_name='Sheet1', header=[
0, 1], skiprows=[0])
# Create a dictionary that identifies the parameters for each unique header
unique_headers = pd.unique(pd.Index(df.columns.get_level_values(0)))
parameters = {}
for header in unique_headers:
parameters[header] = pd.unique(
[column.split('.')[0] for column in df[header].columns])
unstack_df = df.head(1).stack()
# Keep order of the original index after stack
index = df.head(1).unstack().index.get_level_values(1)
unstack_df = unstack_df.reindex(zip([0] * len(index), index))
unstack_df = unstack_df.reset_index()
# Create the new level 0 and level 1 headers
level_0 = []
for header in new_headers:
level_0 += [header] * len(parameters[header.split('.')[0]])
level_1 = [parameter.split('.')[0] for parameter in unstack_df['level_1']]
# Rename level 0 and level 1 columns for the dataframe
df.columns = pd.MultiIndex.from_tuples(zip(level_0, level_1))
print(df)
if __name__ == '__main__':
main()
输出:
Rectangle Ellipse Rectangle.1 Rectangle.2
Width Height a b Width Height Width Height
0 10 20 1 2 20 30 40 50
这里有一个不同的答案,可以产生问题中列出的精确的期望输出
from pathlib import Path
import pandas as pd
from typing import List
def rename_headers(headers: List[str]) -> List[str]:
header_dict = {}
new_headers = []
for header in headers:
header_prefix = header.split('.')[0]
header_occurance = header_dict.get(header_prefix, 0)
if header_occurance > 0:
new_header = header_prefix + f'.{header_occurance}'
else:
new_header = header_prefix
new_headers.append(new_header)
header_occurances[header_prefix] = header_occurance + 1
return new_headers
def main():
xl_file = Path('.') / 'pandasExample.xlsx'
# Read first level headers
header_df = pd.read_excel(xl_file, sheet_name='Sheet1', header=[
0], skiprows=[0], nrows=1)
headers = list(filter(lambda x: not x.startswith(
'Unnamed'), list(header_df.columns)))
# Generate the desired headers
new_headers = rename_headers(headers)
# Read in the full dataframe
df = pd.read_excel(xl_file, sheet_name='Sheet1', header=[
0, 1], skiprows=[0])
# Create a dictionary that identifies the parameters for each unique header
unique_headers = pd.unique(pd.Index(df.columns.get_level_values(0)))
parameters = {}
for header in unique_headers:
parameters[header] = pd.unique(
[column.split('.')[0] for column in df[header].columns])
unstack_df = df.head(1).stack()
# Keep order of the original index after stack
index = df.head(1).unstack().index.get_level_values(1)
unstack_df = unstack_df.reindex(zip([0] * len(index), index))
unstack_df = unstack_df.reset_index()
# Create the new level 0 and level 1 headers
level_0 = []
for header in new_headers:
level_0 += [header] * len(parameters[header.split('.')[0]])
level_1 = [parameter.split('.')[0] for parameter in unstack_df['level_1']]
# Rename level 0 and level 1 columns for the dataframe
df.columns = pd.MultiIndex.from_tuples(zip(level_0, level_1))
print(df)
if __name__ == '__main__':
main()
输出:
Rectangle Ellipse Rectangle.1 Rectangle.2
Width Height a b Width Height Width Height
0 10 20 1 2 20 30 40 50
这不适用于多索引头,如我的示例所示<代码>值错误:指定多索引标题时无法指定名称跳过标题级别0并将其替换为唯一的标签,然后在数据帧中设置多索引,然后堆叠值,然后将结果转换为数据帧,请参见上文,我想这是可行的。必须编写一些额外的代码来计算标题值。这不适用于多索引标题,如我的示例所示<代码>值错误:指定多索引标题时无法指定名称跳过标题级别0并将其替换为唯一的标签,然后在数据帧中设置多索引,然后堆叠值,然后将结果转换为数据帧,请参见上文,我想这是可行的。必须编写一些额外的代码来计算标题值。