Python 如何使代码在读取包含大量行的大型Excel时运行得更快

Python 如何使代码在读取包含大量行的大型Excel时运行得更快,python,pandas,Python,Pandas,我有很多excel文件,就像我有15个excel文件一样每个excel文件包含四张表格所需的表格是三张表格,表格名称如下代码: sheet_names = ['client31_KPN', 'client32_T-Mobile', 'client33_Vodafone'] def renamer(c): # I'm assuming this does what you want. hard to tell without knowing # what your input a

我有很多excel文件,就像我有15个excel文件一样每个excel文件包含四张表格所需的表格是三张表格,表格名称如下代码:

sheet_names = ['client31_KPN', 'client32_T-Mobile', 'client33_Vodafone']
def renamer(c):
    # I'm assuming this does what you want. hard to tell without knowing
    # what your input and output looks like.
    return (
        c.strip().split(' ')[-1].lower()
    )

shared_BM_NL_Q2_DNS_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_DNS_cols)
], ignore_index=True).reset_index(drop=True)
这些工作表名称在我所有的
excel
文件中都是相同的名称

这里的要点是,我需要在我拥有的每个excel文件中
concat
all
sheet\u name
,并删除所有特殊字符、空格和新行,如以下代码:

sheet_names = ['client31_KPN', 'client32_T-Mobile', 'client33_Vodafone']
def renamer(c):
    # I'm assuming this does what you want. hard to tell without knowing
    # what your input and output looks like.
    return (
        c.strip().split(' ')[-1].lower()
    )

shared_BM_NL_Q2_DNS_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_DNS_cols)
], ignore_index=True).reset_index(drop=True)
之后,由于excel中数据或行的限制,我必须将所有
excel
文件转换为
csv
,这就是我需要转换为csv的原因

以下是与前一个场景类似的完整代码:

import os
import pandas as pd

shared_BM_NL_Q2_DNS = r'Shared_BM_NL_Q2_DNS.xlsx'
shared_BM_NL_Q2_HTTP_FDTT_DL = 'Shared_BM_NL_Q2_HTTP_FDTT_DL.xlsx'
shared_BM_NL_Q2_HTTP_FDTT_UL = 'Shared_BM_NL_Q2_HTTP_FDTT_UL.xlsx'
shared_BM_NL_Q2_HTTP_File_DL_excel = 'Shared_BM_NL_Q2_HTTP_File_DL.xlsx'
shared_BM_NL_Q2_HTTP_File_UL = 'Shared_BM_NL_Q2_HTTP_File_UL.xlsx'
shared_BM_NL_Q2_HTTP_Live = 'Shared_BM_NL_Q2_HTTP_Live.xlsx'
shared_BM_NL_Q2_HTTP_Static = 'Shared_BM_NL_Q2_HTTP_Static.xlsx'
shared_BM_NL_Q2_Ping_40 = 'Shared_BM_NL_Q2_Ping_40.xlsx'
shared_BM_NL_Q2_Ping_800 = 'Shared_BM_NL_Q2_Ping_800.xlsx'
shared_BM_NL_Q2_Ping_CDN = 'Shared_BM_NL_Q2_Ping_CDN.xlsx'
shared_BM_NL_Q2_Youtube = 'Shared_BM_NL_Q2_Youtube.xlsx'
shared_BM_NL_Q2_Voice_M2M_SQ_Samples = 'Shared_BM_NL_Q2_Voice_M2M_SQ_Samples.xlsx'
shared_BM_NL_Q2_Voice_M2M_Calls = 'Shared_BM_NL_Q2_Voice_M2M_Calls.xlsx'

sheet_names = ['client31_KPN', 'client32_T-Mobile', 'client33_Vodafone']
shared_BM_NL_Q2_DNS_cols = ['A:AB', 'A:AB', 'A:AB']
shared_BM_NL_Q2_HTTP_FDTT_DL_cols = ['A:DZ', 'A:DZ', 'A:DZ']
shared_BM_NL_Q2_HTTP_FDTT_UL_cols = ['A:DP', 'A:DP', 'A:DP']
shared_BM_NL_Q2_HTTP_File_DL_excel_cols = ['A:CU', 'A:CU', 'A:CU']
shared_BM_NL_Q2_HTTP_File_UL_cols = ['A:CU', 'A:CU', 'A:CU']
shared_BM_NL_Q2_HTTP_Live_cols = ['A:EP', 'A:EP', 'A:EP']
shared_BM_NL_Q2_HTTP_Static_cols = ['A:DQ', 'A:DQ', 'A:DQ']
shared_BM_NL_Q2_Ping_40_cols = ['A:AJ', 'A:AJ', 'A:AJ']
shared_BM_NL_Q2_Ping_800_cols = ['A:AJ', 'A:AJ', 'A:AJ']
shared_BM_NL_Q2_Ping_CDN_cols = ['A:AJ', 'A:AJ', 'A:AJ']
shared_BM_NL_Q2_Youtube_cols = ['A:CA', 'A:CA', 'A:CA']

def renamer(c):
    # I'm assuming this does what you want. hard to tell without knowing
    # what your input and output looks like.
    return (
        c.strip().split(' ')[-1].lower()
    )

shared_BM_NL_Q2_DNS_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_DNS_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_HTTP_FDTT_DL_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_HTTP_FDTT_DL_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_HTTP_FDTT_UL_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_HTTP_FDTT_DL, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_HTTP_FDTT_UL_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_HTTP_File_DL_excel_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_HTTP_File_DL_excel_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_HTTP_File_UL_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_HTTP_File_UL_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_HTTP_Live_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_HTTP_Live_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_HTTP_Static_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_HTTP_Static_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_Ping_40_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_Ping_40_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_Ping_800_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_Ping_800_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_Ping_CDN_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_Ping_CDN_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_Youtube_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_Youtube_cols)
], ignore_index=True).reset_index(drop=True)

pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_rows', 50000)
pd.set_option('display.width', 100000)
print(shared_BM_NL_Q2_DNS_df)

shared_BM_NL_Q2_DNS_df.to_csv("shared_BM_NL_Q2_DNS.csv")
shared_BM_NL_Q2_HTTP_FDTT_DL_df.to_csv("shared_BM_NL_Q2_HTTP_FDTT_DL.csv")
shared_BM_NL_Q2_HTTP_FDTT_UL_df.to_csv("shared_BM_NL_Q2_HTTP_FDTT_UL.csv")
shared_BM_NL_Q2_HTTP_File_DL_excel_df.to_csv("shared_BM_NL_Q2_HTTP_File_DL_excel.csv")
shared_BM_NL_Q2_HTTP_File_UL_df.to_csv("shared_BM_NL_Q2_HTTP_File_UL.csv")
shared_BM_NL_Q2_HTTP_Live_df.to_csv("shared_BM_NL_Q2_HTTP_Live.csv")
shared_BM_NL_Q2_HTTP_Static_df.to_csv("shared_BM_NL_Q2_HTTP_Static.csv")
shared_BM_NL_Q2_Ping_40_df.to_csv("shared_BM_NL_Q2_Ping_40.csv")
shared_BM_NL_Q2_Ping_800_df.to_csv("shared_BM_NL_Q2_Ping_800.csv")
shared_BM_NL_Q2_Ping_CDN_df.to_csv("shared_BM_NL_Q2_Ping_CDN.csv")
shared_BM_NL_Q2_Youtube_df.to_csv("shared_BM_NL_Q2_Youtube.csv")
这里的代码运行得很好,完全符合我的要求,但需要很长时间,大约需要30分钟,因为我认为使用pandas读取数据需要很多时间,我认为这是由于数据的大小

所以我现在需要的是以一种好的方式执行这段代码,以使这个过程更快


我希望有人能给我一些好的建议,很抱歉,我有什么不清楚的地方。

Dask
是一个与熊猫非常相似的大数据工具。它有许多最常用的方法,但不是全部。如果数据处理时间/内存是您的关键,我强烈建议您查看该库。我已经成功地将dask用于具有2000多万行和50+列的项目,并大幅提高了性能/速度,并且使用了一小部分内存。有一点学习曲线,但如果你经常处理熊猫无法处理的数据,这并不可怕,也不完全值得@DavidErickson谢谢你的建议,我会看一看你发送的这封推荐信,我可能很快会发布snwer:D谢谢兄弟:D查看这个帖子@DavidErickson非常感谢:D,这是非常有用的推荐信,这是第一次了解这个
Dask
库:)
Dask
是一个非常类似于熊猫的大数据工具。它有许多最常用的方法,但不是全部。如果数据处理时间/内存是您的关键,我强烈建议您查看该库。我已经成功地将dask用于具有2000多万行和50+列的项目,并大幅提高了性能/速度,并且使用了一小部分内存。有一点学习曲线,但如果你经常处理熊猫无法处理的数据,这并不可怕,也不完全值得@DavidErickson谢谢你的建议,我会看一看你发来的这封推荐信,我可能很快会发布snwer:D谢谢兄弟:D查看这个帖子@DavidErickson非常感谢:D,这是非常有用的推荐信,这是第一次了解这个
Dask
库:)