Python 如何使代码在读取包含大量行的大型Excel时运行得更快_Python_Pandas

Python 如何使代码在读取包含大量行的大型Excel时运行得更快

python pandas

Python 如何使代码在读取包含大量行的大型Excel时运行得更快,python,pandas,Python,Pandas,我有很多excel文件，就像我有15个excel文件一样每个excel文件包含四张表格所需的表格是三张表格，表格名称如下代码： sheet_names = ['client31_KPN', 'client32_T-Mobile', 'client33_Vodafone'] def renamer(c): # I'm assuming this does what you want. hard to tell without knowing # what your input a

我有很多excel文件，就像我有15个excel文件一样每个excel文件包含四张表格所需的表格是三张表格，表格名称如下代码：

sheet_names = ['client31_KPN', 'client32_T-Mobile', 'client33_Vodafone']

def renamer(c):
    # I'm assuming this does what you want. hard to tell without knowing
    # what your input and output looks like.
    return (
        c.strip().split(' ')[-1].lower()
    )

shared_BM_NL_Q2_DNS_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_DNS_cols)
], ignore_index=True).reset_index(drop=True)

这些工作表名称在我所有的

excel

文件中都是相同的名称

这里的要点是，我需要在我拥有的每个excel文件中

concat

all

sheet\u name

，并删除所有特殊字符、空格和新行，如以下代码：

sheet_names = ['client31_KPN', 'client32_T-Mobile', 'client33_Vodafone']

def renamer(c):
    # I'm assuming this does what you want. hard to tell without knowing
    # what your input and output looks like.
    return (
        c.strip().split(' ')[-1].lower()
    )

shared_BM_NL_Q2_DNS_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_DNS_cols)
], ignore_index=True).reset_index(drop=True)

之后，由于excel中数据或行的限制，我必须将所有

excel

文件转换为

csv

，这就是我需要转换为csv的原因

以下是与前一个场景类似的完整代码：

import os
import pandas as pd

shared_BM_NL_Q2_DNS = r'Shared_BM_NL_Q2_DNS.xlsx'
shared_BM_NL_Q2_HTTP_FDTT_DL = 'Shared_BM_NL_Q2_HTTP_FDTT_DL.xlsx'
shared_BM_NL_Q2_HTTP_FDTT_UL = 'Shared_BM_NL_Q2_HTTP_FDTT_UL.xlsx'
shared_BM_NL_Q2_HTTP_File_DL_excel = 'Shared_BM_NL_Q2_HTTP_File_DL.xlsx'
shared_BM_NL_Q2_HTTP_File_UL = 'Shared_BM_NL_Q2_HTTP_File_UL.xlsx'
shared_BM_NL_Q2_HTTP_Live = 'Shared_BM_NL_Q2_HTTP_Live.xlsx'
shared_BM_NL_Q2_HTTP_Static = 'Shared_BM_NL_Q2_HTTP_Static.xlsx'
shared_BM_NL_Q2_Ping_40 = 'Shared_BM_NL_Q2_Ping_40.xlsx'
shared_BM_NL_Q2_Ping_800 = 'Shared_BM_NL_Q2_Ping_800.xlsx'
shared_BM_NL_Q2_Ping_CDN = 'Shared_BM_NL_Q2_Ping_CDN.xlsx'
shared_BM_NL_Q2_Youtube = 'Shared_BM_NL_Q2_Youtube.xlsx'
shared_BM_NL_Q2_Voice_M2M_SQ_Samples = 'Shared_BM_NL_Q2_Voice_M2M_SQ_Samples.xlsx'
shared_BM_NL_Q2_Voice_M2M_Calls = 'Shared_BM_NL_Q2_Voice_M2M_Calls.xlsx'

sheet_names = ['client31_KPN', 'client32_T-Mobile', 'client33_Vodafone']
shared_BM_NL_Q2_DNS_cols = ['A:AB', 'A:AB', 'A:AB']
shared_BM_NL_Q2_HTTP_FDTT_DL_cols = ['A:DZ', 'A:DZ', 'A:DZ']
shared_BM_NL_Q2_HTTP_FDTT_UL_cols = ['A:DP', 'A:DP', 'A:DP']
shared_BM_NL_Q2_HTTP_File_DL_excel_cols = ['A:CU', 'A:CU', 'A:CU']
shared_BM_NL_Q2_HTTP_File_UL_cols = ['A:CU', 'A:CU', 'A:CU']
shared_BM_NL_Q2_HTTP_Live_cols = ['A:EP', 'A:EP', 'A:EP']
shared_BM_NL_Q2_HTTP_Static_cols = ['A:DQ', 'A:DQ', 'A:DQ']
shared_BM_NL_Q2_Ping_40_cols = ['A:AJ', 'A:AJ', 'A:AJ']
shared_BM_NL_Q2_Ping_800_cols = ['A:AJ', 'A:AJ', 'A:AJ']
shared_BM_NL_Q2_Ping_CDN_cols = ['A:AJ', 'A:AJ', 'A:AJ']
shared_BM_NL_Q2_Youtube_cols = ['A:CA', 'A:CA', 'A:CA']

def renamer(c):
    # I'm assuming this does what you want. hard to tell without knowing
    # what your input and output looks like.
    return (
        c.strip().split(' ')[-1].lower()
    )

shared_BM_NL_Q2_DNS_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_DNS_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_HTTP_FDTT_DL_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_HTTP_FDTT_DL_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_HTTP_FDTT_UL_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_HTTP_FDTT_DL, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_HTTP_FDTT_UL_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_HTTP_File_DL_excel_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_HTTP_File_DL_excel_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_HTTP_File_UL_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_HTTP_File_UL_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_HTTP_Live_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_HTTP_Live_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_HTTP_Static_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_HTTP_Static_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_Ping_40_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_Ping_40_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_Ping_800_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_Ping_800_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_Ping_CDN_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_Ping_CDN_cols)
], ignore_index=True).reset_index(drop=True)

shared_BM_NL_Q2_Youtube_df = pd.concat([
    pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c).rename(columns=renamer)
    for ws, c in zip(sheet_names, shared_BM_NL_Q2_Youtube_cols)
], ignore_index=True).reset_index(drop=True)

pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_rows', 50000)
pd.set_option('display.width', 100000)
print(shared_BM_NL_Q2_DNS_df)

shared_BM_NL_Q2_DNS_df.to_csv("shared_BM_NL_Q2_DNS.csv")
shared_BM_NL_Q2_HTTP_FDTT_DL_df.to_csv("shared_BM_NL_Q2_HTTP_FDTT_DL.csv")
shared_BM_NL_Q2_HTTP_FDTT_UL_df.to_csv("shared_BM_NL_Q2_HTTP_FDTT_UL.csv")
shared_BM_NL_Q2_HTTP_File_DL_excel_df.to_csv("shared_BM_NL_Q2_HTTP_File_DL_excel.csv")
shared_BM_NL_Q2_HTTP_File_UL_df.to_csv("shared_BM_NL_Q2_HTTP_File_UL.csv")
shared_BM_NL_Q2_HTTP_Live_df.to_csv("shared_BM_NL_Q2_HTTP_Live.csv")
shared_BM_NL_Q2_HTTP_Static_df.to_csv("shared_BM_NL_Q2_HTTP_Static.csv")
shared_BM_NL_Q2_Ping_40_df.to_csv("shared_BM_NL_Q2_Ping_40.csv")
shared_BM_NL_Q2_Ping_800_df.to_csv("shared_BM_NL_Q2_Ping_800.csv")
shared_BM_NL_Q2_Ping_CDN_df.to_csv("shared_BM_NL_Q2_Ping_CDN.csv")
shared_BM_NL_Q2_Youtube_df.to_csv("shared_BM_NL_Q2_Youtube.csv")

这里的代码运行得很好，完全符合我的要求，但需要很长时间，大约需要30分钟，因为我认为使用pandas读取数据需要很多时间，我认为这是由于数据的大小

所以我现在需要的是以一种好的方式执行这段代码，以使这个过程更快

我希望有人能给我一些好的建议，很抱歉，我有什么不清楚的地方。

Dask

是一个与熊猫非常相似的大数据工具。它有许多最常用的方法，但不是全部。如果数据处理时间/内存是您的关键，我强烈建议您查看该库。我已经成功地将dask用于具有2000多万行和50+列的项目，并大幅提高了性能/速度，并且使用了一小部分内存。有一点学习曲线，但如果你经常处理熊猫无法处理的数据，这并不可怕，也不完全值得@DavidErickson谢谢你的建议，我会看一看你发送的这封推荐信，我可能很快会发布snwer:D谢谢兄弟：D查看这个帖子@DavidErickson非常感谢：D，这是非常有用的推荐信，这是第一次了解这个

Dask

库：）

Dask

是一个非常类似于熊猫的大数据工具。它有许多最常用的方法，但不是全部。如果数据处理时间/内存是您的关键，我强烈建议您查看该库。我已经成功地将dask用于具有2000多万行和50+列的项目，并大幅提高了性能/速度，并且使用了一小部分内存。有一点学习曲线，但如果你经常处理熊猫无法处理的数据，这并不可怕，也不完全值得@DavidErickson谢谢你的建议，我会看一看你发来的这封推荐信，我可能很快会发布snwer:D谢谢兄弟：D查看这个帖子@DavidErickson非常感谢：D，这是非常有用的推荐信，这是第一次了解这个

Dask

库：）