使用Python循环浏览文件夹并提取Excel列_Python_Excel_Pandas

使用Python循环浏览文件夹并提取Excel列

python excel pandas

使用Python循环浏览文件夹并提取Excel列,python,excel,pandas,Python,Excel,Pandas,您好，我有几个excel表格中的数据，分布在不同的子文件夹中。到目前为止，我已经能够编写一个代码，提取所需的列并将其保存在字典中。代码如下： import os import pandas as pd #Path to file using os FOLDER_PATH = r'C:\Users\Sarah\Desktop\test' def listDir(dir): filenames = os.listdir(dir) for filename in filenames:

您好，我有几个excel表格中的数据，分布在不同的子文件夹中。到目前为止，我已经能够编写一个代码，提取所需的列并将其保存在字典中。代码如下：

 import os
 import pandas as pd

#Path to file using os

FOLDER_PATH = r'C:\Users\Sarah\Desktop\test'

def listDir(dir):
filenames = os.listdir(dir)
for filename in filenames:
    print('File Name:'+ filename)
    print('folder Path:'+ os.path.abspath(os.path.join(dir, filename)), sep='\n')
listDir(FOLDER_PATH)

#Display sheets names using pandas

pd.set_option('display.width',300)
mosul_file = (r'C:\Users\Sarah\Desktop\test\Months\March.xlsx')
mosul_file2 =(r'C:\Users\Sarah\Desktop\test\Months\April.xlsx')
mosul_file3 =(r'C:\Users\Sarah\Desktop\test\Months\May.xlsx')
mosul_file7 =(r'C:\Users\Sarah\Desktop\test\Months\July.xlsx')
xl = pd.ExcelFile(mosul_file)
xl2 = pd.ExcelFile(mosul_file2)
xl3 = pd.ExcelFile(mosul_file3)
xl7 = pd.ExcelFile(mosul_file7)


 #Display headers index

 mosul_df = xl.parse(0, header=[1], index_col=[0,1,2])
 mosul_df2 = xl2.parse(0, header=[0], index_col=[0,1,2])
 mosul_df3 = xl3.parse(0, header=[0], index_col=[0,1,2])
 mosul_df7 = xl7.parse(1, header=[0], index_col=[0,1,2])


#Read Excel and Select columns

mosul_file = pd.read_excel(r'C:\Users\Sarah\Desktop\test\Months\March.xlsx', sheet_name = 0 , 
index_clo=None, na_values= ['NA'], usecols = "C , F ,G")
mosul_file2 = pd.read_excel(r'C:\Users\Sarah\Desktop\test\Months\April.xlsx', sheet_name = 0 , 
index_clo=None, na_values= ['NA'], usecols = "C , F , G")
mosul_file3 = pd.read_excel(r'C:\Users\Sarah\Desktop\test\Months\May.xlsx', sheet_name = 0 , 
index_clo=None, na_values= ['NA'], usecols = "C , F , G")
mosul_file7 = pd.read_excel(r'C:\Users\Sarah\Desktop\test\Months\July.xlsx', sheet_name = 0 , 
index_clo=None, na_values= ['NA'], usecols = "C, F, G")

#Remove NaN values

data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
data_mosul_df2 = mosul_file2.apply (pd.to_numeric, errors='coerce')
data_mosul_df2 = mosul_file2.dropna()
data_mosul_df3 = mosul_file3.apply (pd.to_numeric, errors='coerce')
data_mosul_df3 = mosul_file3.dropna()
data_mosul_df7 = mosul_file3.apply (pd.to_numeric, errors='coerce')
data_mosul_df7 = mosul_file7.dropna()

#Save to Dictionary

datamosul1 = data_mosul_df.to_dict()
datamosul2 = data_mosul_df2.to_dict()
datamosul3 = data_mosul_df3.to_dict()
datamosul7 = data_mosul_df7.to_dict()

如何使其自动化，使其在所有文件夹和子文件夹中循环？

谢谢

如果我理解正确，您希望从文件夹和子文件夹中获取所有文件名。我希望下面的代码适合您，请设置根文件夹的路径

from os import walk

path = './test'
my_files = []
for (dirpath, dirnames, filenames) in walk(path):
    my_files.extend(filenames)

print(my_files)

谢谢，对不起，我的问题让人困惑，实际上我的意思是如何从第2步开始：使用熊猫显示图纸名称到第5步保存到字典自动步骤，因为现在，我只是复制/粘贴文件路径。非常感谢。阅读它使用pathlib和一些错误处理来获取多个工作簿和电子表格。再次感谢，第一部分正在工作。它打印文件，但循环似乎有问题。消息错误为FileNotFoundError:[Errno 2]没有此类文件或目录

from os import walk
import pandas as pd 

path = './Results'
my_files = []
for (dirpath, dirnames, filenames) in walk(path):
    my_files.extend(filenames)

print(my_files)

all_dicts_list = []
for file_name in my_files:
   #.....

    #Read Excel and Select columns

    mosul_file = pd.read_excel(file_name, sheet_name = 0 , 
    index_clo=None, na_values= ['NA'], usecols = "C , F ,G")

    #Remove NaN values

    data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
    data_mosul_df = mosul_file.dropna()

    #Save to Dictionary

    datamosul1 = data_mosul_df.to_dict()
    all_dicts_list.append(datamosul1)


#all dictionaries will be in all_dicts_list