Python 递归子文件夹搜索并返回列表中的文件_Python_List_Recursion_Os.walk

Python 递归子文件夹搜索并返回列表中的文件

python list recursion

Python 递归子文件夹搜索并返回列表中的文件,python,list,recursion,os.walk,Python,List,Recursion,Os.walk,我正在编写一个脚本，以递归方式遍历主文件夹中的子文件夹，并根据特定的文件类型生成一个列表。我对剧本有意见。其当前设置如下： for root, subFolder, files in os.walk(PATH): for item in files: if item.endswith(".txt") : fileNamePath = str(os.path.join(root,subFolder,item)) 问题是subFolder变量正在拉入

我正在编写一个脚本，以递归方式遍历主文件夹中的子文件夹，并根据特定的文件类型生成一个列表。我对剧本有意见。其当前设置如下：

for root, subFolder, files in os.walk(PATH):
    for item in files:
        if item.endswith(".txt") :
            fileNamePath = str(os.path.join(root,subFolder,item))

问题是subFolder变量正在拉入子文件夹列表，而不是项文件所在的文件夹。我之前曾想过为子文件夹运行for循环，并加入路径的第一部分，但我想我会仔细检查一下，看看在此之前是否有人有任何建议。谢谢你的帮助

您应该使用称为

根目录的目录路径。提供了dirnames
，因此，如果存在不希望os.walk
递归的文件夹，则可以对其进行修剪
import os
result = [os.path.join(dp, f) for dp, dn, filenames in os.walk(PATH) for f in filenames if os.path.splitext(f)[1] == '.txt']

编辑：
在最近的一次否决投票后，我意识到，glob
是一个更好的扩展选择工具
import os
from glob import glob
result = [y for x in os.walk(PATH) for y in glob(os.path.join(x[0], '*.txt'))]

还有一个生成器版本
from itertools import chain
result = (chain.from_iterable(glob(os.path.join(x[0], '*.txt')) for x in os.walk('.')))

Edit2 for Python 3.4+
from pathlib import Path
result = list(Path(".").rglob("*.[tT][xX][tT]"))

在Python 3.5中更改：支持使用“**”的递归全局变量
glob.glob（）
如果要获取my_path
下的每个.txt
文件（递归包括子目录）：
如果您需要迭代器，您可以使用它作为替代：
for file in glob.iglob(my_path, recursive=False):
    # ...

这并不是最符合Python的答案，但我将把它放在这里是为了好玩，因为这是一堂关于递归的精巧课程
def find_files( files, dirs=[], extensions=[]):
    new_dirs = []
    for d in dirs:
        try:
            new_dirs += [ os.path.join(d, f) for f in os.listdir(d) ]
        except OSError:
            if os.path.splitext(d)[1] in extensions:
                files.append(d)

    if new_dirs:
        find_files(files, new_dirs, extensions )
    else:
        return

在我的机器上有两个文件夹，root
和root2

mender@multivax ]ls -R root root2
root:
temp1 temp2

root/temp1:
temp1.1 temp1.2

root/temp1/temp1.1:
f1.mid

root/temp1/temp1.2:
f.mi  f.mid

root/temp2:
tmp.mid

root2:
dummie.txt temp3

root2/temp3:
song.mid

假设我想在这两个目录中找到所有.txt
和所有.mid
文件，那么我就可以这样做了
files = []
find_files( files, dirs=['root','root2'], extensions=['.mid','.txt'] )
print(files)

#['root2/dummie.txt',
# 'root/temp2/tmp.mid',
# 'root2/temp3/song.mid',
# 'root/temp1/temp1.1/f1.mid',
# 'root/temp1/temp1.2/f.mid']

我会翻译成nested for的，以防其他人理解它有困难
result = [y for x in os.walk(PATH) for y in glob(os.path.join(x[0], '*.txt'))]

应等同于：
import glob
import os

result = []

for x in os.walk(PATH):
    for y in glob.glob(os.path.join(x[0], '*.txt')):
        result.append(y)

以下是函数和的文档。
新的pathlib
库将其简化为一行：
from pathlib import Path
result = list(Path(PATH).glob('**/*.txt'))

您还可以使用生成器版本：
from pathlib import Path
for file in Path(PATH).glob('**/*.txt'):
    pass

这将返回Path
对象，您可以将其用于几乎任何事情，或者通过file以字符串形式获取文件名。name
递归在Python3.5中是新的，因此它在Python2.7上不起作用。下面是一个使用r
字符串的示例，因此您只需提供Win、Lin、
import glob

mypath=r"C:\Users\dj\Desktop\nba"

files = glob.glob(mypath + r'\**\*.py', recursive=True)
# print(files) # as list
for f in files:
    print(f) # nice looking single line per file

注意：它将列出所有文件，无论文件的深度有多深。
此函数将仅递归地将文件放入列表中
导入操作系统
def ls_文件（目录）：
文件=列表（）
对于os.listdir（目录）中的项：
abspath=os.path.join（dir，item）
尝试：
如果os.path.isdir（abspath）：
files=files+ls\u文件（abspath）
其他：
追加文件（abspath）
除FileNotFoundError作为错误外：
打印（'invalid directory\n'，'Error:'，err）
返回文件
您可以通过这种方式返回绝对路径文件列表。
def list_files_recursive（路径）：
"""
作为参数接收目录路径的函数
：返回列表\文件列表及其绝对路径
"""
导入操作系统
文件=[]
#r=根，d=目录，f=文件
对于操作系统中的r、d、f.walk（路径）：
对于f中的文件：
files.append（os.path.join（r，file））
lst=[文件中的文件对文件]
返回lst
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu'：
结果=列表文件递归（'/tmp'）
打印（结果）
如果您不介意安装额外的灯光库，可以执行以下操作：
pip install plazy

用法：
import plazy

txt_filter = lambda x : True if x.endswith('.txt') else False
files = plazy.list_files(root='data', filter_func=txt_filter, is_include_root=True)

结果应该如下所示：
['data/a.txt', 'data/b.txt', 'data/sub_dir/c.txt']

它可以在Python2.7和Python3上工作
Github：
免责声明：我是plazy
的作者，这似乎是我能想出的最快的解决方案，比os.walk
快，比任何glob
解决方案快得多

它还将为您提供一个基本上免费的所有嵌套子文件夹的列表
您可以搜索几个不同的扩展名
通过将f.path
更改为f.name
（不要更改子文件夹！），您还可以选择返回完整路径或仅返回文件名

参数：dir:str，ext:list


函数返回两个列表：子文件夹、文件

有关详细的速度分析，请参见下文
def run_fast_scandir(dir, ext):    # dir: str, ext: list
    subfolders, files = [], []

    for f in os.scandir(dir):
        if f.is_dir():
            subfolders.append(f.path)
        if f.is_file():
            if os.path.splitext(f.name)[1].lower() in ext:
                files.append(f.path)


    for dir in list(subfolders):
        sf, f = run_fast_scandir(dir, ext)
        subfolders.extend(sf)
        files.extend(f)
    return subfolders, files


subfolders, files = run_fast_scandir(folder, [".jpg"])

如果需要文件大小，还可以创建一个size
列表，并添加f.stat（）.st_size
，如下所示，以显示MiB：
sizes.append(f"{f.stat().st_size/1024/1024:.0f} MiB")



速度分析
用于获取所有子文件夹和主文件夹中具有特定文件扩展名的所有文件的各种方法
tl；博士：

fast\u scandir
显然胜出，速度是除os.walk之外的所有其他解决方案的两倍
os.walk
排名第二，速度稍慢
使用glob
将大大降低过程的速度
所有结果均未使用自然排序。这意味着结果将按如下方式排序：1、10、2。要获得自然排序（1、2、10），请查看



**结果:**
测试使用W7x64、Python 3.8.1和20次运行完成。439（部分嵌套）子文件夹中的16596个文件。

find_files
来自，允许您搜索多个扩展名。

fast\u scandir
是我自己编写的，它还将返回子文件夹列表。您可以给它一个要搜索的扩展列表（我测试了一个列表，其中一个条目是一个简单的if…==“.jpg”
，没有显著差异）


#-*-编码：utf-8-*-
#Python 3
导入时间
导入操作系统
从glob导入glob，iglob
从pathlib导入路径
目录=r“”
行程=20
def run_os_walk（）：
a=时间。时间
对于范围内的i（运行）：
fu=[os.path.join（dp，f）表示dp，dn，os.walk（目录）中的文件名表示f，如果
os.path.splitext（f）[1]。低
sizes.append(f"{f.stat().st_size/1024/1024:.0f} MiB")

fast_scandir    took  499 ms. Found files: 16596. Found subfolders: 439
os.walk         took  589 ms. Found files: 16596
find_files      took  919 ms. Found files: 16596
glob.iglob      took  998 ms. Found files: 16596
glob.glob       took 1002 ms. Found files: 16596
pathlib.rglob   took 1041 ms. Found files: 16596
os.walk-glob    took 1043 ms. Found files: 16596

# -*- coding: utf-8 -*-
# Python 3


import time
import os
from glob import glob, iglob
from pathlib import Path


directory = r"<folder>"
RUNS = 20


def run_os_walk():
    a = time.time_ns()
    for i in range(RUNS):
        fu = [os.path.join(dp, f) for dp, dn, filenames in os.walk(directory) for f in filenames if
                  os.path.splitext(f)[1].lower() == '.jpg']
    print(f"os.walk\t\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")


def run_os_walk_glob():
    a = time.time_ns()
    for i in range(RUNS):
        fu = [y for x in os.walk(directory) for y in glob(os.path.join(x[0], '*.jpg'))]
    print(f"os.walk-glob\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")


def run_glob():
    a = time.time_ns()
    for i in range(RUNS):
        fu = glob(os.path.join(directory, '**', '*.jpg'), recursive=True)
    print(f"glob.glob\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")


def run_iglob():
    a = time.time_ns()
    for i in range(RUNS):
        fu = list(iglob(os.path.join(directory, '**', '*.jpg'), recursive=True))
    print(f"glob.iglob\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")


def run_pathlib_rglob():
    a = time.time_ns()
    for i in range(RUNS):
        fu = list(Path(directory).rglob("*.jpg"))
    print(f"pathlib.rglob\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")


def find_files(files, dirs=[], extensions=[]):
    # https://stackoverflow.com/a/45646357/2441026

    new_dirs = []
    for d in dirs:
        try:
            new_dirs += [ os.path.join(d, f) for f in os.listdir(d) ]
        except OSError:
            if os.path.splitext(d)[1].lower() in extensions:
                files.append(d)

    if new_dirs:
        find_files(files, new_dirs, extensions )
    else:
        return


def run_fast_scandir(dir, ext):    # dir: str, ext: list
    # https://stackoverflow.com/a/59803793/2441026

    subfolders, files = [], []

    for f in os.scandir(dir):
        if f.is_dir():
            subfolders.append(f.path)
        if f.is_file():
            if os.path.splitext(f.name)[1].lower() in ext:
                files.append(f.path)


    for dir in list(subfolders):
        sf, f = run_fast_scandir(dir, ext)
        subfolders.extend(sf)
        files.extend(f)
    return subfolders, files



if __name__ == '__main__':
    run_os_walk()
    run_os_walk_glob()
    run_glob()
    run_iglob()
    run_pathlib_rglob()


    a = time.time_ns()
    for i in range(RUNS):
        files = []
        find_files(files, dirs=[directory], extensions=[".jpg"])
    print(f"find_files\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(files)}")


    a = time.time_ns()
    for i in range(RUNS):
        subf, files = run_fast_scandir(directory, [".jpg"])
    print(f"fast_scandir\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(files)}. Found subfolders: {len(subf)}")

root = 'C:\\'
subFolder = ['Users', 'ProgramFiles', 'ProgramFiles (x86)', 'Windows', ...]
files = ['foo1.txt', 'foo2.txt', 'foo3.txt', ...]

root = 'C:\\Users\\'
subFolder = ['UserAccount1', 'UserAccount2', ...]
files = ['bar1.txt', 'bar2.txt', 'bar3.txt', ...]

...

import os
for root, subFolder, files in os.walk(PATH):
    for item in files:
        if item.endswith(".txt") :
            fileNamePath = str(os.path.join(root,item))
            print(fileNamePath)

import os
for current_dir_path, current_subdirs, current_files in os.walk(RECURSIVE_ROOT):
    for aFile in current_files:
        if aFile.endswith(".txt") :
            txt_file_path = str(os.path.join(current_dir_path, aFile))
            print(txt_file_path)

import glob
glob.glob('//Mypath/folder/**/*',recursive = True)

glob.glob('//Mypath/folder/**/*.txt',recursive = True)