Python glob,但它是一个字符串列表,而不是文件系统
我希望能够将格式模式与字符串列表相匹配,而不是与文件系统中的实际文件相匹配。有什么方法可以做到这一点,或者将Python glob,但它是一个字符串列表,而不是文件系统,python,regex,python-2.7,glob,Python,Regex,Python 2.7,Glob,我希望能够将格式模式与字符串列表相匹配,而不是与文件系统中的实际文件相匹配。有什么方法可以做到这一点,或者将glob模式轻松转换为正则表达式吗?没关系,我找到了。我需要模块。虽然可以直接用于检查模式是否与文件名匹配,但也可以使用fnmatch.translate方法从给定的fnmatch模式生成正则表达式: >>> import fnmatch >>> fnmatch.translate('*.txt') '.*\\.txt\\Z(?ms)' 从: fnma
glob
模式轻松转换为正则表达式吗?没关系,我找到了。我需要模块。虽然可以直接用于检查模式是否与文件名匹配,但也可以使用fnmatch.translate
方法从给定的fnmatch
模式生成正则表达式:
>>> import fnmatch
>>> fnmatch.translate('*.txt')
'.*\\.txt\\Z(?ms)'
从:
fnmatch.translate(模式)
返回转换为正则表达式的shell样式模式
glob
模块对单个路径元素使用
这意味着路径被分为目录名和文件名,如果目录名包含元字符(包含任何字符[
,*
或?
),则这些字符将递归展开
如果您有一个简单文件名字符串列表,那么只需使用即可:
import fnmatch
matching = fnmatch.filter(filenames, pattern)
但如果它们包含完整路径,则需要做更多的工作,因为生成的正则表达式不考虑路径段(通配符不排除分隔符,也不针对跨平台路径匹配进行调整)
您可以从路径构造一个简单的路径,然后将您的模式与该路径匹配:
import fnmatch
import glob
import os.path
from itertools import product
# Cross-Python dictionary views on the keys
if hasattr(dict, 'viewkeys'):
# Python 2
def _viewkeys(d):
return d.viewkeys()
else:
# Python 3
def _viewkeys(d):
return d.keys()
def _in_trie(trie, path):
"""Determine if path is completely in trie"""
current = trie
for elem in path:
try:
current = current[elem]
except KeyError:
return False
return None in current
def find_matching_paths(paths, pattern):
"""Produce a list of paths that match the pattern.
* paths is a list of strings representing filesystem paths
* pattern is a glob pattern as supported by the fnmatch module
"""
if os.altsep: # normalise
pattern = pattern.replace(os.altsep, os.sep)
pattern = pattern.split(os.sep)
# build a trie out of path elements; efficiently search on prefixes
path_trie = {}
for path in paths:
if os.altsep: # normalise
path = path.replace(os.altsep, os.sep)
_, path = os.path.splitdrive(path)
elems = path.split(os.sep)
current = path_trie
for elem in elems:
current = current.setdefault(elem, {})
current.setdefault(None, None) # sentinel
matching = []
current_level = [path_trie]
for subpattern in pattern:
if not glob.has_magic(subpattern):
# plain element, element must be in the trie or there are
# 0 matches
if not any(subpattern in d for d in current_level):
return []
matching.append([subpattern])
current_level = [d[subpattern] for d in current_level if subpattern in d]
else:
# match all next levels in the trie that match the pattern
matched_names = fnmatch.filter({k for d in current_level for k in d}, subpattern)
if not matched_names:
# nothing found
return []
matching.append(matched_names)
current_level = [d[n] for d in current_level for n in _viewkeys(d) & set(matched_names)]
return [os.sep.join(p) for p in product(*matching)
if _in_trie(path_trie, p)]
这一口可以快速找到匹配使用地球仪沿路径的任何地方:
>>> paths = ['/foo/bar/baz', '/spam/eggs/baz', '/foo/bar/bar']
>>> find_matching_paths(paths, '/foo/bar/*')
['/foo/bar/baz', '/foo/bar/bar']
>>> find_matching_paths(paths, '/*/bar/b*')
['/foo/bar/baz', '/foo/bar/bar']
>>> find_matching_paths(paths, '/*/[be]*/b*')
['/foo/bar/baz', '/foo/bar/bar', '/spam/eggs/baz']
在Python3.4+上,您可以直接使用 在Python3.3或更早版本(包括2.x)上,获取 请注意,要获得独立于平台的结果(这将取决于运行此操作的原因),您需要明确声明
PurePosixPath
或PureWindowsPath
优秀的艺术家复制;优秀的艺术家复制
我偷了;)
fnmatch.translate
将globs?
和*
分别翻译为regex
和*
。我将其调整为不匹配
import re
def glob2re(pat):
"""Translate a shell PATTERN to a regular expression.
There is no way to quote meta-characters.
"""
i, n = 0, len(pat)
res = ''
while i < n:
c = pat[i]
i = i+1
if c == '*':
#res = res + '.*'
res = res + '[^/]*'
elif c == '?':
#res = res + '.'
res = res + '[^/]'
elif c == '[':
j = i
if j < n and pat[j] == '!':
j = j+1
if j < n and pat[j] == ']':
j = j+1
while j < n and pat[j] != ']':
j = j+1
if j >= n:
res = res + '\\['
else:
stuff = pat[i:j].replace('\\','\\\\')
i = j+1
if stuff[0] == '!':
stuff = '^' + stuff[1:]
elif stuff[0] == '^':
stuff = '\\' + stuff
res = '%s[%s]' % (res, stuff)
else:
res = res + re.escape(c)
return res + '\Z(?ms)'
在此页上找到的全局模式和字符串通过测试
pat_dict = {
'a/b/*/f.txt': ['a/b/c/f.txt', 'a/b/q/f.txt', 'a/b/c/d/f.txt','a/b/c/d/e/f.txt'],
'/foo/bar/*': ['/foo/bar/baz', '/spam/eggs/baz', '/foo/bar/bar'],
'/*/bar/b*': ['/foo/bar/baz', '/foo/bar/bar'],
'/*/[be]*/b*': ['/foo/bar/baz', '/foo/bar/bar'],
'/foo*/bar': ['/foolicious/spamfantastic/bar', '/foolicious/bar']
}
for pat in pat_dict:
print('pattern :\t{}\nstrings :\t{}'.format(pat,pat_dict[pat]))
print('matched :\t{}\n'.format(list(glob_filter(pat_dict[pat],pat))))
@Veedrac answer的扩展,可应用于字符串列表:
#Python 3.4+
从pathlib导入路径
path_list=[“foo/bar.txt”、“spam/bar.txt”、“foo/eggs.txt”]
#将字符串转换为pathlib.PosixPath/.WindowsPath,然后将PurePath.match应用于列表
打印([p代表路径列表中的p,如果路径(p).match(“ba*”)))#“*ba*”也可以使用
#输出:['foo/bar.txt','spam/bar.txt']
打印([p表示路径列表中的p,如果路径(p).match(“*o/ba*”))
#输出:['foo/bar.txt']
最好在
pathlib.PurePath()上使用pathlib.Path()
,因为这样您就不必担心底层文件系统。我想添加对递归全局模式的支持,即things/***.py
和相对路径匹配,因此示例*.py
与文件夹/example\u stuff.py
不匹配
我的做法如下:
从操作系统导入路径
进口稀土
def递归全局过滤器(文件,全局):
#转换为正则表达式并添加行开始匹配
模式_re='^'+fnmatch_translate(全局)
#fnmatch不转义路径分隔符,因此转义它们
如果pattern\u re中的path.sep不是r'\{},则pattern\u re中的格式(path.sep):
pattern\u re=pattern\u re.replace('/',r'\/'))
#用忽略路径分隔符的替换“*”
sep___通配符='[^\{}]*'.格式(path.sep)
pattern\u re=pattern\u re.replace('.*',sep\u关于通配符)
#现在对于`**`我们有`[^\/]*[^\/]*`,所以将其替换为`*`
#匹配中间的所有模式
pattern\u re=pattern\u re.replace(2*sep\u关于通配符,.*)
已编译\u re=重新编译(模式\u re)
返回筛选器(编译搜索、文件)
这是一个可以处理转义标点符号的glob。它不会停在路径分隔符上。我在这里发布它是因为它与问题的标题匹配
要在列表中使用:
rex = glob_to_re(glob_pattern)
rex = r'(?s:%s)\Z' % rex # Can match newline; match whole string.
rex = re.compile(rex)
matches = [name for name in names if rex.match(name)]
代码如下:
import re as _re
class GlobSyntaxError(SyntaxError):
pass
def glob_to_re(pattern):
r"""
Given pattern, a unicode string, return the equivalent regular expression.
Any special character * ? [ ! - ] \ can be escaped by preceding it with
backslash ('\') in the pattern. Forward-slashes ('/') and escaped
backslashes ('\\') are treated as ordinary characters, not boundaries.
Here is the language glob_to_re understands.
Earlier alternatives within rules have precedence.
pattern = item*
item = '*' | '?' | '[!' set ']' | '[' set ']' | literal
set = element element*
element = literal '-' literal | literal
literal = '\' char | char other than \ [ ] and sometimes -
glob_to_re does not understand "{a,b...}".
"""
# (Note: the docstring above is r""" ... """ to preserve backslashes.)
def expect_char(i, context):
if i >= len(pattern):
s = "Unfinished %s: %r, position %d." % (context, pattern, i)
raise GlobSyntaxError(s)
def literal_to_re(i, context="pattern", bad="[]"):
if pattern[i] == '\\':
i += 1
expect_char(i, "backslashed literal")
else:
if pattern[i] in bad:
s = "Unexpected %r in %s: %r, position %d." \
% (pattern[i], context, pattern, i)
raise GlobSyntaxError(s)
return _re.escape(pattern[i]), i + 1
def set_to_re(i):
assert pattern[i] == '['
set_re = "["
i += 1
try:
if pattern[i] == '!':
set_re += '^'
i += 1
while True:
lit_re, i = literal_to_re(i, "character set", bad="[-]")
set_re += lit_re
if pattern[i] == '-':
set_re += '-'
i += 1
expect_char(i, "character set range")
lit_re, i = literal_to_re(i, "character set range", bad="[-]")
set_re += lit_re
if pattern[i] == ']':
return set_re + ']', i + 1
except IndexError:
expect_char(i, "character set") # Trigger "unfinished" error.
i = 0
re_pat = ""
while i < len(pattern):
if pattern[i] == '*':
re_pat += ".*"
i += 1
elif pattern[i] == '?':
re_pat += "."
i += 1
elif pattern[i] == '[':
set_re, i = set_to_re(i)
re_pat += set_re
else:
lit_re, i = literal_to_re(i)
re_pat += lit_re
return re_pat
import re as\u re
类GlobSyntaxError(SyntaxError):
通过
def全局到全局(模式):
r”“”
给定的模式(unicode字符串)返回等价的正则表达式。
任何特殊字符*?[!-]\都可以通过在前面加上
模式中的反斜杠('\')。正斜杠('/')和转义
反斜杠(“\\”)被视为普通字符,而不是边界。
这是glob_需要重新理解的语言。
规则中较早的备选方案具有优先权。
模式=项目*
项目=“*”|“?“|”[!“集”]“|”[“集”]”|文字
集合=元素*
元素=文字'-'文字|文字
literal='\'字符\[]以外的字符,有时-
glob_to_re不理解“{a,b…}”。
"""
#(注意:上面的docstring是r“…”以保留反斜杠。)
def expect_char(i,上下文):
如果i>=len(模式):
s=“未完成的%s:%r,位置%d.%(上下文,模式,i)
引发全局语法错误
def literal_to_re(i,context=“pattern”,bad=“[]”):
如果模式[i]='\\':
i+=1
预期字符(i,“反斜杠文字”)
其他:
如果模式[i]处于坏状态:
s=“意外的%r在%s中:%r,位置%d。”\
%(模式[i],上下文,模式,i)
引发全局语法错误
返回_re.escape(模式[i]),i+1
def设置为(i):
断言模式[i]='['
设置_re=“”
i+=1
尝试:
如果模式[i]='!':
设置_re+='^'
i+=1
尽管如此:
lit_re,i=literal_to_re(i,“字符集”,bad=“[-]”)
设置\u re+=点亮\u re
如果模式[i]='-':
设置_re+='-'
i+=1
预期字符(i,“字符集范围”)
rex = glob_to_re(glob_pattern)
rex = r'(?s:%s)\Z' % rex # Can match newline; match whole string.
rex = re.compile(rex)
matches = [name for name in names if rex.match(name)]
import re as _re
class GlobSyntaxError(SyntaxError):
pass
def glob_to_re(pattern):
r"""
Given pattern, a unicode string, return the equivalent regular expression.
Any special character * ? [ ! - ] \ can be escaped by preceding it with
backslash ('\') in the pattern. Forward-slashes ('/') and escaped
backslashes ('\\') are treated as ordinary characters, not boundaries.
Here is the language glob_to_re understands.
Earlier alternatives within rules have precedence.
pattern = item*
item = '*' | '?' | '[!' set ']' | '[' set ']' | literal
set = element element*
element = literal '-' literal | literal
literal = '\' char | char other than \ [ ] and sometimes -
glob_to_re does not understand "{a,b...}".
"""
# (Note: the docstring above is r""" ... """ to preserve backslashes.)
def expect_char(i, context):
if i >= len(pattern):
s = "Unfinished %s: %r, position %d." % (context, pattern, i)
raise GlobSyntaxError(s)
def literal_to_re(i, context="pattern", bad="[]"):
if pattern[i] == '\\':
i += 1
expect_char(i, "backslashed literal")
else:
if pattern[i] in bad:
s = "Unexpected %r in %s: %r, position %d." \
% (pattern[i], context, pattern, i)
raise GlobSyntaxError(s)
return _re.escape(pattern[i]), i + 1
def set_to_re(i):
assert pattern[i] == '['
set_re = "["
i += 1
try:
if pattern[i] == '!':
set_re += '^'
i += 1
while True:
lit_re, i = literal_to_re(i, "character set", bad="[-]")
set_re += lit_re
if pattern[i] == '-':
set_re += '-'
i += 1
expect_char(i, "character set range")
lit_re, i = literal_to_re(i, "character set range", bad="[-]")
set_re += lit_re
if pattern[i] == ']':
return set_re + ']', i + 1
except IndexError:
expect_char(i, "character set") # Trigger "unfinished" error.
i = 0
re_pat = ""
while i < len(pattern):
if pattern[i] == '*':
re_pat += ".*"
i += 1
elif pattern[i] == '?':
re_pat += "."
i += 1
elif pattern[i] == '[':
set_re, i = set_to_re(i)
re_pat += set_re
else:
lit_re, i = literal_to_re(i)
re_pat += lit_re
return re_pat
import re
from sys import hexversion, implementation
# Support for insertion-preserving/ordered dicts became language feature in Python 3.7, but works in CPython since 3.6.
if hexversion >= 0x03070000 or (implementation.name == 'cpython' and hexversion >= 0x03060000):
ordered_dict = dict
else:
from collections import OrderedDict as ordered_dict
escaped_glob_tokens_to_re = ordered_dict((
# Order of ``**/`` and ``/**`` in RE tokenization pattern doesn't matter because ``**/`` will be caught first no matter what, making ``/**`` the only option later on.
# W/o leading or trailing ``/`` two consecutive asterisks will be treated as literals.
('/\*\*', '(?:/.+?)*'), # Edge-case #1. Catches recursive globs in the middle of path. Requires edge case #2 handled after this case.
('\*\*/', '(?:^.+?/)*'), # Edge-case #2. Catches recursive globs at the start of path. Requires edge case #1 handled before this case. ``^`` is used to ensure proper location for ``**/``.
('\*', '[^/]*?'), # ``[^/]*?`` is used to ensure that ``*`` won't match subdirs, as with naive ``.*?`` solution.
('\?', '.'),
('\[\*\]', '\*'), # Escaped special glob character.
('\[\?\]', '\?'), # Escaped special glob character.
('\[!', '[^'), # Requires ordered dict, so that ``\[!`` preceded ``\[`` in RE pattern. Needed mostly to differentiate between ``!`` used within character class ``[]`` and outside of it, to avoid faulty conversion.
('\[', '['),
('\]', ']'),
))
escaped_glob_replacement = re.compile('(%s)' % '|'.join(escaped_glob_tokens_to_re).replace('\\', '\\\\\\'))
def glob_to_re(pattern):
return escaped_glob_replacement.sub(lambda match: escaped_glob_tokens_to_re[match.group(0)], re.escape(pattern))
if __name__ == '__main__':
validity_paths_globs = (
(True, 'foo.py', 'foo.py'),
(True, 'foo.py', 'fo[o].py'),
(True, 'fob.py', 'fo[!o].py'),
(True, '*foo.py', '[*]foo.py'),
(True, 'foo.py', '**/foo.py'),
(True, 'baz/duck/bar/bam/quack/foo.py', '**/bar/**/foo.py'),
(True, 'bar/foo.py', '**/foo.py'),
(True, 'bar/baz/foo.py', 'bar/**'),
(False, 'bar/baz/foo.py', 'bar/*'),
(False, 'bar/baz/foo.py', 'bar**/foo.py'),
(True, 'bar/baz/foo.py', 'bar/**/foo.py'),
(True, 'bar/baz/wut/foo.py', 'bar/**/foo.py'),
)
results = []
for seg in validity_paths_globs:
valid, path, glob_pat = seg
print('valid:', valid)
print('path:', path)
print('glob pattern:', glob_pat)
re_pat = glob_to_re(glob_pat)
print('RE pattern:', re_pat)
match = re.fullmatch(re_pat, path)
print('match:', match)
result = bool(match) == valid
results.append(result)
print('result was expected:', result)
print('-'*79)
print('all results were expected:', all(results))
print('='*79)