Python 解析srt字幕
我想解析srt字幕:Python 解析srt字幕,python,regex,Python,Regex,我想解析srt字幕: 1 00:00:12,815 --> 00:00:14,509 Chlapi, jak to jde s těma pracovníma světlama?. 2 00:00:14,815 --> 00:00:16,498 Trochu je zesilujeme. 3 00:00:16,934 --> 00:00:17,814 Jo, sleduj. 每一个项目都被组织
1
00:00:12,815 --> 00:00:14,509
Chlapi, jak to jde s
těma pracovníma světlama?.
2
00:00:14,815 --> 00:00:16,498
Trochu je zesilujeme.
3
00:00:16,934 --> 00:00:17,814
Jo, sleduj.
每一个项目都被组织起来。使用此正则表达式:
A:
对于代码B,我在数组中只有一个项(因为贪心。*),而对于代码A,我有空的“文本”,因为没有贪心。*
如何治疗
谢谢以下是我用来解析SRT文件的一些代码:
from __future__ import division
import datetime
class Srt_entry(object):
def __init__(self, lines):
def parsetime(string):
hours, minutes, seconds = string.split(u':')
hours = int(hours)
minutes = int(minutes)
seconds = float(u'.'.join(seconds.split(u',')))
return datetime.timedelta(0, seconds, 0, 0, minutes, hours)
self.index = int(lines[0])
start, arrow, end = lines[1].split()
self.start = parsetime(start)
if arrow != u"-->":
raise ValueError
self.end = parsetime(end)
self.lines = lines[2:]
if not self.lines[-1]:
del self.lines[-1]
def __unicode__(self):
def delta_to_string(d):
hours = (d.days * 24) \
+ (d.seconds // (60 * 60))
minutes = (d.seconds // 60) % 60
seconds = d.seconds % 60 + d.microseconds / 1000000
return u','.join((u"%02d:%02d:%06.3f"
% (hours, minutes, seconds)).split(u'.'))
return (unicode(self.index) + u'\n'
+ delta_to_string(self.start)
+ ' --> '
+ delta_to_string(self.end) + u'\n'
+ u''.join(self.lines))
srt_file = open("foo.srt")
entries = []
entry = []
for line in srt_file:
if options.decode:
line = line.decode(options.decode)
if line == u'\n':
entries.append(Srt_entry(entry))
entry = []
else:
entry.append(line)
srt_file.close()
为什么不使用?文本后面是一个空行,或者是文件的结尾。因此,您可以使用:
r' .... (?P<text>.*?)(\n\n|$)'
r'。。。。(?P.*)(\n\n |$)'
splits=[s.strip()表示重新拆分中的s(r'\n\s*\n',text),如果s.strip()]
regex=re.compile(r''(?P\d+).*?(?P\d{2}:\d{2}:\d{2}\d{3}-->(?P\d{2}:\d{2}:\d{2}:\d{2}\d{3})\s*?\s*(?P.*),re.DOTALL)
对于拆分中的s:
r=正则表达式搜索
打印r.组()
以下是我编写的一个片段,它将SRT文件转换为字典:
import re
def srt_time_to_seconds(time):
split_time=time.split(',')
major, minor = (split_time[0].split(':'), split_time[1])
return int(major[0])*1440 + int(major[1])*60 + int(major[2]) + float(minor)/1000
def srt_to_dict(srtText):
subs=[]
for s in re.sub('\r\n', '\n', srtText).split('\n\n'):
st = s.split('\n')
if len(st)>=3:
split = st[1].split(' --> ')
subs.append({'start': srt_time_to_seconds(split[0].strip()),
'end': srt_time_to_seconds(split[1].strip()),
'text': '<br />'.join(j for j in st[2:len(st)])
})
return subs
我对Python可用的srt库感到非常失望(通常是因为它们是重量级的,并且为了支持自定义类而避开了语言标准类型),所以我花了一年左右的时间开发了自己的srt库。你可以在 我试着让它在类上保持简单和轻松(除了核心Subtitle类,它或多或少只存储SRT块数据)。它可以读写SRT文件,并将不兼容的SRT文件转换为兼容的SRT文件 下面是一个示例输入的使用示例:
>>> import srt, pprint
>>> gen = srt.parse('''\
... 1
... 00:00:12,815 --> 00:00:14,509
... Chlapi, jak to jde s
... těma pracovníma světlama?.
...
... 2
... 00:00:14,815 --> 00:00:16,498
... Trochu je zesilujeme.
...
... 3
... 00:00:16,934 --> 00:00:17,814
... Jo, sleduj.
...
... ''')
>>> pprint.pprint(list(gen))
[Subtitle(start=datetime.timedelta(0, 12, 815000), end=datetime.timedelta(0, 14, 509000), index=1, proprietary='', content='Chlapi, jak to jde s\ntěma pracovníma světlama?.'),
Subtitle(start=datetime.timedelta(0, 14, 815000), end=datetime.timedelta(0, 16, 498000), index=2, proprietary='', content='Trochu je zesilujeme.'),
Subtitle(start=datetime.timedelta(0, 16, 934000), end=datetime.timedelta(0, 17, 814000), index=3, proprietary='', content='Jo, sleduj.')]
+1清洁。为了解释空白,您可以添加<代码>r'。。。。(?P.*)\n\s*\n'我看不到很好的文档。当我读到这篇文章时,我意识到我以前做过这件事,但不记得是怎么做的。结果证明我是按程序做的,而不是使用正则表达式。正则表达式非常优雅。如果您感兴趣,可以在(请注意,它从jaraco.util导入的“grouper”只是itertools文档中的“grouper”)中找到我用来处理SRT字幕的Python类。
r' .... (?P<text>.*?)(\n\n|$)'
splits = [s.strip() for s in re.split(r'\n\s*\n', text) if s.strip()]
regex = re.compile(r'''(?P<index>\d+).*?(?P<start>\d{2}:\d{2}:\d{2},\d{3}) --> (?P<end>\d{2}:\d{2}:\d{2},\d{3})\s*.*?\s*(?P<text>.*)''', re.DOTALL)
for s in splits:
r = regex.search(s)
print r.groups()
import re
def srt_time_to_seconds(time):
split_time=time.split(',')
major, minor = (split_time[0].split(':'), split_time[1])
return int(major[0])*1440 + int(major[1])*60 + int(major[2]) + float(minor)/1000
def srt_to_dict(srtText):
subs=[]
for s in re.sub('\r\n', '\n', srtText).split('\n\n'):
st = s.split('\n')
if len(st)>=3:
split = st[1].split(' --> ')
subs.append({'start': srt_time_to_seconds(split[0].strip()),
'end': srt_time_to_seconds(split[1].strip()),
'text': '<br />'.join(j for j in st[2:len(st)])
})
return subs
import srt_to_dict
with open('test.srt', "r") as f:
srtText = f.read()
print srt_to_dict(srtText)
>>> import srt, pprint
>>> gen = srt.parse('''\
... 1
... 00:00:12,815 --> 00:00:14,509
... Chlapi, jak to jde s
... těma pracovníma světlama?.
...
... 2
... 00:00:14,815 --> 00:00:16,498
... Trochu je zesilujeme.
...
... 3
... 00:00:16,934 --> 00:00:17,814
... Jo, sleduj.
...
... ''')
>>> pprint.pprint(list(gen))
[Subtitle(start=datetime.timedelta(0, 12, 815000), end=datetime.timedelta(0, 14, 509000), index=1, proprietary='', content='Chlapi, jak to jde s\ntěma pracovníma světlama?.'),
Subtitle(start=datetime.timedelta(0, 14, 815000), end=datetime.timedelta(0, 16, 498000), index=2, proprietary='', content='Trochu je zesilujeme.'),
Subtitle(start=datetime.timedelta(0, 16, 934000), end=datetime.timedelta(0, 17, 814000), index=3, proprietary='', content='Jo, sleduj.')]