使用Microsoft Azure语音转换为Python文本的字幕/说明

使用Microsoft Azure语音转换为Python文本的字幕/说明,python,azure,speech-recognition,speech-to-text,subtitle,Python,Azure,Speech Recognition,Speech To Text,Subtitle,我一直在试图找出如何使用Python中的Microsoft Azure语音识别服务制作字幕,但没有找到答案。我遵循了其他人在这里回答的关于获取单个单词的技巧,但即使将它们格式化为.srt或.vtt也似乎很复杂。 代码如下: import azure.cognitiveservices.speech as speechsdk def speech_recognize_continuous_from_file(): """performs continuou

我一直在试图找出如何使用Python中的Microsoft Azure语音识别服务制作字幕,但没有找到答案。我遵循了其他人在这里回答的关于获取单个单词的技巧,但即使将它们格式化为.srt或.vtt也似乎很复杂。 代码如下:

import azure.cognitiveservices.speech as speechsdk


def speech_recognize_continuous_from_file():
    """performs continuous speech recognition with input from an audio file"""
    # <SpeechContinuousRecognitionWithFile>
    speech_key, service_region = "{api-key}", "{serive-region}"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    
    audio_filename = "{for example: video.wav}"
    audio_config = speechsdk.audio.AudioConfig(filename=audio_filename)
    
    speech_config.speech_recognition_language="en-US"
    speech_config.request_word_level_timestamps()

    speech_config.enable_dictation()
    speech_config.output_format = speechsdk.OutputFormat(1)
    
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    done = False
    
    results = []
    
    transcript = []
    words = []
    
    def handle_final_result(evt):
        import json
        results = json.loads(evt.result.json)
        transcript.append(results['DisplayText'])
    confidence_list_temp = [item.get('Confidence') for item in results['NBest']]
    max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
        words.extend(results['NBest'][max_confidence_index]['Words'])
    def stop_cb(evt):
        """callback that stops continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done = True
        print("Transcript display list:\n")
        print(transcript)
        print("\nWords\n")
        print(words)
        print("\n")


    speech_recognizer.recognized.connect(handle_final_result)
    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(lambda evt: format(evt))
    speech_recognizer.recognized.connect(lambda evt: format(evt))
    speech_recognizer.session_started.connect(lambda evt: format(evt))
    speech_recognizer.session_stopped.connect(lambda evt: format(evt))
    speech_recognizer.canceled.connect(lambda evt: format(evt))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)

    
    with open('Azure_Raw.txt','w') as f:
        f.write('\n'.join(results))

sample_long_running_recognize(storage_uri)

所以,如果您仔细观察Azure语音服务的JSON输出,它与其他服务的输出略有不同

对于上述配置,在选择最佳匹配后,输出如下所示

[{'Duration': 3900000, 'Offset': 500000, 'Word': "what's"},
 {'Duration': 1300000, 'Offset': 4500000, 'Word': 'the'},
 {'Duration': 2900000, 'Offset': 5900000, 'Word': 'weather'},
 {'Duration': 4800000, 'Offset': 8900000, 'Word': 'like'}]
有三种输出-字、持续时间和偏移量

  • 持续时间-单词拼写的时间,以100纳秒为单位
  • 偏移量-从视频开始到第100纳秒的秒数
你必须利用这一点来制定你的时间表

import azure.cognitiveservices.speech as speechsdk
import os
import time
import pprint
import json
import srt
import datetime

 
path = os.getcwd()
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
speech_key, service_region = "<>", "<>"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

# Creates an audio configuration that points to an audio file.
# Replace with your own audio filename.
audio_filename = "sample.wav"
audio_input = speechsdk.audio.AudioConfig(filename=audio_filename)

# Creates a recognizer with the given settings
speech_config.speech_recognition_language="en-US"
speech_config.request_word_level_timestamps()



speech_config.enable_dictation()
speech_config.output_format = speechsdk.OutputFormat(1)

speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)

#result = speech_recognizer.recognize_once()
all_results = []
results = []
transcript = []
words = []


#https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
def handle_final_result(evt):
    import json
    all_results.append(evt.result.text) 
    results = json.loads(evt.result.json)
    transcript.append(results['DisplayText'])
    confidence_list_temp = [item.get('Confidence') for item in results['NBest']]
    max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
    words.extend(results['NBest'][max_confidence_index]['Words'])



done = False

def stop_cb(evt):
    print('CLOSING on {}'.format(evt))
    speech_recognizer.stop_continuous_recognition()
    global done
    done= True
    
speech_recognizer.recognized.connect(handle_final_result) 
#Connect callbacks to the events fired by the speech recognizer    
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)

speech_recognizer.start_continuous_recognition()

while not done:
    time.sleep(.5)
    
print("Printing all results:")
print(all_results)

speech_to_text_response = words

def convertduration(t):
    x= t/10000
    return int((x / 1000)), (x % 1000)


##-- Code to Create Subtitle --#

#3 Seconds
bin = 3.0
duration = 0 
transcriptions = []
transcript = ""
index,prev=0,0
wordstartsec,wordstartmicrosec=0,0
for i in range(len(speech_to_text_response)):
    #Forms the sentence until the bin size condition is met
    transcript = transcript + " " + speech_to_text_response[i]["Word"]
    #Checks whether the elapsed duration is less than the bin size
    if(int((duration / 10000000)) < bin): 
        wordstartsec,wordstartmicrosec=convertduration(speech_to_text_response[i]["Offset"])
        duration= duration+speech_to_text_response[i]["Offset"]-prev
        prev=speech_to_text_response[i]["Offset"]
                #transcript = transcript + " " + speech_to_text_response[i]["Word"]
    else : 
        index=index+1
        #transcript = transcript + " " + speech_to_text_response[i]["Word"]
        transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
        duration = 0 
        #print(transcript)
        transcript=""



transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
subtitles = srt.compose(transcriptions)
with open("subtitle.srt", "w") as f:
    f.write(subtitles)
导入azure.cognitiveservices.speech作为speechsdk
导入操作系统
导入时间
导入pprint
导入json
进口srt
导入日期时间
path=os.getcwd()
#使用指定的订阅密钥和服务区域创建语音配置的实例。
#从此处替换为您自己的订阅密钥和区域标识符:https://aka.ms/speech/sdkregion
语音键,服务区域“”,“”
speechsdk.SpeechConfig(订阅=语音密钥,区域=服务区域)
#创建指向音频文件的音频配置。
#替换为您自己的音频文件名。
音频文件名=“sample.wav”
音频输入=speechsdk.audio.AudioConfig(文件名=音频文件名)
#使用给定设置创建识别器
speech\u config.speech\u recognition\u language=“en-US”
语音\配置.请求\单词\级别\时间戳()
语音配置启用听写()
speech\u config.output\u format=speechsdk.OutputFormat(1)
语音识别器=speechsdk.SpeechRecognizer(语音配置=语音配置,音频配置=音频输入)
#结果=语音识别器。识别一次()
所有结果=[]
结果=[]
成绩单=[]
单词=[]
#https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-蟒蛇
def处理最终结果(evt):
导入json
所有_results.append(evt.result.text)
results=json.loads(evt.result.json)
附加(结果['DisplayText'])
置信度列表温度=[item.get('confidence')用于结果中的项['NBest']]
最大置信度指数=置信度列表温度指数(最大(置信度列表温度))
扩展(结果['NBest'][max_confidence_index]['words'])
完成=错误
def停止_cb(evt):
打印({}结束。格式(evt))
语音识别器。停止连续识别()
全球完成
完成=正确
语音识别器。识别。连接(处理最终结果)
#将回调连接到语音识别器触发的事件
语音识别器。识别。连接(lambda evt:print('recogniting:{}'。格式(evt)))
语音识别器.recognized.connect(lambda evt:print('recognized:{}.format(evt)))
语音识别器.session\u started.connect(lambda evt:print('session started:{}'.format(evt)))
语音识别器.session\u stopped.connect(lambda evt:print('session stopped{}.format(evt)))
语音识别器.cancelled.connect(lambda evt:print('cancelled{}.format(evt)))
#在会话已停止或已取消的事件上停止连续识别
语音识别器。会话已停止。连接(停止\u cb)
语音识别器。已取消。连接(停止\u cb)
语音识别器。启动连续语音识别()
虽然没有这样做:
时间。睡眠(.5)
打印(“打印所有结果:”)
打印(所有结果)
语音对文字的回应=单词
def转换器持续时间(t):
x=t/10000
返回整数((x/1000)),(x%1000)
##--创建字幕的代码--#
#3秒
bin=3.0
持续时间=0
转录=[]
转录本=“”
指数,上一个=0,0
wordstartsec,wordstartmicrosec=0,0
对于范围内的i(len(语音到文本响应)):
#形成句子,直到满足箱子大小条件
成绩单=成绩单+“”+演讲对文本的回应[i][“单词”]
#检查经过的持续时间是否小于存储箱大小
如果(int((持续时间/10000000))

附加输出以供参考:



希望这有帮助:)

现在这是一个很好的答案!我终于看到了它与谷歌云版本之间的相似之处。这也让我意识到我必须磨练我的JSON知识(我不知道为什么,但我有麻烦,这是最容易的事情之一)。非常感谢萨提亚!
import azure.cognitiveservices.speech as speechsdk
import os
import time
import pprint
import json
import srt
import datetime

 
path = os.getcwd()
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
speech_key, service_region = "<>", "<>"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

# Creates an audio configuration that points to an audio file.
# Replace with your own audio filename.
audio_filename = "sample.wav"
audio_input = speechsdk.audio.AudioConfig(filename=audio_filename)

# Creates a recognizer with the given settings
speech_config.speech_recognition_language="en-US"
speech_config.request_word_level_timestamps()



speech_config.enable_dictation()
speech_config.output_format = speechsdk.OutputFormat(1)

speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)

#result = speech_recognizer.recognize_once()
all_results = []
results = []
transcript = []
words = []


#https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
def handle_final_result(evt):
    import json
    all_results.append(evt.result.text) 
    results = json.loads(evt.result.json)
    transcript.append(results['DisplayText'])
    confidence_list_temp = [item.get('Confidence') for item in results['NBest']]
    max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
    words.extend(results['NBest'][max_confidence_index]['Words'])



done = False

def stop_cb(evt):
    print('CLOSING on {}'.format(evt))
    speech_recognizer.stop_continuous_recognition()
    global done
    done= True
    
speech_recognizer.recognized.connect(handle_final_result) 
#Connect callbacks to the events fired by the speech recognizer    
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)

speech_recognizer.start_continuous_recognition()

while not done:
    time.sleep(.5)
    
print("Printing all results:")
print(all_results)

speech_to_text_response = words

def convertduration(t):
    x= t/10000
    return int((x / 1000)), (x % 1000)


##-- Code to Create Subtitle --#

#3 Seconds
bin = 3.0
duration = 0 
transcriptions = []
transcript = ""
index,prev=0,0
wordstartsec,wordstartmicrosec=0,0
for i in range(len(speech_to_text_response)):
    #Forms the sentence until the bin size condition is met
    transcript = transcript + " " + speech_to_text_response[i]["Word"]
    #Checks whether the elapsed duration is less than the bin size
    if(int((duration / 10000000)) < bin): 
        wordstartsec,wordstartmicrosec=convertduration(speech_to_text_response[i]["Offset"])
        duration= duration+speech_to_text_response[i]["Offset"]-prev
        prev=speech_to_text_response[i]["Offset"]
                #transcript = transcript + " " + speech_to_text_response[i]["Word"]
    else : 
        index=index+1
        #transcript = transcript + " " + speech_to_text_response[i]["Word"]
        transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
        duration = 0 
        #print(transcript)
        transcript=""



transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
subtitles = srt.compose(transcriptions)
with open("subtitle.srt", "w") as f:
    f.write(subtitles)