使用Microsoft Azure语音转换为Python文本的字幕/说明_Python_Azure_Speech Recognition_Speech To Text_Subtitle

使用Microsoft Azure语音转换为Python文本的字幕/说明

python azure speech-recognition

使用Microsoft Azure语音转换为Python文本的字幕/说明,python,azure,speech-recognition,speech-to-text,subtitle,Python,Azure,Speech Recognition,Speech To Text,Subtitle,我一直在试图找出如何使用Python中的Microsoft Azure语音识别服务制作字幕，但没有找到答案。我遵循了其他人在这里回答的关于获取单个单词的技巧，但即使将它们格式化为.srt或.vtt也似乎很复杂。代码如下： import azure.cognitiveservices.speech as speechsdk def speech_recognize_continuous_from_file(): """performs continuou

我一直在试图找出如何使用Python中的Microsoft Azure语音识别服务制作字幕，但没有找到答案。我遵循了其他人在这里回答的关于获取单个单词的技巧，但即使将它们格式化为.srt或.vtt也似乎很复杂。代码如下：

import azure.cognitiveservices.speech as speechsdk


def speech_recognize_continuous_from_file():
    """performs continuous speech recognition with input from an audio file"""
    # <SpeechContinuousRecognitionWithFile>
    speech_key, service_region = "{api-key}", "{serive-region}"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    
    audio_filename = "{for example: video.wav}"
    audio_config = speechsdk.audio.AudioConfig(filename=audio_filename)
    
    speech_config.speech_recognition_language="en-US"
    speech_config.request_word_level_timestamps()

    speech_config.enable_dictation()
    speech_config.output_format = speechsdk.OutputFormat(1)
    
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    done = False
    
    results = []
    
    transcript = []
    words = []
    
    def handle_final_result(evt):
        import json
        results = json.loads(evt.result.json)
        transcript.append(results['DisplayText'])
    confidence_list_temp = [item.get('Confidence') for item in results['NBest']]
    max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
        words.extend(results['NBest'][max_confidence_index]['Words'])
    def stop_cb(evt):
        """callback that stops continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done = True
        print("Transcript display list:\n")
        print(transcript)
        print("\nWords\n")
        print(words)
        print("\n")


    speech_recognizer.recognized.connect(handle_final_result)
    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(lambda evt: format(evt))
    speech_recognizer.recognized.connect(lambda evt: format(evt))
    speech_recognizer.session_started.connect(lambda evt: format(evt))
    speech_recognizer.session_stopped.connect(lambda evt: format(evt))
    speech_recognizer.canceled.connect(lambda evt: format(evt))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)

    
    with open('Azure_Raw.txt','w') as f:
        f.write('\n'.join(results))

sample_long_running_recognize(storage_uri)

所以，如果您仔细观察Azure语音服务的JSON输出，它与其他服务的输出略有不同

对于上述配置，在选择最佳匹配后，输出如下所示

[{'Duration': 3900000, 'Offset': 500000, 'Word': "what's"},
 {'Duration': 1300000, 'Offset': 4500000, 'Word': 'the'},
 {'Duration': 2900000, 'Offset': 5900000, 'Word': 'weather'},
 {'Duration': 4800000, 'Offset': 8900000, 'Word': 'like'}]

有三种输出-字、持续时间和偏移量

持续时间-单词拼写的时间，以100纳秒为单位
偏移量-从视频开始到第100纳秒的秒数

你必须利用这一点来制定你的时间表

import azure.cognitiveservices.speech as speechsdk
import os
import time
import pprint
import json
import srt
import datetime

 
path = os.getcwd()
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
speech_key, service_region = "<>", "<>"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

# Creates an audio configuration that points to an audio file.
# Replace with your own audio filename.
audio_filename = "sample.wav"
audio_input = speechsdk.audio.AudioConfig(filename=audio_filename)

# Creates a recognizer with the given settings
speech_config.speech_recognition_language="en-US"
speech_config.request_word_level_timestamps()



speech_config.enable_dictation()
speech_config.output_format = speechsdk.OutputFormat(1)

speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)

#result = speech_recognizer.recognize_once()
all_results = []
results = []
transcript = []
words = []


#https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
def handle_final_result(evt):
    import json
    all_results.append(evt.result.text) 
    results = json.loads(evt.result.json)
    transcript.append(results['DisplayText'])
    confidence_list_temp = [item.get('Confidence') for item in results['NBest']]
    max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
    words.extend(results['NBest'][max_confidence_index]['Words'])



done = False

def stop_cb(evt):
    print('CLOSING on {}'.format(evt))
    speech_recognizer.stop_continuous_recognition()
    global done
    done= True
    
speech_recognizer.recognized.connect(handle_final_result) 
#Connect callbacks to the events fired by the speech recognizer    
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)

speech_recognizer.start_continuous_recognition()

while not done:
    time.sleep(.5)
    
print("Printing all results:")
print(all_results)

speech_to_text_response = words

def convertduration(t):
    x= t/10000
    return int((x / 1000)), (x % 1000)


##-- Code to Create Subtitle --#

#3 Seconds
bin = 3.0
duration = 0 
transcriptions = []
transcript = ""
index,prev=0,0
wordstartsec,wordstartmicrosec=0,0
for i in range(len(speech_to_text_response)):
    #Forms the sentence until the bin size condition is met
    transcript = transcript + " " + speech_to_text_response[i]["Word"]
    #Checks whether the elapsed duration is less than the bin size
    if(int((duration / 10000000)) < bin): 
        wordstartsec,wordstartmicrosec=convertduration(speech_to_text_response[i]["Offset"])
        duration= duration+speech_to_text_response[i]["Offset"]-prev
        prev=speech_to_text_response[i]["Offset"]
                #transcript = transcript + " " + speech_to_text_response[i]["Word"]
    else : 
        index=index+1
        #transcript = transcript + " " + speech_to_text_response[i]["Word"]
        transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
        duration = 0 
        #print(transcript)
        transcript=""



transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
subtitles = srt.compose(transcriptions)
with open("subtitle.srt", "w") as f:
    f.write(subtitles)

导入azure.cognitiveservices.speech作为speechsdk
导入操作系统
导入时间
导入pprint
导入json
进口srt
导入日期时间
path=os.getcwd（）
#使用指定的订阅密钥和服务区域创建语音配置的实例。
#从此处替换为您自己的订阅密钥和区域标识符：https://aka.ms/speech/sdkregion
语音键，服务区域“”，“”
speechsdk.SpeechConfig（订阅=语音密钥，区域=服务区域）
#创建指向音频文件的音频配置。
#替换为您自己的音频文件名。
音频文件名=“sample.wav”
音频输入=speechsdk.audio.AudioConfig（文件名=音频文件名）
#使用给定设置创建识别器
speech\u config.speech\u recognition\u language=“en-US”
语音\配置.请求\单词\级别\时间戳（）
语音配置启用听写（）
speech\u config.output\u format=speechsdk.OutputFormat（1）
语音识别器=speechsdk.SpeechRecognizer（语音配置=语音配置，音频配置=音频输入）
#结果=语音识别器。识别一次（）
所有结果=[]
结果=[]
成绩单=[]
单词=[]
#https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-蟒蛇
def处理最终结果（evt）：
导入json
所有_results.append（evt.result.text）
results=json.loads（evt.result.json）
附加（结果['DisplayText']）
置信度列表温度=[item.get（'confidence'）用于结果中的项['NBest']]
最大置信度指数=置信度列表温度指数（最大（置信度列表温度））
扩展（结果['NBest'][max_confidence_index]['words']）
完成=错误
def停止_cb（evt）：
打印（{}结束。格式（evt））
语音识别器。停止连续识别（）
全球完成
完成=正确
语音识别器。识别。连接（处理最终结果）
#将回调连接到语音识别器触发的事件
语音识别器。识别。连接（lambda evt:print（'recogniting:{}'。格式（evt）））
语音识别器.recognized.connect（lambda evt:print（'recognized:{}.format（evt）））
语音识别器.session\u started.connect（lambda evt:print（'session started:{}'.format（evt）））
语音识别器.session\u stopped.connect（lambda evt:print（'session stopped{}.format（evt）））
语音识别器.cancelled.connect（lambda evt:print（'cancelled{}.format（evt）））
#在会话已停止或已取消的事件上停止连续识别
语音识别器。会话已停止。连接（停止\u cb）
语音识别器。已取消。连接（停止\u cb）
语音识别器。启动连续语音识别（）
虽然没有这样做：
时间。睡眠（.5）
打印（“打印所有结果：”）
打印（所有结果）
语音对文字的回应=单词
def转换器持续时间（t）：
x=t/10000
返回整数（（x/1000）），（x%1000）
##--创建字幕的代码--#
#3秒
bin=3.0
持续时间=0
转录=[]
转录本=“”
指数，上一个=0,0
wordstartsec，wordstartmicrosec=0,0
对于范围内的i（len（语音到文本响应））：
#形成句子，直到满足箱子大小条件
成绩单=成绩单+“”+演讲对文本的回应[i][“单词”]
#检查经过的持续时间是否小于存储箱大小
如果（int（（持续时间/10000000））



附加输出以供参考：


希望这有帮助：）现在这是一个很好的答案！我终于看到了它与谷歌云版本之间的相似之处。这也让我意识到我必须磨练我的JSON知识（我不知道为什么，但我有麻烦，这是最容易的事情之一）。非常感谢萨提亚！
import azure.cognitiveservices.speech as speechsdk
import os
import time
import pprint
import json
import srt
import datetime

 
path = os.getcwd()
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
speech_key, service_region = "<>", "<>"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

# Creates an audio configuration that points to an audio file.
# Replace with your own audio filename.
audio_filename = "sample.wav"
audio_input = speechsdk.audio.AudioConfig(filename=audio_filename)

# Creates a recognizer with the given settings
speech_config.speech_recognition_language="en-US"
speech_config.request_word_level_timestamps()



speech_config.enable_dictation()
speech_config.output_format = speechsdk.OutputFormat(1)

speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)

#result = speech_recognizer.recognize_once()
all_results = []
results = []
transcript = []
words = []


#https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
def handle_final_result(evt):
    import json
    all_results.append(evt.result.text) 
    results = json.loads(evt.result.json)
    transcript.append(results['DisplayText'])
    confidence_list_temp = [item.get('Confidence') for item in results['NBest']]
    max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
    words.extend(results['NBest'][max_confidence_index]['Words'])



done = False

def stop_cb(evt):
    print('CLOSING on {}'.format(evt))
    speech_recognizer.stop_continuous_recognition()
    global done
    done= True
    
speech_recognizer.recognized.connect(handle_final_result) 
#Connect callbacks to the events fired by the speech recognizer    
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)

speech_recognizer.start_continuous_recognition()

while not done:
    time.sleep(.5)
    
print("Printing all results:")
print(all_results)

speech_to_text_response = words

def convertduration(t):
    x= t/10000
    return int((x / 1000)), (x % 1000)


##-- Code to Create Subtitle --#

#3 Seconds
bin = 3.0
duration = 0 
transcriptions = []
transcript = ""
index,prev=0,0
wordstartsec,wordstartmicrosec=0,0
for i in range(len(speech_to_text_response)):
    #Forms the sentence until the bin size condition is met
    transcript = transcript + " " + speech_to_text_response[i]["Word"]
    #Checks whether the elapsed duration is less than the bin size
    if(int((duration / 10000000)) < bin): 
        wordstartsec,wordstartmicrosec=convertduration(speech_to_text_response[i]["Offset"])
        duration= duration+speech_to_text_response[i]["Offset"]-prev
        prev=speech_to_text_response[i]["Offset"]
                #transcript = transcript + " " + speech_to_text_response[i]["Word"]
    else : 
        index=index+1
        #transcript = transcript + " " + speech_to_text_response[i]["Word"]
        transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
        duration = 0 
        #print(transcript)
        transcript=""



transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
subtitles = srt.compose(transcriptions)
with open("subtitle.srt", "w") as f:
    f.write(subtitles)