在python中使用Azure语音服务读取音频文件并转换为文本,但只有第一句话转换为语音

在python中使用Azure语音服务读取音频文件并转换为文本,但只有第一句话转换为语音,python,python-3.x,azure,speech-recognition,speech-to-text,Python,Python 3.x,Azure,Speech Recognition,Speech To Text,下面是代码 import json import os from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient import azure.cognitiveservices.speech as speechsdk def main(filename): container_name="test-container" print(filename

下面是代码

import json
import os
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import azure.cognitiveservices.speech as speechsdk

def main(filename):
    container_name="test-container"
            print(filename)
    blob_service_client = BlobServiceClient.from_connection_string("DefaultEndpoint")
    container_client=blob_service_client.get_container_client(container_name)
    blob_client = container_client.get_blob_client(filename)
    with open(filename, "wb") as f:
        data = blob_client.download_blob()
        data.readinto(f)

    speech_key, service_region = "1234567", "eastus"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

    audio_input = speechsdk.audio.AudioConfig(filename=filename)
    print("Audio Input:-",audio_input)
  
    speech_config.speech_recognition_language="en-US"
    speech_config.request_word_level_timestamps()
    speech_config.enable_dictation()
    speech_config.output_format = speechsdk.OutputFormat(1)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)
    print("speech_recognizer:-",speech_recognizer)
    #result = speech_recognizer.recognize_once()
    all_results = []

    def handle_final_result(evt):
        all_results.append(evt.result.text)  
    done = False 

    def stop_cb(evt):
        #print('CLOSING on {}'.format(evt))
        speech_recognizer.stop_continuous_recognition()
        global done
        done= True

    #Appends the recognized text to the all_results variable. 
    speech_recognizer.recognized.connect(handle_final_result) 
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    speech_recognizer.start_continuous_recognition()
    
    
    #while not done:
        #time.sleep(.5)
    
    print("Printing all results from speech to text:")
    print(all_results)


    
main(filename="test.wav")
从主函数调用时出错

test.wav
Audio Input:- <azure.cognitiveservices.speech.audio.AudioConfig object at 0x00000204D72F4E88>
speech_recognizer:- <azure.cognitiveservices.speech.SpeechRecognizer object at 0x00000204D7065148>
[]
test.wav
音频输入:-
语音识别器:-
[]
预期输出(不使用主功能的输出)

test.wav
音频输入:-
语音识别器:-
将所有结果从语音打印到文本:
[嗨,'','','嗯,'','好笑','1487',“好了,好了,我想这就够了。”,'']

如果我们不使用main函数,现有的代码就可以完美地工作,但是当我使用main函数调用它时,我没有得到所需的输出。请在缺少的部分中为我们提供指导。

如本文所述,识别\u once\u async()(您正在使用的方法)-此方法将仅从检测到的语音开始到下一次暂停的输入检测到已识别的语音

根据我的理解,如果您使用启动连续识别(),您的要求将得到满足。启动函数将启动并继续处理所有话语,直到您调用停止函数

这个方法有很多事件连接到它,当语音识别过程发生时,“已识别”事件触发。您需要有一个事件处理程序来处理识别和提取文本。你可以参考这篇文章了解更多信息

共享使用启动连续识别()将音频转换为文本的示例片段

import azure.cognitiveservices.speech as speechsdk
import time
import datetime

# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
speech_key, service_region = "YOURSUBSCRIPTIONKEY", "YOURREGION"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

# Creates an audio configuration that points to an audio file.
# Replace with your own audio filename.
audio_filename = "sample.wav"
audio_input = speechsdk.audio.AudioConfig(filename=audio_filename)

# Creates a recognizer with the given settings
speech_config.speech_recognition_language="en-US"
speech_config.request_word_level_timestamps()
speech_config.enable_dictation()
speech_config.output_format = speechsdk.OutputFormat(1)

speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)

#result = speech_recognizer.recognize_once()
all_results = []



#https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
def handle_final_result(evt):
    all_results.append(evt.result.text) 
    
    
done = False

def stop_cb(evt):
    print('CLOSING on {}'.format(evt))
    speech_recognizer.stop_continuous_recognition()
    global done
    done= True

#Appends the recognized text to the all_results variable. 
speech_recognizer.recognized.connect(handle_final_result) 

#Connect callbacks to the events fired by the speech recognizer & displays the info/status
#Ref:https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.eventsignal?view=azure-python   
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)

speech_recognizer.start_continuous_recognition()

while not done:
    time.sleep(.5)
    
print("Printing all results:")
print(all_results)
样本输出:


通过函数调用相同的函数

封装在函数中并尝试调用它

只是调整了一些,并封装在一个函数中。确保非本地访问变量“done”。 请检查并让我知道

import azure.cognitiveservices.speech as speechsdk
import time
import datetime

def speech_to_text():
    
    # Creates an instance of a speech config with specified subscription key and service region.
    # Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
    speech_key, service_region = "<>", "<>"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

    # Creates an audio configuration that points to an audio file.
    # Replace with your own audio filename.
    audio_filename = "whatstheweatherlike.wav"
    audio_input = speechsdk.audio.AudioConfig(filename=audio_filename)

    # Creates a recognizer with the given settings
    speech_config.speech_recognition_language="en-US"
    speech_config.request_word_level_timestamps()
    speech_config.enable_dictation()
    speech_config.output_format = speechsdk.OutputFormat(1)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)

    #result = speech_recognizer.recognize_once()
    all_results = []



    #https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
    def handle_final_result(evt):
        all_results.append(evt.result.text) 
    
    
    done = False

    def stop_cb(evt):
        print('CLOSING on {}'.format(evt))
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done= True

    #Appends the recognized text to the all_results variable. 
    speech_recognizer.recognized.connect(handle_final_result) 

    #Connect callbacks to the events fired by the speech recognizer & displays the info/status
    #Ref:https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.eventsignal?view=azure-python   
    speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
    speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
    speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    speech_recognizer.start_continuous_recognition()

    while not done:
        time.sleep(.5)
            
    print("Printing all results:")
    print(all_results)

#calling the conversion through a function    
speech_to_text()
导入azure.cognitiveservices.speech作为speechsdk
导入时间
导入日期时间
def speech_to_text():
#使用指定的订阅密钥和服务区域创建语音配置的实例。
#从此处替换为您自己的订阅密钥和区域标识符:https://aka.ms/speech/sdkregion
语音键,服务区域“”,“”
speechsdk.SpeechConfig(订阅=语音密钥,区域=服务区域)
#创建指向音频文件的音频配置。
#替换为您自己的音频文件名。
audio_filename=“whatstheweatherlike.wav”
音频输入=speechsdk.audio.AudioConfig(文件名=音频文件名)
#使用给定设置创建识别器
speech\u config.speech\u recognition\u language=“en-US”
语音\配置.请求\单词\级别\时间戳()
语音配置启用听写()
speech\u config.output\u format=speechsdk.OutputFormat(1)
语音识别器=speechsdk.SpeechRecognizer(语音配置=语音配置,音频配置=音频输入)
#结果=语音识别器。识别一次()
所有结果=[]
#https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-蟒蛇
def处理最终结果(evt):
所有_results.append(evt.result.text)
完成=错误
def停止_cb(evt):
打印({}结束。格式(evt))
语音识别器。停止连续识别()
非局部完成
完成=正确
#将识别的文本追加到all_results变量。
语音识别器。识别。连接(处理最终结果)
#将回调连接到语音识别器触发的事件并显示信息/状态
#参考:https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.eventsignal?view=azure-蟒蛇
语音识别器。识别。连接(lambda evt:print('recogniting:{}'。格式(evt)))
语音识别器.recognized.connect(lambda evt:print('recognized:{}.format(evt)))
语音识别器.session\u started.connect(lambda evt:print('session started:{}'.format(evt)))
语音识别器.session\u stopped.connect(lambda evt:print('session stopped{}.format(evt)))
语音识别器.cancelled.connect(lambda evt:print('cancelled{}.format(evt)))
#在会话已停止或已取消的事件上停止连续识别
语音识别器。会话已停止。连接(停止\u cb)
语音识别器。已取消。连接(停止\u cb)
语音识别器。启动连续语音识别()
虽然没有这样做:
时间。睡眠(.5)
打印(“打印所有结果:”)
打印(所有结果)
#通过函数调用转换
演讲稿

当我试图在主函数中添加上述代码时,它没有按预期工作。。编辑了上面的代码,但是如果我们不通过主函数调用,它工作得很好。。请导游@RaphaelTitus-我已经将代码封装在一个函数中,并尝试调用该函数。刚刚做了一些调整。请检查并让我知道。@Raphael这对您的要求有帮助吗?很抱歉耽搁了。。通过azure中的应用程序功能测试,它运行良好。。非常感谢。如果我们需要一个详细的输出,比如发音、说话人计数等,那么我们应该在这个SDK中使用哪些参数或选项。。
import azure.cognitiveservices.speech as speechsdk
import time
import datetime

def speech_to_text():
    
    # Creates an instance of a speech config with specified subscription key and service region.
    # Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
    speech_key, service_region = "<>", "<>"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

    # Creates an audio configuration that points to an audio file.
    # Replace with your own audio filename.
    audio_filename = "whatstheweatherlike.wav"
    audio_input = speechsdk.audio.AudioConfig(filename=audio_filename)

    # Creates a recognizer with the given settings
    speech_config.speech_recognition_language="en-US"
    speech_config.request_word_level_timestamps()
    speech_config.enable_dictation()
    speech_config.output_format = speechsdk.OutputFormat(1)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)

    #result = speech_recognizer.recognize_once()
    all_results = []



    #https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
    def handle_final_result(evt):
        all_results.append(evt.result.text) 
    
    
    done = False

    def stop_cb(evt):
        print('CLOSING on {}'.format(evt))
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done= True

    #Appends the recognized text to the all_results variable. 
    speech_recognizer.recognized.connect(handle_final_result) 

    #Connect callbacks to the events fired by the speech recognizer & displays the info/status
    #Ref:https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.eventsignal?view=azure-python   
    speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
    speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
    speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    speech_recognizer.start_continuous_recognition()

    while not done:
        time.sleep(.5)
            
    print("Printing all results:")
    print(all_results)

#calling the conversion through a function    
speech_to_text()