使用Python和Pocketsphinx进行实时识别_Python_Cmusphinx

使用Python和Pocketsphinx进行实时识别

python

使用Python和Pocketsphinx进行实时识别,python,cmusphinx,Python,Cmusphinx,我最近一直在使用python中的pocket sphinx。我成功地得到了那份工作下面的示例用于识别记录的wav #!/usr/bin/env python import sys,os def decodeSpeech(hmmd,lmdir,dictp,wavfile): """ Decodes a speech file """ try: import pocketsphinx as ps import sp

我最近一直在使用python中的pocket sphinx。我成功地得到了那份工作下面的示例用于识别记录的wav

#!/usr/bin/env python

import sys,os



def decodeSpeech(hmmd,lmdir,dictp,wavfile):

    """

    Decodes a speech file

    """

    try:

        import pocketsphinx as ps

        import sphinxbase

    except:

        print """Pocket sphinx and sphixbase is not installed

        in your system. Please install it with package manager.

        """

    speechRec = ps.Decoder(hmm = hmmd, lm = lmdir, dict = dictp)

    wavFile = file(wavfile,'rb')

    wavFile.seek(44)

    speechRec.decode_raw(wavFile)

    result = speechRec.get_hyp()



    return result[0]



if __name__ == "__main__":

    hmdir = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/hmm/wsj1"

    lmd = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/lm/wsj/wlist5o.3e-7.vp.tg.lm.DMP"

    dictd = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/lm/wsj/wlist5o.dic"

    wavfile = "/home/jaganadhg/Desktop/Docs_New/kgisl/sa1.wav"

    recognised = decodeSpeech(hmdir,lmd,dictd,wavfile)

    print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"

    print recognised

    print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"

问题是如何通过麦克风进行实时语音识别？在里面带有if语句的while循环，以便从麦克风识别设置的单词

可以调用函数？

用于实时识别的代码如下所示：

您还可以在pocketsphinx中使用gstreamer python绑定，请选中并重试。Pocketsphinx现在是一个GStreamer插件。

这是我在互联网上看到的代码，我修改了一些东西，以真正听到单词“非常糟糕和缓慢” 你可以帮我永久地修改它。它是基于ubuntu 16.04 LTS构建的我对编程知之甚少期待帮助

    # -*- encoding: utf-8 -*-
    #!/usr/bin/env python

    from pocketsphinx.pocketsphinx import *
    from sphinxbase.sphinxbase import *

    import os
    import pyaudio
    import wave
    import audioop
    from collections import deque
    import time
    import math;import Mic

    """
    Written by Sophie Li, 2016
    http://blog.justsophie.com/python-speech-to-text-with-pocketsphinx/
    """

    class SpeechDetector:
        def __init__(self):
            # Microphone stream config.
            self.CHUNK = 1024  # CHUNKS of bytes to read each time from mic
            self.FORMAT = pyaudio.paInt16
            self.CHANNELS = 1
            self.RATE = 16000

            self.SILENCE_LIMIT = 1  # Silence limit in seconds. The max ammount of seconds where
                               # only silence is recorded. When this time passes the
                               # recording finishes and the file is decoded

            self.PREV_AUDIO = 0.5  # Previous audio (in seconds) to prepend. When noise
                              # is detected, how much of previously recorded audio is
                              # prepended. This helps to prevent chopping the beginning
                              # of the phrase.

            self.THRESHOLD = 4500
            self.num_phrases = -1

            # These will need to be modified according to where the pocketsphinx folder is
            MODELDIR = "/home/l/Desktop/pocketsphinx/model/en-us"

            # Create a decoder with certain model
            config = Decoder.default_config()
            config.set_string('-hmm', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/en-us/'))
            config.set_string('-lm', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/en-us.lm.bin'))
            config.set_string('-dict', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/cmudict-en-us.dict'))
            config.set_string('-keyphrase', 'no one')
            config.set_float('-kws_threshold', 1e+20)

            # Creaders decoder object for streaming data.
            self.decoder = Decoder(config)

        def setup_mic(self, num_samples=50):
            """ Gets average audio intensity of your mic sound. You can use it to get
                average intensities while you're talking and/or silent. The average
                is the avg of the .2 of the largest intensities recorded.
            """
            #print "Getting intensity values from mic."
            p = pyaudio.PyAudio()
            stream = p.open(format=self.FORMAT, 
                            channels=self.CHANNELS,
                            rate=self.RATE, 
                            input=True, 
                            frames_per_buffer=self.CHUNK)

            values = [math.sqrt(abs(audioop.avg(stream.read(self.CHUNK), 4)))
                      for x in range(num_samples)]
            values = sorted(values, reverse=True)
            r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2)
            #print " Finished "
            #print " Average audio intensity is ", r
            stream.close()
            p.terminate()

            if r < 3000:
                self.THRESHOLD = 3500
            else:
                self.THRESHOLD = r + 100

        def save_speech(self, data, p):
            """
            Saves mic data to temporary WAV file. Returns filename of saved
            file
            """
            filename = 'output_'+str(int(time.time()))
            # writes data to WAV file
            data = ''.join(data)
            wf = wave.open(filename + '.wav', 'wb')
            wf.setnchannels(1)
            wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
            wf.setframerate(16000)  # TODO make this value a function parameter?
            wf.writeframes(data)
            wf.close()
            return filename + '.wav'

        def decode_phrase(self, wav_file):
            self.decoder.start_utt()
            stream = open(wav_file, "rb")
            while True:
              buf = stream.read(1024)
              if buf:
                self.decoder.process_raw(buf, False, False)
              else:
                break
            self.decoder.end_utt()
            words = []
            [words.append(seg.word) for seg in self.decoder.seg()]
            return words

        def run(self):
            """
            Listens to Microphone, extracts phrases from it and calls pocketsphinx
            to decode the sound
            """
            self.setup_mic()

            #Open stream
            p = pyaudio.PyAudio()
            stream = p.open(format=self.FORMAT, 
                            channels=self.CHANNELS, 
                            rate=self.RATE, 
                            input=True, 
                            frames_per_buffer=self.CHUNK)

            audio2send = []
            cur_data = ''  # current chunk of audio data
            rel = self.RATE/self.CHUNK
            slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
            #Prepend audio from 0.5 seconds before noise was detected
            prev_audio = deque(maxlen=self.PREV_AUDIO * rel)
            started = False

            while True:
                cur_data = stream.read(self.CHUNK)
                slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))

                if sum([x > self.THRESHOLD for x in slid_win]) > 0:
                    if started == False:
                        print "Bắt đầu ghi âm"
                        started = True
                    audio2send.append(cur_data)

                elif started:
                    print "Hoàn thành ghi âm"
                    filename = self.save_speech(list(prev_audio) + audio2send, p)
                    r = self.decode_phrase(filename)
                    print "RESULT: ", r
# hot word for me " no one" if r.count('one') and r.count("no") > 0 the end programs
                    if r.count("one") > 0 and r.count("no") > 0:
                        Mic.playaudiofromAudio().play("/home/l/Desktop/PROJECT/Audio/beep_hi.wav")
                        os.remove(filename)
                        return
                    # Removes temp audio file
                    os.remove(filename)
                    # Reset all
                    started = False
                    slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
                    prev_audio = deque(maxlen= 0.5 * rel)
                    audio2send = []
                    print "Chế độ nghe ..."

                else:
                    prev_audio.append(cur_data)

            print "* Hoàn thành nghe"
            stream.close()
            p.terminate()

#-*-编码：utf-8-*-
#!/usr/bin/env python
从pocketsphinx.pocketsphinx导入*
从sphinxbase.sphinxbase导入*
导入操作系统
导入pyaudio
输入波
导入音频操作
从集合导入deque
导入时间
导入数学；进口话筒
"""
作者：Sophie Li，2016
http://blog.justsophie.com/python-speech-to-text-with-pocketsphinx/
"""
课堂演讲检测器：
定义初始化（自）：
#麦克风流配置。
self.CHUNK=1024#每次从麦克风读取的字节块
self.FORMAT=pyaudio.paInt16
self.CHANNELS=1
自费率=16000
self.SILENCE_LIMIT=1#以秒为单位的SILENCE LIMIT。最大秒数，其中
#只有沉默被记录下来。当这段时间过去的时候
#录制完成，文件被解码
self.PREV_AUDIO=0.5#前一个音频（以秒为单位）前置。当噪音
#检测到之前录制的音频有多少
#预先准备好的。这有助于防止切碎开始
#这句话的意思。
self.THRESHOLD=4500
self.num_短语=-1
#这些将需要根据pocketsphinx文件夹所在的位置进行修改
MODELDIR=“/home/l/Desktop/pocketsphinx/model/en-us”
#创建具有特定模型的解码器
config=Decoder.default\u config（）
config.set_字符串（'-hmm'，os.path.join（MODELDIR，'/home/l/Desktop/pocketsphinx/model/en-us/en-us/'））
config.set_字符串（'-lm'，os.path.join（MODELDIR，'/home/l/Desktop/pocketsphinx/model/en-us/en-us.lm.bin'））
config.set_字符串（'-dict'，os.path.join（MODELDIR，'/home/l/Desktop/pocketsphinx/model/en-us/cmudit en-us.dict'））
config.set_字符串（'-keyphase'，noone'）
配置设置浮点数（'-kws_阈值'，1e+20）
#用于流数据的Creaders解码器对象。
self.decoder=解码器（配置）
def设置麦克风（自身，样本数=50）：
“”获取麦克风声音的平均音频强度。您可以使用它来获取
你说话和/或沉默时的平均强度
是记录的最大强度中.2的平均值。
"""
#打印“从麦克风获取强度值”
p=pyaudio.pyaudio（）
流=p.open（格式=self.format，
通道=自通道，
速率=自速率，
输入=真，
帧（每个缓冲区=self.CHUNK）
value=[math.sqrt（abs（audioop.avg（stream.read（self.CHUNK），4）））
对于范围内的x（数量样本）]
值=已排序（值，反向=真）
r=总和（值[：int（num_samples*0.2）]）/int（num_samples*0.2）
#打印“已完成”
#打印“平均音频强度为”，r
stream.close（）
p、 终止（）
如果r<3000：
自阈值=3500
其他：
自阈值=r+100
def save_语音（自我、数据、p）：
"""
将麦克风数据保存到临时WAV文件。返回保存的麦克风数据的文件名
文件
"""
文件名='output\'+str（int（time.time（）））
#将数据写入WAV文件
数据=“”。连接（数据）
wf=wave.open（文件名+'.wav'，wb'）
wf.设置通道（1）
wf.setsampwidth（p.get_样本大小（pyaudio.paInt16））
setframerate（16000）#如何将此值设为函数参数？
wf.writeframes（数据）
wf.close（）
返回文件名+'.wav'
def解码_短语（自身、wav_文件）：
self.decoder.start_utt（）
流=打开（wav_文件，“rb”）
尽管如此：
buf=流读取（1024）
如果buf：
self.decoder.process_raw（buf，False，False）
其他：
打破
self.decoder.end_utt（）
单词=[]
[self.decoder.seg（）中seg的words.append（seg.word）]
回话
def运行（自）：
"""
听麦克风，从中提取短语并呼叫Pocketsphenx
解码声音
"""
self.setup_mic（）
#明流
p=pyaudio.pyaudio（）
流=p.open（格式=self.format，
通道=自通道，
速率=自速率，
输入=真，
帧（每个缓冲区=self.CHUNK）
audio2send=[]
cur_data=''当前音频数据块
rel=self.RATE/self.CHUNK
滑动\u win=deque（maxlen=self.silen\u LIMIT*rel）
#在检测到噪音前0.5秒开始预编音频
prev_audio=deque（maxlen=self.prev_audio
    # -*- encoding: utf-8 -*-
    #!/usr/bin/env python

    from pocketsphinx.pocketsphinx import *
    from sphinxbase.sphinxbase import *

    import os
    import pyaudio
    import wave
    import audioop
    from collections import deque
    import time
    import math;import Mic

    """
    Written by Sophie Li, 2016
    http://blog.justsophie.com/python-speech-to-text-with-pocketsphinx/
    """

    class SpeechDetector:
        def __init__(self):
            # Microphone stream config.
            self.CHUNK = 1024  # CHUNKS of bytes to read each time from mic
            self.FORMAT = pyaudio.paInt16
            self.CHANNELS = 1
            self.RATE = 16000

            self.SILENCE_LIMIT = 1  # Silence limit in seconds. The max ammount of seconds where
                               # only silence is recorded. When this time passes the
                               # recording finishes and the file is decoded

            self.PREV_AUDIO = 0.5  # Previous audio (in seconds) to prepend. When noise
                              # is detected, how much of previously recorded audio is
                              # prepended. This helps to prevent chopping the beginning
                              # of the phrase.

            self.THRESHOLD = 4500
            self.num_phrases = -1

            # These will need to be modified according to where the pocketsphinx folder is
            MODELDIR = "/home/l/Desktop/pocketsphinx/model/en-us"

            # Create a decoder with certain model
            config = Decoder.default_config()
            config.set_string('-hmm', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/en-us/'))
            config.set_string('-lm', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/en-us.lm.bin'))
            config.set_string('-dict', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/cmudict-en-us.dict'))
            config.set_string('-keyphrase', 'no one')
            config.set_float('-kws_threshold', 1e+20)

            # Creaders decoder object for streaming data.
            self.decoder = Decoder(config)

        def setup_mic(self, num_samples=50):
            """ Gets average audio intensity of your mic sound. You can use it to get
                average intensities while you're talking and/or silent. The average
                is the avg of the .2 of the largest intensities recorded.
            """
            #print "Getting intensity values from mic."
            p = pyaudio.PyAudio()
            stream = p.open(format=self.FORMAT, 
                            channels=self.CHANNELS,
                            rate=self.RATE, 
                            input=True, 
                            frames_per_buffer=self.CHUNK)

            values = [math.sqrt(abs(audioop.avg(stream.read(self.CHUNK), 4)))
                      for x in range(num_samples)]
            values = sorted(values, reverse=True)
            r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2)
            #print " Finished "
            #print " Average audio intensity is ", r
            stream.close()
            p.terminate()

            if r < 3000:
                self.THRESHOLD = 3500
            else:
                self.THRESHOLD = r + 100

        def save_speech(self, data, p):
            """
            Saves mic data to temporary WAV file. Returns filename of saved
            file
            """
            filename = 'output_'+str(int(time.time()))
            # writes data to WAV file
            data = ''.join(data)
            wf = wave.open(filename + '.wav', 'wb')
            wf.setnchannels(1)
            wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
            wf.setframerate(16000)  # TODO make this value a function parameter?
            wf.writeframes(data)
            wf.close()
            return filename + '.wav'

        def decode_phrase(self, wav_file):
            self.decoder.start_utt()
            stream = open(wav_file, "rb")
            while True:
              buf = stream.read(1024)
              if buf:
                self.decoder.process_raw(buf, False, False)
              else:
                break
            self.decoder.end_utt()
            words = []
            [words.append(seg.word) for seg in self.decoder.seg()]
            return words

        def run(self):
            """
            Listens to Microphone, extracts phrases from it and calls pocketsphinx
            to decode the sound
            """
            self.setup_mic()

            #Open stream
            p = pyaudio.PyAudio()
            stream = p.open(format=self.FORMAT, 
                            channels=self.CHANNELS, 
                            rate=self.RATE, 
                            input=True, 
                            frames_per_buffer=self.CHUNK)

            audio2send = []
            cur_data = ''  # current chunk of audio data
            rel = self.RATE/self.CHUNK
            slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
            #Prepend audio from 0.5 seconds before noise was detected
            prev_audio = deque(maxlen=self.PREV_AUDIO * rel)
            started = False

            while True:
                cur_data = stream.read(self.CHUNK)
                slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))

                if sum([x > self.THRESHOLD for x in slid_win]) > 0:
                    if started == False:
                        print "Bắt đầu ghi âm"
                        started = True
                    audio2send.append(cur_data)

                elif started:
                    print "Hoàn thành ghi âm"
                    filename = self.save_speech(list(prev_audio) + audio2send, p)
                    r = self.decode_phrase(filename)
                    print "RESULT: ", r
# hot word for me " no one" if r.count('one') and r.count("no") > 0 the end programs
                    if r.count("one") > 0 and r.count("no") > 0:
                        Mic.playaudiofromAudio().play("/home/l/Desktop/PROJECT/Audio/beep_hi.wav")
                        os.remove(filename)
                        return
                    # Removes temp audio file
                    os.remove(filename)
                    # Reset all
                    started = False
                    slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
                    prev_audio = deque(maxlen= 0.5 * rel)
                    audio2send = []
                    print "Chế độ nghe ..."

                else:
                    prev_audio.append(cur_data)

            print "* Hoàn thành nghe"
            stream.close()
            p.terminate()