Python PyAudio-将stream.read转换为int以获得振幅

Python PyAudio-将stream.read转换为int以获得振幅,python,python-2.7,stream,pyaudio,Python,Python 2.7,Stream,Pyaudio,我试图录制音频,同时打印录制信号的振幅。所以我将所有数据保存在stream.read中。但是当我试图打印它们时,我有一个字节字符串,没有整数。我想知道如何转换这些符号以获得振幅 这是我的代码: import pyaudio import wave CHUNK = 1024 FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 44100 RECORD_SECONDS = 5 WAVE_OUTPUT_FILENAME = "output.wav" p

我试图录制音频,同时打印录制信号的振幅。所以我将所有数据保存在stream.read中。但是当我试图打印它们时,我有一个字节字符串,没有整数。我想知道如何转换这些符号以获得振幅

这是我的代码:

import pyaudio
import wave

CHUNK = 1024 
FORMAT = pyaudio.paInt16
CHANNELS = 1 
RATE = 44100 
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "output.wav"

p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK) 

print("* recording")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data) # 2 bytes(16 bits) per channel

print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()

for data in frames:
    print(data)
这就是我得到的:

������������������������������������������������������������������

��������� ���������������

%�� ��(��)����,����.����%����#�� �� �� �����������������������


PyAudio正在以字符串中的字节形式为您提供二进制编码的音频帧。有关如何打印帧的可读表示形式,请参阅此问题的答案:


您当然可以通过以下代码激励自己:

#!/usr/bin/python

# open a microphone in pyAudio and listen for taps

import pyaudio
import struct
import math

INITIAL_TAP_THRESHOLD = 0.010
FORMAT = pyaudio.paInt16 
SHORT_NORMALIZE = (1.0/32768.0)
CHANNELS = 2
RATE = 44100  
INPUT_BLOCK_TIME = 0.05
INPUT_FRAMES_PER_BLOCK = int(RATE*INPUT_BLOCK_TIME)
# if we get this many noisy blocks in a row, increase the threshold
OVERSENSITIVE = 15.0/INPUT_BLOCK_TIME                    
# if we get this many quiet blocks in a row, decrease the threshold
UNDERSENSITIVE = 120.0/INPUT_BLOCK_TIME 
# if the noise was longer than this many blocks, it's not a 'tap'
MAX_TAP_BLOCKS = 0.15/INPUT_BLOCK_TIME

def get_rms( block ):
    # RMS amplitude is defined as the square root of the 
    # mean over time of the square of the amplitude.
    # so we need to convert this string of bytes into 
    # a string of 16-bit samples...

# we will get one short out for each 
# two chars in the string.
count = len(block)/2
format = "%dh"%(count)
shorts = struct.unpack( format, block )

# iterate over the block.
    sum_squares = 0.0
    for sample in shorts:
        # sample is a signed short in +/- 32768. 
        # normalize it to 1.0
        n = sample * SHORT_NORMALIZE
        sum_squares += n*n

    return math.sqrt( sum_squares / count )

class TapTester(object):
    def __init__(self):
        self.pa = pyaudio.PyAudio()
        self.stream = self.open_mic_stream()
        self.tap_threshold = INITIAL_TAP_THRESHOLD
        self.noisycount = MAX_TAP_BLOCKS+1 
        self.quietcount = 0 
        self.errorcount = 0

    def stop(self):
        self.stream.close()

    def find_input_device(self):
        device_index = None            
        for i in range( self.pa.get_device_count() ):     
            devinfo = self.pa.get_device_info_by_index(i)   
            print( "Device %d: %s"%(i,devinfo["name"]) )

            for keyword in ["mic","input"]:
                if keyword in devinfo["name"].lower():
                    print( "Found an input: device %d - %s"%        (i,devinfo["name"]) )
                    device_index = i
                    return device_index

    if device_index == None:
        print( "No preferred input found; using default input device." )

    return device_index

def open_mic_stream( self ):
    device_index = self.find_input_device()

    stream = self.pa.open(   format = FORMAT,
                             channels = CHANNELS,
                             rate = RATE,
                             input = True,
                             input_device_index = device_index,
                             frames_per_buffer = INPUT_FRAMES_PER_BLOCK)

    return stream

def tapDetected(self):
    print "Tap!"

def listen(self):
    try:
        block = self.stream.read(INPUT_FRAMES_PER_BLOCK)
    except IOError, e:
        # dammit. 
        self.errorcount += 1
        print( "(%d) Error recording: %s"%(self.errorcount,e) )
        self.noisycount = 1
        return

    amplitude = get_rms( block )
    if amplitude > self.tap_threshold:
        # noisy block
        self.quietcount = 0
        self.noisycount += 1
        if self.noisycount > OVERSENSITIVE:
            # turn down the sensitivity
            self.tap_threshold *= 1.1
    else:            
        # quiet block.

        if 1 <= self.noisycount <= MAX_TAP_BLOCKS:
            self.tapDetected()
        self.noisycount = 0
        self.quietcount += 1
        if self.quietcount > UNDERSENSITIVE:
            # turn up the sensitivity
            self.tap_threshold *= 0.9

if __name__ == "__main__":
tt = TapTester()

for i in range(1000):
    tt.listen()
!/usr/bin/python
#在pyAudio中打开麦克风并聆听敲击声
导入pyaudio
导入结构
输入数学
初始_-TAP_阈值=0.010
格式=pyaudio.paInt16
SHORT_NORMALIZE=(1.0/32768.0)
通道=2
费率=44100
输入\块\时间=0.05
每个块的输入帧=int(速率*输入块时间)
#如果我们在一行中获得如此多的噪声块,请增加阈值
过敏=15.0/输入\块\时间
#如果我们连续获得这么多安静的块,请降低阈值
欠灵敏=120.0/输入\块\时间
#如果噪音超过这么多街区,那就不是“敲击声”
最大点击块=0.15/输入块时间
def get_rms(块):
#均方根振幅定义为
#振幅平方随时间的平均值。
#所以我们需要把这个字节串转换成
#一个由16位样本组成的字符串。。。
#我们每个人少一个
#字符串中有两个字符。
计数=长度(块)/2
format=“%dh”%(计数)
shorts=结构解包(格式、块)
#在块上迭代。
平方和=0.0
对于短裤样品:
#样本为+/-32768中的有符号短路。
#将其标准化为1.0
n=样本*短\标准化
平方和+=n*n
返回数学.sqrt(平方和/计数)
类别测试仪(对象):
定义初始化(自):
self.pa=pyaudio.pyaudio()
self.stream=self.open\u mic\u stream()
self.tap\u threshold=初始\u tap\u threshold
self.noisycount=最大抽头块数+1
self.quietcount=0
self.errorcount=0
def停止(自):
self.stream.close()
def查找输入设备(自身):
设备索引=无
对于范围内的i(self.pa.get_device_count()):
devinfo=self.pa.get\u设备\u信息\u索引(i)
打印(“设备%d:%s%”(i,设备信息[“名称]))
对于[“麦克风”、“输入”]中的关键字:
devinfo[“name”]中的if关键字。lower()
打印(“找到输入:设备%d-%s”%(i,设备信息[“名称”]))
设备索引=i
返回设备索引
如果设备索引==无:
打印(“未找到首选输入;使用默认输入设备。”)
返回设备索引
def open_mic_流(自):
设备索引=self.find\u输入设备()
stream=self.pa.open(format=format,
频道=频道,
比率=比率,
输入=真,
输入设备索引=设备索引,
每个缓冲区的帧数=每个块的输入帧数)
回流
检测到def TAP(自身):
打印“点击!”
def监听(self):
尝试:
block=self.stream.read(每个块输入帧)
除IOError外,e:
#该死。
self.errorcount+=1
打印((%d)错误记录:%s“%(self.errorcount,e))
self.noisycount=1
返回
振幅=获取均方根值(块)
如果振幅>自拍阈值:
#噪声块
self.quietcount=0
自噪声计数+=1
如果self.noisycount>过敏感:
#把灵敏度调低
self.tap_阈值*=1.1
其他:
#安静的街区。

如果我认为你能做到这一点

data = stream.read(CHUNK)
for each in data:
    print(each)

我猜这个问题很老了,我一直在寻找其他答案,但在我的项目中,我使用了类似的东西

#Lets assume the constants are defined somewhere

import struct
import pyaudio
import numpy as np

self.input = pyaudio.PyAudio().open(
            format=pyaudio.paInt16,
            channels=1,
            rate=44100,
            input=True,
            output=False,
            frames_per_buffer=1024,
)
wf_data = self.input.read(self.CHUNK)
wf_data = struct.unpack(str(self.CHUNK) + 'h', wf_data)
wf_data = np.array(wf_data)
paInt16和“h”对应。您可以在此处找出与pyaudio格式匹配的字母。

归功于:

处理音频时,您可能需要信号缓冲区的RMS(均方根)值。我相信它可以更好地“查看”音频信号的整体功率

python标准库作为一个名为audioop的模块,该模块有一个名为rms的函数

import pyaudio
import time
import audioop

def get_rms():
    # Creates a generator that can iterate rms values
    CHUNK = 8
    WIDTH = 2
    CHANNELS = 1
    RATE = 44100

    p = pyaudio.PyAudio()

    try:
        stream = p.open(format=p.get_format_from_width(WIDTH),
                        channels=CHANNELS,
                        rate=RATE,
                        input=True,
                        output=False,
                        frames_per_buffer=CHUNK)
        # wait a second to allow the stream to be setup
        time.sleep(1)
        while True:
            # read the data
            data = stream.read(CHUNK, exception_on_overflow = False)
            rms = audioop.rms(data, 1)
            yield rms_scaled
    finally:
        p.terminate()
        stream.stop_stream()
        stream.close()
您可以像这样使用函数

rms_values = get_rms()
for rms in rms_values:
    print(rms)

谢谢你的回答。我刚刚添加了一行“decoded=numpy.fromstring(data,'Float32');”在我的for循环中,但结果不是结论。我得到了一个非常小的数字列表,如:3.67348991e-40 6.42851276e-40 3.67355998e-40 6.42868091e-40 2.75502285e-40 1.10201895e-39 nan 4.59204105e-40 1.19389508e-39 1.37756747e-39您需要为数据使用正确的格式。请尝试
decoded=numpy.fromstring(数据,dtype=numpy.int16)
。我建议
numpy.int16
,因为您似乎已将流定义为由16位整数样本组成。如果您想尝试不同的样本格式,以下是numpy支持的样本格式列表:感谢您的回答。我刚刚添加了类get\u rms,并将值保存在列表中,一切都很好。I g
rms_values = get_rms()
for rms in rms_values:
    print(rms)