如何改进cmuSphinx';准确度如何?
我想用Pocketspinx做一些语音到文字的转换。我已经安装了斯芬克斯基地和口袋斯芬克斯。下载声学模型/语言模型/字典。然后,我测试了如下所示:如何改进cmuSphinx';准确度如何?,c,speech-recognition,speech-to-text,alsa,C,Speech Recognition,Speech To Text,Alsa,我想用Pocketspinx做一些语音到文字的转换。我已经安装了斯芬克斯基地和口袋斯芬克斯。下载声学模型/语言模型/字典。然后,我测试了如下所示: #include <pocketsphinx/pocketsphinx.h> #include <stdio.h> #include <stdlib.h> #include "debug.h" int main(int argc, char *argv[]) { ps_decoder_t *ps; cm
#include <pocketsphinx/pocketsphinx.h>
#include <stdio.h>
#include <stdlib.h>
#include "debug.h"
int main(int argc, char *argv[])
{
ps_decoder_t *ps;
cmd_ln_t *config;
FILE *fh;
int rv;
char const *hyp, *uttid;
int32 score;
config = cmd_ln_init(NULL, ps_args(), TRUE,
"-hmm", "/home/madper/speech/hub4opensrc.cd_continuous_8gau",
"-lm", "/home/madper/speech/language_model.arpaformat.DMP",
"-dict", "/home/madper/speech/cmudict/cmudict/sphinxdict/cmudict_SPHINX_40",
NULL);
if (config == NULL)
{
DBG (("cmd_ln_init() failed.\n"));
exit(1);
}
if ((ps = ps_init (config)) == NULL) /* init decoder */
{
DBG (("ps_init() failed.\n"));
exit(1 );
}
if ((fh = fopen("test.raw", "rb")) == NULL) /* open raw file */
{
DBG (("fopen() failed.\n"));
exit (1);
}
if ((rv = ps_decode_raw (ps, fh, "test", -1)) < 0 )
{
DBG (("ps_decode_raw() error!\n"));
exit (1);
}
if ((hyp = ps_get_hyp(ps, &score, &uttid)) == NULL)
{
DBG (("ps_get_hyp() failed!\n"));
exit (1);
}
printf ("Recognized: %s\n", hyp); /* this is what you say */
fclose(fh);
ps_free(ps);
return 0;
}
#define ALSA_PCM_NEW_HW_PARAMS_API
#include <alsa/asoundlib.h>
int main() {
long loops;
int rc;
int size;
snd_pcm_t *handle;
snd_pcm_hw_params_t *params;
unsigned int val;
int dir;
snd_pcm_uframes_t frames;
char *buffer;
/* Open PCM device for recording (capture). */
rc = snd_pcm_open(&handle, "default",
SND_PCM_STREAM_CAPTURE, 0);
if (rc < 0) {
fprintf(stderr,
"unable to open pcm device: %s\n",
snd_strerror(rc));
exit(1);
}
/* Allocate a hardware parameters object. */
snd_pcm_hw_params_alloca(¶ms);
/* Fill it in with default values. */
snd_pcm_hw_params_any(handle, params);
/* Set the desired hardware parameters. */
/* Interleaved mode */
snd_pcm_hw_params_set_access(handle, params,
SND_PCM_ACCESS_RW_INTERLEAVED);
/* Signed 16-bit little-endian format */
snd_pcm_hw_params_set_format(handle, params,
SND_PCM_FORMAT_S16_LE);
/* Two channels (stereo) */
snd_pcm_hw_params_set_channels(handle, params, 1);
/* 44100 bits/second sampling rate (CD quality) */
val = 16000;
snd_pcm_hw_params_set_rate_near(handle, params,
&val, &dir);
/* Set period size to 32 frames. */
frames = 16;
snd_pcm_hw_params_set_period_size_near(handle,
params, &frames, &dir);
/* Write the parameters to the driver */
rc = snd_pcm_hw_params(handle, params);
if (rc < 0) {
fprintf(stderr,
"unable to set hw parameters: %s\n",
snd_strerror(rc));
exit(1);
}
/* Use a buffer large enough to hold one period */
snd_pcm_hw_params_get_period_size(params,
&frames, &dir);
size = frames * 2; /* 2 bytes/sample, 2 channels */
buffer = (char *) malloc(size);
/* We want to loop for 5 seconds */
snd_pcm_hw_params_get_period_time(params,
&val, &dir);
loops = 2000000 / val;
while (loops > 0) {
loops--;
rc = snd_pcm_readi(handle, buffer, frames);
if (rc == -EPIPE) {
/* EPIPE means overrun */
fprintf(stderr, "overrun occurred\n");
snd_pcm_prepare(handle);
} else if (rc < 0) {
fprintf(stderr,
"error from read: %s\n",
snd_strerror(rc));
} else if (rc != (int)frames) {
fprintf(stderr, "short read, read %d frames\n", rc);
}
rc = write(1, buffer, size);
if (rc != size)
fprintf(stderr,
"short write: wrote %d bytes\n", rc);
}
snd_pcm_drain(handle);
snd_pcm_close(handle);
free(buffer);
return 0;
}
所以,我记录了一个文件。然后对该文件进行语音测试。但是准确度非常差。就像你好
或者回家
会给我酒店
或者MHM-MHM
等等。那么这些代码有什么问题?
我已经阅读了,我应该使用声学模型调整来提高准确性吗
我把立体声换成单声道。而且声音很奇怪。我不明白我说了什么。那么,它怎么了?这是原始文件如果您查看中的第一个Q和A,您会注意到库采用单声道数据
你用立体声录音。我把它改成了单声道数据。但是准确性仍然很差。你可以提供你想要识别的音频文件,以便获得更详细的帮助。通常你需要编写更好的语法或构造更好的语言模型来适应你要识别的文本。@NikolayShmyrev,我上传了我的语音文件。你的音频识别得很好。我的结果是“家”。唯一的问题是记录级别太高,你需要降低它。我用POCKETSPHINX 0.7测试了这个文件,结果是“clinton”,当我使用“-lm”,MODELDIR/lm/en_US/hub4.5000.DMP”,当我切换到“-lm”,MODELDIR“/lm/en_US/wsj0vp.5000.DMP”,结果是“总统”“我想知道这是否是语音识别技术的下一步,比如自然智能?