Javascript 使用谷歌NodeJS语音到文本,并带有来自客户端麦克风的音频

Javascript 使用谷歌NodeJS语音到文本,并带有来自客户端麦克风的音频,javascript,node.js,speech-to-text,getusermedia,google-speech-api,Javascript,Node.js,Speech To Text,Getusermedia,Google Speech Api,我正在尝试使用google的speech to text nodeJS library()并使用navigator.mediaDevices.getUserMedia从客户端的麦克风输入流式传输音频 我能够用sox将麦克风传输到Nodej的语音流识别,并且工作正常。 我还能够从客户端流式传输音频,制作并通过管道传输到服务器端的扬声器但是,当我尝试将其传输到StreamingRecognite时,它无法识别任何单词。 服务器端 var io = require("socket.io")(serve

我正在尝试使用google的speech to text nodeJS library()并使用navigator.mediaDevices.getUserMedia从客户端的麦克风输入流式传输音频

我能够用sox将麦克风传输到Nodej的语音流识别,并且工作正常。 我还能够从客户端流式传输音频,制作并通过管道传输到服务器端的扬声器但是,当我尝试将其传输到StreamingRecognite时,它无法识别任何单词。

服务器端

var io = require("socket.io")(server);
const speech = require('@google-cloud/speech');

const request = {
    config: {
        encoding: encoding,
        sampleRateHertz: sampleRateHertz,

        languageCode: languageCode,
    },
    interimResults: true,
    //singleUtterance: false
};
let recognizeStream = speechClient
    .streamingRecognize(request)
    .on('data', data => {
        console.log(data);
        process.stdout.write(
            data.results[0] && data.results[0].alternatives[0] ?
            `Transcription: ${data.results[0].alternatives[0].transcript}\n` :
            `\n\nReached transcription time limit, press Ctrl+C\n`
        )
    });
io.on("connection", function (client) {
    client.on("userSpeaking", function (data) {
        if (recognizeStream !== null) {
            recognizeStream.write(new Uint8Array(data));
        }
    });
});
function convertFloat32ToInt16(buffer) {
    let l = buffer.length;
    let buf = new Int16Array(l);
    while (l--) {
        buf[l] = Math.min(1, buffer[l]) * 0x7FFF;
    }
    return buf.buffer;
}
AudioContext = window.AudioContext || window.webkitAudioContext;
context = new AudioContext();
processor = context.createScriptProcessor(bufferSize, 1, 1);
processor.connect(context.destination);
context.resume();

function microphoneProcess(e) {
  var left = e.inputBuffer.getChannelData(0);
  var left16 = convertFloat32ToInt16(left);
  socket.emit('userSpeaking', left16);
}
navigator.mediaDevices
  .getUserMedia({
    video: false,
    audio: true
  }, {
    type: 'audio',
    sampleRate: 16000,
    desiredSampleRate: 16000,
    audioBitsPerSecond: 16000,
    mimeType: 'audio/webm;codecs=opus'
  })
  .then((stream) => {
    globalStream = stream;
    input = context.createMediaStreamSource(stream);
    input.connect(processor);

    processor.onaudioprocess = function (e) {
      microphoneProcess(e);
    };
  })
  .catch(console.error);
客户端

var io = require("socket.io")(server);
const speech = require('@google-cloud/speech');

const request = {
    config: {
        encoding: encoding,
        sampleRateHertz: sampleRateHertz,

        languageCode: languageCode,
    },
    interimResults: true,
    //singleUtterance: false
};
let recognizeStream = speechClient
    .streamingRecognize(request)
    .on('data', data => {
        console.log(data);
        process.stdout.write(
            data.results[0] && data.results[0].alternatives[0] ?
            `Transcription: ${data.results[0].alternatives[0].transcript}\n` :
            `\n\nReached transcription time limit, press Ctrl+C\n`
        )
    });
io.on("connection", function (client) {
    client.on("userSpeaking", function (data) {
        if (recognizeStream !== null) {
            recognizeStream.write(new Uint8Array(data));
        }
    });
});
function convertFloat32ToInt16(buffer) {
    let l = buffer.length;
    let buf = new Int16Array(l);
    while (l--) {
        buf[l] = Math.min(1, buffer[l]) * 0x7FFF;
    }
    return buf.buffer;
}
AudioContext = window.AudioContext || window.webkitAudioContext;
context = new AudioContext();
processor = context.createScriptProcessor(bufferSize, 1, 1);
processor.connect(context.destination);
context.resume();

function microphoneProcess(e) {
  var left = e.inputBuffer.getChannelData(0);
  var left16 = convertFloat32ToInt16(left);
  socket.emit('userSpeaking', left16);
}
navigator.mediaDevices
  .getUserMedia({
    video: false,
    audio: true
  }, {
    type: 'audio',
    sampleRate: 16000,
    desiredSampleRate: 16000,
    audioBitsPerSecond: 16000,
    mimeType: 'audio/webm;codecs=opus'
  })
  .then((stream) => {
    globalStream = stream;
    input = context.createMediaStreamSource(stream);
    input.connect(processor);

    processor.onaudioprocess = function (e) {
      microphoneProcess(e);
    };
  })
  .catch(console.error);

我认为'streaming.requests'还不能与节点上的GCSpeech一起工作。。。此示例在节点上工作,从麦克风输入到音频blob,您可以从麦克风获取输入的客户端将其发布到服务器。f但此操作只能在音频输入完成时执行。它不是一个流接口,AFAIK仍然需要一个本机应用程序,它的构建支持gRPC和protoBuffer堆栈。@RobertRowntree如果麦克风输入直接在nodejs代码中,它对我有效。我用node-record-lpcm16试过了,它可以工作()。但是,如果我尝试从客户端发送它,它将不起作用。我想知道这是否与我使用单声道音频而不是立体声有关。gcspeech完全是单声道的,但你似乎从立体声中得到了“左”chan。我不知道。我也在js客户端录制opus mono,保存录音并使用gCS识别音频。。。FWIW我将opus blob发布到我的“app.js”,并从那里使用云语音客户端。但我只是将opus二进制文件转换为bin64字符串,然后再发送到google。它可以工作,但很笨重。@RobertRowntree你能分享一些代码吗。