ds1379551 2017-10-08 08:40
浏览 42

Go App服务器语音响应仅3-4秒

I want to make Speech-to-Text application with Google speech engine and Go lang HTTP server. Everything is working as per expected but one issue I am facing and unable to find where I am doing wrong. Problem is that, when am start speaking Google speech engine response only 3-4 seconds, after that I need to start again or 1 min expire then start. I am beginner to GO-lang And I spend 2 days only on debugging. Please help me out here. I need your help.

Thanks in Advance

Go-Lang source: main.go , recognize package

package main
import(
    "fmt"
    "log"
    "net/http"
    "strings"
    "vrecognize"

    gmux "github.com/gorilla/mux"
    "github.com/gorilla/websocket"
     speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
)
 var upgrader = websocket.Upgrader{} // use default options
 var voiceRecognize = new(vrecognize.VoiceRecognize)
 var voiceStream vrecognize.VoiceStream

func main() {
   mux := gmux.NewRouter().StrictSlash(true)

   mux.HandleFunc("/echo", echo)
   mux.PathPrefix("/").Handler(http.FileServer(http.Dir("./public")))
   http.ListenAndServe(":8080", mux)
}

func echo(w http.ResponseWriter, r *http.Request) {
conn, err := upgrader.Upgrade(w, r, nil)
if err != nil {
    log.Print("upgrade:", err)
    return
}
defer conn.Close()

voiceRecognize := voiceRecognize.NewRecongnize(1, 48000, "en_US")
var stream speechpb.Speech_StreamingRecognizeClient

var msgStr string
for {
    mt, message, err := conn.ReadMessage()
    if err != nil {
        log.Println("read:", err)
        return
    }
    // Message received as Text
    if mt == websocket.TextMessage {
        //Convert byte to string
        msgStr = string(message)
        //is received msg if command of text
        isEcho := strings.HasPrefix(msgStr, "echo")
        //if text, will remove echo
        if isEcho {
            msgStr = msgStr[4:len(msgStr)]
        }
        //start Audio, will initalize Audio
        if msgStr == "start" {
            stream = voiceStream.InitAudio(voiceRecognize.Encoding, voiceRecognize.SampleRateHertz, voiceRecognize.LanguageCode)
            if stream == nil {
                fmt.Println("initAudio failed!!!")
                conn.WriteMessage(websocket.TextMessage, []byte("speechInitFail"))
                break
            } else {
                //Routine will listen the result
                go voiceStream.GetResults(&stream, conn, voiceRecognize)
            }
        } else if msgStr == "stop" { //on stop will close the stream conn
            if err := stream.CloseSend(); err != nil {
                log.Printf("Could not close stream: %v", err)
                break
            }
        } else if isEcho { //echo message
            log.Printf("recv: %s", msgStr)
            err = conn.WriteMessage(mt, []byte(msgStr))
            if err != nil {
                log.Println("write: ", err)
                break
            }
        } else {
            fmt.Println("no handling for: ", string(message))
        }
    } else if mt == websocket.BinaryMessage {
        //Send voice to process
        voiceStream.SendData(&stream, &message)
    }
  }
}
---------------------------
package vrecognize

import (
"io"
"log"

"cloud.google.com/go/speech/apiv1"
"github.com/gorilla/websocket"
"golang.org/x/net/context"
speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
)

//VoiceStream : Voice related configuratin
 type VoiceStream struct {
 }

 //GetResults : Take the stream result whenever available
 func (vs VoiceStream) GetResults(stream 
   *speechpb.Speech_StreamingRecognizeClient, conn *websocket.Conn, vr 
   *VoiceRecognize) {

  defer func() {
    if r := recover(); r != nil {
        log.Println("In getResults got panic: ", r)
        *stream = vs.InitAudio(vr.Encoding, vr.SampleRateHertz, vr.LanguageCode)
    }
}()
for {
    resp, err := (*stream).Recv()
    if err == io.EOF {
        log.Printf("Reciever EOF: %v", err)
        conn.WriteMessage(websocket.TextMessage, []byte("MinuteDone"))
    }

    if err != nil {
        log.Printf("Cannot stream results: %v", err)
        conn.WriteMessage(websocket.TextMessage, []byte("VoiceInterrupted"))
    }
    if err := resp.Error; err != nil {
        log.Printf("Could not recognize: %v", err)
        conn.WriteMessage(websocket.TextMessage, []byte("NotRecognize"))
    }

    for _, result := range resp.Results {

        for _, altr := range result.GetAlternatives() {
            msg := altr.GetTranscript()
            log.Printf("Result: %+v
", msg)
            conn.WriteMessage(websocket.TextMessage, []byte(msg))
        }
    }
    //log.Printf("Loop last...")
}

}

//InitAudio - This will initalize the Audio and will listen the incomming audio
 func (vs VoiceStream) InitAudio(audioEnco speechpb.RecognitionConfig_AudioEncoding, sampleRate int32, lang string) speechpb.Speech_StreamingRecognizeClient {
//log.Printf("Speech Init strat...")
ctx := context.Background()
client, err := speech.NewClient(ctx)
if err != nil {
    log.Fatal(err)
    return nil
}
stream, err2 := client.StreamingRecognize(ctx)
if err2 != nil {
    log.Fatal(err2)
    return nil
}
//Send 1st send configuration
err = stream.Send(&speechpb.StreamingRecognizeRequest{
    StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
        StreamingConfig: &speechpb.StreamingRecognitionConfig{
            Config: &speechpb.RecognitionConfig{
                Encoding:        audioEnco, 
                SampleRateHertz: sampleRate,
                LanguageCode:    lang,
            },
            InterimResults:  true,
            SingleUtterance: true,
        },
    },
})
if err != nil {
    log.Fatal(err)
    return nil
}
//log.Printf("Speech Init finished...")
return stream
}

 //SendData This will send the data to Speech API
func (vs VoiceStream) SendData(stream *speechpb.Speech_StreamingRecognizeClient, message *[]byte) {
//log.Printf("SendData Called ...")
if err := (*stream).Send(&speechpb.StreamingRecognizeRequest{
    StreamingRequest: &speechpb.StreamingRecognizeRequest_AudioContent{
        AudioContent: *message,
    },
}); err != nil {
    log.Printf("Could not send audio: %v", err)
}
  return
}

//VoiceRecognize Class
type VoiceRecognize struct {
Encoding        speechpb.RecognitionConfig_AudioEncoding
SampleRateHertz int32
LanguageCode    string
}

//NewRecongnize : pass parameters
func (vr VoiceRecognize) NewRecongnize(audioencoding, HertzRate int32, 
  language string) *VoiceRecognize {
  var Encoding speechpb.RecognitionConfig_AudioEncoding
  switch audioencoding {
    case 0:
    // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT]
   [google.rpc.Code.INVALID_ARGUMENT].
    Encoding = speechpb.RecognitionConfig_ENCODING_UNSPECIFIED
case 1:
    // Uncompressed 16-bit signed little-endian samples (Linear PCM).
    Encoding = speechpb.RecognitionConfig_LINEAR16
case 2:
    // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
    // Codec) is the recommended encoding because it is
    // lossless--therefore recognition is not compromised--and
    // requires only about half the bandwidth of `LINEAR16`.
    Encoding = speechpb.RecognitionConfig_FLAC
case 3:
    // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
    Encoding = speechpb.RecognitionConfig_MULAW
case 4:
    // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
    Encoding = speechpb.RecognitionConfig_AMR
case 5:
    // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
    Encoding = speechpb.RecognitionConfig_AMR_WB
case 6:
    // Opus encoded audio frames in Ogg container
    // ([OggOpus](https://wiki.xiph.org/OggOpus)).
    // `sample_rate_hertz` must be 16000.
    Encoding = speechpb.RecognitionConfig_OGG_OPUS
default:
    // Uncompressed 16-bit signed little-endian samples (Linear PCM).
    Encoding = speechpb.RecognitionConfig_LINEAR16
}
return &VoiceRecognize{Encoding, HertzRate, language}
}

And index.html code:

 <!DOCTYPE html>
  <html>
   <head>
    <meta charset="UTF-8">
    <title>Golang WebSocket</title>
  </head>
  <body>
  <form>
   <input id="message" type="text" value="What are you doing">
   <input onclick="wsConnect();" id="connectBtn" value="Connect" 
     type="button"/>
   <input onclick="wsSendMessage();" id="echoBtn" value="Echo" type="button"/>
<br/>
<input onclick="startAudio();" id="startBtn" value="Start Audio" type="button"/>
 </form>
 <br/>
 <h2>Log</h2>
 <pre id="log"></pre>
 <h2>Server Response</h2>
 <pre id="serResp"></pre>
 <script type="text/javascript">
   var webSocket = null;
   var audioStream = null;
   var context = null;
   function __log(e, data) {
    if(e.type !="error"){
        log.innerHTML += "
" + e + " " + (data || '');
    }
  }
  function serverResponse(e, data){
    serResp.innerHTML = "
" + e + " " + (data || '');
  }
  function wsConnect() {
    webSocket = new WebSocket("ws://"+window.location.host+"/echo");

    var message = document.getElementById("message");
    webSocket.onopen = function (message) {
        wsOpen(message);
    };
    webSocket.onmessage = function (message) {
        wsGetMessage(message);
    };
    webSocket.onclose = function (message) {
        wsClose(message);
    };
    webSocket.onerror = function (message) {
        wsError(message);
    };
 }

 function wsError(message) {
    console.log(message);
    __log(message);
 }

 function wsOpen(message) {
    __log("Connected ...");
    if(message.type === "open"){
        document.getElementById("connectBtn").disabled = true;
        document.getElementById("startBtn").disabled = false;
        document.getElementById("echoBtn").disabled = false;
        if(document.getElementById("stopBtn") != undefined){
            document.getElementById("stopBtn").disabled = false;   
        }
    }
 }

 function wsSendMessage() {
    __log("From Browser: "+ message.value);
    webSocket.send("echo"+message.value);
    document.getElementById("message").value = ""; 
 }

 function wsCloseConnection() {
    webSocket.close();
 }

 function wsGetMessage(message) {
    console.log("Server: "+ message.data);
    msg = message.data;
    if(msg == "VoiceInterrupted" || msg == "MinuteDone" || msg == 
  "NotRecognize" || msg == "speechInitFail"){
        stopAudio();
        startAudio();
    }else{
        serverResponse("Server: " + message.data);
    }   
 }

function wsClose(message) {
    __log("Disconnect ... ");
    if(message.type === "close"){
    document.getElementById("connectBtn").disabled = false;
    document.getElementById("startBtn").disabled = true;
    document.getElementById("echoBtn").disabled = true;
    }
 }

 function wserror(message) {
    __log("Error ..."+ message);
 }

 function sendStart() {
    webSocket.send("start")
 }

 function startAudio() {
    var session = {
        audio: true,
        video: false
    };
    sendStart();
    var isMediaReady = true;
    navigator.mediaDevices.getUserMedia(session).then(function (stream) {
        initializeRecorder(stream);
    }).catch(function (err) {
        onError(err);
        document.getElementById("startBtn").disabled = false;
    });
    document.getElementById("startBtn").disabled = true;
    setTimeout(stopAudio,55*1000);
 }

 function onError(err) {
    __log("Error while calling getUserMedia:" + err);
 }

 function initializeRecorder(stream) {
    __log("initializeRecorder called...");
    audioStream = stream;
    var audioContext = window.AudioContext;
    context = new audioContext();
    var audioInput = context.createMediaStreamSource(stream);
    var bufferSize = 1*4*1024;
    // create a javascript node
    var recorder = context.createScriptProcessor(bufferSize, 1, 1);
    // specify the processing function
    recorder.onaudioprocess = recorderProcess;
    // connect stream to our recorder
    audioInput.connect(recorder);
    // connect our recorder to the previous destination
    recorder.connect(context.destination);
 }

 function convertFloat32ToInt16(buffer) {
    l = buffer.length;
    buf = new Int16Array(l);
    while (l--) {
        buf[l] = Math.min(1, buffer[l]) * 0x7FFF;
    }
    return buf.buffer;
 }

 function recorderProcess(e) {
    var left = e.inputBuffer.getChannelData(0);
    webSocket.send(convertFloat32ToInt16(left));
 }

 function stopAudio() {
    __log("stop audio called...");
    document.getElementById("startBtn").disabled = false;
    context.close();
    context = null;
    webSocket.send("stop")
    var audioTrack = audioStream.getAudioTracks();
    var i = 0;

    for (i = 0; i < audioTrack.length; i++) {
        var track = audioTrack[i];
        track.stop();
        audioStream.removeTrack(track);
    }
    audioStream = null;
 }
 </script>
 </body>
 </html>
  • 写回答

0条回答 默认 最新

    报告相同问题?

    悬赏问题

    • ¥15 多址通信方式的抗噪声性能和系统容量对比
    • ¥15 winform的chart曲线生成时有凸起
    • ¥15 msix packaging tool打包问题
    • ¥15 finalshell节点的搭建代码和那个端口代码教程
    • ¥15 Centos / PETSc / PETGEM
    • ¥15 centos7.9 IPv6端口telnet和端口监控问题
    • ¥20 完全没有学习过GAN,看了CSDN的一篇文章,里面有代码但是完全不知道如何操作
    • ¥15 使用ue5插件narrative时如何切换关卡也保存叙事任务记录
    • ¥20 海浪数据 南海地区海况数据,波浪数据
    • ¥20 软件测试决策法疑问求解答