Search code examples
c#asp.netspeech-to-textibm-watson

Speech to text Ibm Watson C# with long audio more than 100 MB


I am implementing C# code in which I am trying to transcribe audio more than of 100 mb but it is not allowing me to develop that required a program that can send more than 100 mb audio in C#
In this code I am using web socket but how I can send like streaming a audio

public static void CallWatson()
{
    using (var nf = new Notifier())
    using (var ws = new WebSocket("wss://stream.watsonplatform.net/speech-to-text/api/v1/recognize?watson-token=""))
    {
        string startActionjson = "{\"action\": \"start\", \"content-type\": \"audio/wav\", \"continuous\" : true, \"interim_results\": true}";
        ws.OnOpen += (sender, e) => ws.Send(startActionjson);

        // Set the WebSocket events.
        string result = string.Empty;

        ws.OnMessage += Ws_OnMessage;


        ws.OnError += (sender, e) =>
          nf.Notify(
            new NotificationMessage
            {
                Summary = "WebSocket Error",
                Body = e.Message,
                Icon = "notification-message-im"
            });

        ws.OnClose += (sender, e) =>
          nf.Notify(
            new NotificationMessage
            {
                Summary = String.Format("WebSocket Close ({0})", e.Code),
                Body = e.Reason,
                Icon = "notification-message-im"
            });

        ws.Connect();

        //ws.SendAsync(startActionjson, b =>
        //{
        //    if (b == true)
        //    {
        //        //send the audio as binary
        //        string filePath = "E:\\test33.wav";
        //        byte[] bytes = System.IO.File.ReadAllBytes(filePath);

        //        ws.SendAsync(bytes, b1 =>
        //        {
        //            if (b1)
        //                ws.Close();
        //        });

        //        // result+=result+ws.
        //    }
        //});
        // Connect to the server asynchronously.
        //ws.ConnectAsync ();

        //Console.WriteLine("\nType 'exit' to exit.\n");
        string filePath = "E:\\Test3.wav";
        byte[] bytes = System.IO.File.ReadAllBytes(filePath);
        while (true)
        {
            Thread.Sleep(3000);

            ws.SendAsync(bytes, b1 =>
            {
                if (b1)
                    ws.Close();
            });

        }

        //for (int i = 0; i < bytes.Length; i += 1000000)
        //{
        //    Thread.Sleep(1000);
        //    byte[] buffer = new byte[1000000];
        //    Buffer.BlockCopy(bytes, i, buffer, 0, 128);
        //  //  ws.Send(buffer);
        //    ws.SendAsync(buffer, b1 =>
        //    {
        //        if (b1)
        //            ws.Close();
        //    });
        //}
    }
}

private static void Ws_OnMessage(object sender, MessageEventArgs e)
{
    string s = e.Data;
}

Solution

  • Per the documentation, there is a 100mb limit regardless of input method. So you will have to split you audio files into chunks that are < 100mb.

    To stream the audio, instead of calling System.IO.File.ReadAllBytes(filePath); and iterating over the result, I think you'll want to create a FileStream.

    Also, you shouldn't immediately close the websocket once you've reached the end of file - that may prevent you from receiving all of the results. Instead, send the string {"action": "stop"} and wait until you receive a response of {"state": "listening"} which indicates that it has completed processing your audio and sent all text back.

    Update: I got a hold of a windows machine, installed visual studio, and put together a working sample. I never did figure out what WebSocket API/Library you were using, but this just uses the built-in stuff that I could find documentation for on microsoft.com, so it should hopefully work for you.

    I tested it with a couple of different .ogg and .wav files and confirmed that I get multiple interim and final results as expected.

    using System;
    using System.Net.WebSockets;
    using System.Net;
    using System.Runtime.Serialization.Json;
    using System.Threading;
    using System.Threading.Tasks;
    using System.Text;
    using System.IO;
    using System.Runtime.Serialization;
    
    
    // Perform streaming transcription of an audio file using the IBM Watson Speech to Text service over a websocket
    // http://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/speech-to-text.html
    // https://msdn.microsoft.com/en-us/library/system.net.websockets.clientwebsocket%28v=vs.110%29.aspx
    namespace WatsonSTTWebsocketExample
    {
    
        class Program
        {
    
            static void Main(string[] args)
            {
                Transcribe();
                Console.WriteLine("Press any key to exit");
                Console.ReadLine();
            }
    
            // http://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/doc/getting_started/gs-credentials.shtml
            static String username = "<username>";
            static String password = "<password>";
    
            static String file = @"c:\audio.wav";
    
            static Uri url = new Uri("wss://stream.watsonplatform.net/speech-to-text/api/v1/recognize");
            static ArraySegment<byte> openingMessage = new ArraySegment<byte>( Encoding.UTF8.GetBytes(
                "{\"action\": \"start\", \"content-type\": \"audio/wav\", \"continuous\" : true, \"interim_results\": true}"
            ));
            static ArraySegment<byte> closingMessage = new ArraySegment<byte>(Encoding.UTF8.GetBytes(
                "{\"action\": \"stop\"}"
            ));
    
    
            static void Transcribe()
            {
                var ws = new ClientWebSocket();
                ws.Options.Credentials = new NetworkCredential(username, password);
                ws.ConnectAsync(url, CancellationToken.None).Wait();
    
                // send opening message and wait for initial delimeter 
                Task.WaitAll(ws.SendAsync(openingMessage, WebSocketMessageType.Text, true, CancellationToken.None), HandleResults(ws));
    
                // send all audio and then a closing message; simltaneously print all results until delimeter is recieved
                Task.WaitAll(SendAudio(ws), HandleResults(ws)); 
    
                // close down the websocket
                ws.CloseAsync(WebSocketCloseStatus.NormalClosure, "Close", CancellationToken.None).Wait();
            }
    
            static async Task SendAudio(ClientWebSocket ws)
            {
    
                using (FileStream fs = File.OpenRead(file))
                {
                    byte[] b = new byte[1024];
                    while (fs.Read(b, 0, b.Length) > 0)
                    {
                        await ws.SendAsync(new ArraySegment<byte>(b), WebSocketMessageType.Binary, true, CancellationToken.None);
                    }
                    await ws.SendAsync(closingMessage, WebSocketMessageType.Text, true, CancellationToken.None);
                }
            }
    
            // prints results until the connection closes or a delimeterMessage is recieved
            static async Task HandleResults(ClientWebSocket ws)
            {
                var buffer = new byte[1024];
                while (true)
                {
                    var segment = new ArraySegment<byte>(buffer);
    
                    var result = await ws.ReceiveAsync(segment, CancellationToken.None);
    
                    if (result.MessageType == WebSocketMessageType.Close)
                    {
                        return;
                    }
    
                    int count = result.Count;
                    while (!result.EndOfMessage)
                    {
                        if (count >= buffer.Length)
                        {
                            await ws.CloseAsync(WebSocketCloseStatus.InvalidPayloadData, "That's too long", CancellationToken.None);
                            return;
                        }
    
                        segment = new ArraySegment<byte>(buffer, count, buffer.Length - count);
                        result = await ws.ReceiveAsync(segment, CancellationToken.None);
                        count += result.Count;
                    }
    
                    var message = Encoding.UTF8.GetString(buffer, 0, count);
    
                    // you'll probably want to parse the JSON into a useful object here,
                    // see ServiceState and IsDelimeter for a light-weight example of that.
                    Console.WriteLine(message);
    
                    if (IsDelimeter(message))
                    {
                        return;
                    }
                }
            }
    
    
            // the watson service sends a {"state": "listening"} message at both the beginning and the *end* of the results
            // this checks for that
            [DataContract]
            internal class ServiceState
            {
                [DataMember]
                public string state = "";
            }
            static bool IsDelimeter(String json)
            {
                MemoryStream stream = new MemoryStream(Encoding.UTF8.GetBytes(json));
                DataContractJsonSerializer ser = new DataContractJsonSerializer(typeof(ServiceState));
                ServiceState obj = (ServiceState)ser.ReadObject(stream);
                return obj.state == "listening";
            }
    
        }
    }