Search code examples
javascriptnode.jsazureaudiotext-to-speech

How to convert raw data to audio in javascript


I'm using Bing Text to Speech api from Azure cognitive services.

post_option.headers = {
        'content-type' : 'application/ssml+xml',
        'Content-Length' : post_data.length,
        'X-Microsoft-OutputFormat' : 'riff-8khz-8bit-mono-mulaw',
        'Authorization': 'Bearer ' + OxfordAccessToken.access_token,
        'X-Search-AppId': '',
        'X-Search-ClientID': '',
        "User-Agent": "TTSNodeJS"
    };

var post_req = https.request(post_option, function(res){
      var _data="";
       res.on('data', function(buffer){
           //get the wave
         _data += buffer;
         });

         // end callback
        res.on('end', function(){

        console.log('wave data.length: ' + _data.length);
        });

        post_req.on('error', function(e) {
        console.log('problem with request: ' + e.message);
        });
    });

I have received the raw data of audio in string format. I want to save the data as .mp3 or .mp4 file. I was able to save that raw data as .wav by using the following code.

var fs = require('fs')
fs.writeFile('./audio.wav', data, 'binary', function(err) {
if(err) console.log(err);
else console.log("File saved");
});

But the end binary audio file is full of noise and could not be used. When 16-bit header is used the audio file has more noise.

I'm in need of help to save the output data to .mp3/.mp4 audio file without noise, kindly suggest a way to proceed.


Solution

  • I leverage request module to implement this functionality, the audio file created was clear without any noise as mentioned by you. Here is my test code snippet:

            var SsmlTemplate = "<speak version='1.0' xml:lang='en-us'><voice xml:lang='%s' xml:gender='%s' name='%s'>%s</voice></speak>";
            var post_data = util.format(SsmlTemplate, 'en-US', 'Female', 'Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)', 'This is a demo to call microsoft text to speach service in javascript.');
            console.log('\n\ntts post_data: ' + post_data + '\n');
    
            var post_option = {
                url: "https://speech.platform.bing.com/synthesize",
                method: 'POST',
                body :post_data
            };
            post_option.headers = {
                'content-type': 'application/ssml+xml',
                // 'Content-Length': post_data.length,
                'X-Microsoft-OutputFormat': 'riff-16khz-16bit-mono-pcm',
                'Authorization': 'Bearer ' + OxfordAccessToken.access_token,
                'X-Search-AppId': '07D3234E49CE426DAA29772419F436CA',
                'X-Search-ClientID': '1ECFAE91408841A480F00935DC390960',
                "User-Agent": "TTSNodeJS"
            };
    
            var post_req = request.post(post_option).on('response', function(response) {
                console.log(response.statusCode) // 200
                console.log(response.headers) 
              }).pipe(fs.createWriteStream('audio.mp3',{defaultEncoding:'binary'}));
    

    Otherwise, could you please provide your test template and your local environment. Any concern, please feel free to let me know.