Search code examples
javaaudiospeech-recognitionmicrophonevosk

Use the microphone in java for speech recognition with VOSK


I am trying to add real-time speech recognition to my java project (preferably offline). Through some googling and trying other solutions, I settled on using VOSK for my speech recognition. The primary problem I am encountering, however, is that VOSK has very little documentation and comes with only one example file for java which is used to extract text from a prerecorded wav file, shown below.

public static void main(String[] argv) throws IOException, UnsupportedAudioFileException {
        LibVosk.setLogLevel(LogLevel.DEBUG);

        try (Model model = new Model("src\\main\\resources\\model");
                    InputStream ais = AudioSystem.getAudioInputStream(new BufferedInputStream(new FileInputStream("src\\main\\resources\\python_example_test.wav")));
                    Recognizer recognizer = new Recognizer(model, 16000)) {

            int nbytes;
            byte[] b = new byte[4096];
            while ((nbytes = ais.read(b)) >= 0) {
                System.out.println(nbytes);
                if (recognizer.acceptWaveForm(b, nbytes)) {
                    System.out.println(recognizer.getResult());
                } else {
                    System.out.println(recognizer.getPartialResult());
                }
            }

            System.out.println(recognizer.getFinalResult());
        }
    }

I attempted to convert this into something that would accept microphone audio, shown below:

public static void main(String[] args) {
        LibVosk.setLogLevel(LogLevel.DEBUG);
        AudioFormat format = new AudioFormat(8000.0f, 16, 1, true, true);
        TargetDataLine microphone;
        SourceDataLine speakers;

        try (Model model = new Model("src\\main\\resources\\model");
                Recognizer recognizer = new Recognizer(model, 16000)) {
            try {
                microphone = AudioSystem.getTargetDataLine(format);

                DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
                microphone = (TargetDataLine) AudioSystem.getLine(info);
                microphone.open(format);
                microphone.start();
                
                ByteArrayOutputStream out = new ByteArrayOutputStream();
                int numBytesRead;
                int CHUNK_SIZE = 1024;
                int bytesRead = 0;
                
                DataLine.Info dataLineInfo = new DataLine.Info(SourceDataLine.class, format);
                speakers = (SourceDataLine) AudioSystem.getLine(dataLineInfo);
                speakers.open(format);
                speakers.start();
                byte[] b = new byte[4096];

                while (bytesRead <= 100000) {
                    numBytesRead = microphone.read(b, 0, CHUNK_SIZE);
                    bytesRead += numBytesRead;
                    
                    out.write(b, 0, numBytesRead); 

                    speakers.write(b, 0, numBytesRead);

                    if (recognizer.acceptWaveForm(b, numBytesRead)) {
                        System.out.println(recognizer.getResult());
                    } else {
                        System.out.println(recognizer.getPartialResult());
                    }
                }
                System.out.println(recognizer.getFinalResult());
                speakers.drain();
                speakers.close();
                microphone.close();
            } catch (Exception e) {
                e.printStackTrace();
            }

        }

    }

This appears to be correctly capturing microphone data correctly (as it also outputs to the speakers) but VOSK shows no input, constantly printing results as empty strings. What am I doing wrong? Is what I am attempting even possible? Should I try to find a different library for speech recognition?


Solution

  • this code work correctly for me you can use this:

        public static void main(String[] args) {
        
        LibVosk.setLogLevel(LogLevel.DEBUG);
        
        AudioFormat format = new AudioFormat(AudioFormat.Encoding.PCM_SIGNED, 60000, 16, 2, 4, 44100, false);
        DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
        TargetDataLine microphone;
        SourceDataLine speakers;
    
        try (Model model = new Model("model");
             Recognizer recognizer = new Recognizer(model, 120000)) {
            try {
    
                microphone = (TargetDataLine) AudioSystem.getLine(info);
                microphone.open(format);
                microphone.start();
    
                ByteArrayOutputStream out = new ByteArrayOutputStream();
                int numBytesRead;
                int CHUNK_SIZE = 1024;
                int bytesRead = 0;
    
                DataLine.Info dataLineInfo = new DataLine.Info(SourceDataLine.class, format);
                speakers = (SourceDataLine) AudioSystem.getLine(dataLineInfo);
                speakers.open(format);
                speakers.start();
                byte[] b = new byte[4096];
    
                while (bytesRead <= 100000000) {
                    numBytesRead = microphone.read(b, 0, CHUNK_SIZE);
                    bytesRead += numBytesRead;
    
                    out.write(b, 0, numBytesRead);
    
                    speakers.write(b, 0, numBytesRead);
    
                    if (recognizer.acceptWaveForm(b, numBytesRead)) {
                        System.out.println(recognizer.getResult());
                    } else {
                        System.out.println(recognizer.getPartialResult());
                    }
                }
                System.out.println(recognizer.getFinalResult());
                speakers.drain();
                speakers.close();
                microphone.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }