Плохие типы пытаются использовать петлю распознавания звука - PullRequest
5 голосов
/ 14 октября 2019

Система Microsoft Speech имеет хороший пример кода, но у меня есть проблема при добавлении обратной петли для записи того, что она играет, а не того, что идет через микрофон. Чтобы дать текстовое описание видео, например, не воспроизводя его на динамике. Кажется, что это - библиотека, которая делает это, но я получаю ошибки типа, выдвигающие его в аудиопоток распознавателя:

using System;
using System.Speech.Recognition;
using NAudio.Wave;
using NAudio.CoreAudioApi.Interfaces;

using NAudio.CoreAudioApi;
using System.IO;
using System.Speech.AudioFormat;

namespace SpeechRecognitionApp
{
    class Program
    {
        static void Main(string[] args)
        {

            // Create an in-process speech recognizer for the en-US locale.  
            using (
            SpeechRecognitionEngine recognizer =
              new SpeechRecognitionEngine(
                new System.Globalization.CultureInfo("en-US")))
            {

                // Create and load a dictation grammar.  
                recognizer.LoadGrammar(new DictationGrammar());

                // Add a handler for the speech recognized event.  
                recognizer.SpeechRecognized +=
                  new EventHandler<SpeechRecognizedEventArgs>(recognizer_SpeechRecognized);

                // Configure input to the speech recognizer.  
                //recognizer.SetInputToDefaultAudioDevice();  
                WasapiLoopbackCapture capture = new WasapiLoopbackCapture();
                Stream captureStream = new System.IO.MemoryStream();
                capture.DataAvailable += (s, a) =>
                {
                    captureStream.Write(a.Buffer, 0, a.BytesRecorded);
                    captureStream.Flush();
                };
                capture.StartRecording();
                Console.WriteLine(capture.WaveFormat.AverageBytesPerSecond);
                Console.WriteLine(capture.WaveFormat.BitsPerSample);
                recognizer.SetInputToAudioStream(captureStream, new SpeechAudioFormatInfo(
                    capture.WaveFormat.AverageBytesPerSecond, AudioBitsPerSample.Sixteen, AudioChannel.Stereo));

                // Start asynchronous, continuous speech recognition.  
                recognizer.RecognizeAsync(RecognizeMode.Multiple);

                // Keep the console window open.  
                while (true)
                {
                    Console.ReadLine();
                }
            }
        }

        // Handle the SpeechRecognized event.  
        static void recognizer_SpeechRecognized(object sender, SpeechRecognizedEventArgs e)
        {
            Console.WriteLine("Recognized text: " + e.Result.Text);
        }
    }
}

Обновлено КакВы можете видеть в пересмотренном коде, что он компилируется по крайней мере сейчас, но не распознает никакой речи, внутренней или внешней. На самом деле он выдает:

384000
32

Так что, поскольку AudioBitsPerSample не имеет значения «тридцать два», возможно, я даже не могу использовать класс NAudio для получения системного звука ??

Обновление Кажется, это работает несколько иначе, основываясь на другом ответе, но он не очень улавливается, я думаю, что он отправляет замедленное или быстрое аудио, возможно?

using System;
using System.Speech.Recognition;
using NAudio.Wave;
using NAudio.CoreAudioApi.Interfaces;

using NAudio.CoreAudioApi;
using System.IO;
using System.Speech.AudioFormat;

namespace SpeechRecognitionApp
{

    class FakeStreamer : Stream
    {
        public bool bExit = false;
        Stream stream;
        Stream client;
        public FakeStreamer(Stream client)
        {
            this.client = client;
            this.stream = client;
        }
        public override bool CanRead
        {
            get { return stream.CanRead; }
        }

        public override bool CanSeek
        {
            get { return false; }
        }

        public override bool CanWrite
        {
            get { return stream.CanWrite; }
        }

        public override long Length
        {
            get { return -1L; }
        }

        public override long Position
        {
            get { return 0L; }
            set { }
        }
        public override long Seek(long offset, SeekOrigin origin)
        {
            return 0L;
        }

        public override void SetLength(long value)
        {
            stream.SetLength(value);
        }
        public override int Read(byte[] buffer, int offset, int count)
        {
            int len = 0, c = count;
            while (c > 0 && !bExit)
            {
                try
                {
                    len = stream.Read(buffer, offset, c);
                }
                catch (Exception e)
                {
                    Console.WriteLine("ouch");
                }
                /*if (!client.Connected || len == 0)
                {
                    //Exit read loop
                    return 0;
                }*/
                offset += len;
                c -= len;
            }
            return count;
        }

        public override void Write(byte[] buffer, int offset, int count)
        {
            stream.Write(buffer, offset, count);
        }

        public override void Close()
        {
            stream.Close();
            base.Close();
        }

        public override void Flush()
        {
            stream.Flush();
        }
    }
    class Program
    {
        static void Main(string[] args)
        {

            // Create an in-process speech recognizer for the en-US locale.  
            using (
            SpeechRecognitionEngine recognizer =
              new SpeechRecognitionEngine(
                new System.Globalization.CultureInfo("en-US")))
            {

                // Create and load a dictation grammar.  
                recognizer.LoadGrammar(new DictationGrammar());

                // Add a handler for the speech recognized event.  
                recognizer.SpeechRecognized +=
                  new EventHandler<SpeechRecognizedEventArgs>(recognizer_SpeechRecognized);

                // Configure input to the speech recognizer.  
                //recognizer.SetInputToDefaultAudioDevice();  
                WasapiLoopbackCapture capture = new WasapiLoopbackCapture();
                Stream captureStream = new System.IO.MemoryStream();
                Stream buffStream = new FakeStreamer(captureStream);
                capture.DataAvailable += (s, a) =>
                {
                    captureStream.Write(a.Buffer, 0, a.BytesRecorded);
                };
                capture.StartRecording();
                Console.WriteLine(capture.WaveFormat.AverageBytesPerSecond);
                Console.WriteLine(capture.WaveFormat.BitsPerSample);

                //recognizer.SetInputToDefaultAudioDevice();
                recognizer.SetInputToAudioStream(buffStream, new SpeechAudioFormatInfo(
                    capture.WaveFormat.AverageBytesPerSecond/4, AudioBitsPerSample.Eight, AudioChannel.Stereo));

                // Start asynchronous, continuous speech recognition.  
                recognizer.RecognizeAsync(RecognizeMode.Multiple);

                // Keep the console window open.  
                while (true)
                {
                    Console.ReadLine();
                }
            }
        }

        // Handle the SpeechRecognized event.  
        static void recognizer_SpeechRecognized(object sender, SpeechRecognizedEventArgs e)
        {
            Console.WriteLine("Recognized text: " + e.Result.Text);
        }
    }
}

Обновление 3 - Попытка перекодировать аудиопоток в режим распознавания голоса: к сожалению, он не попадает в перезаписанный перекодированный звук, как вы можете видеть ...

using System;
using System.Speech.Recognition;
using NAudio.Wave;
using NAudio.CoreAudioApi.Interfaces;

using NAudio.CoreAudioApi;
using System.IO;
using System.Speech.AudioFormat;

namespace SpeechRecognitionApp
{

    class FakeStreamer : Stream
    {
        public bool bExit = false;
        Stream stream;
        Stream client;
        public FakeStreamer(Stream client)
        {
            this.client = client;
            this.stream = client;
        }
        public override bool CanRead
        {
            get { return stream.CanRead; }
        }

        public override bool CanSeek
        {
            get { return false; }
        }

        public override bool CanWrite
        {
            get { return stream.CanWrite; }
        }

        public override long Length
        {
            get { return -1L; }
        }

        public override long Position
        {
            get { return 0L; }
            set { }
        }
        public override long Seek(long offset, SeekOrigin origin)
        {
            return 0L;
        }

        public override void SetLength(long value)
        {
            stream.SetLength(value);
        }
        public override int Read(byte[] buffer, int offset, int count)
        {
            int len = 0, c = count;
            while (c > 0 && !bExit)
            {
                try
                {
                    len = stream.Read(buffer, offset, c);
                }
                catch (Exception e)
                {
                    Console.WriteLine("ouch");
                }
                /*if (!client.Connected || len == 0)
                {
                    //Exit read loop
                    return 0;
                }*/
                offset += len;
                c -= len;
            }
            return count;
        }

        public override void Write(byte[] buffer, int offset, int count)
        {
            stream.Write(buffer, offset, count);
        }

        public override void Close()
        {
            stream.Close();
            base.Close();
        }

        public override void Flush()
        {
            stream.Flush();
        }
    }
    class Program
    {
        static void Main(string[] args)
        {

            // Create an in-process speech recognizer for the en-US locale.  
            using (
            SpeechRecognitionEngine recognizer =
              new SpeechRecognitionEngine(
                new System.Globalization.CultureInfo("en-US")))
            {

                // Create and load a dictation grammar.  
                recognizer.LoadGrammar(new DictationGrammar());

                // Add a handler for the speech recognized event.  
                recognizer.SpeechRecognized +=
                  new EventHandler<SpeechRecognizedEventArgs>(recognizer_SpeechRecognized);

                // Configure input to the speech recognizer.  
                //recognizer.SetInputToDefaultAudioDevice();  
                WasapiLoopbackCapture capture = new WasapiLoopbackCapture();
                Stream captureStream = new System.IO.MemoryStream();
                //Stream buffStream = new FakeStreamer(captureStream);
                capture.DataAvailable += (s, a) =>
                {
                    captureStream.Write(a.Buffer, 0, a.BytesRecorded);
                };
                Console.WriteLine(capture.WaveFormat.AverageBytesPerSecond);
                Console.WriteLine(capture.WaveFormat.BitsPerSample);
                var newFormat = new WaveFormat(8000, 16, 1);
                //using (var conversionStream = new WaveFormatConversionStream(newFormat, capture)

                var resampler = new MediaFoundationResampler(new NAudio.Wave.RawSourceWaveStream(captureStream,capture.WaveFormat), newFormat);
                Stream captureConvertStream = new System.IO.MemoryStream();
                resampler.ResamplerQuality = 60;
                    //WaveFileWriter.WriteWavFileToStream(captureConvertStream, resampler);
                    //recognizer.SetInputToDefaultAudioDevice();
                    Stream buffStream = new FakeStreamer(captureConvertStream);

                recognizer.SetInputToAudioStream(buffStream, new SpeechAudioFormatInfo(
                    8000, AudioBitsPerSample.Sixteen, AudioChannel.Mono));

                // Start asynchronous, continuous speech recognition.  
                recognizer.RecognizeAsync(RecognizeMode.Multiple);

                capture.StartRecording();
                var arr = new byte[128];
                while (resampler.Read(arr, 0, arr.Length) > 0)
                {
                    captureConvertStream.Write(arr, 0, arr.Length);
                    Console.WriteLine("Never getting here");
                }
                // Keep the console window open.  
                while (true)
                {
                    Console.ReadLine();
                }
            }
        }

        // Handle the SpeechRecognized event.  
        static void recognizer_SpeechRecognized(object sender, SpeechRecognizedEventArgs e)
        {
            Console.WriteLine("Recognized text: " + e.Result.Text);
        }
    }
}
...