Question

Я использую Google Speech-To-Text API в своем приложении Qt C ++.

Документация Google C ++ полезна, но в некоторой степени.

В моем коде ниже, если я раскомментирую

станд :: this_thread :: sleep_for (STD :: хроно :: секунд (1));

Распознавание речи работает, но не правильно - оно пропускает некоторые слова. Но без этой линии это не работает вообще. Я думаю, это потому, что цикл while MicrophoneThreadMain () конфликтует с циклом while start_speech_to_text () . Но я не уверен.

Я хочу, чтобы эти две функции выполнялись одновременно, без перерывов и без задержек. Я пытался использовать QThreads, Signal и Slots, но не смог заставить его работать.

speech_to_text.cpp

#include "speechtotext.h"

using google::cloud::speech::v1::StreamingRecognitionConfig;
using google::cloud::speech::v1::RecognitionConfig;
using google::cloud::speech::v1::Speech;
using google::cloud::speech::v1::StreamingRecognizeRequest;
using google::cloud::speech::v1::StreamingRecognizeResponse;

SpeechToText::SpeechToText(QObject *parent) : QObject(parent)
{

}

void SpeechToText::initialize()
{
    QAudioFormat qtFormat;

    // Get default audio input device
    QAudioDeviceInfo qtInfo = QAudioDeviceInfo::defaultInputDevice();

    // Set the audio format settings
    qtFormat.setCodec("audio/pcm");
    qtFormat.setByteOrder(QAudioFormat::Endian::LittleEndian);
    qtFormat.setChannelCount(1);
    qtFormat.setSampleRate(16000);
    qtFormat.setSampleSize(16);
    qtFormat.setSampleType(QAudioFormat::SignedInt);

    // Check whether the format is supported
    if (!qtInfo.isFormatSupported(qtFormat)) {
        qWarning() << "Default format is not supported";
        exit(3);
    }

    // Instantiate QAudioInput with the settings
    audioInput = new QAudioInput(qtFormat);

    // Start receiving data from audio input
    ioDevice = audioInput->start();

    emit finished_initializing();
}

void SpeechToText::MicrophoneThreadMain(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
                                      StreamingRecognizeResponse> *streamer)
{
    StreamingRecognizeRequest request;
    std::size_t size_read;

    while(true)
    {
        audioDataBuffer.append(ioDevice->readAll());
        size_read = audioDataBuffer.size();
        // And write the chunk to the stream.
        request.set_audio_content(&audioDataBuffer.data()[0], size_read);
        std::cout << "Sending " << size_read / 1024 << "k bytes." << std::endl;
        streamer->Write(request);
        //std::this_thread::sleep_for(std::chrono::seconds(1));
    }
}

void SpeechToText::start_speech_to_text()
{
    StreamingRecognizeRequest request;

    auto *streaming_config   = request.mutable_streaming_config();
    RecognitionConfig *recognition_config = new RecognitionConfig();

    recognition_config->set_language_code("en-US");
    recognition_config->set_sample_rate_hertz(16000);
    recognition_config->set_encoding(RecognitionConfig::LINEAR16);
    streaming_config->set_allocated_config(recognition_config);

    // Create a Speech Stub connected to the speech service.
    auto creds = grpc::GoogleDefaultCredentials();
    auto channel = grpc::CreateChannel("speech.googleapis.com", creds);
    std::unique_ptr<Speech::Stub> speech(Speech::NewStub(channel));

    // Begin a stream.
    grpc::ClientContext context;
    auto streamer = speech->StreamingRecognize(&context);

    // Write the first request, containing the config only.
    streaming_config->set_interim_results(true);
    streamer->Write(request);

    // The microphone thread writes the audio content.
    std::thread microphone_thread(&SpeechToText::MicrophoneThreadMain, this, streamer.get());

    // Read responses.
    StreamingRecognizeResponse response;
    while (streamer->Read(&response)) // Returns false when no more to read.
    {
        // Dump the transcript of all the results.
        for (int r = 0; r < response.results_size(); ++r)
        {
            auto result = response.results(r);
            std::cout << "Result stability: " << result.stability() << std::endl;
            for (int a = 0; a < result.alternatives_size(); ++a)
            {
                auto alternative = result.alternatives(a);
                std::cout << alternative.confidence() << "\t"
                        << alternative.transcript() << std::endl;
            }
        }
    }

    grpc::Status status = streamer->Finish();
    microphone_thread.join();
    if (!status.ok()) {
      // Report the RPC failure.
      qDebug() << "error RPC";
      std::cerr << status.error_message() << std::endl;
    }
}

speech_to_text.h

#ifndef SPEECHTOTEXT_H
#define SPEECHTOTEXT_H

#include <QObject>
#include <QDebug>
#include <QThread>

#include <thread>
#include <chrono>
#include <fstream>
#include <iostream>
#include <iterator>
#include <string>
#include <functional>

#include <QtMultimedia>
#include <QtMultimedia/QAudioInput>
#include <QAudioDeviceInfo>
#include <QAudioFormat>
#include <QIODevice>
#include <QtConcurrent>
#include <QMutex>

#include <grpc++/grpc++.h>
#include "google/cloud/speech/v1/cloud_speech.grpc.pb.h"

using google::cloud::speech::v1::StreamingRecognitionConfig;
using google::cloud::speech::v1::RecognitionConfig;
using google::cloud::speech::v1::Speech;
using google::cloud::speech::v1::StreamingRecognizeRequest;
using google::cloud::speech::v1::StreamingRecognizeResponse;

class SpeechToText : public QObject
{
    Q_OBJECT
public:
    explicit SpeechToText(QObject *parent = nullptr);

signals:
    void finished_initializing();
    void finished_speech_to_text(QString);

public slots:
    void initialize();
    void start_speech_to_text();

private:
    void MicrophoneThreadMain(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
                                          StreamingRecognizeResponse> *);

    QAudioInput *audioInput;
    QIODevice *ioDevice;
    QByteArray audioDataBuffer;
};

#endif // SPEECHTOTEXT_H

Есть идеи, как это решить?

anon_user_13 · Answer 1 · 25 июля 2018

Выкладываю здесь решение моей проблемы. Спасибо @allquixotic за всю полезную информацию.

в mainwindow.cpp

void MainWindow::setUpMicrophoneRecorder()
{
    microphone_thread = new QThread(this);
    microphone_recorder_engine.moveToThread(microphone_thread);

    connect(microphone_thread, SIGNAL(started()), &microphone_recorder_engine, SLOT(start_listen()));
    connect(&microphone_recorder_engine, &MicrophoneRecorder::microphone_data_raw,
            this, [this] (const QByteArray &data) {
        this->speech_to_text_engine.listen(data);
    });

    microphone_thread->start();
}

void MainWindow::setUpSpeechToTextEngine()
{
    speech_to_text_thread = new QThread(this);
    speech_to_text_engine.moveToThread(speech_to_text_thread);

    connect(speech_to_text_thread, SIGNAL(started()), &speech_to_text_engine, SLOT(initialize()));
    connect(&speech_to_text_engine, SIGNAL(finished_speech_to_text(QString)), this, SLOT(process_user_input(QString)));

    speech_to_text_thread->start();
}

microphonerecorder.h

#ifndef MICROPHONERECORDER_H
#define MICROPHONERECORDER_H

#include <QObject>
#include <QByteArray>
#include <QDebug>
#include <QtMultimedia>
#include <QtMultimedia/QAudioInput>
#include <QAudioDeviceInfo>
#include <QAudioFormat>
#include <QIODevice>

class MicrophoneRecorder : public QObject
{
    Q_OBJECT
public:
    explicit MicrophoneRecorder(QObject *parent = nullptr);

signals:
    void microphone_data_raw(const QByteArray &);

public slots:
    void start_listen();

private slots:
    void listen(const QByteArray &);

private:
    QAudioInput *audioInput;
    QIODevice *ioDevice;
    QByteArray audioDataBuffer;
};

#endif // MICROPHONERECORDER_H

microphonerecorder.cpp

#include "microphonerecorder.h"

MicrophoneRecorder::MicrophoneRecorder(QObject *parent) : QObject(parent)
{

}

void MicrophoneRecorder::listen(const QByteArray &audioData)
{
    emit microphone_data_raw(audioData);
}

void MicrophoneRecorder::start_listen()
{
    QAudioFormat qtFormat;

    // Get default audio input device
    QAudioDeviceInfo qtInfo = QAudioDeviceInfo::defaultInputDevice();

    // Set the audio format settings
    qtFormat.setCodec("audio/pcm");
    qtFormat.setByteOrder(QAudioFormat::Endian::LittleEndian);
    qtFormat.setChannelCount(1);
    qtFormat.setSampleRate(16000);
    qtFormat.setSampleSize(16);
    qtFormat.setSampleType(QAudioFormat::SignedInt);

    // Check whether the format is supported
    if (!qtInfo.isFormatSupported(qtFormat)) {
        qWarning() << "Default format is not supported";
        exit(3);
    }

    // Instantiate QAudioInput with the settings
    audioInput = new QAudioInput(qtFormat);

    // Start receiving data from audio input
    ioDevice = audioInput->start();

    // Listen to the received data for wake words
    QObject::connect(ioDevice, &QIODevice::readyRead, [=] {
        listen(ioDevice->readAll());
    });
}

speechtotext.h

#ifndef SPEECHTOTEXT_H
#define SPEECHTOTEXT_H

#include <QObject>
#include <QDebug>
#include <QThread>
#include <QDateTime>

#include <thread>
#include <chrono>
#include <string>

#include <QtMultimedia>
#include <QtMultimedia/QAudioInput>
#include <QAudioDeviceInfo>
#include <QAudioFormat>
#include <QIODevice>
#include <QtConcurrent>
#include <QMutex>

#include <grpc++/grpc++.h>
#include "google/cloud/speech/v1/cloud_speech.grpc.pb.h"

using google::cloud::speech::v1::StreamingRecognitionConfig;
using google::cloud::speech::v1::RecognitionConfig;
using google::cloud::speech::v1::Speech;
using google::cloud::speech::v1::StreamingRecognizeRequest;
using google::cloud::speech::v1::StreamingRecognizeResponse;

class SpeechToText : public QObject
{
    Q_OBJECT
public:
    explicit SpeechToText(QObject *parent = nullptr);

signals:
    void finished_initializing();
    void in_speech_to_text();
    void out_of_speech_to_text();
    void finished_speech_to_text(QString);

public slots:
    void initialize();
    void listen(const QByteArray &);
    void start_speech_to_text();

private:
    void MicrophoneThreadMain(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
                                          StreamingRecognizeResponse> *);
    void StreamerThread(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
                                          StreamingRecognizeResponse> *);

    QByteArray audioDataBuffer;
    int m_start_time;
};

#endif // SPEECHTOTEXT_H

speechtotext.cpp

#include "speechtotext.h"

using google::cloud::speech::v1::StreamingRecognitionConfig;
using google::cloud::speech::v1::RecognitionConfig;
using google::cloud::speech::v1::Speech;
using google::cloud::speech::v1::StreamingRecognizeRequest;
using google::cloud::speech::v1::StreamingRecognizeResponse;

SpeechToText::SpeechToText(QObject *parent) : QObject(parent)
{

}

void SpeechToText::initialize()
{
    emit finished_initializing();
}

void SpeechToText::MicrophoneThreadMain(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
                                        StreamingRecognizeResponse> *streamer)
{
    StreamingRecognizeRequest request;
    std::size_t size_read;
    while (time(0) - m_start_time <= TIME_RECOGNITION)
    {
        int chunk_size = 64 * 1024;
        if (audioDataBuffer.size() >= chunk_size)
        {
            QByteArray bytes_read = QByteArray(audioDataBuffer);
            size_read = std::size_t(bytes_read.size());

            // And write the chunk to the stream.
            request.set_audio_content(&bytes_read.data()[0], size_read);

            bool ok = streamer->Write(request);
            /*if (ok)
            {
                std::cout << "Sending " << size_read / 1024 << "k bytes." << std::endl;
            }*/

            audioDataBuffer.clear();
            audioDataBuffer.resize(0);
        }
        std::this_thread::sleep_for(std::chrono::milliseconds(50));
    }

    qDebug() << "Out of speech recognition: " << end_date;

    emit out_of_speech_to_text();

    streamer->WritesDone();
}

void SpeechToText::StreamerThread(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
                                      StreamingRecognizeResponse> *streamer)
{
    // Read responses.
    StreamingRecognizeResponse response;

    while (time(0) - m_start_time <= TIME_RECOGNITION)
    {
        if(streamer->Read(&response)) // Returns false when no more to read.
        {
            // Dump the transcript of all the results.
            if (response.results_size() > 0)
            {
                auto result = response.results(0);
                if (result.alternatives_size() > 0)
                {
                    auto alternative = result.alternatives(0);
                    auto transcript = QString::fromStdString(alternative.transcript());
                    if (result.is_final())
                    {
                        qDebug() << "Speech recognition: " << transcript;

                        emit finished_speech_to_text(transcript);
                    }
                }
            }
        }
    }
}

void SpeechToText::listen(const QByteArray &audioData)
{
    audioDataBuffer.append(audioData);
}

void SpeechToText::start_speech_to_text()
{
    qDebug() << "in start_speech_to_text: " << start_date;

    emit in_speech_to_text();

    m_start_time = time(0);
    audioDataBuffer.clear();
    audioDataBuffer.resize(0);

    StreamingRecognizeRequest request;

    auto *streaming_config   = request.mutable_streaming_config();
    RecognitionConfig *recognition_config = new RecognitionConfig();

    recognition_config->set_language_code("en-US");
    recognition_config->set_sample_rate_hertz(16000);
    recognition_config->set_encoding(RecognitionConfig::LINEAR16);
    streaming_config->set_allocated_config(recognition_config);

    // Create a Speech Stub connected to the speech service.
    auto creds = grpc::GoogleDefaultCredentials();
    auto channel = grpc::CreateChannel("speech.googleapis.com", creds);
    std::unique_ptr<Speech::Stub> speech(Speech::NewStub(channel));

    // Begin a stream.
    grpc::ClientContext context;
    auto streamer = speech->StreamingRecognize(&context);

    // Write the first request, containing the config only.
    streaming_config->set_interim_results(true);
    streamer->Write(request);

    // The microphone thread writes the audio content.
    std::thread microphone_thread(&SpeechToText::MicrophoneThreadMain, this, streamer.get());
    std::thread streamer_thread(&SpeechToText::StreamerThread, this, streamer.get());

    microphone_thread.join();
    streamer_thread.join();
}

allquixotic · Answer 2 · 02 июля 2018

Вы действительно должны следовать примеру Google и делать только 64 Кб одновременно.
Вам следует использовать WritesDone() на streamer, когда вы собираетесь отправить запрос на сервер Google.
Похоже, что вы никогда не очищаете данные своего QByteArray, поэтому со временем он просто накапливается при каждом последующем вызове append вашего QByteArray. Поскольку вы используете указатель на первый элемент данных в базовом массиве, каждый раз, когда вы выполняете цикл, вы отправляете все аудиоданные, которые были захвачены до этого момента, в streamer. Я предлагаю вложенный цикл, который несколько раз вызывает QIODevice::read(char *data, qint64 maxSize), пока ваш QByteArray не будет иметь ровно 64 КБ. Вам нужно будет обработать возвращаемое значение -1, указывающее конец потока, и скорректировать maxSize в сторону понижения в зависимости от того, сколько еще данных необходимо для заполнения массива до 64 КБ. Запросы к API Google со слишком малым объемом данных (например, всего несколько байтов, как, по-видимому, выполняется в текущем цикле), могут привести к ограничению скорости или к перегрузке в восходящем направлении при подключении к Интернету из-за высокого отношения издержек протокола к данным. Также, вероятно, легче справиться с этим с помощью простого массива в стиле C фиксированного размера (64 КБ), а не QByteArray, потому что вам не нужно изменять размер, а AFAIK QByteArray::clear() может привести к выделению памяти (не очень хорошо для производительности). Чтобы избежать повторной отправки старых данных при короткой записи (например, когда поток микрофона закрывается до заполнения буфера 64 КБ), вы также должны memset(array, 0, sizeof array); после каждого вызова ClientReaderWriterInterface::WritesDone().
Если сеть не успевает за поступающими данными с микрофона, вы можете столкнуться с ситуацией переполнения на QAudioInput, когда у нее заканчивается локальный буфер для хранения аудио. Большая буферизация делает это менее вероятным, но также уменьшает отзывчивость. Вы можете просто буферизовать все данные, поступающие из QAudioInput, в неограниченный QByteArray и считывать из , что 64k за раз (это можно сделать, обернув их в QBuffer и весь ваш код, связанный с QIODevice в MicrophoneThreadMain(), будет совместимым.) Я думаю, что обычно для таких проектов, как ваш, пользователь предпочел бы иметь худшую отзывчивость, а не повторяться в случае переполнение сети. Но, вероятно, существует порог - возможно, 5 секунд или около того - после которого буферизованные данные могут устареть, поскольку пользователь может попытаться снова говорить в микрофон, вызывая странный эффект от нескольких событий STT, происходящих в быстрой последовательности после восходящего потока. узкое место освобождается.

Google Speech Recognition не работает из-за сталкивающихся потоков Qt C ++

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Google Speech Recognition не работает из-за сталкивающихся потоков Qt C ++

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Похожие темы