Я пытаюсь изменить пример потоковой передачи (речь в текст, код ниже), чтобы получать входные данные из предварительно записанного аудиоклипа внутри сценария потоковой передачи демонстрационного примера IBM. В частности, как использовать предварительно записанный клип из аудиоисточника вместо использования микрофонного входа?
Я пытался сделать это, сделав стартовый клип (_recording) равным аудиофрагменту клипа в обработчике записи IEnumerator, но что происходит, он просто играет без изменения текста.
Кто-нибудь сделал это в единстве? Я знаю, что речь в текстовом на демо-сайте имеет функцию использования предварительно записанных аудиоклипов. (https://speech-to-text-demo.ng.bluemix.net/) Чтобы воспроизвести это, вам нужна учетная запись Unity, IBM и необходимо установить IBM Unity SDK из github. (https://github.com/watson-developer-cloud/unity-sdk).
#pragma warning disable 0649
using UnityEngine;
using System.Collections;
using System.Collections.Generic;
using UnityEngine.UI;
using IBM.Watson.SpeechToText.V1;
using IBM.Cloud.SDK;
using IBM.Cloud.SDK.Authentication;
using IBM.Cloud.SDK.Authentication.Iam;
using IBM.Cloud.SDK.Utilities;
using IBM.Cloud.SDK.DataTypes;
namespace IBM.Watsson.Examples
{
public class ExampleStreaming : MonoBehaviour
{
#region PLEASE SET THESE VARIABLES IN THE INSPECTOR
[Space(10)]
[Tooltip("The service URL (optional). This defaults to \"https://stream.watsonplatform.net/speech-to-text/api\"")]
[SerializeField]
private string _serviceUrl;
[Tooltip("Text field to display the results of streaming.")]
public Text ResultsField;
[Header("IAM Authentication")]
[Tooltip("The IAM apikey.")]
[SerializeField]
private string _iamApikey;
[Header("Parameters")]
// https://www.ibm.com/watson/developercloud/speech-to-text/api/v1/curl.html?curl#get-model
[Tooltip("The Model to use. This defaults to en-US_BroadbandModel")]
[SerializeField]
private string _recognizeModel;
#endregion
private int _recordingRoutine = 0;
private string _microphoneID = null;
[SerializeField] AudioClip _recording = null;
private int _recordingBufferSize = 1;
private int _recordingHZ = 22050;
[SerializeField] AudioSource Clipper = null;
private SpeechToTextService _service;
void Start()
{
LogSystem.InstallDefaultReactors();
Runnable.Run(CreateService());
}
private IEnumerator CreateService()
{
if (string.IsNullOrEmpty(_iamApikey))
{
throw new IBMException("Plesae provide IAM ApiKey for the service.");
}
IamAuthenticator authenticator = new IamAuthenticator(apikey: _iamApikey);
// Wait for tokendata
while (!authenticator.CanAuthenticate())
yield return null;
_service = new SpeechToTextService(authenticator);
if (!string.IsNullOrEmpty(_serviceUrl))
{
_service.SetServiceUrl(_serviceUrl);
}
_service.StreamMultipart = true;
Active = true;
StartRecording();
}
public bool Active
{
get { return _service.IsListening; }
set
{
if (value && !_service.IsListening)
{
_service.RecognizeModel = (string.IsNullOrEmpty(_recognizeModel) ? "en-US_BroadbandModel" : _recognizeModel);
_service.DetectSilence = true;
_service.EnableWordConfidence = true;
_service.EnableTimestamps = true;
_service.SilenceThreshold = 0.01f;
_service.MaxAlternatives = 1;
_service.EnableInterimResults = true;
_service.OnError = OnError;
_service.InactivityTimeout = -1;
_service.ProfanityFilter = false;
_service.SmartFormatting = true;
_service.SpeakerLabels = false;
_service.WordAlternativesThreshold = null;
_service.EndOfPhraseSilenceTime = null;
_service.StartListening(OnRecognize, OnRecognizeSpeaker);
}
else if (!value && _service.IsListening)
{
_service.StopListening();
}
}
}
private void StartRecording()
{
if (_recordingRoutine == 0)
{
UnityObjectUtil.StartDestroyQueue();
_recordingRoutine = Runnable.Run(RecordingHandler());
}
}
private void StopRecording()
{
if (_recordingRoutine != 0)
{
Microphone.End(_microphoneID);
Runnable.Stop(_recordingRoutine);
_recordingRoutine = 0;
}
}
private void OnError(string error)
{
Active = false;
Log.Debug("ExampleStreaming.OnError()", "Error! {0}", error);
}
private IEnumerator RecordingHandler()
{
Log.Debug("ExampleStreaming.RecordingHandler()", "devices: {0}", Microphone.devices);
_recording = Microphone.Start(_microphoneID, true, _recordingBufferSize, _recordingHZ);
yield return null; // let _recordingRoutine get set..
if (_recording == null)
{
StopRecording();
yield break;
}
bool bFirstBlock = true;
int midPoint = _recording.samples / 2;
float[] samples = null;
while (_recordingRoutine != 0 && _recording != null)
{
int writePos = Microphone.GetPosition(_microphoneID);
if (writePos > _recording.samples || !Microphone.IsRecording(_microphoneID))
{
Log.Error("ExampleStreaming.RecordingHandler()", "Microphone disconnected.");
StopRecording();
yield break;
}
if ((bFirstBlock && writePos >= midPoint)
|| (!bFirstBlock && writePos < midPoint))
{
// front block is recorded, make a RecordClip and pass it onto our callback.
samples = new float[midPoint];
_recording.GetData(samples, bFirstBlock ? 0 : midPoint);
AudioData record = new AudioData();
record.MaxLevel = Mathf.Max(Mathf.Abs(Mathf.Min(samples)), Mathf.Max(samples));
record.Clip = AudioClip.Create("Recording", midPoint, _recording.channels, _recordingHZ, false);
record.Clip.SetData(samples, 0);
_service.OnListen(record);
bFirstBlock = !bFirstBlock;
}
else
{
// calculate the number of samples remaining until we ready for a block of audio,
// and wait that amount of time it will take to record.
int remaining = bFirstBlock ? (midPoint - writePos) : (_recording.samples - writePos);
float timeRemaining = (float)remaining / (float)_recordingHZ;
yield return new WaitForSeconds(timeRemaining);
}
}
yield break;
}
private void OnRecognize(SpeechRecognitionEvent result)
{
if (result != null && result.results.Length > 0)
{
foreach (var res in result.results)
{
foreach (var alt in res.alternatives)
{
string text = string.Format("{0} ({1}, {2:0.00})\n", alt.transcript, res.final ? "Final" : "Interim", alt.confidence);
Log.Debug("ExampleStreaming.OnRecognize()", text);
ResultsField.text = text;
}
if (res.keywords_result != null && res.keywords_result.keyword != null)
{
foreach (var keyword in res.keywords_result.keyword)
{
Log.Debug("ExampleStreaming.OnRecognize()", "keyword: {0}, confidence: {1}, start time: {2}, end time: {3}", keyword.normalized_text, keyword.confidence, keyword.start_time, keyword.end_time);
}
}
if (res.word_alternatives != null)
{
foreach (var wordAlternative in res.word_alternatives)
{
Log.Debug("ExampleStreaming.OnRecognize()", "Word alternatives found. Start time: {0} | EndTime: {1}", wordAlternative.start_time, wordAlternative.end_time);
foreach (var alternative in wordAlternative.alternatives)
Log.Debug("ExampleStreaming.OnRecognize()", "\t word: {0} | confidence: {1}", alternative.word, alternative.confidence);
}
}
}
}
}
private void OnRecognizeSpeaker(SpeakerRecognitionEvent result)
{
if (result != null)
{
foreach (SpeakerLabelsResult labelResult in result.speaker_labels)
{
Log.Debug("ExampleStreaming.OnRecognizeSpeaker()", string.Format("speaker result: {0} | confidence: {3} | from: {1} | to: {2}", labelResult.speaker, labelResult.from, labelResult.to, labelResult.confidence));
}
}
}
}
}