Я попытался запустить пример кода из:
https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/how-to-async-conversation-transcription
реализации вспомогательного класса из:
https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/java/jre/console/src/com/microsoft/cognitiveservices/speech/samples/console/WavStream.java
с некоторыми небольшими изменениями, чтобы он мог читать в wav-файлах, не ограниченных 16-килобитовым 16-канальным одиночным каналом, и когда я его запускаю, возникает следующее:
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
Conversation transcriber canceled:SessionId:b2496d2c13424b3ba3138f2c8ce0893f ResultId:258796dbc69d491786f3ccdd8ec708d6 CancellationReason:Error CancellationErrorCode:ConnectionFailure Error details:<Connection failed (no connection to the remote host). Internal error: 1. Error details: 11001. Please check network connection, firewall setting, and the region name used to create speech factory. SessionId: b2496d2c13424b3ba3138f2c8ce0893f
Conversation transcriber stopped:SessionId: b2496d2c13424b3ba3138f2c8ce0893f.
I ' Я уверен, что ключ API и настройка региона правильны и работают, а конфигурация Inte rnet не имеет никаких проблем.
Вот коды:
Main. java:
package speechsdk.quickstart;
import com.azure.core.util.polling.PollerFlux;
import com.azure.core.util.polling.SyncPoller;
import com.microsoft.cognitiveservices.speech.*;
import com.microsoft.cognitiveservices.speech.audio.*;
import com.microsoft.cognitiveservices.speech.remoteconversation.RemoteConversationTranscriptionClient;
import com.microsoft.cognitiveservices.speech.remoteconversation.RemoteConversationTranscriptionOperation;
import com.microsoft.cognitiveservices.speech.remoteconversation.RemoteConversationTranscriptionResult;
import com.microsoft.cognitiveservices.speech.transcription.Conversation;
import com.microsoft.cognitiveservices.speech.transcription.ConversationTranscriber;
import com.microsoft.cognitiveservices.speech.transcription.ConversationTranscriptionResult;
import javax.sound.sampled.AudioFileFormat;
import java.io.FileInputStream;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
/**
* Quickstart: recognize speech using the Speech SDK for Java.
*/
public class Main {
/**
* @param args Arguments are ignored in this sample.
*/
public static void main(String[] args) {
try {
// Create the speech config object
SpeechConfig speechConfig = SpeechConfig.fromSubscription("APIKEY", "SERVICEREGION");
speechConfig.setProperty("ConversationTranscriptionInRoomAndOnline", "true");
// Set the property for asynchronous transcription
speechConfig.setServiceProperty("transcriptionMode", "Async", ServicePropertyChannel.UriQueryParameter);
// Set the property for real-time plus asynchronous transcription
//speechConfig.setServiceProperty("transcriptionMode", "RealTimeAndAsync", ServicePropertyChannel.UriQueryParameter);
// pick a conversation Id that is a GUID.
String conversationId = UUID.randomUUID().toString();
// Create a Conversation
Conversation conversation = new Conversation(speechConfig, conversationId);
// Create an audio stream from a wav file or from the default microphone if you want to stream live audio from the supported devices
// Replace with your own audio file name and Helper class which implements AudioConfig using PullAudioInputStreamCallback
WavStream wavStream = new WavStream(new FileInputStream("sample.wav"));
PullAudioInputStreamCallback wavfilePullStreamCallback = wavStream;
// Create an audio stream format assuming the file used above is 16Khz, 16 bits and 8 channel pcm wav file
//AudioStreamFormat audioStreamFormat = AudioStreamFormat.getWaveFormatPCM((long)16000, (short)16,(short)8);
AudioStreamFormat audioStreamFormat = AudioStreamFormat.getWaveFormatPCM(wavStream.getSamplespersecond(), (short) wavStream.getBitspersample(), (short) wavStream.getChannel());
// Create an input stream
AudioInputStream audioStream = AudioInputStream.createPullStream(wavfilePullStreamCallback, audioStreamFormat);
// Create a conversation transcriber
ConversationTranscriber transcriber = new ConversationTranscriber(AudioConfig.fromStreamInput(audioStream));
// join a conversation
transcriber.joinConversationAsync(conversation);
// Add the event listener for the realtime events
transcriber.transcribed.addEventListener((o, e) -> {
System.out.println("Conversation transcriber Recognized:" + e.toString());
});
transcriber.canceled.addEventListener((o, e) -> {
System.out.println("Conversation transcriber canceled:" + e.toString());
try {
transcriber.stopTranscribingAsync().get();
} catch (InterruptedException ex) {
ex.printStackTrace();
} catch (ExecutionException ex) {
ex.printStackTrace();
}
});
transcriber.sessionStopped.addEventListener((o, e) -> {
System.out.println("Conversation transcriber stopped:" + e.toString());
try {
transcriber.stopTranscribingAsync().get();
} catch (InterruptedException ex) {
ex.printStackTrace();
} catch (ExecutionException ex) {
ex.printStackTrace();
}
});
// start the transcription.
Future<?> future = transcriber.startTranscribingAsync();
// Create a remote Conversation Transcription client
RemoteConversationTranscriptionClient client = new RemoteConversationTranscriptionClient(speechConfig);
// Get the PollerFlux for the remote operation
PollerFlux<RemoteConversationTranscriptionOperation, RemoteConversationTranscriptionResult> remoteTranscriptionOperation = client.getTranscriptionOperation(conversationId);
// Subscribe to PollerFlux to get the remote operation status
remoteTranscriptionOperation.subscribe(
pollResponse -> {
System.out.println("Poll response status : " + pollResponse.getStatus());
System.out.println("Poll response status : " + pollResponse.getValue().getServiceStatus());
}
);
// Obtain the blocking operation using getSyncPoller
SyncPoller<RemoteConversationTranscriptionOperation, RemoteConversationTranscriptionResult> blockingOperation = remoteTranscriptionOperation.getSyncPoller();
// Wait for the operation to finish
blockingOperation.waitForCompletion();
// Get the final result response
RemoteConversationTranscriptionResult resultResponse = blockingOperation.getFinalResult();
// Print the result
if(resultResponse != null) {
if(resultResponse.getConversationTranscriptionResults() != null) {
for (int i = 0; i < resultResponse.getConversationTranscriptionResults().size(); i++) {
ConversationTranscriptionResult result = resultResponse.getConversationTranscriptionResults().get(i);
System.out.println(result.getProperties().getProperty(PropertyId.SpeechServiceResponse_JsonResult.name()));
System.out.println(result.getProperties().getProperty(PropertyId.SpeechServiceResponse_JsonResult));
System.out.println(result.getOffset());
System.out.println(result.getDuration());
System.out.println(result.getUserId());
System.out.println(result.getReason());
System.out.println(result.getResultId());
System.out.println(result.getText());
System.out.println(result.toString());
}
}
}
System.out.println("Operation finished");
} catch (Exception ex) {
//System.out.println("Unexpected exception: " + ex.getMessage());
ex.printStackTrace();
assert(false);
System.exit(1);
}
}
}
Помощник. java:
package speechsdk.quickstart;
import com.microsoft.cognitiveservices.speech.audio.PullAudioInputStreamCallback;
import com.microsoft.cognitiveservices.speech.internal.AudioConfig;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.slf4j.*;
class WavStream extends PullAudioInputStreamCallback {
private final InputStream stream;
private long samplespersecond;
private int bitspersample;
private int channel;
public long getSamplespersecond()
{
return samplespersecond;
}
public int getBitspersample()
{
return bitspersample;
}
public int getChannel()
{
return channel;
}
public WavStream(InputStream wavStream) {
try {
this.stream = parseWavHeader(wavStream);
} catch (Exception ex) {
throw new IllegalArgumentException(ex.getMessage());
}
}
@Override
public int read(byte[] dataBuffer) {
long ret = 0;
try {
ret = this.stream.read(dataBuffer, 0, dataBuffer.length);
} catch (Exception ex) {
System.out.println("Read " + ex);
}
return (int)Math.max(0, ret);
}
@Override
public void close() {
try {
this.stream.close();
} catch (IOException ex) {
// ignored
}
}
// endregion
// region Wav File helper functions
private int ReadInt32(InputStream inputStream) throws IOException {
int n = 0;
for (int i = 0; i < 4; i++) {
n |= inputStream.read() << (i * 8);
}
return n;
}
private long ReadUInt32(InputStream inputStream) throws IOException {
long n = 0;
for (int i = 0; i < 4; i++) {
n |= inputStream.read() << (i * 8);
}
return n;
}
private int ReadUInt16(InputStream inputStream) throws IOException {
int n = 0;
for (int i = 0; i < 2; i++) {
n |= inputStream.read() << (i * 8);
}
return n;
}
public InputStream parseWavHeader(InputStream reader) throws IOException {
// Note: assumption about order of chunks
// Tag "RIFF"
byte data[] = new byte[4];
int numRead = reader.read(data, 0, 4);
ThrowIfFalse((numRead == 4) && (data[0] == 'R') && (data[1] == 'I') && (data[2] == 'F') && (data[3] == 'F'), "RIFF");
// Chunk size
/* int fileLength = */ReadInt32(reader);
// Subchunk, Wave Header
// Subchunk, Format
// Tag: "WAVE"
numRead = reader.read(data, 0, 4);
ThrowIfFalse((numRead == 4) && (data[0] == 'W') && (data[1] == 'A') && (data[2] == 'V') && (data[3] == 'E'), "WAVE");
// Tag: "fmt"
numRead = reader.read(data, 0, 4);
ThrowIfFalse((numRead == 4) && (data[0] == 'f') && (data[1] == 'm') && (data[2] == 't') && (data[3] == ' '), "fmt ");
// chunk format size
long formatSize = ReadInt32(reader);
ThrowIfFalse(formatSize >= 16, "formatSize");
int formatTag = ReadUInt16(reader);
int channels = ReadUInt16(reader);
int samplesPerSec = (int) ReadUInt32(reader);
int avgBytesPerSec = (int) ReadUInt32(reader);
int blockAlign = ReadUInt16(reader);
int bitsPerSample = ReadUInt16(reader);
ThrowIfFalse(formatTag == 1, "PCM"); // PCM
//ThrowIfFalse(channels == 1, "single channel");
channel = channels;
//ThrowIfFalse(samplesPerSec == 16000, "samples per second");
samplespersecond = samplesPerSec;
//ThrowIfFalse(bitsPerSample == 16, "bits per sample");
bitspersample = bitsPerSample;
// Until now we have read 16 bytes in format, the rest is cbSize and is ignored
// for now.
if (formatSize > 16) {
numRead = reader.read(new byte[(int) (formatSize - 16)]);
ThrowIfFalse(numRead == (int)(formatSize - 16), "could not skip extended format");
}
// Second Chunk, data
// tag: data.
numRead = reader.read(data, 0, 4);
//for (byte i : data) System.out.print((char) i);
//System.out.println();
//ThrowIfFalse((numRead == 4) && (data[0] == 'd') && (data[1] == 'a') && (data[2] == 't') && (data[3] == 'a'), "data");
// data chunk size
// Note: assumption is that only a single data chunk
/* int dataLength = */ReadInt32(reader);
numRead = reader.read(data, 0, 4);
while (!((numRead == 4) && (data[0] == 'd') && (data[1] == 'a') && (data[2] == 't') && (data[3] == 'a')))
{
numRead = reader.read(data, 0, 4);
//for (byte i : data) System.out.print((char) i);
//System.out.println();
ReadInt32(reader);
}
//for (byte i : data) System.out.println((char) i);
return reader;
}
private static void ThrowIfFalse(Boolean condition, String message) {
if (!condition) {
throw new IllegalArgumentException(message);
}
}
// endregion
}