Являются ли SpeakProgressEventArgs SpeechSynthesizer неточными? - PullRequest
6 голосов
/ 12 ноября 2009

При использовании класса System.Speech.Synthesis.SpeechSynthesizer в .Net 3.5 свойство AudioPosition объекта SpeakProgressEventArgs представляется неточным.

Следующий код выдает следующий вывод:

Код:

using System;
using System.Speech.Synthesis;
using System.Threading;

namespace SpeechTest
{
    class Program
    {
        static ManualResetEvent speechDoneEvent = new ManualResetEvent(false);

        static void Main(string[] args)
        {
            SpeechSynthesizer synthesizer = new SpeechSynthesizer();

            synthesizer.SpeakProgress += new EventHandler<SpeakProgressEventArgs>(synthesizer_SpeakProgress);

            synthesizer.SpeakCompleted += new EventHandler<SpeakCompletedEventArgs>(synthesizer_SpeakCompleted);

            synthesizer.SetOutputToWaveFile("Test.wav");

            synthesizer.SpeakAsync("This holiday season, support the music you love by shopping at Made in Washington, online and at one of five local stores. Made in Washington chocolates, bountiful gift baskets and ornaments are the perfect holiday gifts for family, friends and co-workers.");

            speechDoneEvent.WaitOne();
        }

        static void synthesizer_SpeakCompleted(object sender, SpeakCompletedEventArgs e)
        {
            speechDoneEvent.Set();
        }

        static void synthesizer_SpeakProgress(object sender, SpeakProgressEventArgs e)
        {
            Console.WriteLine("SpeakProgress: AudioPosition=" + e.AudioPosition + ",\tCharacterPosition=" + e.CharacterPosition + ",\tCharacterCount=" + e.CharacterCount + ",\tText=" + e.Text);
        }
    }
}

Выход:

SpeakProgress: AudioPosition=00:00:00.0043750,  CharacterPosition=0,    CharacterCount=4,       Text=This
SpeakProgress: AudioPosition=00:00:00.2925625,  CharacterPosition=5,    CharacterCount=7,       Text=holiday
SpeakProgress: AudioPosition=00:00:00.9086250,  CharacterPosition=13,   CharacterCount=6,       Text=season
SpeakProgress: AudioPosition=00:00:01.9421250,  CharacterPosition=21,   CharacterCount=7,       Text=support
SpeakProgress: AudioPosition=00:00:02.5621250,  CharacterPosition=29,   CharacterCount=3,       Text=the
SpeakProgress: AudioPosition=00:00:02.6760625,  CharacterPosition=33,   CharacterCount=5,       Text=music
SpeakProgress: AudioPosition=00:00:03.2648125,  CharacterPosition=39,   CharacterCount=3,       Text=you
SpeakProgress: AudioPosition=00:00:03.5199375,  CharacterPosition=43,   CharacterCount=4,       Text=love
SpeakProgress: AudioPosition=00:00:03.8435625,  CharacterPosition=48,   CharacterCount=2,       Text=by
SpeakProgress: AudioPosition=00:00:04.0701875,  CharacterPosition=51,   CharacterCount=8,       Text=shopping
SpeakProgress: AudioPosition=00:00:04.6840625,  CharacterPosition=60,   CharacterCount=2,       Text=at
SpeakProgress: AudioPosition=00:00:04.8036250,  CharacterPosition=63,   CharacterCount=4,       Text=Made
SpeakProgress: AudioPosition=00:00:05.0698125,  CharacterPosition=68,   CharacterCount=2,       Text=in
SpeakProgress: AudioPosition=00:00:05.2521250,  CharacterPosition=71,   CharacterCount=10,      Text=Washington
SpeakProgress: AudioPosition=00:00:06.2961875,  CharacterPosition=83,   CharacterCount=6,       Text=online
SpeakProgress: AudioPosition=00:00:07.0540625,  CharacterPosition=90,   CharacterCount=3,       Text=and
SpeakProgress: AudioPosition=00:00:07.3331250,  CharacterPosition=94,   CharacterCount=2,       Text=at
SpeakProgress: AudioPosition=00:00:07.6818750,  CharacterPosition=97,   CharacterCount=3,       Text=one
SpeakProgress: AudioPosition=00:00:08.0598750,  CharacterPosition=101,  CharacterCount=2,       Text=of
SpeakProgress: AudioPosition=00:00:08.2163750,  CharacterPosition=104,  CharacterCount=4,       Text=five
SpeakProgress: AudioPosition=00:00:08.5971875,  CharacterPosition=109,  CharacterCount=5,       Text=local
SpeakProgress: AudioPosition=00:00:09.0243750,  CharacterPosition=115,  CharacterCount=6,       Text=stores
SpeakProgress: AudioPosition=00:00:10.5325625,  CharacterPosition=123,  CharacterCount=4,       Text=Made
SpeakProgress: AudioPosition=00:00:10.7700625,  CharacterPosition=128,  CharacterCount=2,       Text=in
SpeakProgress: AudioPosition=00:00:10.9377500,  CharacterPosition=131,  CharacterCount=10,      Text=Washington
SpeakProgress: AudioPosition=00:00:11.6708125,  CharacterPosition=142,  CharacterCount=10,      Text=chocolates
SpeakProgress: AudioPosition=00:00:12.9798750,  CharacterPosition=154,  CharacterCount=9,       Text=bountiful
SpeakProgress: AudioPosition=00:00:13.6303125,  CharacterPosition=164,  CharacterCount=4,       Text=gift
SpeakProgress: AudioPosition=00:00:14.0959375,  CharacterPosition=169,  CharacterCount=7,       Text=baskets
SpeakProgress: AudioPosition=00:00:14.7848125,  CharacterPosition=177,  CharacterCount=3,       Text=and
SpeakProgress: AudioPosition=00:00:15.0507500,  CharacterPosition=181,  CharacterCount=9,       Text=ornaments
SpeakProgress: AudioPosition=00:00:15.7195000,  CharacterPosition=191,  CharacterCount=3,       Text=are
SpeakProgress: AudioPosition=00:00:15.9872500,  CharacterPosition=195,  CharacterCount=3,       Text=the
SpeakProgress: AudioPosition=00:00:16.1488750,  CharacterPosition=199,  CharacterCount=7,       Text=perfect
SpeakProgress: AudioPosition=00:00:16.7275000,  CharacterPosition=207,  CharacterCount=7,       Text=holiday
SpeakProgress: AudioPosition=00:00:17.3336875,  CharacterPosition=215,  CharacterCount=5,       Text=gifts
SpeakProgress: AudioPosition=00:00:17.9813125,  CharacterPosition=221,  CharacterCount=3,       Text=for
SpeakProgress: AudioPosition=00:00:18.2216875,  CharacterPosition=225,  CharacterCount=6,       Text=family
SpeakProgress: AudioPosition=00:00:19.0973750,  CharacterPosition=233,  CharacterCount=7,       Text=friends
SpeakProgress: AudioPosition=00:00:19.7726250,  CharacterPosition=241,  CharacterCount=3,       Text=and
SpeakProgress: AudioPosition=00:00:19.9655625,  CharacterPosition=245,  CharacterCount=10,      Text=co-workers
SpeakProgress: AudioPosition=00:00:20.2518750,  CharacterPosition=245,  CharacterCount=10,      Text=co-workers

Однако продолжительность создаваемого WAV-файла составляет 15,69 секунды. То же самое происходит, если вы выводите в поток или в ноль.

Документация для свойства гласит, что это свойство "объект TimeSpan, который представляет временную позицию события в потоке аудиовывода".

Должно ли это быть точное время, указывающее время начала или окончания слова в выходном файле, или я неправильно его понимаю?

1 Ответ

1 голос
/ 10 декабря 2015

audioPosition зависит от выбранного голоса синтезатора речи. Для некоторых голосов Microsoft, таких как Анна, Зира, Дэвид, Хейзел, как я слышал, поддерживаемый аудиоформат - 16000 Гц PCM. Таким образом, следующее решение может исправить положение auido:

var format = 
new System.Speech.AudioFormat.SpeechAudioFormatInfo(EncodingFormat.Pcm, 
                                                    16000, 16, 1, 32000, 2, null);
synthesizer.SetOutputToWaveFile("Test.wav", format);

Если вы заметили, частота дискретизации SetOutputToWaveFile по умолчанию равна 22050, а отношение правильного времени (15.69) ко времени, указанному AudipPosition (20.25), составляет около 0.77. Если вы умножите это соотношение на 22050, вы получите около 16000, что является правильной частотой дискретизации.

...