Uwp SpeechSynthesizer / MediaPlayer утечка памяти - PullRequest
0 голосов
/ 12 ноября 2018

У нас есть приложение Uwp, которое использует голоса Microsoft, чтобы говорить и читать текст, как он говорит. Я заметил, что использование памяти приложением увеличивается с каждым произносимым битом текста, и в конечном итоге ему не хватает памяти. Неважно, какой голос используется или какой текст произносится.

Чтобы выделить текст, я подписываюсь на события в TimedMedatataTracks объекта MediaPlaybackItem. Когда текст заканчивается, я отписываюсь от каждого события и распоряжаюсь MediaPlaybackItem.Source. Профилировщик памяти Visual Studio не показывает утечек в управляемой памяти, поэтому я подозреваю, что что-то не очищается в неуправляемом пространстве.

Редактировать: я прокомментировал это в коде, но я это здесь напишу - если я не подпишусь на события TimedMetadataTrack, утечка исчезнет. Я также могу воспроизвести это с помощью примера приложения Windows (синтезировать текст с границами)

Я что-то упускаю, что нужно удалить, или это ошибка в SpeechSynthesizer / MediaPlayer?

using System;
using System.Diagnostics;
using Windows.Media.Core;
using Windows.Media.Playback;
using Windows.Media.SpeechSynthesis;

namespace WindowsTts
{
    public class UwpNativeVoice : IDisposable
    {
        private readonly object _activeSpeechLock;
        private SpeechSynthesizer _synthesizer;
        private MediaPlayer _mediaPlayer;
        private SpeechCallback _activeSpeech;

        public UwpNativeVoice(VoiceInformation platformInfo)
        {
            _activeSpeechLock = new object();

            _synthesizer = new SpeechSynthesizer();
            _synthesizer.Options.IncludeWordBoundaryMetadata = true;
            _synthesizer.Voice = platformInfo;

            _mediaPlayer = new MediaPlayer
            {
                RealTimePlayback = true,
                AutoPlay = false,
                Volume = 1.0f
            };
            _mediaPlayer.MediaOpened += OnMediaPlayerMediaOpened;
            _mediaPlayer.MediaEnded += OnMediaPlayerMediaEnded;
        }

        public void Dispose()
        {
            _mediaPlayer.MediaOpened -= OnMediaPlayerMediaOpened;
            _mediaPlayer.MediaEnded -= OnMediaPlayerMediaEnded;
            (_mediaPlayer.Source as MediaPlaybackItem)?.Source?.Dispose();
            _mediaPlayer.Source = null;
            _mediaPlayer.Dispose();
            _mediaPlayer = null;

            _synthesizer?.Dispose();
            _synthesizer = null;
        }

        public async void Speak(string text, SpeechDelegate speechDelegate)
        {
            if ( string.IsNullOrEmpty(text) )
            {
                // no-op; just fire events and bail
                speechDelegate?.Invoke(text, ReadTextEvent.Start);
                speechDelegate?.Invoke(text, ReadTextEvent.End);
                return;
            }

            if (_activeSpeech != null)
            {
                // something currently speaking; halt it, fire events and then start anew
                Halt();
            }

            // get synth stream, and add markers for bookmarks & word boundaries
            var synthStream = await _synthesizer.SynthesizeTextToStreamAsync(text);

            lock (_activeSpeechLock)
            {
                _activeSpeech = new SpeechCallback(text, speechDelegate);

                try
                {
                    var source = MediaSource.CreateFromStream(synthStream, synthStream.ContentType);
                    var playbackItem = new MediaPlaybackItem(source);
                    ConfigPlaybackEvents(playbackItem); //Comment this out and the leak goes away
                    _mediaPlayer.Source = playbackItem;
                    _mediaPlayer.Play();
                }
                catch (Exception e)
                {
                    Debug.WriteLine(e);
                    _activeSpeech?.Invoke(ReadTextEvent.End);
                    _activeSpeech = null;
                }
            }
        }

        public bool Halt()
        {
            lock (_activeSpeechLock)
            {
                if (_activeSpeech == null)
                    return true;
            }

            _mediaPlayer.Pause();
            DestroyMediaPlaybackItem(_mediaPlayer.Source as MediaPlaybackItem);
            _mediaPlayer.Source = null;

            SpeechCallback callback;
            lock (_activeSpeechLock)
            {
                callback = _activeSpeech;
                _activeSpeech = null;
            }
            callback?.Invoke(ReadTextEvent.End);

            return true;
        }

        private void OnMediaPlayerMediaOpened(MediaPlayer sender, object args)
        {
            FireReadTextEvent(ReadTextEvent.Start);
        }

        private void OnTimedMetadataTrackEntered(TimedMetadataTrack track, MediaCueEventArgs args)
        {
            if ( track.TimedMetadataKind == TimedMetadataKind.Speech && args.Cue is SpeechCue speechCue )
            {
                var startIdx = speechCue.StartPositionInInput ?? 0;
                var endIdx = speechCue.EndPositionInInput ?? -1;
                FireReadTextEvent(ReadTextEvent.WordEvent(startIdx, (endIdx - startIdx) + 1));
            }
        }

        private void OnMediaPlayerMediaEnded(MediaPlayer sender, object args)
        {
            SpeechCallback callback;
            lock ( _activeSpeechLock )
            {
                callback = _activeSpeech;
                _activeSpeech = null;
            }
            callback?.Invoke(ReadTextEvent.End);

            DestroyMediaPlaybackItem(sender.Source as MediaPlaybackItem);
            sender.Source = null;
        }

        private void FireReadTextEvent(ReadTextEvent evt)
        {
            SpeechCallback callback;
            lock ( _activeSpeechLock )
                callback = _activeSpeech;
            callback?.Invoke(evt);
        }

        private void ConfigPlaybackEvents(MediaPlaybackItem playbackItem)
        {
            // see: https://docs.microsoft.com/en-us/uwp/api/windows.media.core.timedmetadatatrack

            // iterate through existing tracks, registering callbacks for them
            for ( int i = 0; i < playbackItem.TimedMetadataTracks.Count; i++ )
                RegisterAction(playbackItem, i);
        }

        private void RegisterAction(MediaPlaybackItem item, int idx)
        {
            const string speechWordIdentifier = "SpeechWord";

            TimedMetadataTrack track = item.TimedMetadataTracks[idx];
            if (track.Id.Equals(speechWordIdentifier, StringComparison.Ordinal) || track.Label.Equals(speechWordIdentifier, StringComparison.Ordinal))
            {
                track.CueEntered += OnTimedMetadataTrackEntered;
                item.TimedMetadataTracks.SetPresentationMode((uint)idx, TimedMetadataTrackPresentationMode.ApplicationPresented);
            }
        }

        private void DestroyMediaPlaybackItem(MediaPlaybackItem item)
        {
            if ( item == null )
                return;

            foreach ( var track in item.TimedMetadataTracks )
            {
                track.CueEntered -= OnTimedMetadataTrackEntered;
            }

            item.Source?.Dispose();
        }
    }
}




namespace WindowsTts
{
    /// <summary>Defines a trigger that caused the broadcasting of a ReadTextEvent.</summary>
    public enum ReadTextTrigger
    {
        Start,
        Bookmark,
        Word,
        End,
    }

    /// <summary>A ReadTextEvent encompasses the relevant information from the tts world and is passed to the api user as part of a ReadTextInfo's EventAction data. </summary>
    public class ReadTextEvent
    {
        public static ReadTextEvent Start { get; } = new ReadTextEvent()
        {
            Trigger = ReadTextTrigger.Start,
            BookmarkName = null,
            TextOffset = -1,
            TextLength = -1,
        };

        public static ReadTextEvent End { get; } = new ReadTextEvent()
        {
            Trigger = ReadTextTrigger.End,
            BookmarkName = null,
            TextOffset = -1,
            TextLength = -1,
        };

        public ReadTextTrigger Trigger { get; set; }
        public string BookmarkName { get; set; }
        public int TextOffset { get; set; }
        public int TextLength { get; set; }

        /// <summary>Utility methods to pre-initialize some fields of this object.</summary>
        public static ReadTextEvent Factory(ReadTextEvent src)
        {
            return new ReadTextEvent()
            {
                Trigger = src.Trigger,
                BookmarkName = src.BookmarkName,
                TextOffset = src.TextOffset,
                TextLength = src.TextLength,
            };
        }

        public static ReadTextEvent BookmarkEvent(string bookmark)
        {
            return new ReadTextEvent()
            {
                Trigger = ReadTextTrigger.Bookmark,
                BookmarkName = bookmark,
                TextOffset = -1,
                TextLength = -1,
            };
        }

        public static ReadTextEvent WordEvent(int textOffset, int textLength)
        {
            return new ReadTextEvent()
            {
                Trigger = ReadTextTrigger.Word,
                BookmarkName = null,
                TextOffset = textOffset,
                TextLength = textLength,
            };
        }

        private ReadTextEvent()
        {
        }
    }

    /// <summary>
    /// A SpeechDelegate is passed to the ITtsVoice.Speak() method, so that the caller may receive progress info as the text is being spoken.
    /// </summary>
    /// <param name="speechText"></param>
    /// <param name="readTextEvent"></param>
    public delegate void SpeechDelegate(string speechText, ReadTextEvent readTextEvent);

    /// <summary>
    /// This class encapsulates everything necessary to invoke a SpeechDelegate.
    /// A SpeechCallback instance may be created each time a new string is enqueued for speaking,
    /// and then invoked multiple times throughout the process, with an updated ReadTextEvent.
    /// </summary>
    public class SpeechCallback
    {
        private readonly SpeechDelegate _speechDelegate;

        public SpeechCallback(string text, SpeechDelegate speechDelegate)
        {
            Text = text;
            _speechDelegate = speechDelegate;
        }

        public string Text { get; }

        public void Invoke(ReadTextEvent readTextEvent) => _speechDelegate?.Invoke(Text, readTextEvent);
    }
}

namespace WindowsTts
{
    /// <summary>Defines a trigger that caused the broadcasting of a ReadTextEvent.</summary>
    public enum ReadTextTrigger
    {
        Start,
        Bookmark,
        Word,
        End,
    }

    /// <summary>A ReadTextEvent encompasses the relevant information from the tts world and is passed to the api user as part of a ReadTextInfo's EventAction data. </summary>
    public class ReadTextEvent
    {
        public static ReadTextEvent Start { get; } = new ReadTextEvent()
        {
            Trigger = ReadTextTrigger.Start,
            BookmarkName = null,
            TextOffset = -1,
            TextLength = -1,
        };

        public static ReadTextEvent End { get; } = new ReadTextEvent()
        {
            Trigger = ReadTextTrigger.End,
            BookmarkName = null,
            TextOffset = -1,
            TextLength = -1,
        };

        public ReadTextTrigger Trigger { get; set; }
        public string BookmarkName { get; set; }
        public int TextOffset { get; set; }
        public int TextLength { get; set; }

        /// <summary>Utility methods to pre-initialize some fields of this object.</summary>
        public static ReadTextEvent Factory(ReadTextEvent src)
        {
            return new ReadTextEvent()
            {
                Trigger = src.Trigger,
                BookmarkName = src.BookmarkName,
                TextOffset = src.TextOffset,
                TextLength = src.TextLength,
            };
        }

        public static ReadTextEvent BookmarkEvent(string bookmark)
        {
            return new ReadTextEvent()
            {
                Trigger = ReadTextTrigger.Bookmark,
                BookmarkName = bookmark,
                TextOffset = -1,
                TextLength = -1,
            };
        }

        public static ReadTextEvent WordEvent(int textOffset, int textLength)
        {
            return new ReadTextEvent()
            {
                Trigger = ReadTextTrigger.Word,
                BookmarkName = null,
                TextOffset = textOffset,
                TextLength = textLength,
            };
        }

        private ReadTextEvent()
        {
        }
    }

    /// <summary>
    /// A SpeechDelegate is passed to the ITtsVoice.Speak() method, so that the caller may receive progress info as the text is being spoken.
    /// </summary>
    /// <param name="speechText"></param>
    /// <param name="readTextEvent"></param>
    public delegate void SpeechDelegate(string speechText, ReadTextEvent readTextEvent);

    /// <summary>
    /// This class encapsulates everything necessary to invoke a SpeechDelegate.
    /// A SpeechCallback instance may be created each time a new string is enqueued for speaking,
    /// and then invoked multiple times throughout the process, with an updated ReadTextEvent.
    /// </summary>
    public class SpeechCallback
    {
        private readonly SpeechDelegate _speechDelegate;

        public SpeechCallback(string text, SpeechDelegate speechDelegate)
        {
            Text = text;
            _speechDelegate = speechDelegate;
        }

        public string Text { get; }

        public void Invoke(ReadTextEvent readTextEvent) => _speechDelegate?.Invoke(Text, readTextEvent);
    }
}
...