unit TaggedAudio;

{
  // ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////

  Tagged audio consists of a chain of audio chunks that are being labeled with some extra information
  to indicate where in spoken text they originated from. The tag's audio can be interpolated over the
  full chain, triggering callbacks to signal the generating text elelent to an interested party.

  This was made for voice generated audio to keeep track about where audio came from, so a speach
  module can implement Sentence, Word an Phoneme change outputs which then in turn can be used to
  modify the generated audio.

  This can be made to work with SAPI or eSpeak, both do provide the needed callback mechanism, but
  this implemetation was tested on eSpeak only .. SAPI may need some mods.

  All tags must contain audio, and they can have multiple callback generating attributes.

  The idea is to render the speech 'offline' while generating tagged audio, the tagged audio can then
  be used at runtime.

  for instance:

  +------ this the first tag, containing audio and four atributes
  |
  |  [textstart]  - start    tag, can cause a callback to be called - indicates the start of the audio
  |  [sentece]    - sentence tag, can cause a callback to be called - indicates the start of a sentence
  |  [word]       - word     tag, can cause a callback to be called - indicates the start of a word
  |  [phoneme]    - phoneme  tag, can cause a callback to be called - indicates the start of a phoneme
  |  [audio]      - audio to be played with interpolation, all audio buffers are glued together to one
  +------

  +------ a next tag
  |  [phoneme]    - etc
  |  [mark]       - mark     from <mark value="text"/>, can cause a callback
  |  [usermark]   - usermark from <mark value="user1=1.27"/> :: user<digits>=<value>
  |  [audio]
  +------

  +------ a next tag
  |  [phoneme]
  |  [audio]     <---+--- should be able to interpolate between two audio tags
  +------            |    which is possible as all tags must contain audio and
                     |    tags are connected by a next-tag link. Only forward
  +------ a next tag |    look ups are needed, and provided.
  |  [word]          |
  |  [phoneme]       |
  |  [audio]     <---+
  +------

  +------ a next tag
  |  [phoneme]
  |  [silence]    - that is a phoneme value of '_'
  |  [audio]
  +------

  +------ the last tag, or the pre-first as its circular really
  |  [textdone]     end      tag, can cause a callback to be called - indicates the end of the audio
  |  [audio]      - probaly just one sample
  +------


  // ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////

   COPYRIGHT 2019 Blue Hell / Jan Punter

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License version 2 as
  published by the Free Software Foundation;

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

  For all listed email addresses :

    _dot. to be substituted by a dot      '.'
    2@t2  to be substituted by an at sign '@'


  Blue Hell is a trade mark owned by

    Jan Punter
    https://www.bluehell.nl/
    jan2@t2mail_dot_bluehell_dot_nl
}



interface

uses

  System.SysUtils, System.Classes, System.Math,

  ESpeakInterface, KnobsUtils, KnobsConversions;


type

  TOnTextStart     = procedure( const aSender: TObject                                ) of object;
  TOnSentence      = procedure( const aSender: TObject; anId: Integer                 ) of object;
  TOnWord          = procedure( const aSender: TObject; anId: Integer                 ) of object;
  TOnPhoneme       = procedure( const aSender: TObject; const aPhoneme: string        ) of object;
  TOnSilence       = procedure( const aSender: TObject                                ) of object;
  TOnMark          = procedure( const aSender: TObject; const aMark   : string        ) of object;
  TOnUserMark      = procedure( const aSender: TObject; anId: Integer; aValue: TSignal) of object;
  TOnTextDone      = procedure( const aSender: TObject                                ) of object;
  TOnRenderingDone = procedure( const aSender: TObject)                                 of object;


  TAudioTagType =
  (
    attTextStart ,   // Start of text
    attSentence  ,   // Sentence element
    attWord      ,   // Word element
    attPhoneme   ,   // Phoneme element
    attSilence   ,   // Phoneme element '_' seen, interpreted as silence
    attTextDone  ,   // All handled
    attMark      ,   // Text mark <mark value="the value"/>
    attUserMark      // Text mark <mark value="user[digits]=[value]"/>
  );

  TAudioTagTypes = set of TAudioTagType;


  TTagElement = class
  private
    FTagTypes      : TAudioTagTypes;      // The types of this tag
    FId            : Integer;             // An ID, -1 means no Id.
    FSignal        : TSignal;             // A float value type for attUserMark
    FInfo          : string;              // An info, '' means no info
    FStartSample   : Integer;             // Element's first sample index in the chain
    FSampleCount   : Integer;             // Number of samples in element
    FSampleRate    : TSignal;             // Sample rate of the element
    FSampleRateRec : TSignal;             // 1 / FSampleRate
    FNext          : TTagElement;         // Pointer to next element
    FAudioData     : TSignalArray;        // Appended audio data
  private
    function    GetDuration : TSignal;
  public
    constructor Create( aTagType: TAudioTagType; aSampleRate: TSignal; aStartSample: Integer; anId: Integer; const anInfo: string; aValue: TSignal);
    procedure   AddTagType( aTagType: TAudioTagType);
    procedure   AddAudio( const anAudioData: TSignalArray);
    function    InterpolateL( const anIndex: TSignal): TSignal;
    function    InterpolateA( const anIndex: TSignal; var aLastOutput: TSignal): TSignal;
  public
    property    TagTypes    : TAudioTagTypes read FTagTypes;
    property    StartSample : Integer        read FStartSample;
    property    SampleRate  : TSignal        read FSampleRate;
    property    Duration    : TSignal        read GetDuration;
    property    SampleCount : Integer        read FSampleCount;
  end;


  TTagElementArray = array of TTagElement;


  TTaggedAudio = class
  private
    FTags             : TTagElementArray;      // The tags are owned by this object
    FTagHead          : TTagElement;
    FTagTail          : TTagElement;
    FSampleRate       : TSignal;
    FSampleRateRec    : TSignal;
    FPrevInterpolated : TSignal;
    FPrevCallback     : TTagElement;
    FOnTextStart      : TOnTextStart;
    FOnSentence       : TOnSentence;
    FOnWord           : TOnWord;
    FOnPhoneme        : TOnPhoneme;
    FOnSilence        : TOnSilence;
    FOnMark           : TOnMark;
    FOnUserMark       : TOnUserMark;
    FOnTextDone       : TOnTextDone;
    FOnRenderingDone  : TOnRenderingDone;
  private
    function    HandleESpeakerListTerminated( const aSender: TObject                                                   ): Integer;
    function    HandleESpeakerWord          ( const aSender: TObject; aNumber: LongInt                                 ): Integer;
    function    HandleESpeakerSentence      ( const aSender: TObject; aNumber: LongInt                                 ): Integer;
    function    HandleESpeakerMark          ( const aSender: TObject; const aName: string                              ): Integer;
    function    HandleESpeakerPlay          ( const aSender: TObject; const aName: string                              ): Integer;
    function    HandleESpeakerEnd           ( const aSender: TObject                                                   ): Integer;
    function    HandleESpeakerMsgTerminated ( const aSender: TObject                                                   ): Integer;
    function    HandleESpeakerPhoneme       ( const aSender: TObject; const aPhoneme: string                           ): Integer;
    function    HandleESpeakerSampleRate    ( const aSender: TObject; aRate: LongInt                                   ): Integer;
    function    HandleESpeakerWaveData      ( const aSender: TObject; const aData: PESpeakSamples; aNumSamples: LongInt): Integer;
    function    HandleESpeakerCompleted     ( const aSender: TObject                                                   ): Integer;
  private
    function    GetTagCount    : Integer;
    function    GetSampleCount : Integer;
    function    GetDuration    : TSignal;
    procedure   AddTag( const aTag: TTagElement);
    function    FindTag( var aSampleIndex: TSignal): TTagElement;
    procedure   HandleCallback( const aTag: TTagElement);
  private
    procedure   AddAudio( const anAudioData: TSignalArray);
    procedure   AddStartTag;
    procedure   AddSentenceTag( anId: Integer);
    procedure   AddWordTag    ( anId: Integer);
    procedure   AddPhonemeTag ( const aPhoneme: string);
    procedure   AddSilenceTag;
    procedure   AddMarkTag    ( const aMark: string);
    procedure   AddUserMarkTag( anId: Integer; aValue: TSignal);
    procedure   FinalizeCreation;
  public
    constructor Create( aSampleRate: TSignal);
    destructor  Destroy;                                                                                       override;
    procedure   Clear;
    function    InterpolateL( const anIndex: TSignal): TSignal;   // linear   interpolation
    function    InterpolateA( const anIndex: TSignal): TSignal;   // all-pass interpolation
    procedure   RenderToWave( var aWave: TSignalArray);
    procedure   ConnectToSpeaker     ( aSpeaker: TESpeakSpeaker; const aDoneHandler: TOnRenderingDone);
    procedure   DisconnectFromSpeaker( aSpeaker: TESpeakSpeaker);
  public
    property    TagCount        : Integer          read GetTagCount;
    property    SampleRate      : TSignal          read FSampleRate;
    property    SampleCount     : Integer          read GetSampleCount;
    property    Duration        : TSignal          read GetDuration;
    property    OnTextStart     : TOnTextStart     read FOnTextStart     write FOnTextStart;
    property    OnSentence      : TOnSentence      read FOnSentence      write FOnSentence;
    property    OnWord          : TOnWord          read FOnWord          write FOnWord;
    property    OnPhoneme       : TOnPhoneme       read FOnPhoneme       write FOnPhoneme;
    property    OnSilence       : TOnSilence       read FOnSilence       write FOnSilence;
    property    OnMark          : TOnMark          read FOnMark          write FOnMark;
    property    OnUserMark      : TOnUserMark      read FOnUserMark      write FOnUserMark;
    property    OnTextDone      : TOnTextDone      read FOnTextDone      write FOnTextDone;
  end;


implementation


{ ========
  TTagElement = class
  private
    FTagTypes      : TAudioTagTypes;      // The types of this tag
    FId            : Integer;             // An ID, -1 means no Id.
    FSignal        : TSignal;             // A float value type for attUserMark
    FInfo          : string;              // An info, '' means no info
    FStartSample   : Integer;             // Element's first sample index in the chain
    FSampleCount   : Integer;             // Number of samples in element
    FSampleRate    : TSignal;             // Sample rate of the element
    FSampleRateRec : TSignal;             // 1 / FSampleRate
    FNext          : TTagElement;         // Pointer to next element
    FAudioData     : TSignalArray;        // Appended audio data
  public
    property    TagTypes    : TAudioTagTypes read FTagTypes;
    property    StartSample : Integer        read FStartSample;
    property    SampleRate  : TSignal        read FSampleRate;
    property    Duration    : TSignal        read GetDuration;
    property    SampleCount : Integer        read FSampleCount;
  private
}

    function    TTagElement.GetDuration : TSignal;
    begin
      Result := SampleCount * FSampleRateRec;
    end;


//  public

    constructor TTagElement.Create( aTagType: TAudioTagType; aSampleRate: TSignal; aStartSample: Integer; anId: Integer; const anInfo: string; aValue: TSignal);
    begin
      Assert( aSampleRate > 0, 'A valid sample rate value is needed for TTagElement.Create');
      FTagTypes      := [ aTagType];
      FSampleRate    := aSampleRate;
      FSampleRateRec := 1.0 / FSampleRate;
      FStartSample   := aStartSample;
      FId            := anId;
      FInfo          := anInfo;
      FSignal        := aValue;
      FSampleCount   := 0;
      FNext          := nil;
      SetLength( FAudioData, 0);
    end;


    procedure   TTagElement.AddTagType( aTagType: TAudioTagType);
    begin
      FTagTypes := FTagTypes + [ aTagType];
    end;


    procedure   TTagElement.AddAudio( const anAudioData: TSignalArray);
    var
      aSampleCount : Integer;
    begin
      Assert( Length( anAudioData) > 0, 'Should not add NULL audio to tags');

      aSampleCount  := Length( anAudioData);
      SetLength( FAudioData, SampleCount + aSampleCount);
      Move( anAudioData[ 0], FAudioData[ SampleCount], aSampleCount * SizeOf( TSignal));
      FSampleCount := SampleCount + aSampleCount;
    end;


    function    TTagElement.InterpolateL( const anIndex: TSignal): TSignal;
    var
      p : Integer;
      q : Integer;
      d : TSignal;
      S : TSignal;
    begin
      p := Trunc( anIndex);
      d := anIndex - p;
      q := ( p + 1);

      if   q >= SampleCount
      then S := FNext.FAudioData[ 0]
      else S := FAudioData[ q];

      Result := Normalize( FAudioData[ p] + d * ( S - FAudioData[ p]));
    end;


    function    TTagElement.InterpolateA( const anIndex: TSignal; var aLastOutput: TSignal): TSignal;
    var
      p  : Integer;
      q  : Integer;
      d  : TSignal;
      a  : TSignal;
      Sp : TSignal;
      Sq : TSignal;
    begin
      p := Trunc( anIndex);
      d := anIndex - p;

      if d < 0.5                 // best 0.5 <= d < 1.5
      then begin
        Inc( p);
        d := d + 1;

        if   p >= SampleCount
        then begin
          p  := p - SampleCount;
          Sp := FNext.FAudioData[ p];
        end
        else Sp := FAudioData[ p];
      end
      else Sp := FAudioData[ p];

      a := ( 1 - d) / ( 1 + d);
      q := ( p + 1);

      if   q >= SampleCount
      then begin
        q  := q - SampleCount;
        Sq := FNext.FAudioData[ q];
      end
      else Sq := FAudioData[ q];

      alastOutput := Normalize( Sp + a * ( Sq - aLastOutput));
      Result      := alastOutput;
    end;


{ ========
  TTaggedAudio = class
  private
    FTags             : TTagElementArray;      // The tags are owned by this object
    FTagHead          : TTagElement;
    FTagTail          : TTagElement;
    FSampleRate       : TSignal;
    FSampleRateRec    : TSignal;
    FPrevInterpolated : TSignal;
    FPrevCallback     : TTagElement;
    FOnTextStart      : TOnTextStart;
    FOnSentence       : TOnSentence;
    FOnWord           : TOnWord;
    FOnPhoneme        : TOnPhoneme;
    FOnSilence        : TOnSilence;
    FOnMark           : TOnMark;
    FOnUserMark       : TOnUserMark;
    FOnTextDone       : TOnTextDone;
    FOnRenderingDone  : TOnRenderingDone;
  public
    property    TagCount    : Integer      read GetTagCount;
    property    SampleCount : Integer      read GetSampleCount;
    property    Duration    : TSignal      read GetDuration;
    property    OnTextStart : TOnTextStart read FOnTextStart write FOnTextStart;
    property    OnSentence  : TOnSentence  read FOnSentence  write FOnSentence;
    property    OnWord      : TOnWord      read FOnWord      write FOnWord;
    property    OnPhoneme   : TOnPhoneme   read FOnPhoneme   write FOnPhoneme;
    property    OnSilence   : TOnSilence   read FOnSilence   write FOnSilence;
    property    OnMark      : TOnMark      read FOnMark      write FOnMark;
    property    OnUserMark  : TOnUserMark  read FOnUserMark  write FOnUserMark;
    property    OnTextDone  : TOnTextDone  read FOnTextDone  write FOnTextDone;
  private
}

    function    TTaggedAudio.HandleESpeakerListTerminated( const aSender: TObject): Integer;
    begin
      Result := 0;
    end;


    function    TTaggedAudio.HandleESpeakerWord( const aSender: TObject; aNumber: LongInt): Integer;
    begin
      AddWordTag( aNumber);
      Result := 0;
    end;


    function    TTaggedAudio.HandleESpeakerSentence( const aSender: TObject; aNumber: LongInt): Integer;
    begin
      AddSentenceTag( aNumber);
      Result := 0;
    end;


    function    TTaggedAudio.HandleESpeakerMark( const aSender: TObject; const aName: string): Integer;
    var
      IsUserTag : Boolean;
      aValues   : TStringList;
      anId      : Integer;
      aValue    : TSignal;
      S         : string;
    begin
      Result    := 0;
      IsUserTag := False;
      anId      := -1;
      aValue    := NaN;

      try
        aValues := Explode( aName, '=');

        try
          if aValues.Count = 2
          then begin
            S := Copy( aValues[ 0], 1, 4);

            if SameText( 'user', S)
            then begin
              S    := Copy( aValues[ 0], 5, Length( aValues[ 0]));
              anId := strToIntDef( S, -1);

              if anId >= 0
              then begin
                aValue := strToFloatDef( aValues[ 1], NaN);

                if   not IsNan( aValue)
                then IsUserTag := True;
              end;
            end;
          end
        finally
          aValues.DisposeOf;
        end;
      except
      end;

      if   IsUserTag and not IsNan( aValue)
      then AddUserMarkTag( anId, aValue)
      else AddMarkTag( aName);
    end;


    function    TTaggedAudio.HandleESpeakerPlay( const aSender: TObject; const aName: string): Integer;
    begin
      Result := 0;
    end;


    function    TTaggedAudio.HandleESpeakerEnd( const aSender: TObject): Integer;
    begin
      Result := 0;
    end;


    function    TTaggedAudio.HandleESpeakerMsgTerminated( const aSender: TObject): Integer;
    begin
      Result := 0;
    end;


    function    TTaggedAudio.HandleESpeakerPhoneme( const aSender: TObject; const aPhoneme: string): Integer;
    begin
      if   ( aPhoneme = '_' )          // Shorter pause
      or   ( aPhoneme = '_:')          // Short pause
      then AddSilenceTag
      else AddPhonemeTag( aPhoneme);

      Result := 0;
    end;


    function    TTaggedAudio.HandleESpeakerSampleRate( const aSender: TObject; aRate: LongInt): Integer;
    begin
      FSampleRate    := aRate;
      FSampleRateRec := 1 / FSampleRate;
      Result         := 0;
    end;


    function    TTaggedAudio.HandleESpeakerWaveData( const aSender: TObject; const aData: PESpeakSamples; aNumSamples: LongInt): Integer;
    var
      i              : Integer;
      LocalVoiceData : TSignalArray;
      NewSample      : TSignal;
    begin
      Result := 0;

      try
        SetLength( LocalVoiceData, aNumSamples);

        for i := 0 to aNumSamples - 1
        do begin
          NewSample          := aData[ i] / 32768.0;
          LocalVoiceData[ i] := NewSample;
        end;

        AddAudio( LocalVoiceData);
      except
      end;
    end;


    function    TTaggedAudio.HandleESpeakerCompleted( const aSender: TObject): Integer;
    begin
      FinalizeCreation;
      Result := 0;
    end;


//  private

    function    TTaggedAudio.GetTagCount: Integer;
    begin
      Result := Length( FTags);
    end;


    function    TTaggedAudio.GetSampleCount : Integer;
    begin
      if Assigned( FTagTail)
      then Result := FTagTail.StartSample + FTagTail.SampleCount
      else Result := 0;
    end;


    function    TTaggedAudio.GetDuration : TSignal;
    begin
      Result := SampleCount * FSampleRateRec;
    end;


    procedure   TTaggedAudio.AddTag( const aTag: TTagElement);
    begin
      if Assigned( aTag)
      then begin
        if   not Assigned( FTagHead)
        then FTagHead := aTag;

        if   not Assigned( FTagTail)
        then FTagTail := aTag;

        aTag    .FNext := FTagHead;
        FTagTail.FNext := aTag;
        FTagTail       := aTag;

        SetLength( FTags, TagCount + 1);
        FTags[ TagCount - 1] := aTag;
      end;
    end;


    function    TTaggedAudio.FindTag( var aSampleIndex: TSignal): TTagElement;
    // todo : maybe FinalizeCreation() could build a lookup system to speed up this method.
    var
      p     : Integer;
      anEnd : Integer;
    begin
      if   Assigned( FTagTail)
      and  Assigned( FTagHead)
      then begin
        p      := Trunc( aSampleIndex) mod SampleCount;
        Result := FTagHead;
        anEnd  := Result.StartSample + Result.SampleCount;

        while anEnd <= p
        do begin
          aSampleIndex := aSampleIndex - Result.SampleCount;
          Result       := Result.FNext;
          anEnd        := Result.StartSample + Result.SampleCount;
        end;
      end
      else Result := nil;
    end;


    procedure   TTaggedAudio.HandleCallback( const aTag: TTagElement);
    begin
      if aTag <> FPrevCallback
      then begin
        FPrevCallback := aTag;

        if Assigned( FOnTextStart) and ( attTextStart in aTag.FTagTypes) then FOnTextStart( Self);
        if Assigned( FOnSentence ) and ( attSentence  in aTag.FTagTypes) then FOnSentence ( Self, aTag.FId);
        if Assigned( FOnWord     ) and ( attWord      in aTag.FTagTypes) then FOnWord     ( Self, aTag.FId);
        if Assigned( FOnPhoneme  ) and ( attPhoneme   in aTag.FTagTypes) then FOnPhoneme  ( Self, aTag.FInfo);
        if Assigned( FOnSilence  ) and ( attSilence   in aTag.FTagTypes) then FOnSilence  ( Self);
        if Assigned( FOnTextDone ) and ( attTextDone  in aTag.FTagTypes) then FOnTextDone ( Self);
        if Assigned( FOnMark     ) and ( attMark      in aTag.FTagTypes) then FOnMark     ( Self, aTag.FInfo);
        if Assigned( FOnUserMark ) and ( attUserMark  in aTag.FTagTypes) then FOnUserMark ( Self, aTag.FId, aTag.FSignal);
      end;
    end;


//  private

    procedure   TTaggedAudio.AddAudio( const anAudioData: TSignalArray);
    begin
      if not Assigned( FTagTail)
      then AddStartTag;

      FTagTail.AddAudio( anAudioData);
    end;


    procedure   TTaggedAudio.AddStartTag;
    begin
      if   Assigned( FTagHead)
      then FTagHead.AddTagType( attTextStart)
      else AddTag( TTagElement.Create( attTextStart, FSampleRate, 0, 0, '', 0.0));
    end;


    procedure   TTaggedAudio.AddSentenceTag( anId: Integer);
    var
      aStart : Integer;
    begin
      if   Assigned( FTagTail)
      then aStart := SampleCount
      else aStart := 0;

      if   Assigned( FTagTail)          // Combine tag types when the tag has no audio
      and  ( FTagTail.SampleCount = 0)
      then begin
        FTagTail.AddTagType( attSentence);
        FTagTail.FId := anId;
      end
      else AddTag( TTagElement.Create( attSentence, FSampleRate, aStart, anId, '', 0.0)); // When tag has audio, add a new one
    end;


    procedure   TTaggedAudio.AddWordTag( anId: Integer);
    var
      aStart : Integer;
    begin
      if   Assigned( FTagTail)
      then aStart := SampleCount
      else aStart := 0;

      if   Assigned( FTagTail)
      and  ( FTagTail.SampleCount = 0)
      then begin
        FTagTail.AddTagType( attWord);
        FTagTail.FId := anId;
      end
      else AddTag( TTagElement.Create( attWord, FSampleRate, aStart, anId, '', 0.0));
    end;


    procedure   TTaggedAudio.AddPhonemeTag( const aPhoneme: string);
    var
      aStart : Integer;
    begin
      if Assigned( FTagTail)
      then aStart := SampleCount
      else aStart := 0;

      if   Assigned( FTagTail)
      and  ( FTagTail.SampleCount = 0)
      then begin
        FTagTail.AddTagType( attPhoneme);
        FTagTail.FInfo := aPhoneme;
      end
      else AddTag( TTagElement.Create( attPhoneme, FSampleRate, aStart, -1, aPhoneme, 0.0));
    end;


    procedure   TTaggedAudio.AddSilenceTag;
    var
      aStart : Integer;
    begin
      if Assigned( FTagTail)
      then aStart := SampleCount
      else aStart := 0;

      if   Assigned( FTagTail)
      and  ( FTagTail.SampleCount = 0)
      then FTagTail.AddTagType( attSilence)
      else AddTag( TTagElement.Create( attSilence, FSampleRate, aStart, -1, '', 0.0));
    end;


    procedure   TTaggedAudio.AddMarkTag( const aMark: string);
    var
      aStart : Integer;
    begin
      if Assigned( FTagTail)
      then aStart := SampleCount
      else aStart := 0;

      if   Assigned( FTagTail)
      and  ( FTagTail.SampleCount = 0)
      then begin
        FTagTail.AddTagType( attMark);
        FTagTail.FInfo := aMark;
      end
      else AddTag( TTagElement.Create( attMark, FSampleRate, aStart, -1, aMark, 0.0));
    end;


    procedure   TTaggedAudio.AddUserMarkTag( anId: Integer; aValue: TSignal);
    var
      aStart : Integer;
    begin
      if Assigned( FTagTail)
      then aStart := SampleCount
      else aStart := 0;

      if   Assigned( FTagTail)
      and  ( FTagTail.SampleCount = 0)
      then begin
        FTagTail.AddTagType( attUserMark);
        FTagTail.FId     := anId;
        FTagTail.FSignal := aValue;
      end
      else AddTag( TTagElement.Create( attUserMark, FSampleRate, aStart, anId, '', aValue));
    end;


    procedure   TTaggedAudio.FinalizeCreation;
    // Could build indices or something here to speed up the current FindTag() method.
    {$IFDEF DEBUG}
    var
      i : Integer;
    {$ENDIF}
    var
      aDummyAudio : TSignalArray;
    begin

    {$IFDEF DEBUG}                             // @ debug time, check post-condition on tag list
      if   TagCount > 0                        // Tag count of 0 is okay and consistent and all fine
      then begin
        Assert( FTagHead = FTags[ 0           ], 'Tag head should point to first element');
        Assert( FTagTail = FTags[ TagCount - 1], 'Tag tail should point to last  element');

        for i := 0 to TagCount - 1
        do begin
          Assert( FTags[ i].SampleCount > 0, 'Every tag should have audio');

          if   i = TagCount - 1
          then Assert( FTags[ i].FNext = FTags[ 0]    , 'Last element shpuld point to first element')
          else Assert( FTags[ i].FNext = FTags[ i + 1], 'All other elements should point to the next element');
        end;
      end;
    {$ENDIF}

      if SampleCount > 0                       // Means we should have a valid tail
      then begin
        FTagTail.AddTagType( attTextDone);

        if FTagTail.SampleCount = 0            // Keep the invariant that all tags  must have audio
        then begin
          SetLength( aDummyAudio, 1);
          aDummyAudio[ 0] := 0.0;
          FTagTail.AddAudio( aDummyAudio);
        end;
      end;

      if   Assigned( FOnRenderingDone)
      then FOnRenderingDone( Self);
    end;


//  public

    constructor TTaggedAudio.Create( aSampleRate: TSignal);
    begin
      Assert( aSampleRate > 0, 'Must have a proper sample rate for TTaggedAudio.Create');
      SetLength( FTags, 0);
      FTagHead       := nil;
      FTagTail       := nil;
      FSampleRate    := aSampleRate;
      FSampleRateRec := 1.0 / FSampleRate;
    end;


    destructor  TTaggedAudio.Destroy; // override;
    begin
      Clear;
      inherited;
    end;


    procedure   TTaggedAudio.Clear;
    var
      i : Integer;
    begin
      for i := 0 to TagCount - 1
      do FreeAndNil( FTags[ i]);

      SetLength( FTags, 0);
    end;


    function    TTaggedAudio.InterpolateL( const anIndex: TSignal): TSignal;
    var
      T  : TTagElement;
      S : TSignal;
    begin
      S := anIndex;
      T := FindTag( S); // Note: S may be modified

      if   Assigned( T)
      then begin
        HandleCallback( T);
        Result := T.InterpolateL( S)
      end
      else Result := 0.0;
    end;


    function    TTaggedAudio.InterpolateA( const anIndex: TSignal): TSignal;
    var
      T  : TTagElement;
      S : TSignal;
    begin
      S := anIndex;
      T := FindTag( S); // Note: S may be modified

      if   Assigned( T)
      then begin
        HandleCallback( T);
        Result := T.InterpolateA( S, FPrevInterpolated);
      end
      else Result := 0.0;
    end;


    procedure   TTaggedAudio.RenderToWave( var aWave: TSignalArray);
    var
      p : Integer;
      i : Integer;
    begin
      SetLength( aWave, SampleCount);
      p := 0;

      for i := 0 to TagCount - 1
      do begin
        Move( FTags[ i].FAudioData[ 0], aWave[ p], FTags[ i].SampleCount * SizeOf( TSignal));
        p := p + FTags[ i].SampleCount;
      end;
    end;


    procedure   TTaggedAudio.ConnectToSpeaker( aSpeaker: TESpeakSpeaker; const aDoneHandler: TOnRenderingDone);
    begin
      if Assigned( aSpeaker)
      then begin
        FOnRenderingDone          := aDoneHandler;
        aSpeaker.OnListTerminated := HandleESpeakerListTerminated;
        aSpeaker.OnWord           := HandleESpeakerWord          ;
        aSpeaker.OnSentence       := HandleESpeakerSentence      ;
        aSpeaker.OnMark           := HandleESpeakerMark          ;
        aSpeaker.OnPlay           := HandleESpeakerPlay          ;
        aSpeaker.OnEnd            := HandleESpeakerEnd           ;
        aSpeaker.OnMsgTerminated  := HandleESpeakerMsgTerminated ;
        aSpeaker.OnPhoneme        := HandleESpeakerPhoneme       ;
        aSpeaker.OnSampleRate     := HandleESpeakerSampleRate    ;
        aSpeaker.OnWaveData       := HandleESpeakerWaveData      ;
        aSpeaker.OnCompleted      := HandleESpeakerCompleted     ;
      end;
    end;


    procedure   TTaggedAudio.DisconnectFromSpeaker( aSpeaker: TESpeakSpeaker);
    begin
      if Assigned( aSpeaker)
      then begin
        FOnRenderingDone          := nil;
        aSpeaker.OnListTerminated := nil;
        aSpeaker.OnWord           := nil;
        aSpeaker.OnSentence       := nil;
        aSpeaker.OnMark           := nil;
        aSpeaker.OnPlay           := nil;
        aSpeaker.OnEnd            := nil;
        aSpeaker.OnMsgTerminated  := nil;
        aSpeaker.OnPhoneme        := nil;
        aSpeaker.OnSampleRate     := nil;
        aSpeaker.OnWaveData       := nil;
        aSpeaker.OnCompleted      := nil;
      end;
    end;


end.
