unit tss;

{
  A Delphi implementation of the 'Tiny Speech Synth'

  Original copyright message as found at http://www.pouet.nt/prod.php?which=50530 is :

    +-------------------------------------------------------------------------------------------------------------+
    | Type       : C++ subroutine                                                                                 |
    | Created by : Stepanov Andrey, 2008 ( ICQ: 129179794, e-mail: andrewstepanov@mail.ru )                       |
    |                                                                                                             |
    |                                                                                                             |
    | Description                                                                                                 |
    | ===========                                                                                                 |
    | This is a simple speech synth subroutine, based on formant synthesis theory.                                |
    | Speech is synthesized by passing source excitation signal through set of formant one-pole filters.          |
    | Excitation signal is a sawtooth or noise (depending on sound type), although you can try other signals.     |
    |                                                                                                             |
    |                                                                                                             |
    | Some explanation on design                                                                                  |
    | ==========================                                                                                  |
    | It was surprisingly hard to find any information on formant synthesis. Of course, there is a lot of         |
    | information on the Internet, but the biggest part of it is just a general words of theory,                  |
    | nothing practically usable. The biggest problem was to find concrete formant frequencies.                   |
    | After reading a lot of articles I found that I knew nothing new, compared to my knowledge before reading... |
    | So I did frequency analysis by myself. I recorded my own voice and analyzed it's FFT. Analysis is VERY      |
    | approximate and incomplete, so it causes bad speech quality.                                                |
    |                                                                                                             |
    |                                                                                                             |
    | Terms of use                                                                                                |
    | ============                                                                                                |
    | This program is free for use, and you can use it for any purposes you want, as long as you specify          |
    | my name in you program source code and description.                                                         |
    +-------------------------------------------------------------------------------------------------------------+

  2016-04-22: Modified for Delphi by Blue Hell

  This is more or less a direct translation of the original code, only thin I've changed is the oscillator
  frequency, it is 110 Hz instead of 120 Hz here.

  The terms of use remain the same as for the original code.
}

interface

uses

  Math, KnobsUtils;


  procedure TssSimpleSpeak ( aSampleRate : TSignal; aData: string; var aWave: TSignalArray);


implementation


const

  PI_2 = 2 * Pi;


type

  T1bit  = 0 ..  1;
  T2bits = 0 ..  3;
  T4bits = 0 .. 15;

  TPhoneme = record
    p       : Char;
    f       : array[ 0 .. 2] of Byte;   // 50 Hz  units
    w       : array[ 0 .. 2] of Byte;   // 10 Hz  units
    len     : T2bits;                   // 1/15 s units
    amp     : T4bits;                   // 16 levels [0,1]
    osc     : T1bit;                    // Whether the saw or noise is uswd
    plosive : T1bit;                    // Whether this starts with a plosive
  end;

  TShape = TPhoneme;

var

  SampleRate    : TSignal;
  SampleRateRec : TSignal;
  Phonemes      : array[ 0 .. 26] of TPhoneme =
  (
    ( p: ' '; f: (  1,  0,  0); w: (  10,   0,  0); len: 1; amp:  0; osc: 0; plosive: 0 ), // Silence for 1/15 s

    ( p: 'o'; f: ( 12, 15,  0); w: (  10,  10,  0); len: 3; amp:  6; osc: 0; plosive: 0 ),
    ( p: 'i'; f: (  5, 56,  0); w: (  10,  10,  0); len: 3; amp:  3; osc: 0; plosive: 0 ),
    ( p: 'j'; f: (  5, 56,  0); w: (  10,  10,  0); len: 1; amp:  3; osc: 0; plosive: 0 ),
    ( p: 'u'; f: (  5, 14,  0); w: (  10,  10,  0); len: 3; amp:  3; osc: 0; plosive: 0 ),
    ( p: 'a'; f: ( 18, 30,  0); w: (  10,  10,  0); len: 3; amp: 15; osc: 0; plosive: 0 ),
    ( p: 'e'; f: ( 14, 50,  0); w: (  10,  10,  0); len: 3; amp: 15; osc: 0; plosive: 0 ),
    ( p: 'E'; f: ( 20, 40,  0); w: (  10,  10,  0); len: 3; amp: 12; osc: 0; plosive: 0 ),
    ( p: 'w'; f: (  3, 14,  0); w: (  10,  10,  0); len: 3; amp:  1; osc: 0; plosive: 0 ),
    ( p: 'v'; f: (  2, 20,  0); w: (  20,  10,  0); len: 3; amp:  3; osc: 0; plosive: 0 ),

    ( p: 'T'; f: (  2, 20,  0); w: (  40,   1,  0); len: 3; amp:  5; osc: 0; plosive: 0 ),
    ( p: 'z'; f: (  5, 28, 80); w: (  10,   5, 10); len: 3; amp:  3; osc: 0; plosive: 0 ),
    ( p: 'Z'; f: (  4, 30, 60); w: (  50,   1,  5); len: 3; amp:  5; osc: 0; plosive: 0 ),
    ( p: 'b'; f: (  4,  0,  0); w: (  10,   0,  0); len: 1; amp:  2; osc: 0; plosive: 0 ),
    ( p: 'd'; f: (  4, 40, 80); w: (  10,  10, 10); len: 1; amp:  2; osc: 0; plosive: 0 ),
    ( p: 'm'; f: (  4, 20,  0); w: (  10,  10,  0); len: 3; amp:  2; osc: 0; plosive: 0 ),
    ( p: 'n'; f: (  4, 40,  0); w: (  10,  10,  0); len: 3; amp:  2; osc: 0; plosive: 0 ),
    ( p: 'r'; f: (  3, 10, 20); w: (  30,   8,  1); len: 3; amp:  3; osc: 0; plosive: 0 ),
    ( p: 'l'; f: (  8, 20,  0); w: (  10,  10,  0); len: 3; amp:  5; osc: 0; plosive: 0 ),
    ( p: 'g'; f: (  2, 10, 26); w: (  15,   5,  2); len: 2; amp:  1; osc: 0; plosive: 0 ),

    ( p: 'f'; f: (  8, 20, 34); w: (  10,  10, 10); len: 3; amp:  4; osc: 1; plosive: 0 ),
    ( p: 'h'; f: ( 22, 26, 32); w: (  30,  10, 30); len: 1; amp: 10; osc: 1; plosive: 0 ),
    ( p: 's'; f: ( 80, 110, 0); w: (  80,  40,  0); len: 3; amp:  5; osc: 1; plosive: 0 ),
    ( p: 'S'; f: ( 20, 30,  0); w: ( 100, 100,  0); len: 3; amp: 10; osc: 1; plosive: 0 ),

    ( p: 'p'; f: (  4, 10, 20); w: (   5,  10, 10); len: 1; amp:  2; osc: 1; plosive: 1 ),
    ( p: 't'; f: (  4, 20, 40); w: (  10,  20,  5); len: 1; amp:  3; osc: 1; plosive: 1 ),
    ( p: 'k'; f: ( 20, 80,  0); w: (  10,  10,  0); len: 1; amp:  3; osc: 1; plosive: 1 )
  );


  function SearchPhoneme( aPhonemeChar: Char; var aPhonemeData: TPhoneme): Boolean;
  var
    i : Integer;
  begin
    FillChar( aPhonemeData, SizeOf( aPhonemeData), 0);
    Result := False;

    for i := Low( Phonemes) to High( Phonemes)
    do begin
      if aPhonemeChar = Phonemes[ i].p
      then begin
        aPhonemeData := Phonemes[ i];
        Result       := True;
        Break;
      end;
    end;
  end;


  function Sawtooth( x: TSignal ): TSignal;
  begin
    Result := 0.5 - (( x - Floor ( x / PI_2 ) * PI_2 ) / PI_2);
  end;


  function CutLevel( x, lvl: TSignal): TSignal;
  begin
    if x > lvl
    then Result := lvl
    else if x < - lvl
    then Result := - lvl
    else Result := x;
  end;


  procedure AppendData( var anArray: TSignalArray; var aPos: Integer; aData: TSignal);
  var
    p : Integer;
  begin
    if Length( anArray) <= aPos
    then begin
      p := Length( anArray);
      SetLength( anArray, aPos + 1);

      while p < Length( anArray) - 2
      do begin
        anArray[ p] := 0;
        Inc( p);
      end;

      anArray[ Length( anArray) - 1] := aData;
    end
    else anArray[ aPos] := anArray[ aPos] + aData;

    Inc( aPos);
  end;


  procedure TssSimpleSpeak ( aSampleRate : TSignal; aData: string; var aWave: TSignalArray);
  var
    i       : Integer;
    p       : TPhoneme;
    v       : T4Bits;
    sl      : Integer;
    f       : Integer;
    ff      : Integer;
    Freq    : TSignal;
    Buf1Res : TSignal;
    Buf2Res : TSignal;
    q       : TSignal;
    x       : TSignal;
    xp      : TSignal;
    s       : Integer;
    b       : Integer;
    buf     : Integer;
  begin
    SampleRate    := aSampleRate;
    SampleRateRec := 1 / aSampleRate;
    SetLength( aWave, 0);
    v   := 0;
    buf := 0;

    for i := Low( aData) to High( aData)
    do begin
      if SearchPhoneme( aData[ i], p)
      then v := p.amp;

      sl := p.len * ( Round( SampleRate) div 15);

      for f := 0 to 3
      do begin
        ff := p.f[ f];

        Freq := ff * 50.0 * SampleRateRec;

        if ff <> 0
        then begin
          Buf1Res := 0;
          Buf2Res := 0;
          q       := 1.0 - p.w[ f] * 10.0 * Pi * SampleRateRec;
          b       := buf;
          xp      := 0;

          for s := 0 to sl - 1
          do begin
            if p.osc = 1
            then x := Random - 0.5
            else begin
              x  := Sawtooth( s * ( 110.0 * PI_2 * SampleRateRec));
              xp := 0;
            end;

            // Apply formant filter
            x       := x + 2.0 * Cos( PI_2 * Freq) * Buf1Res * q - Buf2Res * q * q;
            Buf2Res := Buf1Res;
            Buf1Res := x;
            x       := 0.75 * xp + x * v;
            xp      := x;

            // Anticlick function
            x := x * CutLevel( sin( Pi * s / sl ) * 5.0, 1.0);
            AppendData( aWave, b, x / ( 256 * 256));
          end;
        end;
      end;

      // Overlap neighbour phonemes
      buf := buf + (( 3 * sl) div 4);

      if p.plosive <> 0
      then buf := buf + ( sl and $ffffff);
    end;
  end;


end.

