function speech=tts_concantenate_milos(speech_corpus,unit_sequence)

% speech=tts_concatenate_using_xorr(speech_corpus,unit_sequence)
%
% returns speech samples corresponding to the concatenation of diphones 
% obtained from the list of phonemes in unit_sequence, referring to 
% entries in speech_corpus. 
%
% Unit_sequence is a simple vector of indices.
% Speech_corpus is an array of phoneme data.
% Each row contains :
%     1 : a string of characters (features): 
%           1: the name of the current phoneme
%           2: the name of the left phoneme
%           3: the name of the right phoneme, 
%           4: the part-of-speech (pos) of the current word (using one character per pos; see table below)
%           5: the index of the current prosodic phrase (within the current sentence, from 1 to max 9)
%           6: the number of prosodic phrases on the right (until the end of the sentence, from 1 to max 9)
%           7: the index of the current word (within the current prosodic phrase, from 1 to max 9)
%           8: the number of words on the right (until the end of the current prosodic phrase, from 1 to max 9)
%     2: the index of the sentence containing the phoneme (related wav file names are given by this index)
%     3: the start sample for the current phoneme in the related wav file
%     4: the end sample for the current phoneme in the related wav file
%  
% The phoneme-diphone correspondance is as follows : each phoneme in the
% list induces the synthesizer to extract a diphone from the middle of the
% phoneme to the middle of the next phoneme in speech_corpus.
%
% Example : 
%    genglish_load_corpus;
%    speech_corpus=corpus_to_speech_corpus(genglish_corpus);
%    out=tts_concatenate_diphones(speech_corpus, 1:25);
%    sound(out,16000);
% outputs the first sentence of the Genglish corpus.
%
% Project: TTSBOX, a corpus-based speech synthesizer for Genglish
%
% Copyright (c) 2004 Faculte Polytechnique de Mons-Thierry Dutoit 
%
% This program is free software; you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation

speech=[];
for i=1:length(unit_sequence)
    file=strcat('./wav/',num2str(speech_corpus{unit_sequence(i),2}),'.wav');
    [y,Fs,N]=wavread(file);
    middle_of_current_phoneme=round((speech_corpus{unit_sequence(i),4}+speech_corpus{unit_sequence(i),3})/2);
    middle_of_next_phoneme=round((speech_corpus{unit_sequence(i)+1,4}+speech_corpus{unit_sequence(i)+1,3})/2);

        f_pm=strcat('./pm/',num2str(speech_corpus{unit_sequence(i),2}),'.pm');
        pm=textread(f_pm,'%f %*d'); % pitch marks in sec
        [value_pm_start,index_pm_start] = min(abs(round(pm*Fs) - middle_of_current_phoneme));   % find the nearest pitch marks
        [value_pm_stop,index_pm_stop] = min(abs(round(pm*Fs) - middle_of_next_phoneme));
        y_start = round(pm(index_pm_start)*Fs);   % move diphone's boundaries to the pitch marks
        y_stop = round(pm(index_pm_stop)*Fs);
        if (i > 1)  % do nothing for the first diphone, start with smoothing between diph1 - diph2, diph2 - diph3, ..., and so far
            artf_pitch_stop = y(y_start + 1:fix(pm(index_pm_start + 1) * Fs));
            length_start = length(artf_pitch_start); length_stop = length(artf_pitch_stop);   % apply the Hann window
            w1 = hann(length_start * 2); w2 = hann(length_stop * 2);        
            artf_pitch_start = artf_pitch_start .* w1(length_start+1:length(w1));
            artf_pitch_stop = artf_pitch_stop .* w2(1:length_stop);
            new_pitch = max(length_start,length_stop);    % ! important: take max of two periods as a length for the new one
            if (new_pitch > length_start)    % add zeros at the end if necessary
                artf_pitch_start = vertcat(artf_pitch_start,zeros(new_pitch-length_start,1));
            end;
            if (new_pitch > length_stop) % add zeros at the begining if necessary
                artf_pitch_stop = vertcat(zeros(new_pitch-length_stop,1),artf_pitch_stop);
            end;
            new_artf_pitch = artf_pitch_start + artf_pitch_stop;    % overlap and add
        end;
        y_subwave = y(round(pm(index_pm_start + 1)*Fs) + 1:y_stop);
        if i == 1 speech=y_subwave;
        else    
            speech=[speech;new_artf_pitch]; % add the artf_pitch first, then the segment
            speech=[speech;y_subwave];
        end;
        artf_pitch_start = y(y_stop + 1:fix(pm(index_pm_stop + 1)*Fs));
    
%     % very rudimentary concatenation point optimization in case of non-consecutive units
%     if (i>1 && unit_sequence(i)~=unit_sequence(i-1)+1 && middle_of_current_phoneme-100>0)
%         y_subwave=y(middle_of_current_phoneme-100:middle_of_current_phoneme+100);
%         [tmp,optimal_pos]=max(xcorr(y_subwave,speech(length(speech)-99:length(speech))));
%         middle_of_current_phoneme=middle_of_current_phoneme+optimal_pos-100;  
%     end;
%     % very rudimentary smoothing in case of non-consecutive units : fade-in/out
%     if (i>1 && unit_sequence(i)~=unit_sequence(i-1)+1 && middle_of_current_phoneme-100>0) 
%         y_subwave=y(middle_of_current_phoneme-100:middle_of_current_phoneme-1);
%         fadein_factor=(0.99:-0.01:0.00)';
%         fadeout_factor=(0.00:0.01:0.99)';
%         speech(length(speech)-99:length(speech))=speech(length(speech)-99:length(speech)).*fadeout_factor+y_subwave.*fadein_factor;
%     end;
%     
%     % concatenation itself
%     y_subwave=y(middle_of_current_phoneme:middle_of_next_phoneme);
%     speech=[speech;y_subwave];
end;