/* txdreams - pitch epoch locator based on SEDREAMS */

/*
% For more information, please read:
%
% T. Drugman, M. Thomas, J. Gudnason, P. Naylor, T. Dutoit,
% "Detection of Glottal Closure Instants from Speech Signals: a Quantitative Review",
% IEEE Transactions on Audio, Speech and Language Processing, Accepted for publication.
%
% and
%
% T.Drugman, T.Dutoit, "Glottal Closure and Opening Instant Detection from Speech Signals",
% Interspeech09, Brighton, U.K, 2009
%
% Please refer to these works in your publication if you use this code.
%
% Code written by Thomas Drugman in TCTS Lab, University of Mons, Belgium.
%
% Copyright (C) 2000-2011 Thomas Drugman - TCTS Lab
*/

/* Mark Huckvale - University College London */

/* version 1.0 - June 2013 */

#define PROGNAME "txdreams"
#define PROGVERS "1.0"
char *progname=PROGNAME;

#undef EBUG

/*-------------------------------------------------------------------------*/
/**MAN
.TH TXDREAMS SFS1 UCL
.SH NAME
txdreams -- pitch epoch detector based on the SEDREAMS algorithm
.SH SYNOPSIS
.B txdreams
(-i item) (-f fxmean) file
.SH DESCRIPTION
.I txdreams
is a program to find the locations of larynx excitation points
in the voiced regions of a speech signal.  The output is saved as
a TX item.
.SS REFERENCES
The algorithm is described in
.PP
T.Drugman, T.Dutoit, "Glottal Closure and Opening Instant Detection from Speech Signals",
Interspeech09, Brighton, U.K, 2009
.PP
T. Drugman, M. Thomas, J. Gudnason, P. Naylor, T. Dutoit,
"Detection of Glottal Closure Instants from Speech Signals: a Quantitative Review",
IEEE Transactions on Audio, Speech and Language Processing, Accepted for publication.
.PP
SEDREAMS_GCIDetection.m in the GLOAT MATLAB toolbox.
.SH OPTIONS
.TP 11
.B -I
Identify program name and version number.
.TP 11
.BI -i item
Select input item number.
.TP 11
.BI -f fxmean
Specify the mean Fx in Hertz. Default 125.
.SH INPUT ITEMS
.IP SPEECH
Any speech signal
.SH OUTPUT ITEMS
.IP TX
Larynx excitation points.
.SH HISTORY
.SH VERSION/AUTHOR
.IP SFS
Mark Huckvale
.SH SEE ALSO
HQtx(SFS1), vtx(SFS1), pp(SFS1), txanal(SFS1)
*/
/*--------------------------------------------------------------------------*/

#include "SFSCONFG.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include "sfs.h"
#include "filter.h"
#include "elliptic.h"
#include "complex.h"
#include "lsys.h"
#include "lsp.h"
#define MAXCOEFF	50	/* max # LPC coefficients */

#ifndef M_PI
#define M_PI		3.14159265358979323846
#endif

/* item data */
char		filename[SFSMAXFILENAME]; /* SFS data file name */
struct item_header	spitem;		/* input speech item */
short			*sp;
float			*fsp;
struct item_header	rsitem;		/* residual */
float			*rsp;
struct item_header	msitem;		/* mean signal */
float			*msp;
struct item_header	pvitem;		/* probability of voicing */
float			*pv;
struct item_header	txitem;

struct gci_rec {
	int	minpos;
	int	maxpos;
	int	gcipos;
	double	resamp;
} *gcitab;
int gcicnt;

double	fxmean=-1;
double	maxenergy=0;


/*==========================================================================*/

/* zero crossing rate */
float zeroc(float *sp,int len)
{
	register int	i;
	float	last=sp[0];
	float	this;
	int		count=0;

	for (i=1;i<len;i++) {
		this = sp[i];
		if ((last<0)&&(this>=0)) count++;
		last=this;
	}

	return((float)(count/(len*spitem.frameduration)));
}

/* get a voicing probability */
float voiceprob(float *nsp,int len)
{
	float	*s1,*s2;
	float	mean,sum,sumsq,val;
	int		num;
	float	z,e,v,r;
	float	zp,ep,rp;
	int		i;

	/* get energy in unfiltered signal */
	sum=(float)0.0;
	num=len;
	s1 = nsp;
	for (i=0;i<num;i++,s1++) sum += (float)*s1;
	mean = sum/len;
	s1 = nsp;
	sumsq=(float)0.0;
	for (i=0;i<num;i++,s1++) {
		val = (float)(*s1-mean);
		sumsq += val * val;
	}
	e = sumsq/len;

	/* get first reflection coefficient for normal signal */
	sumsq=(float)0.0;
	num = len-1;
	s1 = nsp;
	s2 = nsp+1;
	for (i=0;i<num;i++,s1++,s2++) {
		sumsq += (float)(*s1-mean) * (float)(*s2-mean);
	}
	r = (sumsq/len)/e;
	rp = (float)(1.0/(1+exp(-(r-0.6)/0.2)));

	/* get zero crossing rate */
	z=zeroc(nsp,len);
	zp = (float)(1.0/(1+exp((z-1200)/200)));

	/* get energy */
	ep = (float)(1.0/(1+exp(-((10.0*log10(e))-maxenergy+30)/5)));

	/* combine scores */
	v = zp * ep * rp;

#ifdef EBUG
//printf("%.3f\t%.3f\t%.3f\t%.3f\n",zp,ep,rp,v);
#endif

	return(v);
}

/* create random noise at quantisation level */
double quantnoise()
{
	int v=rand()%10000;
	return ((v-5000)/10000.0);
}

/* compare a double array */
int cmpdouble(const void *e1,const void *e2)
{
	double v1 = *(const double *)e1;
	double v2 = *(const double *)e2;
	if (v1 < v2)
		return(-1);
	else if (v1 > v2)
		return(1);
	else
		return(0);
}

/* compare a float array */
int cmpfloat(const void *e1,const void *e2)
{
	float v1 = *(const float *)e1;
	float v2 = *(const float *)e2;
	if (v1 < v2)
		return(-1);
	else if (v1 > v2)
		return(1);
	else
		return(0);
}

/* blackman window */
void blackman(double *w,int len)
{
	double omega=2*M_PI/(len-1);
	int	i;
	for (i=0;i<len;i++) w[i]=0.42659 - 0.49656*cos(i*omega) + 0.076849*cos(2*i*omega);
}

/* estimate mean fundamental frequency */
double fxestimate(float *sig,int cnt,double srate)
{
	int		wsize,acmin,acmax,maxidx;
	int		i,j,k,n,fcnt;
	double	sumx2,sumy2,sumxy,maxval,rval;
	double	*ftab;
	float	*sp;

	wsize = (int)(srate/30);
	acmin = (int)(srate/500);
	acmax = (int)(srate/50);

	ftab=(double *)calloc(1+cnt/wsize,sizeof(double));
	fcnt=0;

	for (i=0;i<cnt-wsize;i+=wsize) {
		sp=sig+i;
		maxval=0.9;
		maxidx=0;
		for (j=acmin;j<=acmax;j++) {
			n=wsize-j;
			sumx2=0;
			sumy2=0;
			sumxy=0;
			for (k=0;k<n;k++) {
				sumx2 += sp[k]*sp[k];
				sumy2 += sp[k+j]*sp[k+j];
				sumxy += sp[k]*sp[k+j];
			}
			rval=(sumxy/n)/(sqrt(sumx2/n)*sqrt(sumy2/n));
			if (rval > maxval) {
				maxval = rval;
				maxidx = j;
			}
		}
		if (maxval > 0.9) {
			ftab[fcnt++]=srate/maxidx;
//			printf("%d (%g),",(int)(srate/maxidx),maxval);
		}
	}

	if (fcnt > 0) {
		qsort(ftab,fcnt,sizeof(float),cmpfloat);

//		printf("\nmedian=%g\n",ftab[fcnt/2]);

		if (fcnt & 1)
			return(ftab[fcnt/2]);
		else
			return(0.5*ftab[fcnt/2]+0.5*ftab[fcnt/2+1]);
	}
	else
		return(125.0);

}

/* main program */
void main(int argc,char *argv[])
{
	/* option decoding */
	extern int	optind;		/* option index */
	extern char	*optarg;	/* option argument ptr */
	int		errflg = 0;	/* option error flag */
	int		c;		/* option switch */
	int32		it;		/* item selection */
	char		*ty;		/* item sub type */
	/* file variables */
	int32		ipitem=SP_TYPE;
	char		*iptype="0";
	int			srate;
	int			i,j,scnt,ofid,forder;
	FILTER		*hpfilt,*ehpfilt;
	int			wisize,stsize,twinlen,txmin;
	int			ncoeff,nframe,lasttx,txval,minidx,maxidx;
	int			wmin,widx,wdist;
	double		*wsp,*win,*frsp,*twin;
	double		val,sum,sumsq,gciratio;
	double		pc[MAXCOEFF];
	double		omega,vp,rmax,maxres;
	double		s1,s2,factor;
	double		*stab;

	/* decode switches */
	while ( (c = getopt(argc,argv,"Ii:f:")) != EOF ) switch (c) {
		case 'I' :	/* Identify */
			fprintf(stderr,"%s: Pitch epoch location by SEDREAMS V%s\n",PROGNAME,PROGVERS);
			exit(0);
			break;
		case 'i' :	/* specific item */
			if (itspec(optarg,&it,&ty) == 0) {
				if (it == SP_TYPE) {
					ipitem = it;
					iptype = ty;
				}
				else
					error("unsuitable item specifier %s",optarg);
			}
			else
				error("illegal item specifier %s",optarg);
			break;
		case 'f' :	/* mean fx */
			fxmean=atof(optarg);
			break;
		case '?' :	/* unknown */
			errflg++;
	}
	if (errflg || (argc<2))
		error("usage: %s (-I) (-i item) (-f fxmean) file",PROGNAME);

	/* get filename */
	if (optind < argc)
		strcpy(filename,sfsfile(argv[optind]));
	else
		error("no database file specified",NULL);

	/* get speech signal */
	getitem(filename,ipitem,iptype,&spitem,&sp);
	srate = (int)(0.5+1.0/spitem.frameduration);

	/* analysis parameters */
	wisize = (int)(0.5 + 0.03/spitem.frameduration);
	stsize = (int)(0.5 + 0.01/spitem.frameduration);
	ncoeff = (int)(2 + 0.001/spitem.frameduration);
	nframe = 1+(spitem.numframes-wisize)/stsize;

	/* get float buffers */
	fsp = (float *)calloc(spitem.numframes,sizeof(float));
	rsp = (float *)calloc(spitem.numframes,sizeof(float));
	msp = (float *)calloc(spitem.numframes,sizeof(float));
	pv = (float *)calloc(spitem.numframes,sizeof(float));

	/* get window */
	wsp = (double *)calloc(wisize,sizeof(double));
	win = (double *)calloc(wisize,sizeof(double));
	frsp = (double *)calloc(wisize,sizeof(double));
	omega = 2.0*M_PI/(wisize-1);
	for (i=0;i<wisize;i++)
		win[i] = (0.54 - 0.46*cos(i*omega));

	/* pre-emphasis */
	for (i=1;i<spitem.numframes;i++)
		fsp[i] = (float)(sp[i]-0.95*sp[i-1]);

	/* do LPC analysis to get residual */
	for (i=0;(i+wisize)<spitem.numframes;i+=stsize) {

		/* get speech */
		for (j=0;j<wisize;j++)
			wsp[j] = fsp[i+j] + 0.01*quantnoise();

		/* remove mean & window */
		sum=0;
		for (j=0;j<wisize;j++) sum += wsp[j];
		sum /= wisize;
		for (j=0;j<wisize;j++)
			wsp[j] = (wsp[j]-sum)*win[j];

		/* perform LPC analysis on this window */
		lsp_auto_lpc(wsp,wisize,pc,ncoeff);

		/* calculate residual */
		lsp_residual_lpc(wsp,wisize,pc,ncoeff,frsp);

		/* get sumsq of signal & residual */
		s1=0;
		for (j=0;j<wisize;j++) s1+=wsp[j]*wsp[j];
		s2=0;
		for (j=0;j<wisize;j++) s2+=frsp[j]*frsp[j];
		factor=sqrt(s1/s2);

		/* overlap add residual */
		for (j=0;j<wisize;j++)
			rsp[i+j] += (float)(frsp[j]*factor);

	}

	/* normalise residual */
	rmax=0;
	for (i=0;i<spitem.numframes;i++) {
		if (rsp[i] > rmax) rmax=rsp[i];
		if (rsp[i] < -rmax) rmax=-rsp[i];
	}
	for (i=0;i<spitem.numframes;i++) rsp[i] /= (float)rmax;

#ifdef EBUG
	sfsheader(&rsitem,TR_TYPE,1,1,4,spitem.frameduration,spitem.offset,1,0,0);
	sprintf(rsitem.history,"%s(%d.%02d;residual)",PROGNAME,spitem.datatype,spitem.subtype);
	putitem(filename,&rsitem,spitem.numframes,rsp);
#endif

	/* create a high-pass filter */
	hpfilt = filter_design(FILTER_HIGH_PASS,8,50,srate/2,srate);

	/* forward pass filter original speech */
	for (i=0;i<spitem.numframes;i++)
		fsp[i] = filter_sample(hpfilt,(float)sp[i]);

	/* backward pass */
	filter_clear(hpfilt);
	for (i=spitem.numframes-1;i>=0;i--)
		fsp[i] = filter_sample(hpfilt,fsp[i]);

	/* if no fxmean given, estimate one */
	if (fxmean < 0) {
		fxmean = (int)(0.5+fxestimate(fsp,spitem.numframes,srate));
		fprintf(stderr,"Fx mean estimated at %.0fHz\n",fxmean);
	}

	/* calculate mean-based signal */
	twinlen=1+2*(int)(0.8*srate/fxmean);
//	fprintf(stderr,"twinlen=%d\n",twinlen);

	twin=(double *)calloc(twinlen+1,sizeof(double));
	blackman(twin,twinlen);
	for (i=0;i<spitem.numframes-twinlen-1;i++) {
		sum=0;
		for (j=0;j<twinlen;j++) sum += fsp[i+j]*twin[j];
		msp[i+twinlen/2] = (float)(sum/twinlen);
	}

	/* create elliptic filter */
	forder=filter_elliptic_order(FILTER_HIGH_PASS,50.0,30.0,3.0,60.0,(double)srate);
	ehpfilt=filter_elliptic_design(FILTER_HIGH_PASS,forder,50.0,3.0,60.0,(double)srate);

#if 0
	printf("Elliptic filter order=%d\n",forder);

	printf("Coefficients:\n  Numerator\t  Denominator\n");
	for (i=0;i<=forder;i++)
		printf("%13.8f\t%13.8f\n",ehpfilt->section[0].acoeff[i],ehpfilt->section[0].bcoeff[i]);

	printf("Impulse response:\n");
	printf("%g,",filter_sample(ehpfilt,1.0));
	for (i=0;i<99;i++) printf("%g,",filter_sample(ehpfilt,0.0));
#endif

	/* high-pass envelope at 50Hz forward pass */
	filter_clear(ehpfilt);
	for (i=0;i<spitem.numframes;i++)
		msp[i] = filter_sample(ehpfilt,msp[i]);

	/* backward pass */
	filter_clear(ehpfilt);
	for (i=spitem.numframes-1;i>=0;i--)
		msp[i] = filter_sample(ehpfilt,msp[i]);

#ifdef EBUG
	sfsheader(&msitem,TR_TYPE,1,1,4,spitem.frameduration,spitem.offset,1,0,0);
	sprintf(msitem.history,"%s(%d.%02d;meansignal)",PROGNAME,spitem.datatype,spitem.subtype);
	putitem(filename,&msitem,spitem.numframes,msp);
#endif


	/* get voiced regions */

	/* get max energy */
	maxenergy=0;
	for (i=0;(i+wisize) < spitem.numframes;i+=wisize) {

		/* calculate mean */
		for (j=0,sum=0;j<wisize;j++) sum += (float)(fsp[i+j]);
		sum /= wisize;

		/* remove mean and calculate energy */
		for (j=0,sumsq=0;j<wisize;j++) {
			val = (float)fsp[i+j] - sum;
			sumsq += val * val;
		}
		if (sumsq > maxenergy) maxenergy=sumsq;

	}
	maxenergy = 10.0 * log10(maxenergy/wisize);

	/* find voiced regions */
	for (i=0;(i+wisize)<=spitem.numframes;i+=stsize) {
		vp = voiceprob(fsp+i,wisize);
		for (j=0;j<wisize;j++) pv[i+j]=(float)vp;
	}

#ifdef EBUG
	sfsheader(&pvitem,TR_TYPE,1,1,4,spitem.frameduration,spitem.offset,1,0,0);
	sprintf(pvitem.history,"%s(%d.%02d;voiceprob)",PROGNAME,spitem.datatype,spitem.subtype);
	putitem(filename,&pvitem,spitem.numframes,pv);
#endif

	/* record the time of the minimum/maximum pairs */
	gcicnt=0;
	for (i=1;i<spitem.numframes-1;i++) if (pv[i]>0.5) {
		if ((msp[i-1]<msp[i])&&(msp[i]>msp[i+1]))
			gcicnt++;
	}
	gcitab = (struct gci_rec *)calloc(gcicnt+1,sizeof(struct gci_rec));
	gcicnt=0;
	for (i=1;i<spitem.numframes-1;i++) {
		if ((pv[i]>0.5)&&(msp[i-1]<msp[i])&&(msp[i]>msp[i+1]))
			gcitab[gcicnt++].maxpos = i;
		if ((msp[i-1]>msp[i])&&(msp[i]<msp[i+1]))
			gcitab[gcicnt].minpos = i;
	}

	/* associate each major residual spike with one min-max window */
	scnt=0;
	for (i=1;i<spitem.numframes-1;i++) if ((pv[i]>0.5)&&(rsp[i]>0.4)) {
		wmin=100000;
		widx=0;
		for (j=0;j<gcicnt;j++) {
			wdist=abs(i-gcitab[j].minpos);
			if (wdist < wmin) {
				wmin=wdist;
				widx=j;
			}
		}
		gcitab[widx].gcipos=i;
		scnt++;
	}

	if (scnt > 0) {
		/* find median position of GCI w.r.t min and max */
		stab=(double *)calloc(scnt,sizeof(double));
		scnt=0;
		for (i=0;i<gcicnt;i++) {
			if (gcitab[i].gcipos!=0) {
				val = (double)(gcitab[i].gcipos - gcitab[i].minpos)/(double)(gcitab[i].maxpos-gcitab[i].minpos);
//				fprintf(stderr,"%d %d %d %g\n",gcitab[i].minpos,gcitab[i].gcipos,gcitab[i].maxpos,val);
				stab[scnt++] = val;
			}
		}
		qsort(stab,scnt,sizeof(double),cmpdouble);
		if (scnt & 1)
			gciratio=stab[scnt/2];
		else
			gciratio=0.5*stab[scnt/2]+0.5*stab[scnt/2+1];
	}
	else
		gciratio=0.5;
//	fprintf(stderr,"scnt=%d gciratio=%g\n",scnt,gciratio);

	/* find the GCI for real */
	for (i=0;i<gcicnt;i++) {
		minidx=(int)(gcitab[i].minpos + (gciratio-0.25)*(gcitab[i].maxpos-gcitab[i].minpos));
		maxidx=(int)(1+gcitab[i].minpos + (gciratio+0.35)*(gcitab[i].maxpos-gcitab[i].minpos));
		if (minidx < 0) minidx=0;
		if (maxidx > spitem.numframes) maxidx=spitem.numframes;
		maxres=0;
		for (j=minidx;j<maxidx;j++) {
			if (rsp[j]>maxres) { maxres=rsp[j]; gcitab[i].gcipos=j; gcitab[i].resamp=maxres; };
			if (rsp[j]<-maxres) { maxres=-rsp[j]; gcitab[i].gcipos=j; gcitab[i].resamp=maxres; };
		}
//		fprintf(stderr,"%d %d %d\n",gcitab[i].minpos,gcitab[i].gcipos,gcitab[i].maxpos);
	}

	/* remove spurious peaks */
	txmin=srate/500;
	for (i=0;i<gcicnt-1;i++) {
		for (j=1;(i+j)<gcicnt;j++) {
			if ((gcitab[i+j].gcipos-gcitab[i].gcipos)<txmin) {
				if (gcitab[i+j].resamp > gcitab[i].resamp)
					gcitab[i].gcipos=0;
				else
					gcitab[i+1].gcipos=0;
			}
			else
				break;
		}
	}

	/* create output Tx item */
	sfsheader(&txitem,TX_TYPE,0,4,1,spitem.frameduration,spitem.offset,0,0,1);
	sprintf(txitem.history,"%s(%d.%02d,fxmean=%g)",
			PROGNAME,spitem.datatype,spitem.subtype,fxmean);

	if ((ofid=sfschannel(filename,&txitem))<0)
		error("could not open output channel to '%s'",filename);

	/* save Tx */
	lasttx=0;
	for (i=0;i<gcicnt;i++) if (gcitab[i].gcipos) {
		txval = gcitab[i].gcipos - lasttx;
		sfswrite(ofid,1,&txval);
		lasttx = gcitab[i].gcipos;
	}

	/* update */
	if (!sfsupdate(filename))
		error("update error on '%s'",filename);

	/* that's all folks */
	exit(0);
}

