/* vqstat - calculate summary voice quality statistics */

/* M.A.Huckvale - University College London */

/* version 1.0 - July 2010 */

#define PROGNAME "vqstat"
#define PROGVERS "1.0"
char	*progname=PROGNAME;

/*--------------------------------------------------------------------------*/
/**MAN
.TH VQSTAT SFS1 UCL SFS
.SH NAME
vqstat -- calculate summary statistics of voice quality
.SH SYNOPSIS
.B vqstat
(-i item) (-l lolimit) (-h hilimit) file
.SH DESCRIPTION
.I vqstat
is a program to calculate a set of standard statistics of voice quality from
a speech recording marked with a set of pitch epoch markers (Tx). The Tx data
can come from a Laryngographic analysis or from an acoustic method such as txanal.
.PP
Statistics are as follows:
.TP 11
.B MeanF0
Mean fundamental frequency (Hz).
.TP 11
.B SDF0
Standard deviation of fundamental frequency (Hz).
.TP 11
.B RAP
Relative Average Perturbation. Mean perturbation measured for eavh cycle by comparing
duration of cycle to mean duration of a window of three cycles centered on the cycle (%).
.TP 11
.B PPQ
Pitch Perturbation Quotient. Mean perturbation measured for eavh cycle by comparing
duration of cycle to mean duration of a window of five cycles centered on the cycle (%).
.TP 11
.B APQ
Amplitude Perturbation Quotient. Mean perturbation measured for eavh cycle by comparing
amplitude of speech in cycle to mean amplitude of a window of eleven cycles centered on the cycle (%).
.TP 11
.B HNR
Harmonic to Noise Ratio. Calculated from the autocorrelation function (dB).
.TP 11
.B H1H2
Average ratio of energy in first harmonic compared to second harmonic (dB).
.TP 11
.B SPI
Soft Phonation Index. Ratio of energy in low frequency region (70-1600Hz) to
energy at high frequency (1600-4500Hz) (dB).
.PP
Analysis is only performed over voiced and non-silent regions of the signal. The voicing decision is
based on comparisons to a low-frequency and high frequency limit, which can be modified by program options.
.PP
.I Options
and their meanings are:
.TP 11
.B -I
Identify program and version number.
.TP 11
.BI -i item
Select input item.
.TP 11
.BI -l lolimit
Set low-frequency limit for voicing decision. Default 50Hz.
.TP 11
.BI -h hilimit
Set high-frequency limit for voicing decision. Default 500Hz.
.SH INPUT ITEMS
.IP SP.xx 11
Speech waveform used for shimmer measurement.
.IP TX.xx 11
Any excitation period data item.
.SH VERSION/AUTHOR
.IP 1.0 11
Mark Huckvale
*/
/*--------------------------------------------------------------------------*/

/* standard definitions */
#include "SFSCONFG.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include "sfs.h"		/* database structures */
#include "filter.h"

/* input file variables */
char		filename[SFSMAXFILENAME]; /* sfs file name */
struct item_header	spitem;	/* item header for speech data */
short	*sp;
struct item_header txitem;	/* item header for tx data */
int		*tx;

/* global parameters */
int		doquiet=0;
int		dodebug=0;
double	stime=-1;
double	etime=-1;
int		domid=0;
double	lolimit=50;
double	hilimit=500;
int		txlolimit;			/* shortest interval that is voiced */
int		txhilimit;			/* longest interval that is voiced */

/* tx structure */
struct tx_rec {
	int		tx;					/* integer period */
	int		ok,ok3,ok5,ok11;	/* OK as part of block */
	double	T;					/* period (s) */
	double	t1,t2;				/* absolute start & end time */
	int		sp1,sp2;			/* indexes into waveform */
	double	f0;					/* local estimate of F0 */
	double	f0p;				/* best estimate of F0 across 5 cycles */
	double	h1,h2;				/* amplitudes of first and second harmonic */
} *txtab;
int txcnt;

/* VQ parameters */
double	Dur;
double	MeanF0;
double	SDF0;
double	RAP;
double	PPQ;
double	APQ;
double	HNR;
double	SPI;
double	H1H2;

/* initialise Tx table */
void inittx()
{
	int	i,j,txsum=0;
	int	ok;
	int	sp1,sp2;

	txtab = (struct tx_rec *)calloc(txitem.numframes,sizeof(struct tx_rec));
	txcnt = txitem.numframes;

	for (i=0;i<txcnt;i++) {
		txtab[i].tx = tx[i];
		txtab[i].T = tx[i]*txitem.frameduration;
		txtab[i].f0 = txtab[i].f0p = 1.0/(tx[i]*txitem.frameduration);
		txtab[i].ok = ((txlolimit<=tx[i])&&(tx[i]<=txhilimit));
		txtab[i].t1 = txitem.offset + txsum*txitem.frameduration;
		txsum += tx[i];
		txtab[i].t2 = txitem.offset + txsum*txitem.frameduration;
		sp1 = (int)(0.5+(txtab[i].t1-spitem.offset)/spitem.frameduration);
		sp2 = (int)(0.5+(txtab[i].t2-spitem.offset)/spitem.frameduration);
		if (sp1 >= spitem.numframes) sp1=spitem.numframes-1;
		if (sp2 >= spitem.numframes) sp2=spitem.numframes-1;
		txtab[i].sp1 = sp1;
		txtab[i].sp2 = sp2;
	}

	for (i=1;i<txcnt-1;i++) {
		ok=1;
		for (j=-1;j<=1;j++) ok &= txtab[i+j].ok;
		txtab[i].ok3 = ok;
	}

	for (i=2;i<txcnt-2;i++) {
		ok=1;
		for (j=-2;j<=2;j++) ok &= txtab[i+j].ok;
		txtab[i].ok5 = ok;
	}

	for (i=5;i<txcnt-5;i++) {
		ok=1;
		for (j=-5;j<=5;j++) ok &= txtab[i+j].ok;
		txtab[i].ok11 = ok;
	}

}

/* clear TX values outside allotted time */
void cleartx(double stime,double etime)
{
	int	i;

	for (i=0;i<txcnt;i++) {
		if ((txtab[i].t1 < stime)||(txtab[i].t2 > etime)) {
			txtab[i].ok = txtab[i].ok3 = txtab[i].ok5 = txtab[i].ok11 = 0;
		}
	}
}

/* dump the Tx table */
void dumptx()
{
	int	i;

	printf("IDX,TX,OK1,OK3,OK5,OK11,T,T1,T2,SP1,SP2,F0,F0P,H1,H2\n");
	for (i=0;i<txcnt;i++)
		printf("%d,%d,%d,%d,%d,%d,%g,%g,%g,%d,%d,%g,%g,%g,%g\n",
			i,
			txtab[i].tx,
			txtab[i].ok,
			txtab[i].ok3,
			txtab[i].ok5,
			txtab[i].ok11,
			txtab[i].T,
			txtab[i].t1,
			txtab[i].t2,
			txtab[i].sp1,
			txtab[i].sp2,
			txtab[i].f0,
			txtab[i].f0p,
			txtab[i].h1,
			txtab[i].h2);
}


/* calculate Duration */
double calcDur()
{
	double	sum=0;
	int	i;

	for (i=0;i<txcnt;i++) if (txtab[i].ok) {
		sum += txtab[i].T;
	}

	return(sum);
}

/* calculate MeanF0 */
double calcMeanF0()
{
	double	sum=0;
	int		cnt=0;
	int	i;

	for (i=0;i<txcnt;i++) if (txtab[i].ok) {
		/* calculate mean freq as total count divided by total time */
		sum += txtab[i].T;
		cnt++;
	}

	if (cnt==0)
		return(0);
	else
		return(cnt/sum);
}

/* calculate SDF0 */
double calcSDF0(double MeanF0)
{
	double	sum=0;
	int		cnt=0;
	int	i;

	for (i=0;i<txcnt;i++) if (txtab[i].ok) {
		/* weight each chunk of variance by duration in time */
		sum += (txtab[i].f0-MeanF0)*(txtab[i].f0-MeanF0)*txtab[i].tx;
		cnt += txtab[i].tx;
	}

	if (cnt==0)
		return(0);
	else {
		return(sqrt(sum/cnt));
	}
}

/* calculate RAP */
double calcRAP()
{
	int		i,j;
	double	rap=0;
	int		cnt=0;
	double	sum=0;

	for (i=0;i<txcnt;i++) if (txtab[i].ok3) {
		/* calculate RAP per period */
		for (sum=0,j=-1;j<=1;j++) sum += txtab[i+j].T;
		sum /= 3;
		rap += fabs(txtab[i].T-sum)/sum;
		cnt++;
	}

	if (cnt==0)
		return(0);
	else {
		return(100*(rap/cnt));
	}
}

/* calculate PPQ */
double calcPPQ()
{
	int		i,j;
	double	ppq=0;
	int		cnt=0;
	double	sum=0;

	for (i=0;i<txcnt;i++) if (txtab[i].ok5) {
		/* calculate PPQ per period */
		for (sum=0,j=-2;j<=2;j++) sum += txtab[i+j].T;
		sum /= 5;
		ppq += fabs(txtab[i].T-sum)/sum;
		cnt++;
	}

	if (cnt==0)
		return(0);
	else {
		return(100*(ppq/cnt));
	}
}

/* calculate the speech signal amplitude */
int	peak2peak(int tidx)
{
	int	min=0,max=1;
	int	i;

	for (i=txtab[tidx].sp1;i<txtab[tidx].sp2;i++) {
		if (sp[i] < min) min=sp[i];
		if (sp[i] > max) max=sp[i];
	}
	return (max-min);
}

/* calculate APQ */
double calcAPQ()
{
	int		i,j;
	double	apq=0;
	int		cnt=0;
	double	sum=0;

	for (i=0;i<txcnt;i++) if (txtab[i].ok11) {
		/* calculate APQ per period */
		for (sum=0,j=-5;j<=5;j++) sum += peak2peak(i+j);
		sum /= 11;
		apq += fabs(peak2peak(i)-sum)/sum;
		cnt++;
	}

	if (cnt==0)
		return(0);
	else {
		return(100*(apq/cnt));
	}
}

/* autocorrelation peak value */
double autocorrel(int sp1,int sp2,double *fx)
{
	double acoeff[10000];
	int			period;
	int			l1,l2;
	register int	i,j;
	int			len,num;
	double		sum,sumsq1,sumsq2,norm;
	short		*s1,*s2;
	int			p1,p2,p3;
	double		a,b,c,p;

	period = (int)((1.0/(*fx))/spitem.frameduration);
	l1 = period/2;
	if (l1 < 5) l1=5;		// avoid zero delay
	l2 = 3*period/2;
	if (l2 > 9999) l2=9999;

	len = sp2-sp1;

	/* zero autocorrelation vector */
	for (i=0;i<=l2;i++) acoeff[i]=(float)0.0;

	/* for zero delay */
	sum=0.0;
	s1 = sp+sp1;
	for (j=0;j<len;j++,s1++) sum += (double)*s1 * (double)*s1;
	acoeff[0] = sum/len;

	/* for each delay in expected freq. range */
	for (i=l1;i<=l2;i++) {
		sumsq1=sumsq2=sum=0.0;
		num=len-i;
		s1 = sp + sp1;
		s2 = sp + sp1 + i;
		for (j=0;j<num;j++,s1++,s2++) {
			sumsq1 += (double)*s1 * (double)*s1;
			sumsq2 += (double)*s2 * (double)*s2;
			sum += (double)*s1 * (double)*s2;
		}
		norm = sqrt(sumsq1)*sqrt(sumsq2)/num;
		acoeff[i] = (sum/num)/norm;
	}

	/* find max around peak using quadratic interpolation */
	p2=(int)(0.5+(1.0/(*fx))/spitem.frameduration);
	while ((p2>l1)&&(acoeff[p2-1]>acoeff[p2])) p2--;
	while ((p2<l2)&&(acoeff[p2+1]>acoeff[p2])) p2++;
	p1=p2-1;
	p3=p2+1;
	a=acoeff[p1];
	b=acoeff[p2];
	c=acoeff[p3];
	p=((a-c)/(a-2*b+c))/2;
	*fx = 1.0/((p2+p)*spitem.frameduration);
	return(b-(a-c)*p/4);
}

/* calculate HNR */
double calcHNR()
{
	int		i,j;
	double	f;
	double	ac,sum=0;
	int		cnt=0;

	for (i=0;i<txcnt;i++) if (txtab[i].ok5) {
		/* compute autocorrelation over this block, and find peak */
		f=0;
		for (j=-2;j<=2;j++) f += txtab[i].f0;
		f /= 5;
		ac = autocorrel(txtab[i-2].sp1,txtab[i+2].sp2,&f);
		txtab[i].f0p = f;
		sum += ac;
		cnt++;
	}

	if (cnt==0)
		return(0);
	else {
		ac = sum/cnt;
		if (ac < 0.000001) ac=0.000001;
		if (ac > 0.999999) ac=0.999999;
//printf("Mean ac=%g\n",ac);
		return(10.0*log10(ac/(1-ac)));
	}
}

/* calculate average part of channel */
double avchan(float *buf,int s,int e)
{
	int	i;
	double	sum=0;
	int		cnt=0;

	for (i=s;i<e;i++) {
		sum += buf[i];
		cnt++;
	}

	return(sum/cnt);
}

/* calculate soft phonation index */
double calcSPI()
{
	int		i;
	int		cnt=0;
	double	sum=0;

	FILTER	*lfilt,*lsfilt;
	FILTER	*hfilt,*hsfilt;
	float	*lchan;
	float	*hchan;

	lfilt=filter_design(FILTER_BAND_PASS,4,70.0,1600.0,1.0/spitem.frameduration);
	hfilt=filter_design(FILTER_BAND_PASS,4,1600.0,4500.0,1.0/spitem.frameduration);
	lsfilt=filter_design(FILTER_LOW_PASS,4,50.0,50.0,1.0/spitem.frameduration);
	hsfilt=filter_design(FILTER_LOW_PASS,4,50.0,50.0,1.0/spitem.frameduration);
	lchan=(float *)calloc(spitem.numframes,sizeof(float));
	hchan=(float *)calloc(spitem.numframes,sizeof(float));

	for (i=0;i<spitem.numframes;i++) {
		lchan[i] = filter_sample(lsfilt,fabs(filter_sample(lfilt,(double)sp[i])));
		hchan[i] = filter_sample(hsfilt,fabs(filter_sample(hfilt,(double)sp[i])));
	}

	for (i=0;i<txcnt;i++) if (txtab[i].ok3) {
		sum += avchan(lchan,txtab[i-1].sp1,txtab[i+1].sp2)/avchan(hchan,txtab[i].sp1,txtab[i+1].sp2);
		cnt++;
	}

	free(lchan);
	free(hchan);
	filter_free(lfilt);
	filter_free(hfilt);

	if (cnt==0)
		return(0);
	else
		return(20*log10(sum/cnt));
}

/* goertzel filter */
double goertzel(short *buf,int len,double f,double fs)
{
	double	omega;
	double	coeff;
	double	s,sp1=0,sp2=0;
	double	w,wsum=0;
	int		i;

	omega = 2*M_PI/(len-1);
	coeff = 2*cos(2*M_PI*f/fs);

	for (i=0;i<len;i++) {
		w = 0.54-0.46*cos(i*omega);
		wsum += w;
		s = buf[i]*w + coeff*sp1 - sp2;
		sp2 = sp1;
		sp1 = s;
	}
	s = sp2*sp2 + sp1*sp1 - coeff*sp1*sp2;
	wsum /= len;
	return(2*sqrt(s)/(len*wsum));
}

/* calculate HNR */
double calcH1H2()
{
	int		i;
	double	amp1,amp2,sum=0;
	int		cnt=0;

	for (i=0;i<txcnt;i++) if (txtab[i].ok5) {
		/* find energy at h1 and h2 given f0 */
		amp1 = goertzel(sp+txtab[i-2].sp1,txtab[i+2].sp2-txtab[i-2].sp1,txtab[i].f0p,1.0/spitem.frameduration);
		amp2 = goertzel(sp+txtab[i-2].sp1,txtab[i+2].sp2-txtab[i-2].sp1,2*txtab[i].f0p,1.0/spitem.frameduration);
		txtab[i].h1 = 20*log10(amp1);
		txtab[i].h2 = 20*log10(amp2);
		sum += (txtab[i].h1-txtab[i].h2);
		cnt++;
	}

	if (cnt==0)
		return(0);
	else {
		return(sum/cnt);
	}
}

/* main program */
void main(int argc,char **argv)
{
	/* option decoding */
	extern int	optind;		/* option index */
	extern char	*optarg;	/* option argument ptr */
	int			errflg = 0;	/* option error flag */
	int			c;		/* option switch */
	int			it;		/* item type selection */
	char		*ty;		/* item match selection */
	char		*sptype="0";
	char		*txtype="0";

	/* decode switches */
	while ( (c = getopt(argc,argv,"Ii:l:h:qs:e:mD")) != EOF ) switch (c) {
		case 'I' :	/* Identify */
			fprintf(stderr,"%s: Voice Quality statistics V%s\n",PROGNAME,PROGVERS);
			exit(0);
			break;
		case 'i' :	/* specific item */
			if (itspec(optarg,&it,&ty) == 0) {
				if (it == SP_TYPE)
					sptype=ty;
				else if (it == TX_TYPE)
					txtype=ty;
				else
					error("unsuitable item specifier %s",optarg);
			}
			else
				error("illegal item specifier %s",optarg);
			break;
		case 'l':
			lolimit=atof(optarg);
			break;
		case 'h':
			hilimit=atof(optarg);
			break;
		case 's':
			stime=atof(optarg);
			break;
		case 'e':
			etime=atof(optarg);
			break;
		case 'q':
			doquiet=1;
			break;
		case 'D':
			dodebug=1;
			break;
		case '?' :	/* unknown */
			errflg++;
	}
	if (errflg || (argc<2))
		error("usage: %s (-I) (-i item) (-l lolimit) (-h hilimit) (-s start) (-e end) (-q|-D) file",PROGNAME);

	/* get filename */
	if (optind < argc)
		strcpy(filename,sfsfile(argv[optind]));
	else
		error("no file specified",NULL);

	/* load Sp & Tx data */
	getitem(filename,TX_TYPE,txtype,&txitem,&tx);
	getitem(filename,SP_TYPE,sptype,&spitem,&sp);

	txlolimit = (int)((1.0/hilimit)/txitem.frameduration);
	txhilimit = (int)(0.5+(1.0/lolimit)/txitem.frameduration);

	/* initialise table */
	inittx();

	/* clear out region not required */
	if ((stime>=0)||(etime>=0)) {
		if (stime < 0) stime=0;
		if (etime < 0) etime = spitem.offset+spitem.numframes*spitem.frameduration;
		cleartx(stime,etime);
	}

	/* calculate the parameters */
	Dur   = calcDur();
	MeanF0= calcMeanF0();
	SDF0  = calcSDF0(MeanF0);
	RAP   = calcRAP();
	PPQ   = calcPPQ();
	APQ   = calcAPQ();
	HNR   = calcHNR();
	SPI   = calcSPI();
	H1H2  = calcH1H2();

	/* produce report */
	if (dodebug)
		dumptx();
	else if (doquiet) {
		printf("\"%s\",%d.%02d,%d.%02d,%g,%g,%g,%g,%g,%g,%g,%g,%g\n",
			filename,
			spitem.datatype,spitem.subtype,
			txitem.datatype,txitem.subtype,
			Dur,
			MeanF0,
			SDF0,
			RAP,
			PPQ,
			APQ,
			HNR,
			SPI,
			H1H2);
	}
	else {
		printf("File                            : %s\n",filename);
		printf("Speech Item                     : %d.%02d\n",spitem.datatype,spitem.subtype);
		printf("Tx Item                         : %d.%02d\n",txitem.datatype,txitem.subtype);
		printf("Voiced Duration (s)             : %g\n",Dur);
		printf("Mean F0 (Hz)                    : %g\n",MeanF0);
		printf("Std-Dev F0 (Hz)                 : %g\n",SDF0);
		printf("Jitter (RAP) (%%)                : %g\n",RAP);
		printf("Jitter (PPQ) (%%)                : %g\n",PPQ);
		printf("Shimmer (APQ) (%%)               : %g\n",APQ);
		printf("Harmonic/Noise Ratio (HNR) (dB) : %g\n",HNR);
		printf("Harmonics Ratio (H1H2) (dB)     : %g\n",H1H2);
		printf("Soft Phonation Index (SPI) (dB) : %g\n",SPI);
	}

	/* that's all folks */
	if (tx) free(tx);
	if (sp) free(sp);
	if (txtab) free(txtab);
	exit(0);
}


