/* endpoint -- endpoint a single utterance in a speech signal */

/* M.A. Huckvale - University College London */

/* version 1.0 - June 1996 */

#define PROGNAME "endpoint"
#define PROGVERS "1.0"
char *progname=PROGNAME;

/*-------------------------------------------------------------------------*/
/**MAN
.TH ENDPOINT UCL1 UCL SFS
.SH NAME
endpoint - endpoint an utterance in speech waveform
.SH SYNOPSIS
.B endpoint
(-i item) file
.SH DESCRIPTION
.I endpoint
is a program to automatically annotate the start and end of an utterance
supplied in a speech waveform.  The output is 2 annotations, marking the beginning 
and end of the utterance.  The program is based on the endpoint C++ class
developed by Bruce Lowerre.
.SH OPTIONS
.TP 11
.B -I
Identify program name and version number.
.TP 11
.BI -i item
Select input item number.
.SH INPUT ITEMS
.IP SP 11
Speech pressure waveform.
.SH OUTPUT ITEMS
.IP AN 11
Endpoint annotations: {start,stop}.
.SH VERSION/AUTHOR
.IP 1.0 11
Mark Huckvale
.SH BUGS
*/
/*--------------------------------------------------------------------------*/

#include "SFSCONFG.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include "sfs.h"

/* definitions */
typedef enum { FALSE, TRUE } BOOLEAN;
typedef enum { NOSILENCE, INSILENCE, START, INSIGNAL, END } EPSTATE;
typedef enum { EP_NONE, EP_RESET, EP_SILENCE, EP_SIGNAL,
		EP_MAYBEEND, EP_ENDOFUTT, EP_NOTEND, EP_NOSTARTSILENCE } EPTAG;
char *tagname[8]={
	"EP_NONE",
	"EP_RESET",
	"EP_SILENCE",
	"EP_SIGNAL",
	"EP_MAYBEEND",
	"EP_ENDOFUTT",
	"EP_NOTEND",
	"EP_NOSTARTSILENCE" };

/* control */
#define DEFSTEPTIME	0.010	/* window step time */
#define DEFWINTIME	0.016	/* window size */
#define DEFENDTIME	0.700	/* end-of-utterance time */
#define DEFMINTIME	0.100	/* minimum utterance time */
#define DEFZCTHRESH	600	/* zero-cross threshold (Hz) */
#define DEFBEGFACT	40.0	/* begin factor */
#define DEFENDFACT	80.0	/* end factor */
#define DEFENERGYFACT	200.0	/* energy factor */
#define DEFSTARTSIL	2000.0	/* start silence */
#define DEFTRIGFACT	3.0	/* trigger factor */
#define DEFDPNOISE	6	/* num dp noise */
#define DEFMINFRCLENG	0.050	/* min fric length */
#define DEFMAXPAUSE	0.150	/* max pause length */
#define DEFSTARTBLIP	0.150	/* start blip length */
#define DEFENDBLIP	0.020	/* end blip length */
#define DEFMINVOICE	0.060	/* min voice length */
#define DEFMINRISE	0.050	/* min rise length */
double	steptime = DEFSTEPTIME;
double	wintime = DEFWINTIME;
double	endtime = DEFENDTIME;
double	mintime = DEFMINTIME;

/* parameters */
int	samprate;
int	windowsize;
int	stepsize;
int	maxipause;
int	minuttlng;
int	zcthresh;
double	begfact;
double	endfact;
double	energyfact;
double	minstartsilence;
double	triggerfact;
int	numdpnoise;
int	minfriclng;
int	maxpause;
int	startblip;
int	endblip;
int	minvoicelng;
int	minrise;

/* endpoint state */
EPSTATE	epstate;
double	ave;
double	noise;
double	begthresh;
double	energy;
double	maxpeak;
double	endthresh;
double	mnbe;
double	peakreturn;
double	dpnoise;
int	scnt;
int	avescnt;
int	vcnt;
int	evcnt;
int	voicecount;
int	bscnt;
int	zccnt;
int	startframe;
int	endframe;
int	ncount;
int	zc;
BOOLEAN	startsilenceok;
BOOLEAN	low;
double	lastdpnoise[DEFDPNOISE];

/* global data */
char		filename[SFSMAXFILENAME]; /* SFS data file name */
struct item_header	spitem;		/* input speech data */
short			*sp;		/* speech buffer */
struct item_header	anitem;		/* output annotations */
struct an_rec		*an;		/* annotation record */
int			debug=0;

/* initialise endpoint parameters */
void	initparams(srate)
double	srate;
{
	int	i;
	
	samprate   = (int)srate;
	windowsize = (int)(0.5+wintime*srate);
	stepsize   = (int)(0.5+steptime*srate);
	maxipause  = (int)(0.5+endtime/steptime);
	minuttlng  = (int)(0.5+mintime/steptime);
	zcthresh   = (int)(0.5+DEFZCTHRESH/steptime);
	begfact    = DEFBEGFACT;
	endfact    = DEFENDFACT;
	energyfact = DEFENERGYFACT;
	minstartsilence = DEFSTARTSIL;
	numdpnoise = DEFDPNOISE;
	triggerfact = DEFTRIGFACT;
	minfriclng = (int)(0.5+DEFMINFRCLENG/steptime);
	maxpause   = (int)(0.5+DEFMAXPAUSE/steptime);
	startblip  = (int)(0.5+DEFSTARTBLIP/steptime);
	endblip    = (int)(0.5+DEFENDBLIP/steptime);
	minvoicelng = (int)(0.5+DEFMINVOICE/steptime);
	minrise     = (int)(0.5+DEFMINRISE/steptime);

	/* clear state */
	for (i=0;i<numdpnoise;i++)
		lastdpnoise[i]=0;

	/* initialise state */
	epstate = NOSILENCE;
	noise = 0.0;
	ave = 0.0;
	begthresh = 0.0;
	endthresh = 0.0;
	energy = 0.0;
	maxpeak = 0.0;
	scnt = 0;
	vcnt = 0;
	evcnt = 0;
	voicecount = 0;
	zccnt = 0;
	bscnt = 0;
	startframe = 0;
	endframe = 0;
	avescnt = 0;
	startsilenceok = FALSE;
	ncount = 0;
	low = TRUE;
}

/* initialise noise record */
void setnoise ()
{
	dpnoise = lastdpnoise[1] = lastdpnoise[0];
	ncount = 2;
}

/* keep record of background noise */
void averagenoise ()
{
	int	i;

	for (dpnoise = 0.0, i = ncount - 1; i > 0; i--) {
	        dpnoise += lastdpnoise[i];
	        lastdpnoise[i] = lastdpnoise[i - 1];
	}
	dpnoise = (dpnoise + lastdpnoise[0]) / ncount;
	if (ncount < numdpnoise)
	        ncount ++;
}


/* get the zero cross count and average energy */
void zcpeakpick(samples)
short	*samples;
{
	int	i;
	double	sum,trigger;
	short	*smp;

	for (sum = 0.0, i = 0, smp = samples; i < windowsize; i++, smp++)
        	sum += *smp * *smp;
    	peakreturn = (sqrt (sum / windowsize));
	lastdpnoise[0] = peakreturn;

	if (ncount == 0)
        	dpnoise = peakreturn;		/* initial value */
    	trigger = dpnoise * triggerfact;	/* schmidt trigger band */

    	for (i = 0, zc = 0, smp = samples; i < windowsize; i++, smp++) {
		if (low) {
			if (*smp > trigger) {
	                	zc++;
				low = FALSE;
			}
		}
		else {
			if (*smp < -trigger) {
				zc++;
				low = TRUE;
			}
		}
	}
}

/* endpoint algorithm */
EPTAG getendpoint(samples)
short	*samples;
{
	float	tmp;

	/* get zc count and peak energy */
	zcpeakpick (samples);

	if (peakreturn > maxpeak) {
		maxpeak = peakreturn;
		if ((tmp = maxpeak / endfact) > endthresh)
			endthresh = tmp;
	}

	switch (epstate) {
	case NOSILENCE:		/* start, get background silence */
		ave += peakreturn;
		if (++scnt <= 3) {	/* average 3 frame's worth */
			if (scnt == 1)
				setnoise();
			else
				averagenoise();
			
			if (dpnoise < minstartsilence) {
				startsilenceok = TRUE;
				ave += peakreturn;
				avescnt++;
			}
			return (EP_SILENCE);
		}
		if (!startsilenceok) return (EP_NOSTARTSILENCE);
		ave /= avescnt;
		noise = ave;
		begthresh = noise + begfact;
		endthresh = begthresh;
		mnbe = noise * energyfact;
		epstate = INSILENCE;
		return (EP_SILENCE);

	case INSILENCE:
		ave = ((3.0 * ave) + peakreturn) / 4.0;
		if ((peakreturn > begthresh) || (zc > zcthresh)) {
			/* looks like start of signal */
			energy += peakreturn - noise;
			if (zc > zcthresh)
				zccnt++;
			if (peakreturn > begthresh)
				voicecount++;
			if (++vcnt > minrise) {
				scnt = 0;
				epstate = START;	/* definitely start of signal */
			}
			return (EP_SIGNAL);
		}
		else {
			/* still in silence */
			energy = 0.0;
			if (ave < noise) {
				noise = ave;
				begthresh = noise + begfact;
				endthresh = begthresh;
				mnbe = noise * energyfact;
			}
			if (vcnt > 0) {
				/* previous frame was signal */
				if ((++bscnt > startblip) || (zccnt == vcnt)) {
					/* Oops, no longer in the signal */
					noise = ave;
					begthresh = noise * begfact;
					endthresh = begthresh;
					mnbe = noise * energyfact;
					vcnt = 0;
					zccnt = 0;
					bscnt = 0;
					voicecount = 0;
					startframe = 0;
					return (EP_RESET); /* not in the signal, ignore previous */
				}
				return (EP_SIGNAL);
			}
			zccnt = 0;
			return (EP_SILENCE);
		}

	 case START:
		 if ((peakreturn > begthresh) || (zc > zcthresh)) {
			/* possible start of signal */
			 energy += peakreturn - noise;
			 if (zc > zcthresh)
				 zccnt++;
			 if (peakreturn > begthresh)
				 voicecount++;
			 vcnt += scnt + 1;
			 scnt = 0;
			 if ((energy > mnbe) || (zccnt > minfriclng))
				 epstate = INSIGNAL;
			 return (EP_SIGNAL);
		 }
		 else if (++scnt > maxpause) {
			/* signal went low again, false start */
			 vcnt = zccnt = voicecount = 0;
			 energy = 0.0;
			 epstate = INSILENCE;
			 ave = ((3.0 * ave) + peakreturn) / 4.0;
			 if (ave < noise + begfact) {
				/* lower noise level */
				 noise = ave;
				 begthresh = noise + begfact;
				 endthresh = begthresh;
				 mnbe = noise * energyfact;
			 }
			 return (EP_RESET);
		 }
		 else
			 return (EP_SIGNAL);

	case INSIGNAL:
		if ((peakreturn > endthresh) || (zc > zcthresh)) {
			/* still in signal */
			if (peakreturn > endthresh)
				voicecount++;
			vcnt++;
			scnt = 0;
			return (EP_SIGNAL);
		}
		else {
			/* below end threshold, may be end */
			scnt++;
			epstate = END;
			return (EP_MAYBEEND);
		}

	case END:
		if ((peakreturn > endthresh) || (zc > zcthresh)) {
			/* signal went up again, may not be end */
			if (peakreturn > endthresh)
				voicecount++;
			if (++evcnt > endblip) {
				/* back in signal again */
				vcnt += scnt + 1;
				evcnt = 0;
				scnt = 0;
				epstate = INSIGNAL;
				return (EP_NOTEND);
			}
			else
				return (EP_SIGNAL);
		}
		else if (++scnt > maxipause) {
			/* silence exceeds inter-word pause */
			if ((vcnt > minuttlng) && (voicecount > minvoicelng))
				/* end of utterance */
				return (EP_ENDOFUTT);
			else {
				/* signal is too short */
				scnt = vcnt = voicecount = 0;
				epstate = INSILENCE;
				/* false utterance, keep looking */
				return (EP_RESET);
			}
		}
		else {
			/* may be an inter-word pause */
			if (peakreturn == 0)
				/* zero filler frame */
				return (EP_ENDOFUTT);	
			evcnt = 0;
			/* assume still in signal */
			return (EP_SIGNAL);
		}
	}
	return(EP_NONE);
}

/* main program */
void main(argc,argv)
int	argc;
char	*argv[];
{
	/* option decoding */
	extern int	optind;		/* option index */
	extern char	*optarg;	/* option argument ptr */
	int		errflg = 0;	/* option error flag */
	int		c;		/* option switch */
	int		it;		/* item selection */
	char		*ty;		/* item sub type */
	/* file variables */
	char		*sptype="0";
	int		fid;		/* input file descriptor */
	int		ofid;		/* output channel */
	int		i;
	EPTAG		eptag,lasttag;

	/* decode switches */
	while ( (c = getopt(argc,argv,"Ii:D")) != EOF ) switch (c) {
		case 'I' :	/* Identify */
			fprintf(stderr,"%s: Endpoint an utterance V%s\n",PROGNAME,PROGVERS);
			exit(0);
			break;
		case 'i' :	/* specific item */
			if (itspec(optarg,&it,&ty) == 0) {
				if (it == SP_TYPE)
					sptype = ty;
				else
					error("unsuitable item specifier %s",optarg);
			}
			else
				error("illegal item specifier %s",optarg);
			break;
		case 'D':
			debug++;
			break;
		case '?' :	/* unknown */
			errflg++;
	}
	if (errflg || (argc<2))
		error("usage: %s (-I) (-i item) file",PROGNAME);

	/* get filename */
	if (optind < argc)
		strcpy(filename,sfsfile(argv[optind]));
	else
		error("no SFS file specified",NULL);

	/* open file and locate speech */
	if ((fid=sfsopen(filename,"w",NULL)) < 0)
		error("access error on '%s'",filename);
	if (!sfsitem(fid,SP_TYPE,sptype,&spitem))
		error("cannot find input SP item in '%s'",filename);

	/* initialise end pointer */
	initparams(1.0/spitem.frameduration);
	
	/* create output item header */
	sfsheader(&anitem,AN_TYPE,-1,1,-1,
			spitem.frameduration,spitem.offset,
			0,0,1);
	sprintf(anitem.history,"%s(%d.%02d)",
		PROGNAME,
		spitem.datatype,spitem.subtype);

	/* open output channel */
	if ((ofid = sfschannel(filename,&anitem)) < 0)
		error("unable to open output channel to '%s'",filename);	

	/* get input buffer */
	sp = (short *)sfsbuffer(&spitem,windowsize);
	an = (struct an_rec *)sfsbuffer(&anitem,1);

	/* loop through speech data */
	lasttag = EP_NONE;
	for (i=0;sfsread(fid,i,windowsize,sp)==windowsize;i+=stepsize) {
		eptag = getendpoint(sp);
		if (debug) {
			if (eptag!=lasttag) {
				printf("%6d. %s\n",i,tagname[eptag]);
				lasttag = eptag;
			}
		}
		switch (eptag) {
		case EP_RESET:
			/* false start */
			/* fall through */
		case EP_SILENCE:
			/* ignore frame */
			startframe = i+stepsize;
			break;
		case EP_MAYBEEND:
			/* might be end of utterance */
			endframe = i;
			/* fall through */
		case EP_SIGNAL:
			/* a real data frame */
			break;
		case EP_NOTEND:
			/* last end was incorrect */
			endframe = -1;
			/* treat as a signal frame */
			break;
		case EP_ENDOFUTT:
			/* confirmation of end */
			goto done;
		case EP_NOSTARTSILENCE:
			/* missing silence at start */
			fprintf(stderr,"Missing silence at start\n");
			break;
		case EP_NONE:
			/* keep gcc happy */
			break;
		}
	}
done:
	if (endframe <= 0)
		error("No utterance found in '%s'",filename);

	/* write start marker */
	an->posn = startframe;
	an->size = endframe - startframe;
	strcpy(an->label,"start");
	sfswrite(ofid,1,an);

	/* write stop marker */
	an->posn = endframe;
	an->size = spitem.numframes - endframe;
	strcpy(an->label,"stop");
	sfswrite(ofid,1,an);

	/* close output file and update */
	if (!sfsupdate(filename))
		error("update error on %s",filename);

	/* that's all folks */
	exit(0);
}
