/* npoint -- endpoint N utterances in a speech signal */

/* M.A. Huckvale - March 1990 */

/* version 1.0 */

/* version 1.1 - April 1990
	- use log energy
*/
/* version 1.2 - May 1990
	- switch to change window size
*/
/* version 1.3 - July 1996
	- 'word' annotation options for compatibility
	  with divide, wordplay and wordchop
*/

#define PROGNAME "npoint"
#define PROGVERS "1.3"
char *progname=PROGNAME;

/*-------------------------------------------------------------------------*/
/**MAN
.TH NPOINT UCL1 UCL SFS
.SH NAME
npoint - endpoint N utterances in a single speech waveform
.SH SYNOPSIS
.B npoint
(-i item) (-n numutter) (-r mark_space_ratio%) (-b backofftime) (-w windowtime)
(-W wordlist) (-l labelstem) file
.SH DESCRIPTION
.I npoint
is a program to automatically annotate the endpoints of multiple
utterances in a speech waveform.  The input is a speech signal
containing one or more utterances (the number specified on command line)
separated by silence.  The output is 2N annotations, marking the beginning 
and end of each utterance.  The program uses a dynamic programming
procedure to find exactly N utterances and N+1 silences.
.SH OPTIONS
.TP 11
.B -I
Identify program name and version number.
.TP 11
.BI -i item
Select input item number.
.TP 11
.BI -n numutter
Specify number of utterances. Default 1.
.TP 11
.BI -r mark_space
Specify the mark-to-space ratio for speech signals to silence.  This is
expressed as a percentage in range 0-100.  Thus for 2 second utterances
separated typically by 5 seconds of silence, specify a mark-space ratio
of 40.  Default 50.
.TP 11
.BI -b backofftime
Specify the time in seconds you want the markers 'backed-off' from
the located start and stop points.  The start markers are moved earlier
by this time, the stop markes are moved later.  No check is performed to
see if this causes starts to overlap previous stops.  Default 0.1s.
.TP 11
.BI -w windowtime
Specify the size of each analysis window in seconds.  
Annotations are positioned to
multiples of this size.  The maximum size of speech file that can be 
processed is limited by the square of the number of analysis windows
required to cover the input.  So for an input signal of 30 seconds, a window
of 0.05 seconds will required 600x600 = 360kbytes of memory. To analyse
long speech signals use a larger analysis window.  Default 0.05 seconds.
.TP 11
.BI -W wordlist
Specify a file containing a list of the N words to be found.  These are
then used as the basis for the annotations for the start points.  The
stop points are annotated with '/'.
.TP 11
.Bi -l labelstem
Specify a different stem for the start point annotation.  The default is
'start'.  When this mode is selected, the stop points are labelled with '/'.
.SH INPUT ITEMS
.IP SP 11
Speech pressure waveform.
.SH OUTPUT ITEMS
.IP AN 11
Endpoint annotations: {startN,stopN}.
.SH HISTORY
.IP utterances 11
number of utterances.
.IP markspace 11
mark-to-space ratio
.IP windowtime 11
analysis window size (s).
.IP backoff 11
back-off time (s)
.IP type 11
set to 'endpoints'.
.SH VERSION/AUTHOR
.IP 1.3 11
Mark Huckvale
.SH BUGS
*/
/*--------------------------------------------------------------------------*/

#include "SFSCONFG.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include "sfs.h"

/* control */
#define DEFFRAMEDUR	0.05		/* size of input frame (s) */
double	framedur = DEFFRAMEDUR;

/* global data */
char		filename[SFSMAXFILENAME]; /* SFS data file name */
struct item_header	spitem;		/* input speech data */
short			*sp;		/* speech buffer */
struct item_header	anitem;		/* output annotations */
struct an_rec		*an;		/* annotation record */
float			*spen;		/* speech energy */
float			*uten;		/* artificial utterance energy */
int			numf;		/* number of matched frames */
int			numutter=1;
int			msratio=50;
double			backofftime=0.1;
int			debug=0;
int			dowords=0;	/* word labelling mode */
FILE			*wp;		/* word list */
char			*startstem="start";
char			*stopstem="stop";

/* alignment buffers */
#define	DIAG 	1			/* DP path values */
#define UP 	2
#define LEFT	3
unsigned char	*direc;			/* DP directions */
float		*cudist1,*cudist2;	/* DP distance matrix lines */
float		*dist1,*dist2;		/* DP distance matrix lines */
double		*reg;			/* DP time registration path */
int		*align;			/* DP frame alignment path */

/* save a track to SFS file */
void savetrack(buf,len)
float	*buf;
int	len;
{
	struct item_header	tritem;
	static int		trackno=1;

	sfsheader(&tritem,TR_TYPE,1,4,1,framedur,spitem.offset,1,0,0);
	sprintf(tritem.history,"%s(%d.%02d;track=%d)",
		PROGNAME,
		spitem.datatype,spitem.subtype,
		trackno++);
	putitem(filename,&tritem,len,buf);
}

/* build the speech contour */
int getspeechcontour(fid,low,avg,high)
int		fid;
float		*low;
float		*avg;
float		*high;
{
	register int	i,j;
	register short	*p,lasp=0;
	float		d,sum;
	int		buflen,numbuf=0;

	/* get buffer length */
	buflen = (int)(0.5+framedur/spitem.frameduration);

	/* get speech buffer */
	if ((sp=(short *)sfsbuffer(&spitem,buflen))==NULL)
		error("could not get input speech buffer");

	/* get contour buffer */
	if ((spen=(float *)calloc(spitem.numframes/buflen,sizeof(float)))==NULL)
		error("could not get speech energy buffer");

	/* process speech */
	*low=1E10;
	*avg=0.0;
	*high=0.0;
	lasp= *sp;
	for (i=0;sfsread(fid,i*buflen,buflen,sp)==buflen;i++) {
		sum=1.0;
		p = sp;
		for (j=0;j<buflen;j++) {
/*			v = (float) *p; */
			d = (float) (*p - lasp);
			sum += /* v*v + */ d*d;
			lasp = *p++;
		}
		sum = log(sum);
		if (sum < *low) *low = sum;
		if (sum > *high) *high = sum;
		*avg += sum;
		spen[i] = sum;
		numbuf++;
		if (((i%10)==9) && ttytest()) {
			printf("Calculating Energy: Frame %d/%d\r",i+1,spitem.numframes/buflen);
			fflush(stdout);
		}
	}
	if (ttytest())
		printf("Calculating Energy: Frame %d/%d\n",i,spitem.numframes/buflen);

	*avg /= numbuf;
	*high = (*high + *avg * 2)/3;

	if (debug)
		savetrack(spen,numbuf);

	return(numbuf);
}

/* generate artificial waveform */
void getuttercontour(numbuf,numutter,ratio,low,avg,high)
int		numbuf;
int		numutter;
int		ratio;
float		low;
float		avg;
float		high;
{
	int	silsize,uttsize;
	int	i,j,encnt=0;
	float	*en;

	/* get utterance contour buffer */
	if ((uten=(float *)calloc(numbuf+1,sizeof(float)))==NULL)
		error("could not get utterance energy buffer");
	en=uten;
	silsize = (numbuf*(100-ratio))/(100*numutter);
	uttsize = (numbuf*ratio)/(100*numutter);
	if (silsize < 2)
		error("silences too small");
	if (uttsize < 6)
		error("utterances too small");

	/* get annotation buffer */
	if ((an=(struct an_rec *)sfsbuffer(&anitem,2*numutter))==NULL)
		error("could not get annotation buffer");

	/* put in start silence */
	for (i=0;i<silsize/2;i++) {
		*en++ = low;
		encnt++;
	}

	/* put in N utterances */
	for (i=0;i<numutter;i++) {
		*en++ = (2*low+avg)/3;
		encnt++;
		/* save annotation to start */
		an[2*i].posn = (int)(((en-uten)*framedur)/anitem.frameduration);
		an[2*i].size = (int)((uttsize*framedur)/anitem.frameduration);
		if (dowords)
			fscanf(wp,"%s",an[2*i].label);
		else
			sprintf(an[2*i].label,"%s%d",startstem,i+1);
		*en++ = (low+2*avg)/3;
		encnt++;
		*en++ = avg;
		encnt++;
		/* put in utterance */
		for (j=3;j<(uttsize-3);j++) {
			*en++ = high;
			encnt++;
		}
		*en++ = avg;
		encnt++;
		*en++ = (low+2*avg)/3;
		encnt++;
		/* save annotation to stop */
		an[2*i+1].posn = (int)(((en-uten)*framedur)/anitem.frameduration);
		an[2*i+1].size = (int)((silsize*framedur)/anitem.frameduration);
		if (dowords || (strcmp(stopstem,"/")==0))
			strcpy(an[2*i+1].label,"/");
		else
			sprintf(an[2*i+1].label,"%s%d",stopstem,i+1);
		*en++ = (2*low+avg)/3;
		encnt++;
		if (i!=(numutter-1))
			for (j=0;j<silsize;j++) {
				*en++ = low;
				encnt++;
			}
	}

	/* put in final silence */
	while (encnt < numbuf) {
		*en++ = low;
		encnt++;
	}

}

#define EDGE	1E10
#define MINOF(x,y,z) (((x) < (y)) ? (((x) < (z)) ? (x) : (z)) : (((y) < (z)) ? (y) : (z)))
#define MINOFIDX(x,y,z) (((x) < (y)) ? (((x) < (z)) ? 1 : 3) : (((y) < (z)) ? 2 : 3))

/* Dynamic Programming match */
void dpmatch(refnum,newnum)
int		refnum;		/* # horizontal frames */
int		newnum;		/* # vertical frames */
{
	register int	i,j;
	register float	diag,up,left;
	unsigned char	*dptr;
	float		*ftemp;
	float		metric();

	/* initialisation */
	ftemp=cudist1;
	for (i=0;i<refnum;i++) *ftemp++=EDGE;

	/* DP match */
	dptr=direc;
	for (i=0;i<newnum;i++) {
		for (j=0;j<refnum;j++) {
			if ((i==0) && (j==0)) {
				diag=0.0;
				left=EDGE;
			}
			else if (j==0) {
				diag=EDGE;
				left=EDGE;
			}
			else {
				diag=cudist1[j-1] + dist1[j-1];
				left=cudist2[j-1] + dist2[j-1];
			}
			up = cudist1[j] + dist1[j];
			dist2[j] = metric(spen[i],uten[j]);
			cudist2[j] = dist2[j] + MINOF(diag,up,left);
			*dptr++ = MINOFIDX(diag,up,left);
		}
		/* swap over buffers */
		ftemp=cudist1;
		cudist1=cudist2;
		cudist2=ftemp;
		ftemp=dist1;
		dist1=dist2;
		dist2=ftemp;
		/* inform user */
		if (((i%10)==9) && ttytest()) {
			printf("Locating Utterances: Frame %d/%d\r",i+1,newnum);
			fflush(stdout);
		}
	}
	if (ttytest())
		printf("Locating Utterances: Frame %d/%d\n",i,newnum);

}

/* Dynamic Programming time registration path */
void timereg(refnum,newnum,reg)
int		refnum;
int		newnum;
double		*reg;
{
	register int	i;
	unsigned char	*dptr;

	/* initialise at end of table */
	dptr = direc + refnum*newnum - 1;
	reg[refnum] = numf*framedur;
	i = refnum - 1;

	/* pass backwards up registration path */
	while ((dptr > direc) && (i >= 0)) {
		switch (*dptr) {
		case DIAG:
			reg[i--] = ((dptr-direc) / refnum)*framedur;
			dptr -= refnum + 1;
			break;
		case UP:
			dptr -= refnum;
			break;
		case LEFT:
			reg[i--] = ((dptr-direc) / refnum)*framedur;
			dptr--;
			break;
		}
	}
}

/* get new time instant from time registration path */
double		newtime(t,reg,refnum)
double		t;		/* time to xfer */
double		*reg;		/* registration path */
int		refnum;		/* length of registration path */
{
	int	p1,p2;
	double	mix,newt;

	/* get pointers into registration path */
	p1 = (int)(t/framedur);
	p2 = p1 + 1;

	/* check not off end */
	if (p2 >= refnum) return(reg[refnum]);

	/* get addmixture coefficient */
	mix = (t - p1*framedur)/framedur;

	/* interpolate answer */
	newt = (1.0-mix)*reg[p1] + mix*reg[p2];

/*
	printf("t=%g,p1=%d,p2=%d,reg[p1]=%g,reg[p2]=%g,newt=%g\n",
		t,p1,p2,reg[p1],reg[p2],newt);
*/
	return(newt);
}

/* distance metric for energy */
float metric(a,b)
double	a,b;
{
	return( sqrt((a-b)*(a-b)) );
}

/* main program */
void main(argc,argv)
int	argc;
char	*argv[];
{
	/* option decoding */
	extern int	optind;		/* option index */
	extern char	*optarg;	/* option argument ptr */
	int		errflg = 0;	/* option error flag */
	int		c;		/* option switch */
	int		it;		/* item selection */
	char		*ty;		/* item sub type */
	/* file variables */
	char		*sptype="0";
	int		fid;		/* input file descriptor */
	int		ofid;		/* output channel */
	float		low,avg,high;	/* speech energies */
	int		i,j;
	double		postime,newtime();
	int		backofflen,lastposn,thisposn;

	/* decode switches */
	while ( (c = getopt(argc,argv,"Ii:n:r:b:w:DW:l:")) != EOF ) switch (c) {
		case 'I' :	/* Identify */
			fprintf(stderr,"%s: Endpoint file of N utterances V%s\n",PROGNAME,PROGVERS);
			exit(0);
			break;
		case 'i' :	/* specific item */
			if (itspec(optarg,&it,&ty) == 0) {
				if (it == SP_TYPE)
					sptype = ty;
				else
					error("unsuitable item specifier %s",optarg);
			}
			else
				error("illegal item specifier %s",optarg);
			break;
		case 'n' :	/* number of utterances */
			numutter = atoi(optarg);
			if (numutter <= 0)
				error("bad utterance number '%s'",optarg);
			break;
		case 'r' :	/* mark/space ratio */
			msratio = atoi(optarg);
			if ((msratio < 0) || (msratio > 100))
				error("bad mark/space ratio '%s'",optarg);
			break;
		case 'b' :	/* back off time */
			backofftime=atof(optarg);
			break;
		case 'w' :	/* window size */
			framedur = atof(optarg);
			if ((framedur < 0.001) || (framedur > 1))
				error("specify windowtime in seconds");
			break;
		case 'W':	/* word list */
			if ((wp=fopen(optarg,"r"))==NULL)
				error("could not open '%s'",optarg);
			dowords=1;
			break;
		case 'l':	/* label stem */
			startstem = optarg;
			stopstem = "/";
			break;
		case 'D':
			debug++;
			break;
		case '?' :	/* unknown */
			errflg++;
	}
	if (errflg || (argc<2))
		error("usage: %s (-I) (-i item) (-n numutter) (-r markspace) (-b backoff) (-w windowtime) (-W wordlist) (-l labelstem) file",PROGNAME);

	/* get filename */
	if (optind < argc)
		strcpy(filename,sfsfile(argv[optind]));
	else
		error("no SFS file specified",NULL);

	/* open file and locate speech */
	if ((fid=sfsopen(filename,"w",NULL)) < 0)
		error("access error on '%s'",filename);
	if (!sfsitem(fid,SP_TYPE,sptype,&spitem))
		error("cannot find input SP item in '%s'",filename);

	/* create output item header */
	sfsheader(&anitem,AN_TYPE,-1,1,-1,
			spitem.frameduration,spitem.offset,
			0,0,1);
	sprintf(anitem.history,"%s(%d.%02d;utterances=%d,markspace=%d,windowtime=%g,backoff=%g,type=endpoints)",
		PROGNAME,
		spitem.datatype,spitem.subtype,
		numutter,msratio,framedur,backofftime);

	/* get energy contour */
	numf = getspeechcontour(fid,&low,&avg,&high);

	/* get artificial contour and annotations */
	getuttercontour(numf,numutter,msratio,low,avg,high);

	/* get buffers */
	dist1=(float *)calloc(numf,sizeof(float));
	dist2=(float *)calloc(numf,sizeof(float));
	cudist1=(float *)calloc(numf,sizeof(float));
	cudist2=(float *)calloc(numf,sizeof(float));
	direc=(unsigned char *)malloc(numf * numf);
	if ((dist1==NULL) || (dist2==NULL) || (cudist1==NULL) || 
			(cudist2==NULL) || (direc==NULL))
		error("unable to get sufficient memory for buffers",NULL);

	/* do DP matching */
	dpmatch(numf,numf);

	/* free work buffers */
	free(dist1);
	free(dist2);
	free(cudist1);
	free(cudist2);

	/* calculate time registration path */
	reg = (double *)calloc(numf+1,sizeof(double));
	timereg(numf,numf,reg);

	if (debug) {
		/* save re-aligned artificial utterance */
		for (i=0;i<numf;i++)
			spen[i]=0.0;
		for (i=0;i<numf;i++) {
			postime = i*framedur;
			j=0;
			while ((j<numf) && (postime > reg[j])) j++;
			if (j<numf)
				spen[i] = uten[j];
			else
				spen[i] = uten[numf-1];
		}
		savetrack(spen,numf);
	}
	
	/* open output channel */
	if ((ofid = sfschannel(filename,&anitem)) < 0)
		error("unable to open output channel to '%s'",filename);	

	/* transfer annotations */
	backofflen = (int)(backofftime/anitem.frameduration);
	lastposn=0;
	for (i=0;i<2*numutter;i++) {

		/* calculate new position and length */
		postime = an[i].posn*anitem.frameduration;
		if (i%2) {
			/* stop markers */
			thisposn = (int)(0.5 + newtime(postime,reg,numf)/anitem.frameduration)
				+ backofflen;
			if (thisposn > spitem.numframes)
				thisposn = spitem.numframes-1;
		}
		else {
			/* start markers */
			thisposn = (int)(0.5 + newtime(postime,reg,numf)/anitem.frameduration)
				- backofflen;
			if (thisposn < 0) thisposn=0;
		}

		/* write previous annotation */
		if (i) {
			/* set up new position */
			an[i-1].posn = lastposn;
			an[i-1].size = thisposn - lastposn;

			/* put annotation */
			if (sfswrite(ofid,1,&an[i-1]) != 1)
				error("write error on output file ",NULL);
		}
		lastposn = thisposn;
	}
	/* write last annotation */
	if (i) {
		/* set up new position */
		an[i-1].posn = lastposn;
		an[i-1].size = spitem.numframes-lastposn;

		/* put annotation */
		if (sfswrite(ofid,1,&an[i-1]) != 1)
			error("write error on output file ",NULL);
	}

	/* close output file and update */
	if (!sfsupdate(filename))
		error("update error on %s",filename);

	/* that's all folks */
	if (wp) fclose(wp);
	exit(0);
}

