/* spectran -- speech waveforms to spectral coefficients by fourier transforms */

/* M.A.Huckvale - February 1987 */

/* version 1.1
	- SFS version
	- windowing bug fixed 
*/
/* version 1.2	- April 1988
	- window size options changed
*/
/* version 1.3 - December 1990
	- change windowing of signal
*/
/* version 1.4 - April 1994
	- allow change to number of values in spectrum returned
*/
/* version 1.5 - May 1995
	- allow change to window overlap & step properly
*/

#define PROGNAME "spectran"
#define PROGVERS "1.5"
char	*progname=PROGNAME;

/*--------------------------------------------------------------------------*/
/**MAN
.TH SPECTRAN 1 UCL SPAR
.SH NAME
spectran - transform speech waveforms to spectral data by fourier transform
.SH SYNOPSIS
.B spectran
[(-b bandwidth)|(-w window) (-s stepsize|-o overlap)] (-n numpixel) (-t offset) (-c) (-p) (-i item) file
.SH DESCRIPTION
.I spectran
performs a fourier transform on selected windows of a speech signal to
produce a set of spectral coefficients suitable for spectrographic 
examination.  The windows may be of fixed length with a selected overlap or selected stepsize, 
or may be positioned by means of a TX item (in which case the unvoiced periods
are analysed according to the fixed length parameters).  A Hamming 
window is used for all analyses.  Pre-emphasis may be de-selected.  This program does not use the
array processor.
.PP
.I Options
and their meanings are:
.TP 11
.B -I
Identify program and version number.
.TP 11
.BI -i item
Select input item number.
.TP 11
.BI -b bandwidth
Select analysis bandwidth in Hz. (default 300Hz).  This can be used as an alternative to specifying
window size directly.
.TP 11
.BI -w window
Select analysis window size in milliseconds.  Default is calculated from analysis bandwidth.
.TP 11
.BI -s stepsize
Select analysis window step size in milliseconds.  Default is calculated from analysis bandwidth.
.TP 11
.BI -o overlap
Select analysis window overlap in milliseconds. Default is calculated from analysis bandwidth.
.TP 11
.BI -n numpixel
Specify number of energies returned in spectrum.  Default 128.
Must be power of 2 between 32 and 16384.
.TP 11
.BI -t offset
Select excitation synchronous analysis with supplied offset in 
micro-seconds between tx markers and start point of analysis window.  
Negative is earlier, positive later, default is 0.
.TP 11
.B -c
Use cepstral filtering to smooth spectrum (cut off at 2ms).
.TP 11
.B -p
Do not pre-emphasize speech data.
.SH INPUT ITEMS
.IP 1.xx 11
Any speech waveform.
.IP 3.xx 11
(Optional) Excitation markers.
.SH OUTPUT ITEMS
.IP 11 11
Spectral coefficients.
.SH VERSION/AUTHOR
1.5 - Mark Huckvale (from 
.I flfft
and others by Lynn Whitaker & David Pearce)
*/
/*--------------------------------------------------------------------------*/

/* standard definitions */
#include "SFSCONFG.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include "sfs.h"
#include "fft.h"

#define MAX(x,y) (((x)>(y))?(x):(y))

/* manifest constants */
#define DEFNCOEFF 128	/* # spectral coefficients */
int NCOEFF=DEFNCOEFF;	/* now changeable # coefficients */
#define CEPCUT 0.002	/* cepstrum cut-off frequency = 500 Hz */
#define DEFBAND 300	/* default analysis bandwidth */

/* global data */
struct item_header	spitem;		/* speech item header */
short			*sp;		/* speech buffer */
struct item_header	txitem;		/* tx item header */
int			*tx;		/* tx buffer */
double			txoffset = 0.0;	/* analysis window offset from tx data */
struct item_header	coitem;		/* coefficients item header */
int			lxsync = 0;	/* excitation synchronous = false */
double			band = DEFBAND;	/* analysis bandwidth */
double			wisize = -1;	/* analysis window size */
double			stsize = -1;	/* analysis window step size */
double			ovsize = -1;	/* analysis window overlap */
float			preemp = 0.95;	/* pre-emphasis factor */
int			cepst = 0;	/* cepstral smoothing */
int			premp = 1;	/* pre-emphasis */

/* window position record */
struct wm_rec {
	int	posn;	/* window location in speech samples */
	int	size;	/* window size in speech samples */
	int	flag;	/* voicing flag */
};				/* window marker structure */
struct wm_rec		*wmb;	/* window marker buffer */

/* calculate number of analysis windows required */
int calcwmsize(txitem,tx)
struct item_header	*txitem;	/* input tx data */
int			*tx;		/* tx buffer */
{
	int	size=0;		/* number of windows found */
	int	ms20;		/* number of samples in 50 Hz */
	int	wlen;		/* number of samples in window gap */
	int	i,txval;

	/* calculate 20ms and window gap */
	ms20 = (int)(0.5 + 0.02/txitem->frameduration);
	wlen = (int)(0.5 + (wisize - ovsize)/txitem->frameduration);

	/* investigate # windows required for each unvoiced period */
	for (i=0;i<txitem->numframes;i++) {
		txval = tx[i];
		if (txval >= ms20) while (txval >= 2*wlen) {
			txval -= wlen;
			size++;
		}
		size++;
	}

	/* check size of last window */
	if (tx[txitem->numframes-1] < wlen) {
		tx[txitem->numframes-2] += tx[txitem->numframes-1];
		txitem->numframes--;
		size--;
	}

	return(size);
}

/* initialise window marker buffer */
void initwm(txitem,tx,wm,spdur,offset,nframes)
struct item_header	*txitem;	/* tx item header */
int			*tx;		/* tx data */
struct wm_rec		*wm;		/* window marker buffer */
double			spdur;		/* speech signal sample duration */
double			offset;		/* tx offset from speech */
int			nframes;	/* # speech samples to cover */
{
	int	ms20;		/* number of samples in 50 Hz */
	int	wlen;		/* number of samples in window gap */
	int	i,txval;
	double	totime;		/* total time */
	double	txtime;		/* tx sample end time */

	/* calculate 10ms and 20ms */
	ms20 = (int)(0.5 + 0.02/txitem->frameduration);
	wlen = (int)(0.5 + (wisize - ovsize)/txitem->frameduration);

	/* investigate # windows required for each unvoiced period */
	totime = offset;	/* location of zeroth tx w.r.t speech */
	for (i=0;i<txitem->numframes;i++) {
		txval = tx[i];
		if (txval >= ms20) {	/* unvoiced section */
			while (txval >= 2*wlen) {
				/* initialise 10ms window marker */
				txval -= wlen;
				txtime = totime + wlen*txitem->frameduration;		
				wm->posn = (int)((totime > 0.0) ? 0.5 + (totime/spdur) : 0.0);
				wm->size = (int)(0.5 + ovsize/spdur);
				wm->flag = 0;
				wm++;
				totime = txtime;
			}
			wm->flag = 0;
		}
		else			/* voiced section */
			wm->flag = 1;
		/* initialise window marker data from tx */
		txtime = totime + txval*txitem->frameduration;		
		wm->posn = (int)((totime > 0.0) ? 0.5 + (totime/spdur) : 0.0);
		wm->size = (int)(0.5 + txtime/spdur - wm->posn);
		wm++;
		totime = txtime;
	}

	/* fix-up last tx sample to match waveform length */
	wm--;
	wm->size = nframes - wm->posn;
}

/* static fft arrays */
float	fftbuf[32770];
float	cepstbuf[32770];

/* perform fft on speech waveform */
void fftsub(sp,wm,coeff)
short			*sp;	/* speech waveform with sp[0]=wm->posn */
struct wm_rec		*wm;	/* window marker */
struct co_rec		*coeff;	/* returned coefficients */
{
	int	i,j,pow,wlen,rep;
	float	omega,val,*xp,*rp;

	/* find power of two required */
	wlen=NCOEFF*2;
	for (pow=0,i=1;i<wlen;pow++,i*=2) /* loop */;
	
	while (((wm->size-1) >> pow) > 0) {
		pow++;
		wlen *= 2;
	}
	if (pow > 15) {
		fprintf(stderr,"analysis window truncated to 32768 points\n");
		wlen=32768;
		wm->size=32768;
	}

	/* put pre-emphasised speech data in floating point buffer */
	xp = fftbuf;
/*	*xp++ = *sp++; might be cause of clicks */
	*xp++ = 0.0;
	sp++;
	for (i=1;i<wm->size;i++,sp++) *xp++ = (float)*sp - preemp * (float)*(sp-1);
	for (;i<=wlen+1;i++) *xp++ = 0.0;

	/* window */
	omega = 8.0 * atan(1.0) / (wm->size - 1);
	xp = fftbuf;
	for (i=0;i<wm->size;i++,xp++)
		*xp *= (0.54 - (0.46 * cos(i * omega)));

	/* perform FFT */
	REALFFT(fftbuf,wlen/2,FORWARD);

	/* perform cepstral analysis if required */
	if (cepst) {
		rp = cepstbuf;
		cepstbuf[wlen/2-1] = 10.0*log10(MAX(fftbuf[0]*fftbuf[0],1e-6));
		xp = fftbuf+2;
		for (i=1;i<wlen/2;i++) {
			val = (*xp * *xp);
			xp++;
			val += (*xp * *xp);
			xp++;
			cepstbuf[wlen/2-i-1] = cepstbuf[wlen/2+i-1] = 10.0*log10(MAX(val,1e-6));
		}
		cepstbuf[wlen-1]=0.0;
		cepstbuf[wlen]=0.0;
		cepstbuf[wlen+1]=0.0;
		cepstbuf[wlen+2]=0.0;

		/* perform real-to-complex fft */
		REALFFT(cepstbuf,wlen/2,FORWARD);

		/* zero out detail < 500Hz */
		j = (int)(CEPCUT/spitem.frameduration);
		xp = &cepstbuf[2*j+2];
		for (i=j;i<=(wlen/2+1);i++) {
			*xp++ = 0.0;
			*xp++ = 0.0;
		}

		/* do inverse fft */
		REALFFT(cepstbuf,wlen/2,REVERSE);
		for (i=0;i<=wlen+2;i++)
			cepstbuf[i] /= wlen/2;

		/* copy smoothed energies into coefficient structure */
		xp = cepstbuf + wlen/2 - 1;
		rep = wlen/NCOEFF;
		if (rep==1) for (i=0;i<NCOEFF;i+=2) {
			coeff->data[i] = *xp++;
			coeff->data[i+1] = coeff->data[i];
		}
		else if (rep==2) for (i=0;i<NCOEFF;i++) {
			coeff->data[i] = *xp++;
		}
		else for (i=0;i<NCOEFF;i++) {
			val = 0.0;
			for (j=0;j<rep;j+=2) val += *xp++;
			coeff->data[i] = 2.0 * val / rep;
		}
	}
	else {
		/* copy raw energies into coefficient structure */
		xp = fftbuf+2;
		rep = wlen/NCOEFF;
		if (rep==1) for (i=0;i<NCOEFF;i+=2) {
			val = *xp * *xp;
			xp++;
			val += *xp * *xp;
			xp++;
			coeff->data[i]=10.0*log10(MAX(val,1e-6));
			coeff->data[i+1]=coeff->data[i];
		}
		else if (rep==2) for (i=0;i<NCOEFF;i++) {
			val = *xp * *xp;
			xp++;
			val += *xp * *xp;
			xp++;
			coeff->data[i]=10.0*log10(MAX(val,1e-6));
		}
		else for (i=0;i<NCOEFF;i++) {
			val = 0.0;
			for (j=0;j<rep;j++,xp++) val += *xp * *xp;
			val = 2.0*val/rep;
			coeff->data[i]=10.0*log10(MAX(val,1e-6));
		}
	}

	/* copy over window specs */
	coeff->posn = wm->posn;
	coeff->size = wm->size;
	coeff->flag = wm->flag;
	coeff->mix  = (float)wm->flag;
	coeff->gain = 20.0*log10((double)wm->size);
}

/* main program */
void main(argc,argv)
int	argc;
char	*argv[];
{
	/* option decoding */
	extern int	optind;		/* option index */
	extern char	*optarg;	/* option argument ptr */
	int		errflg = 0;	/* option error flag */
	int		c;		/* option switch */
	int		it;		/* item selections */
	char		*ty;
	char		*sptype="0";	/* default sub-type = last */
	char		*txtype="0";	/* default sub-type = last */
	/* file variables */
	char		filename[SFSMAXFILENAME];	/* dbase file name */
	int		fid,ofid;
	/* data variables */
	int		wmsize;		/* no. of analysis windows */
	int		wmax;		/* largest window */
	struct co_rec	*coeff;		/* standard coefficient record */
	double		totime;		/* window time (s) */
	int		i;
	double		atof();

	/* decode switches */
	while ( (c = getopt(argc,argv,"Ii:b:w:s:o:t:cpn:")) != EOF ) switch (c) {
		case 'I' :	/* Identify */
			fprintf(stderr,"%s: Spectral analysis by DFT V%s\n",PROGNAME,PROGVERS);
			exit(0);
			break;
		case 'i' :	/* specific item */
			if (itspec(optarg,&it,&ty) == 0) {
				if (it == SP_TYPE)
					sptype = ty;
				else if (it == TX_TYPE) {
					txtype = ty;
					lxsync++;
				}
				else
					error("unsuitable item specifier %s",optarg);
			}
			else
				error("illegal item specifier %s",optarg);
			break;
		case 'b' :	/* analysis bandwidth */
			band = atof(optarg);
			break;
		case 'w' :	/* window size */
			wisize = (atof(optarg))/1E3;
			break;
		case 's' :	/* window stepsize */
			stsize = (atof(optarg))/1E3;
			break;
		case 'o' :	/* window overlap */
			ovsize = (atof(optarg))/1E3;
			break;
		case 't' :	/* tx offset */
			txoffset = (atof(optarg))/1E6;
			lxsync++;
			break;
		case 'c' :	/* cepstral smoothing required */
			cepst++;
			break;
		case 'p' :	/* do not pre-emphasize */
			premp--;
			preemp = 0.0;
			break;
		case 'n' :	/* number of values in spectrum */
			NCOEFF = atoi(optarg);
			if ((NCOEFF < 32) || (NCOEFF > 16384))
				error("number of coefficients out of range");
			break;
		case '?' :	/* unknown */
			errflg++;
	}
	if (errflg || (argc<2))
		error("usage: %s (-I) (-i item) (-b bandwidth) (-w window[ms]) (-s stepsize[ms]|-o overlap[ms]) (-n numpixel) (-t offset[us]) (-c) (-p) file",PROGNAME);

	/* get filename */
	if (optind < argc)
		strcpy(filename,sfsfile(argv[optind]));
	else
		error("no database file specified",NULL);

	/* check file ok for writing */
	if ((fid=sfsopen(filename,"w",NULL)) < 0)
		error("access error on %s",filename);

	/* load original tx */
	if (lxsync) getitem(filename,TX_TYPE,txtype,&txitem,(void **)&tx);

	/* locate speech data */
	if (!sfsitem(fid,SP_TYPE,sptype,&spitem))
		error("unable to find input speech item in %s",filename);

	/* calculate analysis window sizes */
	if (wisize > 0) {
		/* user has specified window size */
		if (stsize > 0) {
			/* user has specified step size */
			ovsize = wisize - stsize;
		}
		else if (ovsize < 0) {
			/* user has not specified overlap */
			ovsize=0;
		}
	}
	else {
		/* estimate from bandwidth */
		wisize = 1.6/band;
		if (stsize > 0) ovsize = wisize - stsize;
		else if (ovsize < 0) ovsize = wisize - 0.26/band;
	}
	if ((ovsize <= 0) || (ovsize >= wisize))
		error("analysis window specification error",NULL);

	/* calculate/estimate size of window marker buffer */
	if (lxsync)
		wmsize = calcwmsize(&txitem,tx);
	else {
		wmsize = (int)(1.0 + (spitem.numframes*spitem.frameduration)/
				(wisize - ovsize));
	}
	if (wmsize <= 0)
		error("analysis window specification error",NULL);

	/* allocate window marker buffer */
	wmb = (struct wm_rec *) calloc(wmsize+1,sizeof(struct wm_rec));

	/* initialise window marker buffer */
	if (lxsync)		/* using tx markers */
		initwm(&txitem,tx,wmb,spitem.frameduration,txoffset,spitem.numframes);
	else {			/* using fixed specs */
		wmb[0].posn = 0;
		wmb[0].size = (int)(0.5 + wisize/spitem.frameduration);
		wmb[0].flag = 0;
		totime = (wisize - ovsize);
		for (wmsize=1;(int)((totime + wisize)/spitem.frameduration) <= spitem.numframes;wmsize++) {
			wmb[wmsize].posn = (int)(0.5 + totime/spitem.frameduration);
			wmb[wmsize].size = (int)(0.5 + wisize/spitem.frameduration);
			wmb[wmsize].flag = 0;
			totime += (wisize - ovsize);
		}
		wmb[wmsize-1].size = spitem.numframes - wmb[wmsize-1].posn;
	}

	/* find largest analysis window */
	for (wmax=0,i=0;i<wmsize;i++) wmax = (wmax > wmb[i].size) ? wmax : wmb[i].size;

	/* create new item header for spectral coefficients */
	if (lxsync) {
		sfsheader(&coitem,CO_TYPE,-1,4,NCOEFF+5,
			spitem.frameduration,spitem.offset+txoffset,0,0,1);
		sprintf(coitem.history,"%s(%d.%02d,%d.%02d;size=%d,window=%d,overlap=%d,offset=%d%s%s)",
			PROGNAME,spitem.datatype,spitem.subtype,
			txitem.datatype,txitem.subtype,
			NCOEFF,
			(int)(wisize*1000),
			(int)(ovsize*1000),
			(int)(txoffset*1E6),
			(cepst) ? ",smoothed" : "",
			(premp) ? ",preemph" : "");
	}
	else {
		sfsheader(&coitem,CO_TYPE,-1,4,NCOEFF+5,
			spitem.frameduration,spitem.offset+wisize/2,
			(int)(wisize/spitem.frameduration),
			(int)(ovsize/spitem.frameduration),0);
		sprintf(coitem.history,"%s(%d.%02d;size=%d,window=%d,overlap=%d%s%s)",
			PROGNAME,spitem.datatype,spitem.subtype,
			NCOEFF,
			(int)(wisize*1000),
			(int)(ovsize*1000),
			(cepst) ? ",smoothed" : "",
			(premp) ? ",preemph" : "");
	}
	sprintf(coitem.params,"minf=%d,maxf=%d",0,(int)(0.5/spitem.frameduration));

	/* open output channel */
	if ((ofid=sfschannel(filename,&coitem))<0)
		error("could not open temporary file",NULL);

	/* get data buffers */
	coeff = (struct co_rec *)sfsbuffer(&coitem,1);
	sp = (short *)sfsbuffer(&spitem,wmax);

	/* process speech frame by frame */
	for (i=0;i<wmsize;i++) {

		/* get speech */
		if (sfsread(fid,wmb[i].posn,wmb[i].size,sp) == 0)
			error("read error on input file",NULL);

		/* perform FFT analysis on this window */
		fftsub(sp,&wmb[i],coeff);

		/* write coefficients to file */
		if (sfswrite(ofid,1,coeff) != 1)
			error("write error on output file",NULL);

		/* print progress */
		if (((i%100)==99) && ttytest()) {
			printf("\rFrame %d/%d",i+1,wmsize);
			fflush(stdout);
		}
	}
	if (ttytest()) {
		printf("\rFrame %d/%d\n",i,wmsize);
		fflush(stdout);
	}

	/* update */
	if (!sfsupdate(filename) != 0)
		error("update error on %s",filename);

	/* that's all folks ... */
	exit(0);
}

