/* repros -- change speaking rate and/or pitch (advanced version) */

/* M.A.Huckvale - University College London */

/* version 1.0 - August 1998
	- from repitch */

/* version 2.0 - February 2006
	- internal rewrite
*/

#undef IAG

#define PROGNAME "repros"
#define PROGVERS "2.0"
char *progname=PROGNAME;

/*-------------------------------------------------------------------------*/
/**MAN
.TH REPROS 1 SFS UCL
.SH NAME
repros - change speaking rate and or pitch of speech (advanced)
.SH SYNOPSIS
.B repros
(-I) (-i item) (-m MBROLAcontrolfile) (-v) (-j) (-O txoffset) (-P txpercent) file
.SH DESCRIPTION
.I repros
is a program to modify the pitch and duration of an utterance.
It uses the PSOLA algorithm and requires a set of pitch-epoch
annotations.  These can be generated from a Laryngograph signal
using Lx->Tx conversion.
.PP
Control data for the change in prosody is specified by a text
file in MBROLA format.  This file consists of a series of
lines, one per annotated segment in the target file.  Each line
contains: segment label, new segment duration in ms, new fx
contour specified as series of pairs: % position through segment,
fx value in Hz. The segment label '_' matches initial or final
silence in the file.
.SH OPTIONS
.TP 11
.B -I
Identify program name and version number.
.TP 11
.BI -i item
Select input item number.
.TP 11
.BI -m MBROLAcontolfile
Specify MBROLA format control file for prosody change.
.TP 11
.BI -O txoffset
Shift TX with respect to speech (0.001s is typical value).  Default 0.
.TP 11
.BI -P txpercent
Reduce pitch windows to this %age of available time.  Deafult 100.
.TP 11
.B -j
Add pitch jitter to hide artifacts.
.TP 11
.B -v
Verbose mode.
.SH INPUT ITEMS
.IP SP 11
Speech item
.IP TX 11
Pitch epochs.
.IP AN 11
Segment labels.
.SH OUTPUT ITEMS
.IP SP 11
Prosody changed speech
.SH HISTORY
.IP MBROLA=
MBROLA format control file.
.SH VERSION/AUTHOR
.IP 2.0
Mark Huckvale
.SH SEE ALSO
respeed, repitch
.SH BUGS
*/
/*--------------------------------------------------------------------------*/

/* include files */
#include "SFSCONFG.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <string.h>
#include <malloc.h>
#include <ctype.h>
#include "sfs.h"
#define MAX(x,y) (((x)>(y))?(x):(y))
#define ABS(x) (((x)>0)?(x):-(x))

/* global defines */
#define MINFX		50.0		/* below this is unvoiced */
#define MAXVOICE	0.020		/* longer than 20ms = unvoiced */
#define MAXSTEP		0.016		/* max window step = 16ms */
#define MAXJITTER	0.0003		/* maximum pitch jitter */

/* window flags */
#define WINFLAG_VOICED_LAST 		1
#define WINFLAG_VOICED_THIS 		2
#define WINFLAG_VOICED_NEXT 		4

/* window record */
struct window_rec {
	int	lastepoch;	/* sample number of last epoch */
	int	nextepoch;	/* sample number of next epoch */
	int	flags;
};

/* segment record */
struct segment_rec {
	char	*label;		/* segment name */
	double	ostime;		/* original start time */
	double	odur;		/* original duration */
	double	nstime;		/* new start time */
	double	ndur;		/* new duration */
};

/* input data */
struct item_header	spitem;		/* input speech */
short	*isp;
struct item_header	txitem;		/* input pitch epochs */
int		*tx;
struct item_header	anitem;		/* input segment labels */
struct an_rec		*an;
short	*ofx;					/* output fx contour */
int		ofxcount;
struct window_rec	*wtab;		/* table of windows */
int		wcount;
struct segment_rec	*stab;		/* table of segments */
int		scount;

/* output data */
double	totdur;
int		ocount;
struct item_header	opitem;		/* output speech */
short	*osp;
float	*rosp;					/* real output signal */
float	*wosp;					/* real window sum */
int		*omap;					/* map output times to input times */

/* global data */
char	mfilename[SFSMAXFILENAME]; /* control file name */
int		verbose=0;	/* dump debugging info */
int		jitter=0;
int		njitter;
double	txoffset=0;
double	txpercent=1.0;
int		donormwindow=1;

/* save string in memory */
char *strsave(char *str)
{
	char *ptr=malloc(strlen(str)+1);
	if (ptr==NULL)
		error("out of memory");
	strcpy(ptr,str);
	return(ptr);
}

/* check label represents silence */
int issilence(char *label)
{
	return((strcmp(label,"_")==0)||(strcmp(label,"/")==0));
}

/* compare labels for equality */
int complabel(char *lab1,char *lab2)
{
	if (issilence(lab1)&&issilence(lab2))
		return(0);
	else
		return(strcmp(lab1,lab2));
}

/* load MBROLA control file */
void loadmbrola(char *fname)
{
	FILE	*ip;
	char	line[1024];
	char	label[128];
	char	*p;
	int	dur;
	int	i,j,idx;
	double	t,pc;
	int	fx;
	int	f1,f2;

	if ((ip=fopen(fname,"r"))==NULL)
		error("could not find '%s'",fname);

	/* find total duration */
	totdur=0;
	while (fgets(line,1024,ip)) if (line[0]!=';') {
		sscanf(line,"%s %d",label,&dur);
		if (dur==0)
			error("zero length segment in '%s'",fname);
		totdur += dur/1000.0;
	}
#ifdef IAG
	printf("totdur=%g\n",totdur);
#endif
	rewind(ip);

	/* allocate buffer for output Fx */
	ofxcount = (int)(0.5+(0.5+totdur)/0.01);
	if ((ofx=(short *)calloc(ofxcount,sizeof(short)))==NULL)
		error("out of memory");

	/* run through control lines, relating to segments on file */
	stab[0].nstime=0.0;
	for (i=0;fgets(line,1024,ip);) if (line[0]!=';') {
		if (i>0) stab[i].nstime = stab[i-1].nstime+stab[i-1].ndur;
		p = strtok(line," \t\n");
		if (complabel(p,stab[i].label)!=0) {
			if ((i<scount) && issilence(stab[i].label)) {
				i++;
				stab[i].nstime = stab[i-1].nstime+stab[i-1].ndur;
			}
		}
		if ((i < scount) && (complabel(p,stab[i].label)==0)) {
			p = strtok(NULL," \t\n");
			stab[i].ndur = atoi(p)/1000.0;
		}
		else {
			fprintf(stderr,"Line %d. Target:%s Control-File:%s\n",i,stab[i].label,p);
			error("segment mismatch between control file and target file");
		}

		/* store FX values */
		p = strtok(NULL," \t\n");
		while (p && isdigit(*p)) {
			pc = atoi(p)/100.0;
			p = strtok(NULL," \t\n");
			if (p && isdigit(*p)) {
				fx = atoi(p);
				t = stab[i].nstime + pc*stab[i].ndur;
				idx = (int)(t/0.01);
				if ((0 <= idx) && (idx < ofxcount))
					ofx[idx]=fx;
			}
			p = strtok(NULL," \t\n");
		}
		i++;
	}

	/* now join up Fx values with linear interpolation */
	idx=0;
	f1=-1;
	for (i=0;i<ofxcount;i++) {
		f2 = ofx[i];
		if (f2 > 0) {
			if (f1<0) f1=f2;
			for (j=idx;j<i;j++)
				ofx[j] = f1 + (j-idx)*(f2-f1)/(i-idx);
			f1 = f2;
			idx = i;
		}
	}
	for (j=idx;j<ofxcount;j++) ofx[j] = f1;

	fclose(ip);
}

float	wbuf[409600];

/* overlap-add two signals */
int	overlapadd(int tpos,int ltime,int rtime,int opos,int offset)
{
	double 	w;
	int	i;

	w = 2.0*M_PI/(2*ltime+1);
	for (i=0;i<ltime;i++) {
		wbuf[i] = (float)(0.5 - 0.5*cos((i+1)*w));
	}
	w = 2.0*M_PI/(2*rtime+1);
	for (i=0;i<rtime;i++) {
		wbuf[ltime+i] = (float)(0.5 - 0.5*cos((i+1+rtime)*w));
	}

	for (i=0;i<(ltime+rtime);i++) {
		if (((opos+offset-ltime+i)>=0)&&((opos+offset-ltime+i)<ocount)) {
			if (((tpos-ltime+i)>=0)&&((tpos-ltime+i)<spitem.numframes))
				rosp[opos+offset-ltime+i] += isp[tpos-ltime+i]*wbuf[i];
			if (donormwindow)
				wosp[opos+offset-ltime+i] += wbuf[i];
			else
				wosp[opos+offset-ltime+i] = 1.0;
		}
	}

	return(opos+offset);
}

/* re-synthesize */
void outsynth(short *obuf,float *sbuf,float *wbuf,int slen)
{
	int	i;

	for (i=0;i<slen;i++,obuf++,sbuf++,wbuf++) {
		if (*wbuf==0.0) {
			fprintf(stderr,"zero window\n");
			*obuf = 0;
		}
		else
			*obuf = (short)(*sbuf / *wbuf);
	}
}

/* choose a ranomd window time */
int 	randomwin()
{
	return((int)(MAXSTEP/spitem.frameduration));
}

/* main program */
void main(argc,argv)
int	argc;
char	*argv[];
{
	/* option decoding */
	extern int	optind;		/* option index */
	extern char	*optarg;	/* option argument ptr */
	int		errflg = 0;		/* option error flag */
	int		c;				/* option switch */
	int		it;				/* item selection */
	char	*ty;			/* item sub type */
	char	*sptype="0";
	char	*txtype="0";
	char	*antype="0";
	/* file variables */
	char	filename[SFSMAXFILENAME]; /* SFS data file name */
	int		fid;		/* input file descriptor */
	int		ofid;
	int		cursamp;
	int		framelen,offset;
	struct window_rec *wptr;
	double	t,t0,t1,t2;
	int		i,j,it0,it1,it2;
	int		lvoice,tvoice,nvoice;
	int		ipos,lpos,tpos,npos;
	int		ltime,rtime;

	/* decode switches */
	while ( (c = getopt(argc,argv,"Ii:m:vjO:P:W")) != EOF ) switch (c) {
		case 'I' :	/* Identify */
			fprintf(stderr,"%s: Change pitch and duration V%s\n",PROGNAME,PROGVERS);
			exit(0);
			break;
		case 'i' :	/* specific item */
			if (itspec(optarg,&it,&ty) == 0) {
				if (it == SP_TYPE)
					sptype=ty;
				else if (it == TX_TYPE)
					txtype=ty;
				else if (it == AN_TYPE)
					antype=ty;
				else
					error("unsuitable item specifier %s",optarg);
			}
			else
				error("illegal item specifier %s",optarg);
			break;
		case 'm' :	/* MBROLA control file */
			strcpy(mfilename,optarg);
			break;
		case 'j' :	/* jitter */
			jitter++;
			break;
		case 'v' :	/* verbose */
			verbose++;
			break;
		case 'O' :	/* tx offset */
			txoffset = atof(optarg);
			if ((txoffset < -0.01)||(txoffset > 0.01))
				error("tx offset too large");
			break;
		case 'P' :	/* txpercent */
			txpercent = atof(optarg)/100.0;
			break;
		case 'W' :	/* turn off window normalisation */
			donormwindow=0;
			break;
		case '?' :	/* unknown */
			errflg++;
	}
	if (errflg || (argc<2))
		error("usage: %s (-I) (-i item) (-m control_file) (-j) (-v) (-O txoffset) (-P txpercent) (-W) file",PROGNAME);

	/* get filename */
	if (optind < argc)
		strcpy(filename,sfsfile(argv[optind]));
	else
		error("no database file specified",NULL);

	if (mfilename[0]==0)
		error("no control file specified");

	/* open file */
	if ((fid=sfsopen(filename,"w",NULL))<0)
		error("access error on '%s'",filename);

	/* locate and read Tx */
	if (!sfsitem(fid,TX_TYPE,txtype,&txitem))
		error("unable to find input TX item in '%s'",filename);
	if ((tx = (int *)sfsbuffer(&txitem,txitem.numframes+1))==NULL)
		error("could not get memory");
	if (sfsread(fid,0,txitem.numframes,tx)!=txitem.numframes)
		error("read error on input");

	/* locate and read annotations */
	if (!sfsitem(fid,AN_TYPE,antype,&anitem))
		error("unable to find input AN item in '%s'",filename);
	if ((an = (struct an_rec *)sfsbuffer(&anitem,anitem.numframes))==NULL)
		error("could not get memory");
	if (sfsread(fid,0,anitem.numframes,an)!=anitem.numframes)
		error("read error on input");
	for (i=1;i<anitem.numframes;i++) {
		an[i-1].size = an[i].posn-an[i-1].posn;
	}

	/* locate and read input speech */
	if (!sfsitem(fid,SP_TYPE,sptype,&spitem))
		error("unable to find input SP item in '%s'",filename);
	if ((isp = (short *)sfsbuffer(&spitem,spitem.numframes))==NULL)
		error("could not get memory");
	if (sfsread(fid,0,spitem.numframes,isp)!=spitem.numframes)
		error("read error on input");
	an[anitem.numframes-1].size = (int)(spitem.numframes*spitem.frameduration/anitem.frameduration)-an[anitem.numframes-1].posn;

	/* apply a ramp at start and end of speech */
	j=(int)(MAXVOICE/spitem.frameduration);
	for (i=0;i<j;i++) {
		isp[i]=(isp[i]*i)/j;
		isp[spitem.numframes-i-1]=(isp[spitem.numframes-i-1]*i)/j;
	}

	/* check for zero length tx */
	if (txitem.numframes==0) {
		tx[0] = (int)((spitem.numframes*spitem.frameduration)/txitem.frameduration);
		txitem.numframes=1;
	}

	/* load annotations into segment table */
	scount = anitem.numframes+2;
	if ((stab=(struct segment_rec *)calloc(scount,sizeof(struct segment_rec)))==NULL)
		error("out of memory");
	if (!issilence(an[0].label)) {
		stab[0].label = strsave("_");
		stab[0].ostime = 0.0;
		stab[0].ndur = stab[0].odur = an[0].posn*anitem.frameduration;
		scount=1;
	}
	else
		scount=0;
	for (i=0;i<anitem.numframes;i++) {
		stab[scount].label = strsave(an[i].label);
		stab[scount].ostime = an[i].posn*anitem.frameduration;
		stab[scount].ndur = stab[scount].odur = an[i].size*anitem.frameduration;
		scount++;
	}
	if (!issilence(an[anitem.numframes-1].label)) {
		stab[scount].label = strsave("_");
		stab[scount].ostime = stab[scount-1].ostime+stab[scount-1].odur;
		stab[scount].ndur = stab[scount].odur = 0.0;
		scount++;
	}

	/* load control file, matching against annotations */
	loadmbrola(mfilename);

#ifdef IAG
printf("Segment table:\n");
for (i=0;i<scount;i++)
	printf("%2d. %4s %10g %10g %10g %10g\n",
		i,stab[i].label,stab[i].ostime,stab[i].odur,stab[i].nstime,stab[i].ndur);
#endif
#ifdef IAG
printf("Fx contour:\n");
for (i=0;i<ofxcount;i++) {
	printf("%3d ",ofx[i]);
	if ((i%10)==9) printf("\n");
}
printf("\n");
#endif

	/* make output header */
	sfsheader(&opitem,SP_TYPE,0,2,1,spitem.frameduration,spitem.offset,1,0,0);
	sprintf(opitem.history,"%s(%d.%02d,%d.%02d,%d.%02d;MBROLA=%s%s,txoffset=%g,txpercent=%g)",
			PROGNAME,
			spitem.datatype,spitem.subtype,
			txitem.datatype,txitem.subtype,
			anitem.datatype,anitem.subtype,
			mfilename,(jitter)?",jitter":"",txoffset,100*txpercent);
	ocount=(int)(0.5+totdur/spitem.frameduration);

	/* get output channel */
	if ((ofid=sfschannel(filename,&opitem))<0)
		error("could not open output channel to '%s'",filename);

	/* allocate memory */
	if ((wtab=(struct window_rec *)calloc(spitem.numframes,sizeof(struct window_rec)))==NULL)
		error("could not get memory buffer");
	if ((osp=(short *)sfsbuffer(&opitem,ocount))==NULL)
		error("could not get memory buffer");
	if ((rosp=(float *)calloc(ocount,sizeof(float)))==NULL)
		error("could not get memory buffer");
	if ((wosp=(float *)calloc(ocount,sizeof(float)))==NULL)
		error("could not get memory buffer");
	if ((omap=(int *)calloc(ocount,sizeof(int)))==NULL)
		error("could not get memory buffer");

	/* initialise window table */
	t0=0;
	t1=txoffset+tx[0]*txitem.frameduration;
	lvoice=tvoice=nvoice=0;
	for (i=1;i<=txitem.numframes;i++) {
		if (i==txitem.numframes)
			t2 = spitem.numframes*spitem.frameduration;
		else
			t2 = t1 + tx[i]*txitem.frameduration;
		tvoice=((t1-t0)<MAXVOICE) ? 1 : 0;
		if (t0==0) tvoice=0;
		nvoice=((t2-t1)<MAXVOICE) ? 1 : 0;
		it0=(int)(0.5+t0/spitem.frameduration);
		it1=(int)(0.5+t1/spitem.frameduration);
#ifdef IAG
	printf("Tx%04d it0=%d it1=%d lv=%d tv=%d nv=%d\n",i,it0,it1,lvoice,tvoice,nvoice);
#endif

		for (j=it0;(j<it1)&&(j<spitem.numframes);j++) {
			wtab[j].lastepoch = it0;
			wtab[j].nextepoch = it1;
			wtab[j].flags=0;
			if (lvoice && ((j-it0)*spitem.frameduration<MAXVOICE))
				wtab[j].flags |= WINFLAG_VOICED_LAST;
			if (tvoice)
				wtab[j].flags |= WINFLAG_VOICED_THIS;
			if (nvoice && ((it1-j)*spitem.frameduration<MAXVOICE))
				wtab[j].flags |= WINFLAG_VOICED_NEXT;
			wcount=j+1;
		}
		t0 = t1;
		t1 = t2;
		lvoice=tvoice;
	}

	/* align output with input */
	for (i=0;i<scount;i++) {
		it0=(int)(0.5+stab[i].nstime/spitem.frameduration);
		it1=(int)(0.5+(stab[i].nstime+stab[i].ndur)/spitem.frameduration);
#ifdef IAG
		printf("Stab%02d it0=%d it1=%d\n",i,it0,it1);
#endif
		for (j=it0;(j<=it1)&&(j<ocount);j++) {
			t=stab[i].ostime+(((j*spitem.frameduration)-stab[i].nstime)/stab[i].ndur)*stab[i].odur;
			omap[j]=(int)(0.5+t/spitem.frameduration);
			if (omap[j]<0) omap[j]=0;
			if (omap[j]>=spitem.numframes) omap[j]=spitem.numframes-1;
		}
	}

	/* processing loop */
	cursamp=0;
	while (cursamp < ocount) {
		/* find matching time in input signal */
		ipos = omap[cursamp];
		wptr = &wtab[ipos];
		if ((ipos - wptr->lastepoch) < (wptr->nextepoch - ipos)) {
			if ((wptr->lastepoch-1) < 0)
				lpos=0;
			else
				lpos = wtab[wptr->lastepoch-1].lastepoch;
			tpos = wptr->lastepoch;
			npos = wptr->nextepoch;
		}
		else {
			lpos = wptr->lastepoch;
			tpos = wptr->nextepoch;
			if (wptr->nextepoch >= wcount)
				npos=0;
			else
				npos = wtab[wptr->nextepoch].nextepoch;
		}
		if (npos==0) npos=spitem.numframes-1+(tpos-lpos);
#ifdef IAG
printf("ipos=%d lpos=%d tpos=%d npos=%d\n",ipos,lpos,tpos,npos);
#endif
		if (wptr->flags & WINFLAG_VOICED_THIS) {
			/* simple voiced */
			if (wptr->flags & WINFLAG_VOICED_LAST)
				ltime=tpos-lpos;
			else
				ltime=randomwin();
			if (wptr->flags & WINFLAG_VOICED_NEXT)
				rtime=npos-tpos;
			else
				rtime=randomwin();
			offset = (int)(1.0/(ofx[(int)(cursamp*spitem.frameduration/0.01)]*spitem.frameduration));
		}
		else if (!(wptr->flags & WINFLAG_VOICED_THIS)&&(wptr->flags & WINFLAG_VOICED_LAST)) {
			/* end of voiced section */
			ltime=tpos-lpos;
			rtime=ltime;
			offset = (int)(1.0/(ofx[(int)(cursamp*spitem.frameduration/0.01)]*spitem.frameduration));
		}
		else if (!(wptr->flags & WINFLAG_VOICED_THIS)&&(wptr->flags & WINFLAG_VOICED_NEXT)) {
			/* start of voiced section */
			rtime=npos-tpos;
			ltime=rtime;
			offset = (int)(1.0/(ofx[(int)(cursamp*spitem.frameduration/0.01)]*spitem.frameduration));
		}
		else {
			/* simple unvoiced */
			ltime=rtime=randomwin();
			offset = ltime;
			tpos=ipos;
		}
		if (cursamp==0)	{
//			printf("cursamp=%d tpos=%d ltime=%d rtime=%d offset=%d\n",0,tpos,0,offset,0);
			cursamp = overlapadd(tpos,0,offset,0,0);
//			printf("cursamp now=%d\n",cursamp);
		}
#ifdef IAG
	printf("cursamp=%d tpos=%d ltime=%d rtime=%d offset=%d\n",cursamp,tpos,ltime,rtime,offset);
#endif
		cursamp = overlapadd(tpos,ltime,rtime,cursamp,offset);

	}

	/* create output waveform */
	outsynth(osp,rosp,wosp,ocount);
	if (sfswrite(ofid,ocount,osp)!=ocount)
		error("output write error on '%s'",filename);

	/* that's all folks */
	if (!sfsupdate(filename))
		error("update error on '%s'",filename);
	exit(0);
}
