/* antrans -- transcribe orthographic annotations to phonetic symbols */

/* M.A.Huckvale - May 2004 */

/* version 1.0 */

#define PROGNAME "antrans"
#define PROGVERS "1.0"
char *progname=PROGNAME;

/*-------------------------------------------------------------------*/
/**MAN
.TH ANTRANS 1 UCL SFS
.SH NAME
antrans - phonetically transcribe orthographic annotations
.SH SYNOPSIS
.B antrans
(-i item) (-x exceptions.txt) (-m missing.lst) (-s) (-A|-J) file
.SH DESCRIPTION
.I antrans
uses an inbuilt English pronunciation dictionary to transcribe
orthographic annotations into phonetic symbols.
The operation merely changes the annotations label text, it
does not change positions of any annotation.
.PP
By default phonetic annotation is output in SAMPA format, but options
are available to output in ARPABET or JSRU symbols.
.PP
Unknown words are not transcribed, and may be reported to a
file for processing by hand.  You may then build an exceptions
dictionary file and include this in processing.
.PP
.I
antrans
assumes that the annotations describe chunks of the signal
separated by pauses (as generated by "npoint -a" for example).
To this end, it add a psuedo "silence" symbol "/" at the start
and end of each chunk, and also converts any chunk that is
only annotated with "/" to the SAMPA pause symbol "...".
.PP
.I Options:
.TP 11
.B -I
Identify the program name and version.
.TP 11
.BI -i item
Select input item number.
.TP 11
.BI -x exceptions.txt
A text file of pronunciation exceptions.  These are used in
preference to inbuilt dictionary.  The format of this file
is <word><TAB><pronunciation><NEWLINE> where pronunciation is done
using SAMPA symbols, e.g.
.nf
Amsterdam	%{mst@"d{m
.fi
.TP 11
.BI -m missing.lst
This option causes the program to generate a list of those words
missing from the dictionary into the supplied file.
.TP 11
.B -s
Include stress markings (where available) in output transcription.
.TP 11
.B -A
Output symbols in ARPABET format, as used in the BEEP dictionary.
.TP 11
.B -J
Output symbols in JSRU format.
.SH VERSION/AUTHOR
1.0 - Mark Huckvale
*/
/*---------------------------------------------------------------*/

/* include files */
#include "SFSCONFG.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include <malloc.h>
#include <math.h>
#include "sfs.h"
#include "c:\src\prosynth\pron\prondict.h"

/* global data */
struct item_header	anitem;		/* input annotation item */
struct an_rec		*an;
struct item_header	opitem;		/* output annotation item */
struct an_rec		*oan;
char	filename[SFSMAXFILENAME];
char	xfilename[SFSMAXFILENAME];
char	mfilename[SFSMAXFILENAME];
int		doarpa=0;
int		dojsru=0;
int		dostress=0;

/* list of input words */
char	**wtab;
int		wcnt;

/* list of matching pronunciations */
char	**ptab;

/* list of exceptions */
struct except_rec {
	char	*word;
	char	*pron;
} *xtab;
int	xcnt;

/*============================== prondict access =============================*/
static unsigned char left[256],right[256],stack[256];
static unsigned char *bfstart,*bfend,*bfpos;
static int bfsize,bfstack;

static void bfinit(unsigned char *buf,int size)
{
	unsigned char	*tab;
	int	c,count,i;

	/* read pair table */
	for (i=0;i<256;i++)
		left[i] = i;
	tab = buf;
	for (c=0;;) {
		count = *tab++;
		if (count > 127) {
			c += count - 127;
			count=0;
		}
		if (c==256) break;

		for (i=0;i<=count;i++,c++) {
			left[c] = *tab++;
			if (c!=(int)(left[c]))
				right[c] = *tab++;
		}
		if (c==256) break;
	}
	bfstart = tab;
	bfsize = size - (tab-buf);
	bfend = bfstart+bfsize;
	bfpos = bfstart;
	bfstack = 0;
}

static int bfnext()
{
	int	c;

	/* decompress */
	while (bfpos < bfend) {
		if (bfstack)
			c = stack[--bfstack];
		else {
			c = *bfpos++;
		}

		if (c==(int)(left[c]))
			return(c);
		else {
			stack[bfstack++] = right[c];
			stack[bfstack++] = left[c];
		}
	}

	return(-1);
}

static char *strsave(char *str)
{
	char	*ptr=malloc(strlen(str)+1);
	strcpy(ptr,str);
	return(ptr);
}

static int wordmatch(char *w1,char *w2)
{
	while (*w1 && (*w1!='#')) {
		if (!*w2) return(0);
		if (*w1!=*w2) {
			if (tolower(*w1)!=*w2)
				return(0);
		}
		w1++;
		w2++;
	}
	if (*w2) return(0);
	return(1);
}

int	lookup(char **words,int num,char **prons,int flags)
{
	int	c,i,j;
	char	lword[1024];
	char	lphon[1024];
	char	lbuf[1024];
	char	*p,*lp;
	int	n;
	int	more=num;

	for (i=0;i<num;i++) prons[i]=NULL;

	strcpy(lword,"");
	strcpy(lphon,"");

	bfinit(prondict,PRONDICTLEN);
	c=bfnext();
	while (more && (c!=ENDOFDICT)) {
		n = c;
		i = n;
		while (((c=bfnext())!=ENDOFDICT)&&(c>32)) lword[i++] = c;
		lword[i]='\0';

		n = c;
		i = n;
		while (((c=bfnext())!=ENDOFDICT)&&(c>32)) lphon[i++] = c;
		lphon[i]='\0';

		for (i=0;i<num;i++) {
			if (wordmatch(words[i],lword)) {
				if (prons[i]) {
					sprintf(lbuf,"%s\t%s",prons[i],lphon);
					free(prons[i]);
					prons[i]=strsave(lbuf);
				}
				else {
					prons[i]=strsave(lphon);
/*					more--;	*/
				}
			}
		}
	}

	for (i=0;i<num;i++) if (prons[i]) {
		if (p=strrchr(words[i],'#')) {
			n = atoi(p+1);
			if ((1<=n)&&(n<=9)) {
				p = strtok(prons[i],"\t");
				n--;
				while (p && *p && (n>0)) {
					lp = p;
					p = strtok(NULL,"\t");
					n--;
				}
				if (!p || !*p) p=lp;
				strcpy(lbuf,p);
				free(prons[i]);
				prons[i]=strsave(lbuf);
			}
		}
		else {
			strtok(prons[i],"\t");
		}
	}

	for (i=0;i<num;i++)
		if ((prons[i]==NULL)&&(strcmp(words[i],"/")==0))
			prons[i]=strsave("/");

	return(num);
}

/*============================================================================*/

/* binary string search */
int strfind(char *s,char **t,int num)
{
	int	i,j,k;
	int	c;

	if (num==0)
		return(-1);
	else {
		i=0;
		j=num-1;
		do {
			k=(i+j)/2;
			if ((c=strcmp(s,t[k])) > 0)
				i=k+1;
			else
				j=k-1;
		} while (c && (i <= j));
		if (c)
			return(-1);
		else
			return(k);
	}
}

/* maintain string table */
void strtable(char *s,char **t,int *num)
{
	int	i;

	/* see if string in table */
	if (strfind(s,t,*num) < 0) {
		/* add to table */
		i = *num;
		while ((i>0) && (strcmp(s,t[i-1])<0)) {
			t[i] = t[i-1];
			i--;
		}
		t[i]=strsave(s);
		(*num)++;
	}
}

/* compare exceptions record */
int compexcept(const void *ep1, const void *ep2)
{
	const struct except_rec *w1 = (const struct except_rec *)ep1;
	const struct except_rec *w2 = (const struct except_rec *)ep2;
	return(strcmp(w1->word,w2->word));
}

/* load exceptions dictionary */
void loadexcept(char *filename)
{
	FILE	*ip;
	char	line[1024];
	char	*w,*p;

	if ((ip=fopen(filename,"r"))==NULL)
		error("could not open '%s'",filename);

	xcnt=0;
	while (fgets(line,1024,ip)) xcnt++;
	rewind(ip);

	xtab = (struct except_rec *)calloc(xcnt,sizeof(struct except_rec));

	xcnt=0;
	while (fgets(line,1024,ip)) {
		w=strtok(line,"\t\r\n");
		p=strtok(NULL,"\r\n");
		if (w && *w && p && *p) {
			xtab[xcnt].word = strsave(w);
			xtab[xcnt].pron = strsave(p);
			xcnt++;
		}
	}
	fclose(ip);

	qsort(xtab,xcnt,sizeof(struct except_rec),compexcept);

	fprintf(stderr,"%d exceptions loaded from %s\n",xcnt,filename);
}

/* check word against exceptions dictionary */
int findexcept(char *word)
{
	int	i,j,k,c;

	i=0;
	j=xcnt-1;
	while (i <= j) {
		k = (i+j)/2;
		c=strcmp(word,xtab[k].word);
		if (c < 0)
			j = k - 1;
		else if (c > 0)
			i = k + 1;
		else
			return(k);
	}
	return(-1);
}

/* parse a SAMPA transcription */
char * getsym(char *src,char *dst)
{
	dst[0]='\0';
	dst[1]='\0';
	dst[2]='\0';

	/* check for allowed start chars */
	if (!*src)
		return(NULL);
	else if (strchr("abdefghijklmnprstuvwzADINOQRSTUVZ3{@/?x",*src))
		dst[0] = *src++;
	else if (dostress && strchr("',\"%",*src))
		dst[0] = *src++;
	else
		return(getsym(src+1,dst));

	/* check for true digraphs */
	if (
		((*dst=='a') && (*src=='I'))||
		((*dst=='a') && (*src=='U'))||
		((*dst=='e') && (*src=='I'))||
		((*dst=='e') && (*src=='@'))||
		((*dst=='@') && (*src=='U'))||
		((*dst=='O') && (*src=='I'))||
		((*dst=='U') && (*src=='@'))||
		((*dst=='I') && (*src=='@'))||
		((*dst=='t') && (*src=='S'))||
		((*dst=='d') && (*src=='Z'))||
		((*dst=='A') && (*src==':'))||
		((*dst=='i') && (*src==':'))||
		((*dst=='u') && (*src==':'))||
		((*dst=='O') && (*src==':'))||
		((*dst=='3') && (*src==':'))
	   ) {
		dst[1] = *src++;
	}

	/* patch up bad SAMPA */
	if ((dst[0]=='A')&&(dst[1]!=':')) dst[1]=':';
	else if ((dst[0]=='i')&&(dst[1]!=':')) dst[1]=':';
	else if ((dst[0]=='u')&&(dst[1]!=':')) dst[1]=':';
	else if ((dst[0]=='O')&&(dst[1]!=':')) dst[1]=':';
	else if ((dst[0]=='3')&&(dst[1]!=':')) dst[1]=':';
	else if (dst[0]=='\'') dst[0]='"';
	else if (dst[0]==',') dst[0]='%';

	return(src);
}

/* convert a SAMPA symbol to ARPABET */
char *arpasym(char *sym)
{
	switch (sym[0]) {
	case 'a':	return((sym[1]=='I')?"ay":((sym[1]=='U')?"aw":sym));
	case 'd':	return((sym[1]=='Z')?"jh":sym);
	case 'e':	return((sym[1]=='I')?"ey":((sym[1]=='@')?"ea":"eh"));
	case 'h':	return("hh");
	case 'i':	return("iy");
	case 'j':	return("y");
	case 't':	return((sym[1]=='S')?"ch":sym);
	case 'u':	return("uw");
	case 'A':	return("aa");
	case 'D':	return("dh");
	case 'I':	return((sym[1]=='@')?"ia":"ih");
	case 'N':	return("ng");
	case 'O':	return((sym[1]=='I')?"oy":"ao");
	case 'Q':	return("oh");
	case 'S':	return("sh");
	case 'T':	return("th");
	case 'U':	return((sym[1]=='@')?"ua":"uh");
	case 'V':	return("ah");
	case 'Z':	return("zh");
	case '3':	return("er");
	case '{':	return("ae");
	case '@':	return((sym[1]=='U')?"ow":"ax");
	case '/':	return("sil");
	case '.':	return("sil");
	default:
		return(sym);
	}
	return(sym);
}


/* convert a SAMPA symbol to JSRU */
char *jsrusym(char *sym)
{
	switch (sym[0]) {
	case 'a':	return((sym[1]=='I')?"ie":((sym[1]=='U')?"ou":sym));
	case 'd':	return((sym[1]=='Z')?"j":sym);
	case 'e':	return((sym[1]=='I')?"ai":((sym[1]=='@')?"ei":"e"));
	case 'h':	return("h");
	case 'i':	return("ee");
	case 'j':	return("y");
	case 't':	return((sym[1]=='S')?"ch":sym);
	case 'u':	return("uu");
	case 'A':	return("ar");
	case 'D':	return("dh");
	case 'I':	return((sym[1]=='@')?"ia":"i");
	case 'N':	return("ng");
	case 'O':	return((sym[1]=='I')?"oi":"aw");
	case 'Q':	return("o");
	case 'S':	return("sh");
	case 'T':	return("th");
	case 'U':	return((sym[1]=='@')?"ur":"oo");
	case 'V':	return("u");
	case 'Z':	return("zh");
	case '3':	return("er");
	case '{':	return("aa");
	case '@':	return((sym[1]=='U')?"oa":"a");
	case '/':	return("q");
	case '?':	return("gx");
	case '%':	return("'");
	case '.':	return("q");
	default:
		return(sym);
	}
	return(sym);
}


/* add word to string */
void addword(char *dst,char *src)
{
	if (*dst) strcat(dst," ");
	strcat(dst,src);
}

/* add pronunciation to string */
void addpron(char *dst,char *src)
{
	char	*p,sym[256];
	p=src;
	while ((p=getsym(p,sym))!=NULL) {
		if (*dst) strcat(dst," ");
		if (doarpa)
			strcat(dst,arpasym(sym));
		else if (dojsru)
			strcat(dst,jsrusym(sym));
		else
			strcat(dst,sym);
	}
}

/* process linking-R */
void rprocess(char *str)
{
	int	i,j;

	for (i=0;str[i];i++) {
		if (str[i]=='R') {
			if ((str[i+1]==' ')&&strchr("aeiouAEIOU3{",str[i+2]))
				str[i]='r';
			else
				str[i]=' ';
		}
	}
	for (i=0,j=0;str[i];i++) {
		if ((i>0)&&isspace(str[i])&&isspace(str[i+1]))
			/* skip */;
		else if ((i>0)&&isspace(str[i])&&(str[i+1]=='\0'))
			/* skip */;
		else
			str[j++] = str[i];
	}
	str[j]='\0';
}

/* main program */
void main(argc,argv)
int	argc;
char	*argv[];
{
	/* option decoding */
	extern int	optind;		/* option index */
	extern char	*optarg;	/* option argument ptr */
	int		errflg = 0;	/* option error flag */
	int		c;		/* option switch */

	/* processing variables */
	int		fid,ofid;
	FILE		*op;
	int32		it;
	char		*ty;
	char		*antype="0";
	int			i,idx;
	int			mcnt=0;
	char		buff[256];
	char		obuff[2048];
	char		*p;

	/* decode switches */
	while ( (c = getopt(argc,argv,"Ii:x:m:sAJ")) != EOF )
		switch (c) {
		case 'I' :	/* Identify */
			fprintf(stderr,"%s: Annotation transcription V%s\n",PROGNAME,PROGVERS);
			exit(0);
			break;
		case 'i' :	/* item number */
			if (itspec(optarg,&it,&ty)==0) {
				if (it==AN_TYPE)
					antype=ty;
				else
					error("bad item specification",NULL);
			}
			else
				error("illegal item specification",NULL);
			break;
		case 'x' :	/* exceptions dictionary */
			strcpy(xfilename,optarg);
			break;
		case 'm' :	/* missing list */
			strcpy(mfilename,optarg);
			break;
		case 's':	/* do stress marks */
			dostress=1;
			break;
		case 'A' :	/* ARPABET output */
			doarpa=1;
			dojsru=0;
			break;
		case 'J' :	/* JSRU output */
			doarpa=0;
			dojsru=1;
			break;
		case '?' :	/* unknown */
			errflg++;
	}
	/* check command line */
	if (errflg || (argc<2))
		error("usage: %s (-I) (-i item) (-x exceptions.txt) (-m missing.lst) (-s) (-A|-J) file",PROGNAME);

	/* get data filename */
	if (optind < argc)
		strcpy(filename,sfsfile(argv[optind]));
	else
		error("no data file specified",NULL);

	/* open data file */
	if ((fid=sfsopen(filename,"w",NULL)) < 0)
		error("access error on '%s'",filename);

	/* find input annotation item */
	if (!sfsitem(fid,AN_TYPE,antype,&anitem))
		error("cannot find input AN item in '%s'",filename);

	/* get annotation buffers */
	an=(struct an_rec *)sfsbuffer(&anitem,anitem.numframes);
	anitem.length=0;
	oan=(struct an_rec *)sfsbuffer(&anitem,anitem.numframes);

	/* load annotations */
	sfsread(fid,0,anitem.numframes,an);
	sfsclose(fid);

	/* count number of words */
	wcnt=0;
	for (i=0;i<anitem.numframes;i++) {
		strcpy(buff,an[i].label);
		p = strtok(buff," \t\r\n");
		while (p && *p) {
			wcnt++;
			p = strtok(NULL," \t\r\n");
		}
	}

	/* allocate some buffers */
	wtab = (char **)calloc(wcnt,sizeof(char *));

	/* load the word strings */
	wcnt=0;
	for (i=0;i<anitem.numframes;i++) {
		strcpy(buff,an[i].label);
		p = strtok(buff," \t\r\n");
		while (p && *p) {
			strtable(p,wtab,&wcnt);
			p = strtok(NULL," \t\r\n");
		}
	}
	fprintf(stderr,"%d different words found in %s item %d.%02d\n",
		wcnt,filename,anitem.datatype,anitem.subtype);

	/* allocate a pronunciation buffer */
	ptab = (char **)calloc(wcnt,sizeof(char *));

	/* look up the words */
	lookup(wtab,wcnt,ptab,0);

	/* check against exceptions */
	if (xfilename[0]) loadexcept(xfilename);
	if (xcnt>0) {
		for (i=0;i<wcnt;i++) {
			if ((idx=findexcept(wtab[i]))>=0) {
				if (ptab[i]) free(ptab[i]);
				ptab[i]=strsave(xtab[idx].pron);
			}
		}
	}

	/* report number missing */
	mcnt=0;
	for (i=0;i<wcnt;i++) if (ptab[i]==NULL) mcnt++;
	if (mcnt>0) fprintf(stderr,"%d words missing from dictionary\n",mcnt);

	/* output missing */
	if (mfilename[0]) {
		if ((op=fopen(mfilename,"w"))==NULL)
			error("could not open '%s'",mfilename);
		for (i=0;i<wcnt;i++)
			if (ptab[i]==NULL)
				fprintf(op,"%s\n",wtab[i]);
		fclose(op);
	}

	/* do transcription */
	for (i=0;i<anitem.numframes;i++) {
		strcpy(buff,an[i].label);
		p = strtok(buff," \t\r\n");
		strcpy(obuff,"/");
		obuff[MAXANLABEL-1]='\0';
		while (p && *p) {
			idx=strfind(p,wtab,wcnt);
			if ((idx>=0) && (ptab[idx]!=NULL))
				addpron(obuff,ptab[idx]);
			else
				addword(obuff,p);
			p = strtok(NULL," \t\r\n");
		}
		/* process linking-r */
		if (strchr(obuff,'R')) rprocess(obuff);

		/* add trailing silence */
		if (strcmp(obuff,"/ /")==0)
			strcpy(obuff,"...");
		else if (strcmp(obuff,"/")!=0)
			strcat(obuff," /");

		/* check for length & save */
		if (obuff[MAXANLABEL-1]!='\0') {
			fprintf(stderr,"warning: transcription truncated: %s => %s\n",
				an[i].label,obuff);
		}
		obuff[MAXANLABEL-1]='\0';
		an[i].label = strsave(obuff);
	}

	/* create output header */
	sfsheader(&opitem,AN_TYPE,-1,1,-1,anitem.frameduration,anitem.offset,0,0,1);
	if (xfilename[0])
		sprintf(opitem.history,"%s(%d.%02d;type=phonetic,stress=%d,except=%s%s%s)",
			PROGNAME,anitem.datatype,anitem.subtype,dostress,xfilename,
			(doarpa)?",format=arpa":"",
			(dojsru)?",format=jsru":"");
	else
		sprintf(opitem.history,"%s(%d.%02d;type=phonetic,stress=%d%s%s)",
			PROGNAME,anitem.datatype,anitem.subtype,
			dostress,
			(doarpa)?",format=arpa":"",
			(dojsru)?",format=jsru":"");

	/* save output */
	putitem(filename,&opitem,anitem.numframes,an);

	/* that's all folks */
	if (!sfsupdate(filename))
		error("update error on '%s'",filename);
	exit(0);
}
