/* ancomp -- compare two sets of annotations */

/* M.A.Huckvale - University College London */

/* version 1.0 - August 1999 */

/* version 1.1 - April 2000
	- add option to save raw matches
	- add option to save histogram
*/
/* version 2.0 - June 2004
	- divide into 3 modes:
		-a = alignment accuracy
		-f = frame labelling
		-l = labelling accuracy
*/

#define PROGNAME "ancomp"
#define PROGVERS "2.0"
char *progname=PROGNAME;

/*-------------------------------------------------------------------------*/
/**MAN
.TH ANCOMP SFS1 UCL
.SH NAME
ancomp -- compare two sets of annotations
.SH SYNOPSIS
.B ancomp
(-a|-f|-l) (-r anitem) (-t anitem) (-m matchfile) (-h histfile) (-s samprate) file
.SH DESCRIPTION
.I ancomp
is a program to compare two sets of annotations.  It has three modes of operation
controlled by command-line switches.  In
.I alignment accuracy
mode (-a) the program measure the alignment accuracy of a set of
automatically aligned annotations against a reference set.
Statistics of mean alignment error and a breakdown of
accuracy per annotation label is provided.  It is assumed
that the labels themselves are correct.
In
.I frame labelling
mode (-f) the program samples the two annotation sets at a given frame
rate and counts how many times each possible pair of labels occurs.
A confusion matrix showing the mappings is produced.
In
.I labelling accuracy
mode (-l) the program only analyses the sequence of labels and not their
position.  The program perfoms a dynamic programming assignment of test labels
to reference labels to collect statistics on substitutions, insertions and
deletions.  A confusion matrix and labelling accuracy statistics are
produced.
.SH OPTIONS
.TP 11
.B -I
Identify program name and version number.
.TP 11
.BI -i item
Select input item number.
.TP 11
.BI -r anitem
Select reference annotation item.  Default first.
.TP 11
.BI -t anitem
Select test annotation item.  Default last.
.TP 11
.B -a
Select alignment accuracy mode (default).
.TP 11
.B -f
Select frame labelling mode.
.TP 11
.B -l
Select labelling accuracy mode.
.TP 11
.BI -m matchfile
Store raw information about matches into the given file rather than
producing a summary.  For frame labelling and
labelling accuracy mode this is a list of input-output label pairs suitable
for processing by the conmat(SFS1) confusion matrix printing program.  If
matchfile is "-" then the information is sent to the standard output.
.TP 11
.BI -h histfile
Store histogram of distances into given file.
.TP 11
.BI -s samprate
Specify sampling rate for frame labelling measurement.  Default:100.
.SH INPUT ITEMS
.IP AN
Reference annotation
.IP AN
Test annotation
.SH VERSION/AUTHOR
.IP 2.0
Mark Huckvale
*/
/*--------------------------------------------------------------------------*/

#include "SFSCONFG.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <malloc.h>
#include <ctype.h>
#include <math.h>
#include "sfs.h"

/* global data */
struct item_header refitem;
struct an_rec *refan;
struct item_header tstitem;
struct an_rec *tstan;
int		doalign=1;
int		doframe=0;
int		dolabel=0;
double	fdur = 0.01;	/* frame duration */

/* linked list of annotation label types */
struct antab_rec {
	char	*label;
	int	cnt;
	double	sum;
	double	sumsq;
	double	min;
	double	max;
	struct antab_rec *next;
} *root;

double	totsum;
double	totsumsq;
int	totcnt;
int	mshist[41];
char	matchfilename[SFSMAXFILENAME];
int	savematch;
char	histfilename[SFSMAXFILENAME];
int	savehist;

/* input & output dictionaries */
char	**idict;
int		idcnt;
char	**odict;
int		odcnt;
int		**cmat;

/* DP matching */
int	*rrec;
int	*trec;
struct dist_rec {
	int	score;
	int	direc;
} **dist;

/* save a string in dynamic memory */
char *strsave(char *str)
{
	char *ptr = malloc(strlen(str)+1);
	strcpy(ptr,str);
	return(ptr);
}

/* find a matching record */
struct antab_rec *findan(char *str)
{
	struct antab_rec *p=root;

	while (p) {
		if (strcmp(str,p->label)==0) return(p);
		p = p->next;
	}
	return(NULL);
}

/* add a label to table (if new) */
void addlabel(char *str)
{
	struct antab_rec *p = findan(str);

	if (!p) {
		p = calloc(1,sizeof(struct antab_rec));
		p->label = strsave(str);
		p->next = root;
		root = p;
	}
}

/* add in a distance to table */
void adddistance(char *str,double t)
{
	int	idx;
	struct antab_rec *p = findan(str);
	if (!p) error("could not find annotation");

	p->sum += fabs(t);
	p->sumsq += (t*t);
	if (t < p->min) p->min = t;
	if (t > p->max) p->max = t;
	p->cnt++;

	totsum += fabs(t);
	totsumsq += (t*t);
	totcnt++;

	if (savehist) {
		idx = (int)(200*t+20);
		if ((0<=idx)&&(idx<=40)) mshist[idx]++;
	}
}

/* report annotation table */
void anreport()
{
	int	i;
	struct antab_rec *p=root;

	printf("Label                 Count     Mean   StdDev      Min      Max\n");
	printf("---------------------------------------------------------------\n");
	while (p) {
		printf("%-20s %6d %8.3f %8.3f %8.3f %8.3f\n",
			p->label,p->cnt,
			p->sum / p->cnt,
			sqrt((p->sumsq-(p->sum * p->sum)/p->cnt)/p->cnt),
			p->min,
			p->max);
		p = p->next;
	}
	printf("\n");
	printf("%-20s %6d %8.3f %8.3f\n",
		"OVERALL",
		totcnt,
		totsum / totcnt,
		sqrt((totsumsq - (totsum * totsum)/totcnt)/totcnt));
}

void anreporthist(FILE *op)
{
	int	i;
	for (i=0;i<=40;i++)
		fprintf(op,"%8.3f %d\n",(i-20)*0.005,mshist[i]);
}

/* find name in dictionary */
int findname(char **dict,int dcnt,char *label)
{
	int	i,j,k,c;

	if (dcnt==0) return(-1);
	i=0;
	j=dcnt-1;
	do {
		k=(i+j)/2;
		c=strcmp(label,dict[k]);
		if (c < 0)
			j = k - 1;
		else if (c > 0)
			i = k + 1;
		else
			return(k);
	} while (i<=j);
	return(-1);
}

/* add name to dictionary */
int addname(char **dict,int *dcnt,char *label)
{
	int	i;

	if (findname(dict,*dcnt,label)>=0) return(0);

	i=*dcnt;
	while ((i>0)&&(strcmp(label,dict[i-1])<0)) {
		dict[i]=dict[i-1];
		i--;
	}
	dict[i]=strsave(label);
	(*dcnt)++;
	return(1);
}

/* get annotation label at time */
char *getlabel(struct item_header *item,struct an_rec *atab,double t)
{
	int	i;
	double	st,et;

	st=item->offset + atab[0].posn*item->frameduration;
	if (t < st) return(atab[0].label);
	for (i=0;i<item->numframes-1;i++) {
		et=item->offset + atab[i+1].posn*item->frameduration;
		if ((st <= t) && (t < et)) return(atab[i].label);
	}
	return(atab[item->numframes-1].label);
}

/* print confusion matrix */
void printconfuse()
{
	int	i,j;
	int	ilmax=0;
	int	olmax=0;
	int	dmax=0;
	char	ilfmt[32];
	char	olfmt[32];
	char	dfmt[32];

	for (i=0;i<idcnt;i++)
		if (strlen(idict[i])>ilmax) ilmax=strlen(idict[i]);
	for (i=0;i<odcnt;i++)
		if (strlen(odict[i])>olmax) olmax=strlen(idict[i]);
	for (i=0;i<idcnt;i++)
		for (j=0;j<odcnt;j++)
			if (cmat[i][j] > dmax) dmax=cmat[i][j];

	sprintf(ilfmt,"%%%ds:",ilmax);
	if (dmax < 10)
		dmax=1;
	else if (dmax < 100)
		dmax=2;
	else if (dmax < 1000)
		dmax=3;
	else if (dmax < 10000)
		dmax=4;
	else
		dmax=5;
	if (dmax < olmax) dmax=olmax; else olmax=dmax;
	sprintf(olfmt," %%%ds",olmax);
	sprintf(dfmt," %%%dd",dmax);

	printf("%*s ",ilmax," ");
	for (j=0;j<odcnt;j++) printf(olfmt,odict[j]);
	printf("\n");
	for (i=0;i<idcnt;i++) {
		printf(ilfmt,idict[i]);
		for (j=0;j<odcnt;j++) printf(dfmt,cmat[i][j]);
		printf("\n");
	}
}

/* minimum of 3 */
int minof3(int a,int b,int c)
{
	if (a < b) {
		if (a < c) return(a); else return(c);
	}
	else {
		if (b < c) return(b); else return(c);
	}
}

/* minimum of 3 index */
int minof3idx(int a,int b,int c)
{
	if (a < b) {
		if (a < c) return(0); else return(2);
	}
	else {
		if (b < c) return(1); else return(2);
	}
}

/* main program */
void main(argc,argv)
int	argc;
char	*argv[];
{
	/* option decoding */
	extern int	optind;		/* option index */
	extern char	*optarg;	/* option argument ptr */
	int		errflg = 0;	/* option error flag */
	int		c;		/* option switch */
	int		it;		/* item selection */
	char		*ty;		/* item sub type */
	/* file variables */
	char		filename[SFSMAXFILENAME]; /* SFS data file name */
	int		fid;		/* input file descriptor */
	char		*reftype="*";
	char		*tsttype="0";
	int			i,j;
	double		tref,ttst;
	double		stime,etime,t;
	char		*reflab,*tstlab;
	FILE		*op;
	int			l,u,d;
	int			ntotal,nsubst,ndelete,ninsert;

	/* decode switches */
	while ( (c = getopt(argc,argv,"Ii:r:t:aflm:h:s:")) != EOF ) switch (c) {
		case 'I' :	/* Identify */
			fprintf(stderr,"%s: V%s\n",PROGNAME,PROGVERS);
			exit(0);
			break;
		case 't' :	/* test item */
		case 'i' :	/* specific item */
			if (itspec(optarg,&it,&ty) == 0) {
				if (it == AN_TYPE)
					tsttype = ty;
				else
					error("unsuitable item specifier %s",optarg);
			}
			else
				error("illegal item specifier %s",optarg);
			break;
		case 'r' :	/* reference item */
			if (itspec(optarg,&it,&ty) == 0) {
				if (it == AN_TYPE)
					reftype = ty;
				else
					error("unsuitable item specifier %s",optarg);
			}
			else
				error("illegal item specifier %s",optarg);
			break;
		case 'a' :	/* alignment mode */
			doalign=1;
			doframe=0;
			dolabel=0;
			break;
		case 'f' :	/* frame mode */
			doalign=0;
			doframe=1;
			dolabel=0;
			break;
		case 'l' :	/* label mode */
			doalign=0;
			doframe=0;
			dolabel=1;
			break;
		case 'm' :	/* match file */
			strcpy(matchfilename,optarg);
			savematch++;
			break;
		case 'h' :	/* histogram file */
			strcpy(histfilename,optarg);
			savehist++;
			break;
		case 's' :	/* sampling rate for frame analysis */
			fdur = 1.0/atof(optarg);
			break;
		case '?' :	/* unknown */
			errflg++;
	}
	if (errflg || (argc<2))
		error("usage: %s (-I) (-r item) (-t item) (-a|-f|-l) (-m matches.out) (-h hist.out) file",PROGNAME);

	/* get filename */
	if (optind < argc)
		strcpy(filename,sfsfile(argv[optind]));
	else
		error("no database file specified",NULL);

	/* open file */
	if ((fid=sfsopen(filename,"r",NULL))<0)
		error("could not open '%s'",filename);

	/* find and load reference item */
	if (!sfsitem(fid,AN_TYPE,reftype,&refitem))
		error("could not find reference annotations");
	refan = (struct an_rec *)sfsbuffer(&refitem,refitem.numframes);
	sfsread(fid,0,refitem.numframes,refan);

	/* find and load test item */
	if (!sfsitem(fid,AN_TYPE,tsttype,&tstitem))
		error("could not find test annotations");
	tstan = (struct an_rec *)sfsbuffer(&tstitem,tstitem.numframes);
	sfsread(fid,0,tstitem.numframes,tstan);
	sfsclose(fid);

	if (doalign) {
		/* alignment mode */

		/* simple checks */
		if (refitem.numframes != tstitem.numframes)
			error("unequal numbers of annotations");

		/* get a list of all the annotation names */
		for (i=0;i<refitem.numframes;i++)
			addlabel(refan[i].label);

		if (savematch) {
			if ((op=fopen(matchfilename,"w"))==NULL)
				error("could not open '%s'",matchfilename);
		}

		/* process annotations */
		for (i=0;i<refitem.numframes;i++) {
			tref = refitem.offset + refan[i].posn*refitem.frameduration;
			ttst = tstitem.offset + tstan[i].posn*tstitem.frameduration;
			adddistance(refan[i].label,ttst - tref);
			if (savematch) {
				fprintf(op,"%s %.4f\n",refan[i].label,ttst - tref);
			}
		}

		/* report annotations */
		anreport();

		/* other reports */
		if (savematch) fclose(op);
		if (savehist) {
			if ((op=fopen(histfilename,"w"))==NULL)
				error("could not open '%s'",matchfilename);
			anreporthist(op);
			fclose(op);
		}
	}
	else if (doframe) {

		/* get a list of all input annotation names */
		idict=(char **)calloc(refitem.numframes,sizeof(char *));
		for (i=0;i<refitem.numframes;i++)
			addname(idict,&idcnt,refan[i].label);

		/* get a list of all output annotation names */
		odict=(char **)calloc(tstitem.numframes,sizeof(char *));
		for (i=0;i<tstitem.numframes;i++)
			addname(odict,&odcnt,tstan[i].label);

		/* get memory for confusion matrix */
		cmat=(int **)calloc(idcnt,sizeof(int *));
		for (i=0;i<idcnt;i++)
			cmat[i]=(int *)calloc(odcnt,sizeof(int));

		/* find start and end times */
		tref = refitem.offset + refan[0].posn*refitem.frameduration;
		ttst = tstitem.offset + tstan[0].posn*tstitem.frameduration;
		stime=(tref > ttst) ? tref : ttst;
		tref = refitem.offset + (refan[refitem.numframes-1].posn+refan[refitem.numframes-1].size)*refitem.frameduration;
		ttst = tstitem.offset + (tstan[tstitem.numframes-1].posn+tstan[tstitem.numframes-1].size)*tstitem.frameduration;
		etime=(tref < ttst) ? tref : ttst;

		if (savematch) {
			if (strcmp(matchfilename,"-")==0)
				op=stdout;
			else if ((op=fopen(matchfilename,"w"))==NULL)
				error("could not open '%s'",matchfilename);
		}

		for (t=stime;t<=etime;t+=fdur) {
			reflab = getlabel(&refitem,refan,t);
			tstlab = getlabel(&tstitem,tstan,t);
			cmat[findname(idict,idcnt,reflab)][findname(odict,odcnt,tstlab)]++;
			if (savematch)
				fprintf(op,"%s\t%s\n",reflab,tstlab);
		}

		if (savematch) {
			if (op!=stdout) fclose(op);
		}
		else
			printconfuse();
	}
	else if (dolabel) {
		/* get a list of all input and output annotation names */
		idict=(char **)calloc(refitem.numframes+tstitem.numframes,sizeof(char *));
		for (i=0;i<refitem.numframes;i++)
			addname(idict,&idcnt,refan[i].label);
		for (i=0;i<tstitem.numframes;i++)
			addname(idict,&idcnt,tstan[i].label);

		/* get memory for ref and test records */
		rrec=(int *)calloc(refitem.numframes,sizeof(int));
		trec=(int *)calloc(tstitem.numframes,sizeof(int));
		for (i=0;i<refitem.numframes;i++)
			rrec[i] = findname(idict,idcnt,refan[i].label);
		for (i=0;i<tstitem.numframes;i++)
			trec[i] = findname(idict,idcnt,tstan[i].label);

		/* get memory for distance matrix */
		dist=(struct dist_rec **)calloc(refitem.numframes,sizeof(struct dist_rec *));
		for (i=0;i<refitem.numframes;i++)
			dist[i]=(struct dist_rec *)calloc(tstitem.numframes,sizeof(struct dist_rec));

		/* do DP */
		for (i=0;i<refitem.numframes;i++) {
			for (j=0;j<tstitem.numframes;j++) {
				if ((i==0)&&(j==0)) {
					l=1000;
					u=1000;
					d=0;
				}
				else if (i==0) {
					l=dist[i][j-1].score;
					u=1000;
					d=1000;
				}
				else if (j==0) {
					l=1000;
					u=dist[i-1][j].score;
					d=1000;
				}
				else {
					l=dist[i][j-1].score;
					u=dist[i-1][j].score;
					d=dist[i-1][j-1].score;
				}
				dist[i][j].score = ((rrec[i]==trec[j]) ? 0 : 1) + minof3(l,u,d);
				dist[i][j].direc = minof3idx(l,u,d);
			}
		}

		if (savematch) {
			if (strcmp(matchfilename,"-")==0)
				op=stdout;
			else if ((op=fopen(matchfilename,"w"))==NULL)
				error("could not open '%s'",matchfilename);
		}

		/* read off alignment */
		i=refitem.numframes-1;
		j=tstitem.numframes-1;
		ntotal=nsubst=ndelete=ninsert=0;
		while (i >= 0) {
			switch (dist[i][j].direc) {
			case 0:	/* left = insertion */
				if (savematch) fprintf(op,"[]\t%s\n",tstan[j].label);
				ninsert++;
				j--;
				break;
			case 1:	/* up = deletion */
				if (savematch) fprintf(op,"%s\t[]\n",refan[i].label);
				ndelete++;
				i--;
				break;
			case 2:	/* diagonal = substitution */
				if (savematch) fprintf(op,"%s\t%s\n",refan[i].label,tstan[j].label);
				if (rrec[i]!=trec[j]) nsubst++;
				i--;
				j--;
				break;
			}
			ntotal++;
		}


		if (savematch) {
			if (op!=stdout) fclose(op);
		}
		else {
			printf("Subst=%d Delete=%d Insert=%d Total=%d",nsubst,ndelete,ninsert,ntotal);
			printf(" Accuracy=%.1f%%\n",(100.0*(ntotal-nsubst-ninsert)/ntotal));
		}

	}

	/* that's all folks! */
	exit(0);
}


