/*C_HEADER_FILE****************************************************************
FILE			:	KanjiDic.c
DESC			:	
TABS			:	4
OWNER			:	Fonix
DATE CREATED	:	12 Dec 2005	

(C) Copyright 2005 All rights reserved.
This is an unpublished work, and is confidential and proprietary: 
technology and information of fonix corporation.  No part of this
code may be reproduced, used or disclosed without written consent of 
fonix corporation in each and every instance.

  $Date:  $
  $Revision:  $


KanjiDic FORMAT 

	The first part of each line is of a fixed format, indicating which character the line is for, 
	while the rest is more free-format. 

	The first two bytes are the kanji itself. There is then a space, the 4-byte ASCII representation 
	of the hexadecimal coding of the two-byte JIS encoding, and another space. 

	The rest of the line is composed of a combination of three kinds of fields (which may be in 
	any order and interspersed): 

	information fields, beginning with an identifying letter and ending with a space. See below 
	for more information about these fields. 

	readings (with '-' to indicate prefixes/suffixes, and '.' to indicate the portion of the reading 
	that is okurigana). ON-yomi are in katakana and KUN-yomi are in hiragana. There may be several 
	classes of reading fields, with ordinary readings first, followed by members of the other 
	classes, if any. The current other classes, and their tagging, are: 
		where the kanji has special "nanori" (i.e. name) readings, these are preceded the marker "T1"; 

		where the kanji is a radical, and the radical name is not already a reading, the radical 
		name is preceded the marker "T2". 
		
		(Other Tn classes may be created at a later date.) 

	English meanings. Each such field begins with an open brace '{' and ends at the next close brace '}'. 

	INFORMATION FIELDS 

	There are currently a variety of predefined fields (programs using KANJIDIC should not make any assumptions about the presence or absence of any of these fields, as KANJIDIC is certain to be extended in the future): 

	B<num> -- the radical (Bushu) number. There is one per entry. As far as possible, this is the radical number used in the Nelson "Modern Japanese-English Character Dictionary" (i.e. the Classic, not the New Nelson). Where the classical or historical radical number differs from this, it is present as a separate C<num> entry. 

	C<num> -- the historical or classical radical number, as recorded in the KangXi Zidian (where this differs from the B<num> entry.) There will be at most one of these. 

	F<num> -- the frequency-of-use ranking. At most one per line. The 2,501 most-used characters have a ranking; those characters that lack this field are not ranked. The frequency is a number from 1 to 2,501 that expresses the relative frequency of occurrence of a character in modern Japanese. The data is based on an analysis of word frequencies in the Mainichi Shimbun over 4 years by Alexandre Girardi. From this the relative frequencies have been derived. Note: 
	these frequencies are biassed towards words and kanji used in newspaper articles, 
	the relative frequencies for the last few hundred kanji so graded is quite imprecise. 
	(Earlier editions of the KANJIDIC file used a frequency-of-use ranking from the National Language Research Institute (Tokyo), interpreted and adapted by Jack Halpern.) 

	G<num> -- the Jouyou grade level. At most one per line. G1 through G6 indicate Jouyou grades 1-6. G8 indicates general-use characters. G9 indicates Jinmeiyou ("for use in names") characters. If not present, it is a kanji outside these categories. 

	H<num> -- the index number in the New Japanese-English Character Dictionary, edited by Jack Halpern. At most one allowed per line. If not preset, the character is not in Halpern. 

	N<num> -- the index number in the "Modern Reader's Japanese-English Character Dictionary", edited by Andrew Nelson. At most one allowed per line. If not present, the character is not in Nelson, or is considered to be a non-standard version, in which case it may have a cross-reference code in the form: XNnnnn. (Note that many kanji currently used are what Nelson described as "non-standard" forms or glyphs.) 

	V<num> -- the index number in The New Nelson Japanese-English Character Dictionary, edited by John Haig. 

	D<code> -- the "D" codes will be progressively used for dictionary based codes. 
	DRnnnn - these are the codes developed by Father Joseph De Roo, and published in his book "2001 Kanji" (Bojinsha). Fr De Roo has given his permission for these codes to be included. 

	DKnnnn - the index numbers used by Jack Halpern in his Kanji Learners Dictionary, published by Kodansha in 1999. The numbers have been provided by Mr Halpern. 

	DOnnnn - the index numbers used in P.G. O'Neill's Essential Kanji (ISBN 0-8348-0222-8). The numbers have been provided by Glenn Rosenthal. 

	DSnnnn - the index numbers used in "A Guide To Reading and Writing Japanese" edited by Florence Sakade. 

	DTnnn - the index numbers used in the Tuttle Kanji Cards, compiled by Alexander Kask. 

	DCnnnn - the index numbers used in "The Kanji Way to Japanese Language Power" by Dale Crowley. 

	DJnnn - the index numbers used in "Kanji in Context" by Nishiguchi and Kono. 

	DGnnn - the index numbers used in "Japanese For Busy People" vols I-III, published by the AJLT. The codes are the volume.chapter. 

	DBnnn - the index numbers used in the "Kodansha Compact Kanji Guide". 

	P<code> -- the SKIP pattern code. The <code> is of the form "P<num>-<num>-<num>". The System of Kanji Indexing by Patterns (SKIP) is a scheme for the classification and rapid retrieval of Chinese characters on the basis of geometrical patterns. Developed by Jack Halpern, it first appeared in the New Japanese-English Character Dictionary (Kenkyusha, Tokyo 1990; NTC, Chicago 1993), and is being used in a series of dictionaries and learning tools called KIT (Kanji Integrated Tools). SKIP is protected by copyright, copyleft and patent laws. The commercial utilization of SKIP in any form is strictly forbidden without the written permission of Jack Halpern, the copyright holder (jhalpern@cc.win.or.jp). (A brief summary of the method is in Appendix C. See Appendix E. for some of the rules applied when counting strokes in some of the radicals.) 

	S<num> -- the stroke count. At least one per line. If more than one, the first is considered the accepted count, while subsequent ones are common miscounts. (See Appendix E. For some of the rules applied when counting strokes in some of the radicals.) 

	U<hexnum> -- the Unicode encoding of the kanji. See Appendix B for further information on this code. There is exactly one per line. 

	I<code> -- the index codes in the reference books by Spahn & Hadamitzky. These codes take two forms: 
	for The Kanji Dictionary (Tuttle 1996), they are in the form nxnn.n, e.g. 3k11.2, where the kanji has 3 strokes in the identifying radical, it is radical "k" in the S&H classification system, there are 11 other strokes, and it is the 2nd kanji in the 3k11 sequence. I am very grateful to Mark Spahn for providing the (almost) full list of these descriptor codes for the kanji in this file. At the time of writing some 800 kanji in the file lack the SH descriptor. This is because the book used a different glyph as the primary kanji. The gaps are gradually being filled in. Where the JIS X 0208 glyph is the second kanji for a particular descriptor code, it has a "-2" appended to the code. 

	for the Kanji & Kana book (Tuttle), they are in the form INnnnn, where nnnn is the number of the kanji referenced in that book (2nd edition.) 
	Qnnnn.n -- the "Four Corner" code for that kanji. This is a code invented by Wang Chen in 1928, it has since then been widely used for dictionaries in China and Japan. In some cases there are two of these codes, as it is can be little ambiguous, and Morohashi has some kanji coded differently from their traditional Chinese codes. See Appendix D for an overview of the Four Corner System. Christian Wittern, who passed on these codes, comments that they are in need of proof-reading and thus users are advised to be cautious using the codes for serious scholarship. 

	MNnnnnnnn and MPnn.nnnn -- the index number and volume.page respectively of the kanji in the 13-volume Morohashi Daikanwajiten. In the MNnnn field, a terminal `P`, e.g. MN4879P, indicates that it is 4879' in the original. In some 500 cases, the number is terminated with an `X`, to indicate that the kanji in Morohashi has a close, but not identical, glyph to the form in the JIS X 0208 standard. 

	Ennnn -- the index number used in "A Guide To Remembering Japanese Characters" by Kenneth G. Henshall. There are 1945 kanji with these numbers (i.e. the Jouyou subset.) 

	Knnnn -- the index number in the Gakken Kanji Dictionary ("A New Dictionary of Kanji Usage"). Some of the numbers relate to the list at the back of the book, jouyou kanji not contained in the dictionary, and various historical tables at the end. 

	Lnnnn -- the index number used in "Remembering The Kanji" by James Heisig. 

	Onnnn -- the index number in "Japanese Names", by P.G. O'Neill. (Weatherhill, 1972) (A warning: some of the numbers end with 'A'. This is how they appear in the book; it is not a problem with the file.) 

	Wxxxx -- the romanized form of the Korean reading(s) of the kanji. Most of these kanji have one Korean reading, a few have two or more. The readings are in the (Republic of Korea) Ministry of Education style of romanization. 

	Yxxxxx -- the "Pinyin" of each kanji, i.e. the (Mandarin or Beijing) Chinese romanization. About 6,000 of the kanji have these. Obviously most of the native Japanese kokuji do not have Pinyin, however at least one does as it was taken into Chinese at a later date. 

	Xxxxxxx -- a cross-reference code. An entry of, say, XN1234 will mean that the user is referred to the kanji with the (unique) Nelson index of 1234. XJ0xxxx and XJ1xxxx are cross-references to the kanji with the JIS hexadecimal code of xxxx. The `0' means the reference is to a JIS X 0208 kanji, and the `1' references a JIS X 0212 kanji. 

	Zxxxxxx -- a mis-classification code. It means that this kanji is sometimes mis-classified as having the xxxxxx coding. In the case of the SKIP classifications, an extra letter code is used to indicate the type of mis-classification. ZPPn-n-n, ZSPn-n-n and ZBPn-n-n indicate mis-classification according to position, stroke-count and both position and stroke-count. (ZRPn-n-n codes are where Jim Breen & Jack Halpern are having a [hopefully temporary] disagreement over the number of strokes.) 
	If the final field of a line is not an English field, there is a final space. Each reading and information field is therefore bracketed by space characters (which makes it convenient for searches using programs like "grep".) 
	As far as possible all entries will have their yomikata and readings attached, even if they are a recognized variant of another kanji. This is to facilitate electronic searches using these fields as keys, and should not be taken as a recommendation to use such obscure kanji. 


*END_HEADER*******************************************************************/
#include "BuildJpDic.h"


/*FUNCTION_HEADER**********************
 * NAME:	;KanjiSortCmp
 * DESC: 	Compare function for sorting Kanji dictionary
 * IN:		
 * OUT:		
 * RETURN:	
 * NOTES:	
 *END_HEADER***************************/
int KanjiSortCmp(const void *arg1, const void *arg2)
{
	KANJIDIC *p1 = *(KANJIDIC **)arg1, 
			 *p2 = *(KANJIDIC **)arg2;

	if( p1->wcKanji == p2->wcKanji )		
		return 0;
	else if( p1->wcKanji > p2->wcKanji )	
		return 1;
	else	
		return -1;
}

/*FUNCTION_HEADER**********************
 * NAME:	;KanjiFindCmp
 * DESC: 	Compare function for finding a kanji character in the dictionary
 * IN:		
 * OUT:		
 * RETURN:	
 * NOTES:	
 *END_HEADER***************************/
int KanjiFindCmp(const void *arg1, const void *arg2)
{
	wchar_t p1 = *(wchar_t *)arg1;
	KANJIDIC *p2 = *(KANJIDIC **)arg2;

	if( p1 == p2->wcKanji )
		return 0;
	else if( p1 > p2->wcKanji )
		return 1;
	else
		return -1;
}

/*FUNCTION_HEADER**********************
 * NAME:	;SortKanji
 * DESC: 	Function to sort kanji dictionary
 * IN:		
 * OUT:		
 * RETURN:	
 * NOTES:	
 *END_HEADER***************************/
void SortKanji(KANJIDIC **ppKanji, int nKanji)
{
	qsort((void *)ppKanji, nKanji, sizeof(KANJIDIC *), KanjiSortCmp);
}

/*FUNCTION_HEADER**********************
 * NAME:	;FindKanji
 * DESC: 	Function to find a kanji symbol in the dictionary
 * IN:		
 * OUT:		
 * RETURN:	
 * NOTES:	
 *END_HEADER***************************/
KANJIDIC *FindKanji(wchar_t wc, KANJIDIC **ppKanji, int nKanji)
{
	void *pVoid;
	KANJIDIC
		*pKanji;

	pVoid = bsearch(&wc, (void *)ppKanji, nKanji, sizeof(KANJIDIC *), KanjiFindCmp);
	if( pVoid != NULL )
	{
		pKanji = *(KANJIDIC **)pVoid;
		return pKanji;
	}
	else
		return NULL;
}

/*FUNCTION_HEADER**********************
 * NAME:	;ParseKanjiBuf
 * DESC: 	parse a line of text containing the kanji and its readings
 * IN:		sBuf - the line of text
 * OUT:		
 * RETURN:	pointer to a KANJIDIC structure
 * NOTES:	See the notes in the header of this file for info on the format
			of sBuf.
 *END_HEADER***************************/
KANJIDIC *ParseKanjiBuf(char *sBuf)
{
	unsigned char 
		*pc,
		*psTok,
		sDelim[] = " \t\r\n";
	wchar_t
		*pwc,
		wsTmp[64];
	int
		i,
		iReadingType=0;
	KANJIDIC 
		*pKanji=NULL;
	KANJIREADING
		*pReading=NULL;
	KOREAN_READING
		*pKorean=NULL;

	if( sBuf == NULL ||
		sBuf[0] == '#' )
		return NULL;

	// Get the EUC Kanji character (2 bytes)
	if( (psTok = strtok(sBuf, sDelim)) == NULL )
		return NULL;

	if( (pKanji = (KANJIDIC *)calloc(1, sizeof(KANJIDIC))) == NULL )
		return NULL;

	pc = psTok;
	pKanji->wcKanji = (*pc << 8) + (*(pc+1));


	// Get the 4-byte ASCII representation of the hexadecimal coding of the two-byte JIS encoding
	if( (psTok = strtok(NULL, sDelim)) == NULL )
		return NULL;

	while( (psTok = strtok(NULL, sDelim)) )
	{
		switch(*psTok)
		{
			case 'B':	break;		// the radical (Bushu) number
			case 'C':	break;		// the historical or classical radical number
			case 'F':	break;		// the frequency-of-use ranking
			case 'G':	break;
			case 'H':	break;
			case 'N':	break;
			case 'V':	break;
			case 'D':	break;
			case 'P':	break;
			case 'S':	break;		// Stroke count
			case 'U':	break;		// Unicode
			case 'I':	break;
			case 'Q':	break;
			case 'M':	break;
			case 'E':	break;
			case 'K':	break;
			case 'L':	break;
			case 'O':	break;
			case 'W':				// Korean reading	
				if( pKanji->pKorean == NULL )
				{
					pKanji->pKorean = pKorean = (KOREAN_READING *)calloc(1, sizeof(KOREAN_READING));
				}
				else
				{
					pKorean->pNext = (KOREAN_READING *)calloc(1, sizeof(KOREAN_READING));
					pKorean = pKorean->pNext;
				}
				psTok++;
				pKorean->sReading = strdup(psTok);
				break;
			case 'Y':	break;		// Pinyin
			case 'X':	break;		// Cross reference code
			case 'Z':	break;		// Kanji mis-classification code
			case '{':
				while( psTok && strchr(psTok, '}') == NULL )
					psTok = strtok(NULL, sDelim);

				break;		// English meaning
			case 'T':	
				if( *(psTok+1) == '1' )	
					iReadingType=1;	// the kanji has special "nanori" (i.e. name) reading
				else if( *(psTok+1) == '2' )	
					iReadingType=2;	// the kanji is a radical, and the radical name is not already a reading
				break;

			default:
				// Get a Kanji reading
				for(i=0, pc=psTok; *pc != ' ' && *pc != 0x00 && *pc != '\n'; i++)
				{
					if( *pc == '.' ||
						*pc == '-' )
					{
						wsTmp[i] = *pc;
						pc++;
					}
					else
					{
						wsTmp[i] = (*pc << 8) + (*(pc+1));
						pc+= 2;
					}
				}
				wsTmp[i] = 0x00;	// Null terminate

				if( pKanji->pReading == NULL )
				{
					pKanji->pReading = pReading = (KANJIREADING *)calloc(1, sizeof(KANJIREADING));
				}
				else
				{
					pReading->pNext = (KANJIREADING *)calloc(1, sizeof(KANJIREADING));
					pReading = pReading->pNext;
				}

				StrKata2Hira(wsTmp);		// Make sure that all symbols in the reading are in hiragana

				// If reading is XYZ.ABC then XYZ is the reading and ABC is the right context
				if( (pwc = wcschr(wsTmp, L'.')) != NULL )
				{
					*pwc = 0x00;	// Null terminate the reading
					pwc++;			// Move over to the right context definition
					pReading->pContext = (KANJICONTEXT *)calloc(1, sizeof(KANJICONTEXT));
					pReading->pContext->wsRight = wcsdup(pwc);
				}

				pReading->wsReading = wcsdup(wsTmp);
				pReading->iType = iReadingType;

				break;
		}
	}

	return pKanji;
}

/*FUNCTION_HEADER**********************
 * NAME:	;ReadKanjiDic
 * DESC: 	Load kanjidic into memory 
 * IN:		sFile - path and filename for kanjidic
 * OUT:		
 * RETURN:	pointer to the head KANJIDIC struct on success, NULL on failure
 * NOTES:	
 *END_HEADER***************************/
KANJIDIC **ReadKanjiDic(char *sFile, int *pnKanji)
{
	char 
		sBuf[512];
	unsigned int 
		i,
		nKanji=0;
	KANJIDIC 
		**ppKanji=NULL,
		*pKanjiDic=NULL,
		*pKanji,
		*pTmp;
	FILE 
		*fp;

	if( sFile == NULL )
		return NULL;

	if( (fp = fopen(sFile, "rb")) == NULL )
	{
		printf("Error: Can't open %s\n", sFile);
		return NULL;
	}

	while( fgets(sBuf, 512, fp) )
	{
		if( (pTmp = ParseKanjiBuf(sBuf)) == NULL )
			continue;

		if( pKanjiDic == NULL )
		{
			pKanjiDic = pKanji = pTmp;
		}
		else
		{
			pKanji->pNext = pTmp;
			pKanji = pKanji->pNext;
		}
		nKanji++;

	}

	ppKanji = (KANJIDIC **)calloc(nKanji, sizeof(KANJIDIC *));
	for( i=0, pKanji=pKanjiDic; i<nKanji && pKanji; i++, pKanji=pKanji->pNext )
	{
		ppKanji[i] = pKanji;
	}

	if( pnKanji )
		*pnKanji = nKanji;

	SortKanji(ppKanji, nKanji);

	return ppKanji;
}

/*FUNCTION_HEADER**********************
 * NAME:	;ReadKanjiList
 * DESC: 	Load Kanji list into memory 
 * IN:		sFile - path and filename for kanjidic
 * OUT:		
 * RETURN:	pointer to the head KANJIDIC struct on success, NULL on failure
 * NOTES:	
			Kanji list format should be:
			[N] XXXX K

			where N is the Kanji index
			XXXX is the Kanji hex value
			K is the Kanji character
 *END_HEADER***************************/
KANJIDIC **ReadKanjiList(char *sFile, int *pnKanji)
{
	wchar_t
		wc,
		wsHex[12],
		wsBuf[512];
	unsigned int 
		i,
		nKanji=0;
	KANJIDIC 
		**ppKanji=NULL,
		*pKanjiDic=NULL,
		*pKanji;
	FILE 
		*fp;

	if( sFile == NULL )
		return NULL;

	if( (fp = fopen(sFile, "rb")) == NULL )
	{
		printf("Error: Can't open %s\n", sFile);
		return NULL;
	}

	fread(&wc, sizeof(short), 1, fp);		// Read unicode header

	while( fgetws(wsBuf, 512, fp) )
	{
		if( swscanf(wsBuf, L"[%d] %s %c", &i, wsHex, &wc) != 3 )
			continue;

		if( pKanjiDic == NULL )
		{
			pKanjiDic = pKanji = (KANJIDIC *)calloc(1, sizeof(KANJIDIC));
		}
		else
		{
			pKanji->pNext = (KANJIDIC *)calloc(1, sizeof(KANJIDIC));
			pKanji = pKanji->pNext;
		}
		pKanji->wcKanji = wc;
		nKanji++;
	}

	ppKanji = (KANJIDIC **)calloc(nKanji, sizeof(KANJIDIC *));
	for( i=0, pKanji=pKanjiDic; i<nKanji && pKanji; i++, pKanji=pKanji->pNext )
	{
		ppKanji[i] = pKanji;
	}

	if( pnKanji )
		*pnKanji = nKanji;

	SortKanji(ppKanji, nKanji);

	return ppKanji;
}

/*FUNCTION_HEADER**********************
 * NAME:	;ConvertKanjiToList
 * DESC: 	Convert a Unicode file full of Kanji to a Kanji list of the format:
			[N] XXXX K
 * IN:		sKanjiIn, sKanjiListOut
 * OUT:		
 * RETURN:	
 * NOTES:	
 *END_HEADER***************************/
void ConvertKanjiToList(char *sKanjiIn, char *sKanjiListOut)
{
	wchar_t
		wc;
	int
		i=0;
	FILE 
		*fp,
		*fpOut;

	if( sKanjiIn == NULL || sKanjiListOut == NULL )
		return;

	if( (fp = fopen(sKanjiIn, "rb")) == NULL )
	{
		printf("Error: Can't open %s\n", sKanjiIn);
		return;
	}
	if( (fpOut = fopen(sKanjiListOut, "wb")) == NULL )
	{
		printf("Error: Can't open %s\n", sKanjiListOut);
		return;
	}

	fread(&wc, sizeof(short), 1, fp);		// Read unicode header
	if( wc != 0xfeff )
		return;

	fwrite(&wc, sizeof(short), 1, fpOut);

	while( fread(&wc, sizeof(short), 1, fp) )
	{
		if( 0x4e00 <= wc && wc <= 0x9FA5)
			fwprintf(fpOut, L"[%d] %x %c\n", i++, wc, wc);
		else
			wprintf(L"Warning: Invalid character %x %c\n", wc, wc);
	}

	fclose(fp);
	fclose(fpOut);
	
}

/*FUNCTION_HEADER**********************
 * NAME:	;FreeKanjiDic
 * DESC: 	Free the memory used to load the kanji dictionary
 * IN:		pKanjiDic
 * OUT:		
 * RETURN:	
 * NOTES:	
 *END_HEADER***************************/
void FreeKanjiDic(KANJIDIC **ppKanji, unsigned int nKanji)
{
	unsigned int
		i;
	KANJIREADING
		*pReading,
		*pTmpReading;

	if( ppKanji == NULL || nKanji == 0 )
		return;

	for( i=0; i<nKanji; i++ )
	{
		pReading = ppKanji[i]->pReading;
		while( pReading )
		{
			pTmpReading = pReading;
			pReading = pReading->pNext;
			if( pTmpReading->wsReading )
				free(pTmpReading->wsReading);
			free(pTmpReading);
		}
		free(ppKanji[i]);
	}

	free(ppKanji);
}

