/*H_HEADER_FILE***************************************************************
FILE			: BuildJpDic.h
DESC			: 
TABS			: 3
OWNER			: Fonix
DATE CREATED	: 3 November 2005

(C) Copyright 2005 All rights reserved.
This is an unpublished work, and is confidential and proprietary: 
technology and information of fonix corporation.  No part of this
code may be reproduced, used or disclosed without written consent of 
fonix corporation in each and every instance.

  $Date:  $
  $Revision:  $

*END_HEADER******************************************************************/
#ifndef	BUILDJPDIC_H
#define	BUILDJPDIC_H
#include <windows.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <wchar.h>

#ifndef bool
#define bool unsigned char
#endif

#ifndef true
#define true 1
#endif

#ifndef false
#define false 0
#endif

// Main Parts of speech
#define JP_ABBR		0x00010000		//	Abbreviation
#define JP_ADJ		0x00020000		//	Adjective
#define JP_ADV		0x00040000		//	Adverb
#define JP_AUX		0x00080000		//	Auxiliary word or phrase
#define JP_CONJ		0x00100000		//	conjunction
#define JP_NOUN		0x00200000		//	noun
#define JP_PART		0x00400000		//	particle
#define JP_PREF		0x00800000		//	prefix
#define JP_SUFX		0x01000000		//	suffix
#define JP_VERB		0x02000000		//	verb
#define JP_MISC		0x04000000		//	miscelaneous

// Adjective sub categories
#define JP_ADJ_NA		0x00020001		//	adj-na - adjectival nouns or quasi-adjectives (keiyoudoushi)
#define JP_ADJ_NO		0x00020002		//	adj-no - nouns which may take the genitive case particle "no"
#define JP_ADJ_PN		0x00020004		//	adj-pn - pre-noun adjectival (rentaishi)
#define JP_ADJ_S		0x00020010		//	adj-s - special adjective (e.g. ookii)
#define JP_ADJ_T		0x00020020		//	adj-t - "taru" adjective

// Adverb sub categories
#define JP_ADV_ADV		0x00040001		//	adv - adverb (fukushi)
#define JP_ADV_N		0x00040002		//	adv-n - adverbial noun
#define JP_ADV_FN		0x00040004		//	n-adv - adverbial noun (fukushitekimeishi)

// Auxiliary sub categories
#define JP_AUX_WORD		0x00080001		//	aux - auxiliary word or phrase
#define JP_AUX_VERB		0x00080002		//	aux-v - auxiliary verb

// Noun sub categories
#define	JP_NOUN_CMN		0x00200001		//	n - noun (common) (futsuumeishi)
#define	JP_NOUN_ADV		0x00200002		//	n-adv - adverbial noun (fukushitekimeishi)
#define	JP_NOUN_TMP		0x00200004		//	n-t - noun (temporal) (jisoumeishi)
#define	JP_NOUN_SUF		0x00200008		//	n-suf - noun, used as a suffix
#define	JP_NOUN_PREF	0x00200010		//	n-pref - noun, used as a prefix
#define	JP_NOUN_SURU	0x00200020		//	vs - noun or participle which takes the aux. verb suru

// Verb sub categories
#define	JP_VERB_V1		0x02000001		//	v1 - Ichidan verb (2357)
#define	JP_VERB_V5R		0x02000002		//	v5r - Godan verb with `ru' ending (1503)
#define	JP_VERB_V5S		0x02000004		//	v5s - Godan verb with `su' ending (1236)
#define	JP_VERB_V5K		0x02000008		//	v5k - Godan verb with `ku' ending (538)
#define	JP_VERB_V5M		0x02000010		//	v5m - Godan verb with `mu' ending (464)
#define	JP_VERB_V5U		0x02000020		//	v5u - Godan verb with `u' ending (445)
#define	JP_VERB_V5T		0x02000040		//	v5t - Godan verb with `tsu' ending (115)
#define JP_VERB_VSS		0x02000080		//	vs-s - suru verb - special class (102)
#define	JP_VERB_V5G		0x02000100		//	v5g - Godan verb with `gu' ending (77)
#define	JP_VERB_V5B		0x02000200		//	v5b - Godan verb with `bu' ending (64)
#define	JP_VERB_VZ		0x02000400		//	vz - zuru verb - (alternative form of -jiru verbs) (37)
#define	JP_VERB_V5KS	0x02000800		//	v5k-s - Godan verb - Iku/Yuku special class (29)
#define	JP_VERB_VK		0x02001000		//	vk - Kuru verb - special class (10)
#define	JP_VERB_V5ARU	0x02002000		//	v5aru - Godan verb - -aru special class (9)
#define	JP_VERB_V5N		0x02004000		//	v5n - Godan verb with `nu' ending (7)
#define	JP_VERB_MISC	0x02008000		//	v5u-s (5), v5uru (5), vs-i (1), vt (1), vi (0), neg-v (0), v5 (0)

// Miscelaneous sub categories
#define JP_MISC_FEM		0x04000001	//	fem (17)
#define JP_MISC_MALE	0x04000002	//	male (15)
#define JP_MISC_HON		0x04000004	//	hon (67), hum (60), pol (38)
#define JP_MISC_GRAM	0x04000008	//	gram - grammatical term (66)
#define JP_MISC_INTJ	0x04000010	//  interjection (78)
#define JP_MISC_NUMB	0x04000020	//	number (32)
#define JP_MISC_VULG	0x04000040	//  vulgar (22), X (35)
#define JP_MISC_ARCH	0x04000080	//	archaism (101)
#define JP_MISC_ATEJI	0x04000100	//  ateji reading of the kanji (18)
#define JP_MISC_COL		0x04000200	//	colloquialisms (106), exp (3833), fam (5), id (5), m_sl (0), sl (34)
#define JP_MISC_IRREG	0x04000400	//	iK (79), ik (39), io (122)
#define JP_MISC_OLD		0x04000800	// 	oK (149), ok(21), obs (53), obsc (16)
#define JP_MISC_MA		0x04001000	//	MA - martial arts term (46)
#define	JP_MISC_KANA	0x04002000	//	uk - only kana (1087)
#define JP_MISC_GIKUN	0x04004000	//	gikun - gikun (meaning) reading (42)
#define	JP_MISC_OTHER	0x04008000	//	anything else

typedef struct _tagJpWordType
{
	char *psWordType;
	unsigned int	uiBitMask;
} JP_WORD_TYPE;

// Bitmask groupings were determined from the following statistics in edict
/********************************************************************
abbr			864
adj             1484, adj-pn 36, adj-s 0, 
adj-na			781
adj-no			234
adj-t			202
adv             1316, adv-n 0
arch            101
ateji           18
aux             5
aux-v           7
conj            75
col             106, exp 3833, fam 5, id 5, m-sl 0, sl 34
fem             17
hon             67, hum 60, pol 38
int             78
male			15
n               87789, n-adv 75, n-t 142, n-suf 33, n-pref 3
num             32
pref            36
prt             56
suf             70
v1              2357,	v5 0, v5u-s 5, v5g 77, v5n 7, 
						v5b 64, v5k-s 29, v5aru 9, v5uru 5, vi 0, vt 1
v5r				1503
v5s				1236
v5k				538
v5m				464
v5u				445
v5t				115
vs              1, vs-i 1, vs-s 102
vz              37
vk              10
vulg            22, X 35


Not used in bitflag
ek              0
gikun           42
gram            66
iK              79
ik              39
io              122
oK              159
ok              21
obs             53
obsc            16
MA              46
neg             0
neg-v           0
qv              0
uK              0
uk              1087
************************************************************************************/


typedef struct _tagJpDict
{
	wchar_t *wsWord;			// dictionary word
	wchar_t *wsKana;			// reading for this word
	wchar_t *wsWordAlign;		// Word aligned with Kana
	wchar_t *wsKanaAlign;		// Kana aligned with word
	char *sWordTypes;			// Parts-of-Speech and word types
	unsigned int iBitFlags;		// bit flags	
}JPDICT, FAR *JPDICTPTR;

typedef struct _tagKanjiWord
{
	wchar_t *wsLeft;					// Left context
	wchar_t *wsRight;					// Right context
	struct _tagKanjiWord *pNext;		// Pointer to next Kanji context that useshas this reading.
}KANJICONTEXT, FAR *KANJICONTEXTPTR;

typedef struct _tagKanjiReading
{
	wchar_t *wsReading;					// Possible reading (kana) for this Kanji
	int	iType;							// 1 - nanori (name), 2 - radical
	KANJICONTEXTPTR pContext;			// List of words with this Kanji in context that uses this reading
	struct _tagKanjiReading *pNext;
}KANJIREADING;

typedef struct _tagKoreanReading
{
	char *sReading;
	struct _tagKoreanReading *pNext;
}KOREAN_READING;

typedef struct _tagKanjiDic
{
	wchar_t	wcKanji;					// A Kanji character
	KANJIREADING *pReading;				// Possible readings for this Kanji depending on context
	KOREAN_READING	*pKorean;
	struct _tagKanjiDic *pNext;
	
}KANJIDIC;

typedef struct _tagBranch;
typedef struct _tagSymbol
{
	wchar_t wcSym;				// Kana or Kanji symbol
	struct _tagBranch *pNext;
	wchar_t *wsData;			// End of word data (full kana)

}SYMBOL, FAR *SYMBOLPTR;

typedef struct _tagBranch		// Branch structure using variable length pointers
{
	int nSym;
	SYMBOL *pSyms;
}BRANCH, FAR *BRANCHPTR;

void FreeDictionary(JPDICTPTR pDict, int nEntries);
void SortDictionary(JPDICTPTR pDict, int nEntries);

BRANCH *BuildLetterTree(JPDICTPTR pDict, int nEntries);
void FreeLTree(BRANCHPTR pLTree);
int GetLTreeSize(BRANCHPTR pLTree);
int GetLTreeSizeWithoutData(BRANCHPTR pLTree);
unsigned int GetDataSizeInLTree(BRANCHPTR pLTree);
wchar_t *FindWordInLTree(wchar_t *sWord, BRANCHPTR pLTree);

void BuildJpDict(char *sOutFile, BRANCHPTR pLTree, bool bBigEndian);

// APIs from KanjiDic.c
KANJIDIC **ReadKanjiDic(char *sFile, int *pnKanji);
void FreeKanjiDic(KANJIDIC **ppKanji, unsigned int nKanji);
KANJIDIC *FindKanji(wchar_t wc, KANJIDIC **ppKanji, int nKanji);


// APIs from Kanji2Kana.c
int IsUnicodeHira(unsigned short wHira);
int IsEucHira(unsigned short wHira);
int IsHira(unsigned short wHira);

int IsUnicodeKata(unsigned short wKata);
int IsEucKata(unsigned short wKata);
int IsKata(unsigned short wKata);

int IsUnicodeKanji(unsigned short wKanji);
int IsEucKanji(unsigned short wKanji);
int IsKanji(unsigned short wKanji);

unsigned short Kata2Hira(unsigned short wKata);
void StrKata2Hira(unsigned short *wsKata);
unsigned short Hira2Kata(unsigned short wHira);
void StrHira2Kata(unsigned short *wsHira);

void WriteKanjiTable(char *sOutFile, KANJIDIC **ppKanji, unsigned int nKanji);

// APIs from CreateKoreanDict.c
void WriteHanjaTable(char *sOutFile, KANJIDIC **ppKanji, unsigned int nKanji);




#endif
