/*
 * Jeffrey Friedl
 * Omron Corporation			ʳ
 * Nagaokakyoshi, Japan			617Ĺ
 *
 * jfriedl@nff.ncl.omron.co.jp
 *
 * This work is placed under the terms of the GNU General Purpose License
 * (the "GNU Copyleft").
 *
 */

#include "config.h"
#include "assert.h"
#include <ctype.h>
#include "romaji2kana.h"
#include "kanaid.h"

#if !defined(__GNUC__)
#  if !defined(__volatile__)
#    define __volatile__ /*nothing; for use with volatile functions */
#  endif
#  if !defined(__inline__)
#    define __inline__ /*nothing; for use with volatile functions */
#  endif
#endif

#if 0
# define ROMAJI_DEBUG
#endif

#ifdef TEST
# define ROMAJI_DEBUG
#endif
#ifdef ROMAJI_DEBUG
# include "output.h"
#endif

#define arraysize(array) (sizeof(array)/sizeof(array[0]))

#define R2K_NORMAL  0x80
#if (R2K_NORMAL & (R2K_UNCONVERTED_PUNC_OK|R2K_NONASCII_OK|R2K_ALLOW_LONG_O_WITH_H))
#  error oops
#endif

#define POSSIBLE_FLAGS_PER_ROMAJI   R2K_ALLOW_LONG_O_WITH_H

#define Y(ROMAJI, KANA, FLAG) { {#ROMAJI}, {#KANA}, FLAG }
#define X(ROMAJI, KANA)       { {#ROMAJI}, {#KANA}, R2K_NORMAL }

#define LISTS_ARE_IN_ORDER  1 /* make true only if the romaji entries below
				 are in alphabetical order within each array */

static struct {
    const unsigned char romaji[1];
    const unsigned char kana[2]; 
    const unsigned char flags;
} romaji1[ ] = {
    X( a ,  ),
    X( e ,  ),
    Y( h ,  , R2K_ALLOW_LONG_O_WITH_H),
    X( i ,  ),
    Y( m ,  , R2K_ALLOW_M_FOR_N),
    X( n ,  ),
    X( o ,  ),
    X( u ,  ),
};

static struct {
    const unsigned char romaji[2];
    const unsigned char kana[4]; 
    const unsigned char flags;
} romaji2[ ] = {
    X( ba ,    ),
    X( be ,    ),
    X( bi ,    ),
    X( bo ,    ),
    X( bu ,    ),
    X( ca ,    ),
    X( co ,    ),
    X( cu ,    ),
    X( da ,    ),
    X( de ,    ),
    X( di ,    ),
    X( do ,    ),         /* make  */
    X( du ,    ),
    X( fa , դ ),
    X( fe , դ ),
    X( fi , դ ),
    X( fo , դ ),
    X( fu ,    ),
    X( ga ,    ),
    X( ge ,    ),             /* sure  */
    X( gi ,    ),
    X( go ,    ),
    X( gu ,    ),
    X( ha ,    ),
    X( he ,    ),
    X( hi ,    ),
    X( ho ,    ),
    X( hu ,    ),
    X( ja ,  ),               /* to */
    X( je ,  ),
    X( ji ,    ),
    X( jo ,  ),
    X( ju ,  ),
    X( ka ,    ),
    X( ke ,    ),
    X( ki ,    ),                  /* keep */
    X( ko ,    ),
    X( ku ,    ),
    X( la ,    ),
    X( le ,    ),
    X( li ,    ),
    X( lo ,    ),
    X( lu ,    ),                   /* in */
    X( ma ,    ),
    X( me ,    ),
    X( mi ,    ),
    X( mo ,    ),
    X( mu ,    ),
    X( na ,    ),
    X( ne ,    ),                      /* alphabetical */
    X( ni ,    ),
    X( no ,    ),
    X( nu ,    ),
    X( pa ,    ),
    X( pe ,    ),
    X( pi ,    ),
    X( po ,    ),
    X( pu ,    ),
    X( ra ,    ),                            /* order */
    X( re ,    ),
    X( ri ,    ),
    X( ro ,    ),
    X( ru ,    ),
    X( sa ,    ),
    X( se ,    ),
    X( si ,    ),
    X( so ,    ),
    X( su ,    ),
    X( ta ,    ),
    X( te ,    ),
    X( ti ,    ),
    X( to ,    ),
    X( tu ,    ),
    X( va ,  ),
    X( ve ,  ),
    X( vi ,  ),
    X( vo ,  ),
    X( vu ,    ),
    X( wa ,    ),
    X( we ,    ),
    X( wi ,    ),
    X( wo ,    ),
    X( xa ,    ),
    X( xe ,    ),
    X( xi ,    ),
    X( xo ,    ),
    X( xu ,    ),
    X( ya ,    ),
    X( yo ,    ),
    X( yu ,    ),
    X( za ,    ),
    X( ze ,    ),
    X( zi ,    ),
    X( zo ,    ),
    X( zu ,    ),
};

static struct {
    const unsigned char romaji[3];
    const unsigned char kana[4]; 
    const unsigned char flags;
} romaji3[ ] = {
    X( bya , Ӥ ),
    X( byo , Ӥ ),
    X( byu , Ӥ ),
    X( cha ,  ),
    X( che ,  ),
    X( chi ,    ),
    X( cho ,  ),
    X( chu ,  ),
    X( dya , ¤ ),
    X( dye , ¤ ),
    X( dyi , Ǥ ),
    X( dyo , ¤ ),
    X( dyu , ¤ ),
    X( dzi ,    ),             /*  make */
    X( dzu ,    ),
    X( gya ,  ),
    X( gyo ,  ),
    X( gyu ,  ),                 /* sure */
    X( hya , Ҥ ),
    X( hyo , Ҥ ),
    X( hyu , Ҥ ),
    X( jya ,  ),
    X( jyo ,  ),                    /* to */
    X( jyu ,  ),
    X( kya ,  ),
    X( kyo ,  ),
    X( kyu ,  ),
    X( mya , ߤ ),                       /* keep */
    X( myo , ߤ ),
    X( myu , ߤ ),
    X( nya , ˤ ),
    X( nyo , ˤ ),
    X( nyu , ˤ ),                          /* in */
    X( pya , Ԥ ),
    X( pyo , Ԥ ),
    X( pyu , Ԥ ),
    X( rya ,  ),
    X( ryo ,  ),                              /* alphabetical */
    X( ryu ,  ),
    X( sha ,  ),
    X( shi ,    ),
    X( sho ,  ),                                /* order */
    X( shu ,  ),
    X( sya ,  ),
    X( syi ,    ),
    X( syo ,  ),
    X( syu ,  ),
    X( tsu ,    ),
    X( tya ,  ),
    X( tye ,  ),
    X( tyi , Ƥ ),
    X( tyo ,  ),
    X( tyu ,  ),
    X( tzu ,    ),
    X( xka ,    ),
    X( xke ,    ),
    X( xtu ,    ),
    X( xwa ,    ),
    X( xya ,    ),
    X( xyo ,    ),
    X( xyu ,    ),
    X( zya ,  ),
    X( zye ,  ),
    X( zyo ,  ),
    X( zyu ,  ),
};

static unsigned mode = R2K_MIXED_MODE;

unsigned r2k_setmode(unsigned newmode)
{
    unsigned oldmode = mode;
    mode = newmode;
    return oldmode;
}

#ifndef R2K_DEFAULT_FLAGS
# define R2K_DEFAULT_FLAGS  R2K_UNCONVERTED_PUNC_OK|\
                                    R2K_NONASCII_OK|\
                            R2K_ALLOW_LONG_O_WITH_H

#endif

static unsigned flags = R2K_DEFAULT_FLAGS;

unsigned r2k_setflag(unsigned new)
{
    unsigned old = flags;
    flags = new;
    return old;
}

static const char
    *romaji2kana_pass      = "\t ",
    *romaji2kana_omit      = "'",
    *romaji2kana_longvowel = "-^";

const char *r2k_setpass(const char *new)
{
    const char *old = romaji2kana_pass;
    if (new)
	romaji2kana_pass = new;
    return old;
}

const char *r2k_setomit(const char *new)
{
    const char *old = romaji2kana_omit;
    if (new)
	romaji2kana_omit = new;
    return old;
}

const char *r2k_setlongvowel(const char *new)
{
    const char *old = romaji2kana_longvowel;
    if (new)
	romaji2kana_longvowel = new;
    return old;
}


/*
 * More or less an index(3s) function.
 */
static __inline__ const char *
is_char_in_string(const char *str, const char c)
{
    if (str) while (str[0]) if (str[0] == c)
	return str;
    else
	str++;
    return 0;
}

/*
 * Convert the romaji from R to R_END to kana in buffer K whose
 * length is K_BUF_LEN.  If INFO is non-null, it will be filled
 * with the number of bytes of K filled, and a flag to note if what
 * was written to K differs from what was read from R (the normal case
 * is that, of course, it does).
 *
 * If K is zero, nothing is written, and INFO will contain values as if
 * it was written.
 *
 * The number of characters that the thing didn't know what to do
 * with are returned.
 */
int romaji2kana(const unsigned char *r,
		const unsigned char *r_end,
		unsigned char *k,
		unsigned k_buf_len,
		struct romaji2kana_info *info)
{
    const unsigned char *orig_k = k;           /* start of output buffer */
    unsigned char last_hi = 0, last_lo = 0;    /* last converted character */
    unsigned badtranscount = 0;                /* value to be returned */
    unsigned modified = 0;
    unsigned searchflags = flags | R2K_NORMAL;

    if (r == 0 || *r == '\0' || (k && k_buf_len < 3))
	return R2K_BAD_ARGS;

    #ifdef ROMAJI_DEBUG
      outputf("romaji2kana(\"%.*s\"): ", r_end - r, r);
    #endif

    /* put the char into the output buffer, aborting on overflow */
    #define out(c)                                                           \
    macro_start {                                                            \
	unsigned char value = (c); /* ensure C evaluated exactly once */     \
	if (orig_k)                                                          \
	{                                                                    \
	    if (k_buf_len == 0)                                              \
		return R2K_OVERFLOW;                                         \
	    k_buf_len--;                                                     \
	    *k = value;                                                      \
	} /* ... else we're just noting the size that would be output */     \
	k++;                                                                 \
    } macro_end

    /* While there's still romaji left to be converted.... */
    while (r < r_end)
    {
	unsigned char bite[3];
	int bite_size, upper;

	/* if not an ascii character, just pass through */
	if (!isascii(r[0]))
	{
	    if ((searchflags & R2K_NONASCII_OK) == 0)
		badtranscount++;
	    out(last_hi = r[0]); /* output high byte */
	    out(last_lo = r[1]); /* output low byte */
	    r += 2;
	    continue;
	}

	/* if the ASCII is to be passed through, do so */
	if (is_char_in_string(romaji2kana_pass, r[0]))
	{
	    out(*r++);
	    last_hi = 0;
	    continue;
	}

	/* if the ASCII is to be omitted, do so */
	if (is_char_in_string(romaji2kana_omit, r[0]))
	{
	    modified = 1;
	    last_hi = 0;
	    r++;
	    continue;
	}

	/*
	 * If the character indicates a long vowel and we've just output
	 * a character that has a vowel sound we can continue, output the
	 * appropriate character.
	 */
	if (is_char_in_string(romaji2kana_longvowel, r[0]))
	{
	    /* If the last character was katakana, just output the
	     * dash character֡. Otherwise, the appropriate vowel. */

	    if (last_hi == KID_KATA_HI)
	    {
		out(((const unsigned char *)"")[0]);
		out(((const unsigned char *)"")[1]);
		r++; /* skip the longness marker */
		last_hi = 0; /* so we won't trigger this again */
		continue;
	    }

	    if (last_hi == KID_HIRA_HI)
	    {
		unsigned char low = 0;
		switch(KANA_ID(last_hi, last_lo) & KID_VSOUND)
		{
		  case KID_A: low = ((const unsigned char *)"")[1];break;
		  case KID_I: low = ((const unsigned char *)"")[1];break;
		  case KID_U: low = ((const unsigned char *)"")[1];break;
		  case KID_E: low = ((const unsigned char *)"")[1];break;
		  case KID_O: low = ((const unsigned char *)"")[1];break;
		}
		if (low) {
		    out(KID_HIRA_HI);
		    out(low);
		    r++; /* skip the longness marker */
		    last_hi = 0; /* so we won't trigger this again */
		    continue;
		}
	    }
	}

	/*
	 * If the first two characters are the same, and not a vowel,
	 * we'll make it a small TSU (unless it's "n" in which case there
	 * will be special handling).
	 */
	if (&r[1] < r_end && isalpha(r[1]) && isascii(r[1]) &&
	    !is_char_in_string("aeiouAEIOU", r[0]) &&
	    (isupper(r[0]) ? r[0] : tolower(r[0])) ==
	    (isupper(r[1]) ? r[1] : tolower(r[1])))
	{
	    if (r[0] == 'n' || r[0] == 'N')
	    {
		out(last_hi = isupper(r[0]) ? KID_KATA_HI : KID_HIRA_HI);
		out(last_lo = ((const unsigned char *)"")[1]);
	    } else {
		out(last_hi = isupper(r[0]) ? KID_KATA_HI : KID_HIRA_HI);
		out(last_lo = ((const unsigned char *)"")[1]);
	    }
	    r++;
	    continue;
	}

	/*
	 * We'll try to bite off as large a chunk of romaji that makes
	 * sense. We'll bite until:
	 *	hit the biggest reasonable bite, or
	 *      run out of romaji, or
	 *	run into a non-ASCII char, or
	 *	run into a non-alphabetic, or
	 *	are paying attention to case and hit a different case.
	 *
	 * We'll convert to lowercase for the checking, but UPPER will
	 * remember if it was upper case or not (if it matters).
	 */
	upper = isupper(r[0]);

	for (bite_size = 0 ;
	        bite_size < sizeof(bite) && &r[bite_size] < r_end &&
		isascii(r[bite_size]) && isalpha(r[bite_size]) &&
		(mode != R2K_MIXED_MODE || upper == isupper(r[bite_size]))
	     ; bite_size++)
	{
	    bite[bite_size] = isupper(r[bite_size])
		? tolower(r[bite_size]) : r[bite_size];
	}

        #ifdef ROMAJI_DEBUG
	printf("trying bite [%.*s]\n", bite_size, bite);
	#endif

	/*
	 * Used below to check the current bite against an array of
	 * romaji->kana pairs.
	 */
	#define check(ARRAY, TEST)                                           \
	macro_start {                                                        \
	    int i;                                                           \
	    for (i = 0; i < arraysize(ARRAY); i++)                           \
	    {                                                                \
		if ((TEST) && (ARRAY[i].flags & searchflags))                \
		{                                                            \
		    kana = ARRAY[i].kana;  /* we have a winner */            \
		    max_kana_len = sizeof(ARRAY[0].kana);                    \
		    r += sizeof(ARRAY[0].romaji);                            \
		    goto copy_matched_kana;                                  \
		}                                                            \
		if ((LISTS_ARE_IN_ORDER) && ARRAY[i].romaji[0] > bite[0])    \
			break;                                               \
	    }                                                                \
	} macro_end


	/*
	 * Now check the bite against the static database.  If we don't find a
	 * match, we'll reduce the size of the bite until we get a match or
	 * find we can't even match a single character.
	 */
	switch (bite_size)
	{
	    const unsigned char *kana; /* Betcha've never seen a variable */
	    unsigned max_kana_len;     /* in a place like this before. */

	  case 3: check(romaji3, (romaji3[i].romaji[0] == bite[0] &&
				  romaji3[i].romaji[1] == bite[1] &&
				  romaji3[i].romaji[2] == bite[2]));
	          /* FALLTHROUGH: Mmm, no match... try below */

	  case 2: check(romaji2, (romaji2[i].romaji[0] == bite[0] &&
				  romaji2[i].romaji[1] == bite[1]));
	          /* FALLTHROUGH: Mmm, no match... try below */

	  case 1: check(romaji1, romaji1[i].romaji[0] == bite[0]);
	          /* FALLTHROUGH: no match */

	  default:
	    /* ack, couldn't find any match... skip the char */
	    if ((searchflags & R2K_UNCONVERTED_PUNC_OK) == 0 || isalpha(*r))
		badtranscount++;
	    out(*r++);
	    last_hi = 0;
	    break;

	  copy_matched_kana:
	    while (max_kana_len && kana[0])
	    {
		unsigned hi = kana[0], lo = kana[1];
		assert(hi == KID_HIRA_HI || hi == KID_KATA_HI);
		if (hi == KID_HIRA_HI)
		{
		    if (mode == R2K_ALL_HIRA_MODE)
			hi = KID_HIRA_HI;
		    else if (mode == R2K_ALL_KATA_MODE)
			hi = KID_KATA_HI;
		    else /* mode is mixed */
			hi = upper ? KID_KATA_HI : KID_HIRA_HI;
		}
		last_hi = hi;
		last_lo = lo;
		modified = 1;
		out(hi);
		out(lo);

		max_kana_len -= 2;
		kana += 2;
	    }
	    break;
	}
    }
    out(0); /* final string-ending null */

    if (info != 0)
    {
	info->k_buf_used = (k - orig_k);
	info->modified = modified;
    }

    #ifdef ROMAJI_DEBUG
      outputf("/ bytes used: %d", k - orig_k);
      if (orig_k)
	outputf("/ [%.*s]", k - orig_k, orig_k);
      outchar('\n');
    #endif

    return badtranscount;
}

#ifdef TEST

main(int argc, char *argv[])
{
    unsigned char kana[100];
    int i;

    i = romaji2kana(argv[1], kana, sizeof(kana), 0);

    if (i < 0)
	outputf("[%s] return is %d\n", argv[1], i);
    else
        outputf("[%s] kana is [%s] ret %d\n", argv[1], kana, i);
}

#endif
