genprops.c   [plain text]


/*
*******************************************************************************
*
*   Copyright (C) 1999-2005, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  genprops.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 1999dec08
*   created by: Markus W. Scherer
*
*   This program reads several of the Unicode character database text files,
*   parses them, and extracts most of the properties for each character.
*   It then writes a binary file containing the properties
*   that is designed to be used directly for random-access to
*   the properties of each Unicode character.
*/

#include <stdio.h>
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/putil.h"
#include "unicode/uclean.h"
#include "cmemory.h"
#include "cstring.h"
#include "unewdata.h"
#include "uoptions.h"
#include "uparse.h"
#include "uprops.h"
#include "propsvec.h"

U_CDECL_BEGIN
#include "genprops.h"
U_CDECL_END

#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))

UBool beVerbose=FALSE, haveCopyright=TRUE;

/* prototypes --------------------------------------------------------------- */

static void
parseDB(const char *filename, UErrorCode *pErrorCode);

/* -------------------------------------------------------------------------- */

enum
{
    HELP_H,
    HELP_QUESTION_MARK,
    VERBOSE,
    COPYRIGHT,
    DESTDIR,
    SOURCEDIR,
    UNICODE_VERSION,
    ICUDATADIR,
    CSOURCE
};

/* Keep these values in sync with the above enums */
static UOption options[]={
    UOPTION_HELP_H,
    UOPTION_HELP_QUESTION_MARK,
    UOPTION_VERBOSE,
    UOPTION_COPYRIGHT,
    UOPTION_DESTDIR,
    UOPTION_SOURCEDIR,
    UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
    UOPTION_ICUDATADIR,
    UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
};

extern int
main(int argc, char* argv[]) {
    char filename[300];
    const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
    char *basename=NULL;
    UErrorCode errorCode=U_ZERO_ERROR;

    U_MAIN_INIT_ARGS(argc, argv);

    /* preset then read command line options */
    options[DESTDIR].value=u_getDataDirectory();
    options[SOURCEDIR].value="";
    options[UNICODE_VERSION].value="";
    options[ICUDATADIR].value=u_getDataDirectory();
    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);

    /* error handling, printing usage message */
    if(argc<0) {
        fprintf(stderr,
            "error in command line argument \"%s\"\n",
            argv[-argc]);
    }
    if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
        /*
         * Broken into chucks because the C89 standard says the minimum
         * required supported string length is 509 bytes.
         */
        fprintf(stderr,
            "Usage: %s [-options] [suffix]\n"
            "\n"
            "read the UnicodeData.txt file and other Unicode properties files and\n"
            "create a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
            "\n",
            argv[0]);
        fprintf(stderr,
            "Options:\n"
            "\t-h or -? or --help  this usage text\n"
            "\t-v or --verbose     verbose output\n"
            "\t-c or --copyright   include a copyright notice\n"
            "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
            "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
        fprintf(stderr,
            "\t-d or --destdir     destination directory, followed by the path\n"
            "\t-s or --sourcedir   source directory, followed by the path\n"
            "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
            "\t                    followed by path, defaults to %s\n"
            "\tsuffix              suffix that is to be appended with a '-'\n"
            "\t                    to the source file basenames before opening;\n"
            "\t                    'genprops new' will read UnicodeData-new.txt etc.\n",
            u_getDataDirectory());
        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    }

    /* get the options values */
    beVerbose=options[VERBOSE].doesOccur;
    haveCopyright=options[COPYRIGHT].doesOccur;
    srcDir=options[SOURCEDIR].value;
    destDir=options[DESTDIR].value;

    if(argc>=2) {
        suffix=argv[1];
    } else {
        suffix=NULL;
    }

    if(options[UNICODE_VERSION].doesOccur) {
        setUnicodeVersion(options[UNICODE_VERSION].value);
    }
    /* else use the default dataVersion in store.c */

    if (options[ICUDATADIR].doesOccur) {
        u_setDataDirectory(options[ICUDATADIR].value);
    }

    /* prepare the filename beginning with the source dir */
    uprv_strcpy(filename, srcDir);
    basename=filename+uprv_strlen(filename);
    if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
        *basename++=U_FILE_SEP_CHAR;
    }

    /* initialize */
    initStore();

    /* process UnicodeData.txt */
    writeUCDFilename(basename, "UnicodeData", suffix);
    parseDB(filename, &errorCode);

    /* process additional properties files */
    *basename=0;
    generateAdditionalProperties(filename, suffix, &errorCode);

    /* process parsed data */
    if(U_SUCCESS(errorCode)) {
        /* write the properties data file */
        generateData(destDir, options[CSOURCE].doesOccur);
    }

    exitStore();
    u_cleanup();
    return errorCode;
}

U_CFUNC void
writeUCDFilename(char *basename, const char *filename, const char *suffix) {
    int32_t length=(int32_t)uprv_strlen(filename);
    uprv_strcpy(basename, filename);
    if(suffix!=NULL) {
        basename[length++]='-';
        uprv_strcpy(basename+length, suffix);
        length+=(int32_t)uprv_strlen(suffix);
    }
    uprv_strcpy(basename+length, ".txt");
}

U_CFUNC UBool
isToken(const char *token, const char *s) {
    const char *z;
    int32_t j;

    s=u_skipWhitespace(s);
    for(j=0;; ++j) {
        if(token[j]!=0) {
            if(s[j]!=token[j]) {
                break;
            }
        } else {
            z=u_skipWhitespace(s+j);
            if(*z==';' || *z==0) {
                return TRUE;
            } else {
                break;
            }
        }
    }

    return FALSE;
}

U_CFUNC int32_t
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
    const char *t, *z;
    int32_t i, j;

    s=u_skipWhitespace(s);
    for(i=0; i<countTokens; ++i) {
        t=tokens[i];
        if(t!=NULL) {
            for(j=0;; ++j) {
                if(t[j]!=0) {
                    if(s[j]!=t[j]) {
                        break;
                    }
                } else {
                    z=u_skipWhitespace(s+j);
                    if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
                        return i;
                    } else {
                        break;
                    }
                }
            }
        }
    }
    return -1;
}

/* parser for UnicodeData.txt ----------------------------------------------- */

/* general categories */
const char *const
genCategoryNames[U_CHAR_CATEGORY_COUNT]={
    "Cn",
    "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
    "Mc", "Nd", "Nl", "No",
    "Zs", "Zl", "Zp",
    "Cc", "Cf", "Co", "Cs",
    "Pd", "Ps", "Pe", "Pc", "Po",
    "Sm", "Sc", "Sk", "So",
    "Pi", "Pf"
};

const char *const
decompositionTypeNames[U_DT_COUNT]={
    NULL,
    NULL,
    "compat",
    "circle",
    "final",
    "font",
    "fraction",
    "initial",
    "isolated",
    "medial",
    "narrow",
    "noBreak",
    "small",
    "square",
    "sub",
    "super",
    "vertical",
    "wide"
};

static struct {
    uint32_t first, last, props;
    char name[80];
} unicodeAreas[32];

static int32_t unicodeAreaIndex=0;

static void U_CALLCONV
unicodeDataLineFn(void *context,
                  char *fields[][2], int32_t fieldCount,
                  UErrorCode *pErrorCode) {
    Props p;
    char *end;
    static uint32_t prevCode=0;
    uint32_t value;
    int32_t i;

    /* reset the properties */
    uprv_memset(&p, 0, sizeof(Props));

    /* get the character code, field 0 */
    p.code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
    if(end<=fields[0][0] || end!=fields[0][1]) {
        fprintf(stderr, "genprops: syntax error in field 0 at %s\n", fields[0][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* get general category, field 2 */
    i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
    if(i>=0) {
        p.generalCategory=(uint8_t)i;
    } else {
        fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
            fields[2][0], (unsigned long)p.code);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* get decomposition type, field 5 */
    if(fields[5][0]<fields[5][1]) {
        /* there is some decomposition */
        if(*fields[5][0]!='<') {
            /* canonical */
            i=U_DT_CANONICAL;
        } else {
            /* get compatibility type */
            end=fields[5][0]+1;
            while(end<fields[5][1] && *end!='>') {
                ++end;
            }
            *end='#';
            i=getTokenIndex(decompositionTypeNames, U_DT_COUNT, fields[5][0]+1);
            if(i<0) {
                fprintf(stderr, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n",
                    fields[5][0], (unsigned long)p.code);
                *pErrorCode=U_PARSE_ERROR;
                exit(U_PARSE_ERROR);
            }
        }
        if(!upvec_setValue(pv, p.code, p.code+1, 2, (uint32_t)i, UPROPS_DT_MASK, pErrorCode)) {
            fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode));
            exit(*pErrorCode);
        }
    }

    /* decimal digit value, field 6 */
    if(fields[6][0]<fields[6][1]) {
        value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10);
        if(end!=fields[6][1] || value>0x7fff) {
            fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n",
                (unsigned long)p.code);
            *pErrorCode=U_PARSE_ERROR;
            exit(U_PARSE_ERROR);
        }
        p.numericValue=(int32_t)value;
        p.numericType=1;
    }

    /* digit value, field 7 */
    if(fields[7][0]<fields[7][1]) {
        value=(uint32_t)uprv_strtoul(fields[7][0], &end, 10);
        if(end!=fields[7][1] || value>0x7fff) {
            fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n",
                (unsigned long)p.code);
            *pErrorCode=U_PARSE_ERROR;
            exit(U_PARSE_ERROR);
        }
        if(p.numericType==0) {
            p.numericValue=(int32_t)value;
            p.numericType=2;
        } else if((int32_t)value!=p.numericValue) {
            fprintf(stderr, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n",
                (unsigned long)p.code);
            *pErrorCode=U_PARSE_ERROR;
            exit(U_PARSE_ERROR);
        }
    }

    /* numeric value, field 8 */
    if(fields[8][0]<fields[8][1]) {
        char *s=fields[8][0];
        UBool isNegative;

        /* get a possible minus sign */
        if(*s=='-') {
            isNegative=TRUE;
            ++s;
        } else {
            isNegative=FALSE;
        }

        value=(uint32_t)uprv_strtoul(s, &end, 10);
        if(value>0 && *end=='/') {
            /* field 8 may contain a fractional value, get the denominator */
            if(p.numericType>0) {
                fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
                    (unsigned long)p.code);
                *pErrorCode=U_PARSE_ERROR;
                exit(U_PARSE_ERROR);
            }

            p.denominator=(uint32_t)uprv_strtoul(end+1, &end, 10);
            if(p.denominator==0) {
                fprintf(stderr, "genprops: denominator is 0 in field 8 at code 0x%lx\n",
                    (unsigned long)p.code);
                *pErrorCode=U_PARSE_ERROR;
                exit(U_PARSE_ERROR);
            }
        }
        if(end!=fields[8][1] || value>0x7fffffff) {
            fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n",
                (unsigned long)p.code);
            *pErrorCode=U_PARSE_ERROR;
            exit(U_PARSE_ERROR);
        }

        if(p.numericType==0) {
            if(isNegative) {
                p.numericValue=-(int32_t)value;
            } else {
                p.numericValue=(int32_t)value;
            }
            p.numericType=3;
        } else if((int32_t)value!=p.numericValue) {
            fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
                (unsigned long)p.code);
            *pErrorCode=U_PARSE_ERROR;
            exit(U_PARSE_ERROR);
        }
    }

    value=makeProps(&p);

    if(*fields[1][0]=='<') {
        /* first or last entry of a Unicode area */
        size_t length=fields[1][1]-fields[1][0];

        if(length<9) {
            /* name too short for an area name */
        } else if(0==uprv_memcmp(", First>", fields[1][1]-8, 8)) {
            /* set the current area */
            if(unicodeAreas[unicodeAreaIndex].first==0xffffffff) {
                length-=9;
                unicodeAreas[unicodeAreaIndex].first=p.code;
                unicodeAreas[unicodeAreaIndex].props=value;
                uprv_memcpy(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length);
                unicodeAreas[unicodeAreaIndex].name[length]=0;
            } else {
                /* error: a previous area is incomplete */
                fprintf(stderr, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas[unicodeAreaIndex].name);
                *pErrorCode=U_PARSE_ERROR;
                exit(U_PARSE_ERROR);
            }
            return;
        } else if(0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) {
            /* check that the current area matches, and complete it with the last code point */
            length-=8;
            if( unicodeAreas[unicodeAreaIndex].props==value &&
                0==uprv_memcmp(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length) &&
                unicodeAreas[unicodeAreaIndex].name[length]==0 &&
                unicodeAreas[unicodeAreaIndex].first<p.code
            ) {
                unicodeAreas[unicodeAreaIndex].last=p.code;
                if(beVerbose) {
                    printf("Unicode area U+%04lx..U+%04lx \"%s\"\n",
                        (unsigned long)unicodeAreas[unicodeAreaIndex].first,
                        (unsigned long)unicodeAreas[unicodeAreaIndex].last,
                        unicodeAreas[unicodeAreaIndex].name);
                }
                unicodeAreas[++unicodeAreaIndex].first=0xffffffff;
            } else {
                /* error: different properties between first & last, different area name, first>=last */
                fprintf(stderr, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas[unicodeAreaIndex].name);
                *pErrorCode=U_PARSE_ERROR;
                exit(U_PARSE_ERROR);
            }
            return;
        } else {
            /* not an area name */
        }
    }

    /* check for non-character code points */
    if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
        fprintf(stderr, "genprops: error - properties for non-character code point U+%04lx\n",
                (unsigned long)p.code);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* check that the code points (p.code) are in ascending order */
    if(p.code<=prevCode && p.code>0) {
        fprintf(stderr, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
                (unsigned long)p.code, (unsigned long)prevCode);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }
    prevCode=p.code;

    /* properties for a single code point */
    addProps(p.code, value);
}

/* set repeated properties for the areas */
static void
repeatAreaProps() {
    uint32_t puaProps;
    int32_t i;
    UBool hasPlane15PUA, hasPlane16PUA;
    UErrorCode errorCode;

    /*
     * UnicodeData.txt before 3.0.1 did not contain the PUAs on
     * planes 15 and 16.
     * If that is the case, then we add them here, using the properties
     * from the BMP PUA.
     */
    puaProps=0;
    hasPlane15PUA=hasPlane16PUA=FALSE;

    for(i=0; i<unicodeAreaIndex; ++i) {
        repeatProps(unicodeAreas[i].first,
                    unicodeAreas[i].last,
                    unicodeAreas[i].props);
        if(unicodeAreas[i].first==0xe000) {
            puaProps=unicodeAreas[i].props;
        } else if(unicodeAreas[i].first==0xf0000) {
            hasPlane15PUA=TRUE;
        } else if(unicodeAreas[i].first==0x100000) {
            hasPlane16PUA=TRUE;
        }
    }

    if(puaProps!=0) {
        if(!hasPlane15PUA) {
            repeatProps(0xf0000, 0xffffd, puaProps);
        }
        if(!hasPlane16PUA) {
            repeatProps(0x100000, 0x10fffd, puaProps);
        }
    }

    /* Hangul have canonical decompositions */
    errorCode=U_ZERO_ERROR;
    if(!upvec_setValue(pv, 0xac00, 0xd7a4, 2, (uint32_t)U_DT_CANONICAL, UPROPS_DT_MASK, &errorCode)) {
        fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode));
        exit(errorCode);
    }
}

static void
parseDB(const char *filename, UErrorCode *pErrorCode) {
    char *fields[15][2];

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return;
    }

    /* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
    unicodeAreas[0].first=0xffffffff;

    u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);

    if(unicodeAreas[unicodeAreaIndex].first!=0xffffffff) {
        fprintf(stderr, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n",
            unicodeAreas[unicodeAreaIndex].name,
            (unsigned long)unicodeAreas[unicodeAreaIndex].first);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    repeatAreaProps();

    if(U_FAILURE(*pErrorCode)) {
        return;
    }
}

/*
 * Hey, Emacs, please set the following:
 *
 * Local Variables:
 * indent-tabs-mode: nil
 * End:
 *
 */