#include <stdio.h>
#include "unicode/utypes.h"
#include "unicode/putil.h"
#include "unicode/uclean.h"
#include "unicode/udata.h"
#include "cmemory.h"
#include "cstring.h"
#include "uarrsort.h"
#include "unewdata.h"
#include "uoptions.h"
#include "uparse.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
#define STRING_STORE_SIZE 1000000
#define GROUP_STORE_SIZE 5000
#define GROUP_SHIFT 5
#define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
#define GROUP_MASK (LINES_PER_GROUP-1)
#define MAX_LINE_COUNT 50000
#define MAX_WORD_COUNT 20000
#define MAX_GROUP_COUNT 5000
#define DATA_NAME "unames"
#define DATA_TYPE "icu"
#define VERSION_STRING "unam"
#define NAME_SEPARATOR_CHAR ';'
#define ISO_DATA_NAME "ucomment"
enum {
UNI_1_0,
UNI_1_1,
UNI_2_0,
UNI_3_0,
UNI_3_1,
UNI_3_2,
UNI_4_0,
UNI_4_0_1,
UNI_4_1,
UNI_5_0,
UNI_5_1,
UNI_VER_COUNT
};
static const UVersionInfo
unicodeVersions[]={
{ 1, 0, 0, 0 },
{ 1, 1, 0, 0 },
{ 2, 0, 0, 0 },
{ 3, 0, 0, 0 },
{ 3, 1, 0, 0 },
{ 3, 2, 0, 0 },
{ 4, 0, 0, 0 },
{ 4, 0, 1, 0 },
{ 4, 1, 0, 0 },
{ 5, 0, 0, 0 },
{ 5, 1, 0, 0 }
};
static int32_t ucdVersion=UNI_5_1;
static int32_t
findUnicodeVersion(const UVersionInfo version) {
int32_t i;
for(i=0;
i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0;
++i) {}
if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) {
--i;
}
return i;
}
static UDataInfo dataInfo={
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
sizeof(UChar),
0,
{0x75, 0x6e, 0x61, 0x6d},
{1, 0, 0, 0},
{3, 0, 0, 0}
};
static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
typedef struct Options {
UBool storeNames;
UBool store10Names;
UBool storeISOComments;
} Options;
static uint8_t stringStore[STRING_STORE_SIZE],
groupStore[GROUP_STORE_SIZE],
lineLengths[LINES_PER_GROUP];
static uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
typedef struct {
uint32_t code;
int16_t length;
uint8_t *s;
} Line;
typedef struct {
int32_t weight;
int16_t count;
int16_t length;
uint8_t *s;
} Word;
static Line lines[MAX_LINE_COUNT];
static Word words[MAX_WORD_COUNT];
static uint32_t lineCount=0, wordCount=0;
static int16_t leadByteCount;
#define LEADBYTE_LIMIT 16
static int16_t tokens[LEADBYTE_LIMIT*256];
static uint32_t tokenCount;
static void
init(void);
static void
parseDB(const char *filename, Options *options);
static void
parseName(char *name, int16_t length);
static int16_t
skipNoise(char *line, int16_t start, int16_t limit);
static int16_t
getWord(char *line, int16_t start, int16_t limit);
static void
compress(void);
static void
compressLines(void);
static int16_t
compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
static int32_t
compareWords(const void *context, const void *word1, const void *word2);
static void
generateData(const char *dataDir, Options *options);
static uint32_t
generateAlgorithmicData(UNewDataMemory *pData, Options *options);
static int16_t
findToken(uint8_t *s, int16_t length);
static Word *
findWord(char *s, int16_t length);
static Word *
addWord(char *s, int16_t length);
static void
countWord(Word *word);
static void
addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
static void
addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
static uint32_t
addToken(uint8_t *s, int16_t length);
static void
appendLineLength(int16_t length);
static void
appendLineLengthNibble(uint8_t nibble);
static uint8_t *
allocLine(int32_t length);
static uint8_t *
allocWord(uint32_t length);
enum {
HELP_H,
HELP_QUESTION_MARK,
VERBOSE,
QUIET,
COPYRIGHT,
DESTDIR,
UNICODE,
UNICODE1_NAMES,
NO_ISO_COMMENTS,
ONLY_ISO_COMMENTS
};
static UOption options[]={
UOPTION_HELP_H,
UOPTION_HELP_QUESTION_MARK,
UOPTION_VERBOSE,
UOPTION_QUIET,
UOPTION_COPYRIGHT,
UOPTION_DESTDIR,
{ "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
{ "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 },
{ "no-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
{ "only-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }
};
extern int
main(int argc, char* argv[]) {
UVersionInfo version;
Options moreOptions={ TRUE, FALSE, TRUE };
UErrorCode errorCode = U_ZERO_ERROR;
U_MAIN_INIT_ARGS(argc, argv);
u_init(&errorCode);
if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) {
fprintf(stderr, "%s: can not initialize ICU. errorCode = %s\n",
argv[0], u_errorName(errorCode));
exit(1);
}
options[DESTDIR].value=u_getDataDirectory();
options[UNICODE].value="4.1";
argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
if(argc<0) {
fprintf(stderr,
"error in command line argument \"%s\"\n",
argv[-argc]);
} else if(argc<2) {
argc=-1;
}
if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
fprintf(stderr,
"Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
"\n"
"Read the UnicodeData.txt file and \n"
"create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n"
"\n"
"\tfilename absolute path/filename for the Unicode database text file\n"
"\t\t(default: standard input)\n"
"\n",
argv[0]);
fprintf(stderr,
"Options:\n"
"\t-h or -? or --help this usage text\n"
"\t-v or --verbose verbose output\n"
"\t-q or --quiet no output\n"
"\t-c or --copyright include a copyright notice\n"
"\t-d or --destdir destination directory, followed by the path\n"
"\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
fprintf(stderr,
"\t-1 or --unicode1-names store Unicode 1.0 character names\n"
"\t --no-iso-comments do not store ISO comments\n"
"\t --only-iso-comments write ucomment.icu with only ISO comments\n");
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}
beVerbose=options[VERBOSE].doesOccur;
beQuiet=options[QUIET].doesOccur;
haveCopyright=options[COPYRIGHT].doesOccur;
moreOptions.store10Names=options[UNICODE1_NAMES].doesOccur;
moreOptions.storeISOComments=!options[NO_ISO_COMMENTS].doesOccur;
if(options[ONLY_ISO_COMMENTS].doesOccur) {
moreOptions.storeNames=moreOptions.store10Names=FALSE;
moreOptions.storeISOComments=TRUE;
}
u_versionFromString(version, options[UNICODE].value);
uprv_memcpy(dataInfo.dataVersion, version, 4);
ucdVersion=findUnicodeVersion(version);
init();
parseDB(argc>=2 ? argv[1] : "-", &moreOptions);
compress();
generateData(options[DESTDIR].value, &moreOptions);
u_cleanup();
return 0;
}
static void
init() {
int i;
for(i=0; i<256; ++i) {
tokens[i]=0;
}
}
static int16_t
getName(char **pStart, char *limit) {
char *start=(char *)u_skipWhitespace(*pStart);
while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
--limit;
}
*pStart=start;
return (int16_t)(limit-start);
}
static void U_CALLCONV
lineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
Options *storeOptions=(Options *)context;
char *names[3];
int16_t lengths[3]={ 0, 0, 0 };
static uint32_t prevCode=0;
uint32_t code=0;
if(U_FAILURE(*pErrorCode)) {
return;
}
code=uprv_strtoul(fields[0][0], NULL, 16);
if(storeOptions->storeNames) {
names[0]=fields[1][0];
lengths[0]=getName(names+0, fields[1][1]);
if(names[0][0]=='<') {
lengths[0]=0;
}
}
if(storeOptions->store10Names) {
names[1]=fields[10][0];
lengths[1]=getName(names+1, fields[10][1]);
if(names[1][0]=='<') {
lengths[1]=0;
}
}
if(storeOptions->storeISOComments) {
names[2]=fields[11][0];
lengths[2]=getName(names+2, fields[11][1]);
}
if(lengths[0]+lengths[1]+lengths[2]==0) {
return;
}
if(!U_IS_UNICODE_CHAR(code)) {
fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
(unsigned long)code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(code<=prevCode && code>0) {
fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
(unsigned long)code, (unsigned long)prevCode);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
prevCode=code;
parseName(names[0], lengths[0]);
parseName(names[1], lengths[1]);
parseName(names[2], lengths[2]);
if(storeOptions->storeNames) {
addLine(code, names, lengths, 3);
} else {
addLine(code, names+2, lengths+2, 1);
}
}
static void
parseDB(const char *filename, Options *storeOptions) {
char *fields[15][2];
UErrorCode errorCode=U_ZERO_ERROR;
u_parseDelimitedFile(filename, ';', fields, 15, lineFn, storeOptions, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
exit(errorCode);
}
if(!beQuiet) {
printf("size of all names in the database: %lu\n",
(unsigned long)lineTop);
printf("number of named Unicode characters: %lu\n",
(unsigned long)lineCount);
printf("number of words in the dictionary from these names: %lu\n",
(unsigned long)wordCount);
}
}
static void
parseName(char *name, int16_t length) {
int16_t start=0, limit, wordLength;
Word *word;
while(start<length) {
limit=skipNoise(name, start, length);
if(start<limit) {
start=limit;
}
if(start==length) {
break;
}
limit=getWord(name, start, length);
wordLength=(int16_t)(limit-start);
if(wordLength>1) {
word=findWord(name+start, wordLength);
if(word==NULL) {
word=addWord(name+start, wordLength);
}
countWord(word);
}
#if 0
if(prevStart!=-1) {
wordLength=limit-prevStart;
word=findWord(name+prevStart, wordLength);
if(word==NULL) {
word=addWord(name+prevStart, wordLength);
}
countWord(word);
}
#endif
start=limit;
}
}
static UBool U_INLINE
isWordChar(char c) {
return ('A'<=c && c<='I') ||
('J'<=c && c<='R') ||
('S'<=c && c<='Z') ||
('a'<=c && c<='i') ||
('j'<=c && c<='r') ||
('s'<=c && c<='z') ||
('0'<=c && c<='9');
}
static int16_t
skipNoise(char *line, int16_t start, int16_t limit) {
while(start<limit && !isWordChar(line[start])) {
++start;
}
return start;
}
static int16_t
getWord(char *line, int16_t start, int16_t limit) {
char c=0;
while(start<limit && isWordChar(line[start])) {
++start;
}
if(start<limit && ((c=line[start])==' ' || c=='-')) {
++start;
}
return start;
}
static void
compress() {
uint32_t i, letterCount;
int16_t wordNumber;
UErrorCode errorCode;
errorCode=U_ZERO_ERROR;
uprv_sortArray(words, wordCount, sizeof(Word),
compareWords, NULL, FALSE, &errorCode);
while(wordCount>0 && words[wordCount-1].weight<1) {
--wordCount;
}
letterCount=0;
for(i=LEADBYTE_LIMIT; i<256; ++i) {
if(tokens[i]==-1) {
++letterCount;
}
}
if(!beQuiet) {
printf("number of letters used in the names: %d\n", (int)letterCount);
}
if(wordCount+letterCount<=256) {
leadByteCount=0;
for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
if(tokens[i]!=-1) {
tokens[i]=wordNumber;
if(beVerbose) {
printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
(int)i, (long)words[wordNumber].weight,
words[wordNumber].length, words[wordNumber].s);
}
++wordNumber;
}
}
tokenCount=i;
} else {
tokenCount=256-letterCount;
for(i=tokenCount; i<wordCount; ++i) {
words[i].weight-=words[i].count;
}
errorCode=U_ZERO_ERROR;
uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
compareWords, NULL, FALSE, &errorCode);
while(wordCount>0 && words[wordCount-1].weight<1) {
--wordCount;
}
tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
tokenCount+=(tokenCount-256+254)/255;
leadByteCount=(int16_t)(tokenCount>>8);
if(leadByteCount<LEADBYTE_LIMIT) {
tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
} else {
leadByteCount=LEADBYTE_LIMIT-1;
tokenCount=LEADBYTE_LIMIT*256;
wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
wordCount-=(tokenCount-256+254)/255;
}
tokens[0]=0;
if(beVerbose) {
printf("tokens[0x000]: word%8ld \"%.*s\"\n",
(long)words[0].weight,
words[0].length, words[0].s);
}
wordNumber=1;
for(i=1; (int16_t)i<=leadByteCount; ++i) {
tokens[i]=-2;
}
for(; i<256; ++i) {
if(tokens[i]!=-1) {
tokens[i]=wordNumber;
if(beVerbose) {
printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
(int)i, (long)words[wordNumber].weight,
words[wordNumber].length, words[wordNumber].s);
}
++wordNumber;
}
}
for(; (uint32_t)wordNumber<wordCount; ++i) {
if((i&0xff)==NAME_SEPARATOR_CHAR) {
tokens[i]=-1;
} else {
tokens[i]=wordNumber;
if(beVerbose) {
printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
(int)i, (long)words[wordNumber].weight,
words[wordNumber].length, words[wordNumber].s);
}
++wordNumber;
}
}
tokenCount=i;
}
if(!beQuiet) {
printf("number of lead bytes: %d\n", leadByteCount);
printf("number of single-byte tokens: %lu\n",
(unsigned long)256-letterCount-leadByteCount);
printf("number of tokens: %lu\n", (unsigned long)tokenCount);
}
compressLines();
}
static void
compressLines() {
Line *line=NULL;
uint32_t i=0, inLine, outLine=0xffffffff ,
groupMSB=0xffff, lineCount2;
int16_t groupTop=0;
groupBottom=lineTop;
lineCount2=lineCount;
lineCount=0;
while(i<lineCount2) {
line=lines+i++;
inLine=line->code;
if(inLine>>GROUP_SHIFT!=groupMSB) {
while((++outLine&GROUP_MASK)!=0) {
appendLineLength(0);
}
if(groupTop>0) {
if(groupTop>GROUP_STORE_SIZE) {
fprintf(stderr, "gennames: group store overflow\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
addGroup(groupMSB, groupStore, groupTop);
}
lineLengthsTop=0;
groupTop=0;
groupMSB=inLine>>GROUP_SHIFT;
outLine=(inLine&~GROUP_MASK)-1;
}
while(++outLine<inLine) {
appendLineLength(0);
}
appendLineLength(compressLine(line->s, line->length, &groupTop));
}
if(line && groupMSB!=0xffff) {
while((++outLine&GROUP_MASK)!=0) {
appendLineLength(0);
}
if(groupTop>0) {
if(groupTop>GROUP_STORE_SIZE) {
fprintf(stderr, "gennames: group store overflow\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
addGroup(groupMSB, groupStore, groupTop);
}
}
if(!beQuiet) {
printf("number of groups: %lu\n", (unsigned long)lineCount);
}
}
static int16_t
compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
int16_t start, limit, token, groupTop=*pGroupTop;
start=0;
do {
limit=skipNoise((char *)s, start, length);
while(start<limit) {
groupStore[groupTop++]=s[start++];
}
if(start==length) {
break;
}
limit=getWord((char *)s, start, length);
if(limit-start==1) {
groupStore[groupTop++]=s[start++];
} else {
token=findToken(s+start, (int16_t)(limit-start));
if(token!=-1) {
if(token>0xff) {
groupStore[groupTop++]=(uint8_t)(token>>8);
}
groupStore[groupTop++]=(uint8_t)token;
start=limit;
} else {
while(start<limit) {
groupStore[groupTop++]=s[start++];
}
}
}
} while(start<length);
length=(int16_t)(groupTop-*pGroupTop);
*pGroupTop=groupTop;
return length;
}
static int32_t
compareWords(const void *context, const void *word1, const void *word2) {
return ((Word *)word2)->weight-((Word *)word1)->weight;
}
static void
generateData(const char *dataDir, Options *storeOptions) {
UNewDataMemory *pData;
UErrorCode errorCode=U_ZERO_ERROR;
uint16_t groupWords[3];
uint32_t i, groupTop=lineTop, offset, size,
tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
long dataLength;
int16_t token;
pData=udata_create(dataDir,
DATA_TYPE, storeOptions->storeNames ? DATA_NAME : ISO_DATA_NAME,
&dataInfo,
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
exit(errorCode);
}
for(i=0; i<tokenCount; ++i) {
token=tokens[i];
if(token!=-1 && token!=-2) {
tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
}
}
i=tokenCount;
tokenCount=(tokenCount+0xff)&~0xff;
if(!beQuiet && i<tokenCount) {
printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i));
}
for(; i<tokenCount; ++i) {
if((i&0xff)==NAME_SEPARATOR_CHAR) {
tokens[i]=-1;
} else {
tokens[i]=0;
}
}
tokenStringOffset=4+4+4+4+2+2*tokenCount;
groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1;
groupStringOffset=groupsOffset+2+6*lineCount;
algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3;
offset=generateAlgorithmicData(NULL, storeOptions);
size=algNamesOffset+offset;
if(!beQuiet) {
printf("size of the Unicode Names data:\n"
"total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
(unsigned long)size, (unsigned long)(lineTop-groupTop),
(unsigned long)(groupTop-groupBottom), (unsigned long)offset);
}
udata_write32(pData, tokenStringOffset);
udata_write32(pData, groupsOffset);
udata_write32(pData, groupStringOffset);
udata_write32(pData, algNamesOffset);
udata_write16(pData, (uint16_t)tokenCount);
udata_writeBlock(pData, tokens, 2*tokenCount);
udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
if((lineTop-groupTop)&1) {
udata_writePadding(pData, 1);
}
udata_write16(pData, (uint16_t)lineCount);
for(i=0; i<lineCount; ++i) {
groupWords[0]=(uint16_t)lines[i].code;
offset = (uint32_t)((lines[i].s - stringStore)-groupBottom);
groupWords[1]=(uint16_t)(offset>>16);
groupWords[2]=(uint16_t)(offset);
udata_writeBlock(pData, groupWords, 6);
}
udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom);
udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom)));
generateAlgorithmicData(pData, storeOptions);
dataLength=udata_finish(pData, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
exit(errorCode);
}
if(dataLength!=(long)size) {
fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
dataLength, (unsigned long)size);
exit(U_INTERNAL_PROGRAM_ERROR);
}
}
typedef struct AlgorithmicRange {
uint32_t rangeStart, rangeEnd;
uint8_t algorithmType, algorithmVariant;
uint16_t rangeSize;
} AlgorithmicRange;
static uint32_t
generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) {
static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
# define PREFIX_LENGTH 23
# define PREFIX_LENGTH_4 24
uint32_t countAlgRanges;
static AlgorithmicRange cjkExtA={
0x3400, 0x4db5,
0, 4,
sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
};
static AlgorithmicRange cjk={
0x4e00, 0x9fa5,
0, 4,
sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
};
static AlgorithmicRange cjkExtB={
0x20000, 0x2a6d6,
0, 5,
sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
};
static char jamo[]=
"HANGUL SYLLABLE \0"
"G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
"S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
"A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
"WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
"YU\0EU\0YI\0I\0"
"\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
"LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
"S\0SS\0NG\0J\0C\0K\0T\0P\0H"
;
static AlgorithmicRange hangul={
0xac00, 0xd7a3,
1, 3,
sizeof(AlgorithmicRange)+6+sizeof(jamo)
};
static uint16_t hangulFactors[3]={
19, 21, 28
};
uint32_t size;
size=0;
if(ucdVersion>=UNI_5_1) {
cjk.rangeEnd=0x9FC3;
} else if(ucdVersion>=UNI_4_1) {
cjk.rangeEnd=0x9FBB;
}
if(!storeOptions->storeNames) {
countAlgRanges=0;
} else if(ucdVersion>=UNI_3_1) {
countAlgRanges=4;
} else if(ucdVersion>=UNI_3_0) {
countAlgRanges=3;
} else {
countAlgRanges=2;
}
if(pData!=NULL) {
udata_write32(pData, countAlgRanges);
} else {
size+=4;
}
if(countAlgRanges==0) {
return size;
}
if(countAlgRanges>=3) {
if(pData!=NULL) {
udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
udata_writeString(pData, prefix, PREFIX_LENGTH);
if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
}
} else {
size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
}
}
if(pData!=NULL) {
udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
udata_writeString(pData, prefix, PREFIX_LENGTH);
if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
}
} else {
size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
}
if(pData!=NULL) {
udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
udata_writeBlock(pData, hangulFactors, 6);
udata_writeString(pData, jamo, sizeof(jamo));
} else {
size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
}
if(countAlgRanges>=4) {
if(pData!=NULL) {
udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
udata_writeString(pData, prefix, PREFIX_LENGTH);
if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
}
} else {
size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
}
}
return size;
}
static int16_t
findToken(uint8_t *s, int16_t length) {
int16_t i, token;
for(i=0; i<(int16_t)tokenCount; ++i) {
token=tokens[i];
if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
return i;
}
}
return -1;
}
static Word *
findWord(char *s, int16_t length) {
uint32_t i;
for(i=0; i<wordCount; ++i) {
if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
return words+i;
}
}
return NULL;
}
static Word *
addWord(char *s, int16_t length) {
uint8_t *stringStart;
Word *word;
if(wordCount==MAX_WORD_COUNT) {
fprintf(stderr, "gennames: too many words\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
stringStart=allocWord(length);
uprv_memcpy(stringStart, s, length);
word=words+wordCount;
word->weight=-(length+1+2);
word->count=0;
word->length=length;
word->s=stringStart;
++wordCount;
return word;
}
static void
countWord(Word *word) {
word->weight+=word->length-1;
++word->count;
}
static void
addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
uint8_t *stringStart;
Line *line;
int16_t i, length;
if(lineCount==MAX_LINE_COUNT) {
fprintf(stderr, "gennames: too many lines\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
while(count>0 && lengths[count-1]==0) {
--count;
}
if(count==0) {
return;
}
i=count;
length=count-1;
while(i>0) {
length+=lengths[--i];
}
stringStart=allocLine(length);
length=0;
for(i=0; i<count; ++i) {
if(i>0) {
stringStart[length++]=NAME_SEPARATOR_CHAR;
}
if(lengths[i]>0) {
uprv_memcpy(stringStart+length, names[i], lengths[i]);
length+=lengths[i];
}
}
line=lines+lineCount;
line->code=code;
line->length=length;
line->s=stringStart;
++lineCount;
while(length>0) {
tokens[stringStart[--length]]=-1;
}
}
static void
addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
uint8_t *stringStart;
Line *line;
if(lineCount==MAX_LINE_COUNT) {
fprintf(stderr, "gennames: too many groups\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
lineLengthsTop=(lineLengthsTop+1)/2;
stringStart=allocLine(lineLengthsTop+length);
uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
uprv_memcpy(stringStart+lineLengthsTop, strings, length);
line=lines+lineCount;
line->code=groupMSB;
line->length=length;
line->s=stringStart;
++lineCount;
}
static uint32_t
addToken(uint8_t *s, int16_t length) {
uint8_t *stringStart;
stringStart=allocLine(length+1);
uprv_memcpy(stringStart, s, length);
stringStart[length]=0;
return (uint32_t)(stringStart - stringStore);
}
static void
appendLineLength(int16_t length) {
if(length>=76) {
fprintf(stderr, "gennames: compressed line too long\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
if(length>=12) {
length-=12;
appendLineLengthNibble((uint8_t)((length>>4)|12));
}
appendLineLengthNibble((uint8_t)length);
}
static void
appendLineLengthNibble(uint8_t nibble) {
if((lineLengthsTop&1)==0) {
lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
} else {
lineLengths[lineLengthsTop/2]|=nibble&0xf;
}
++lineLengthsTop;
}
static uint8_t *
allocLine(int32_t length) {
uint32_t top=lineTop+length;
uint8_t *p;
if(top>wordBottom) {
fprintf(stderr, "gennames: out of memory\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
p=stringStore+lineTop;
lineTop=top;
return p;
}
static uint8_t *
allocWord(uint32_t length) {
uint32_t bottom=wordBottom-length;
if(lineTop>bottom) {
fprintf(stderr, "gennames: out of memory\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
wordBottom=bottom;
return stringStore+bottom;
}