#include <stdio.h>
#include "unicode/putil.h"
#include "unicode/ucnv_err.h"
#include "ucnv_bld.h"
#include "ucnv_imp.h"
#include "ucnv_cnv.h"
#include "cstring.h"
#include "cmemory.h"
#include "uinvchar.h"
#include "filestrm.h"
#include "toolutil.h"
#include "uoptions.h"
#include "unicode/udata.h"
#include "unewdata.h"
#include "uparse.h"
#include "ucm.h"
#include "makeconv.h"
#include "genmbcs.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
#define DEBUG 0
typedef struct ConvData {
UCMFile *ucm;
NewConverter *cnvData, *extData;
UConverterSharedData sharedData;
UConverterStaticData staticData;
} ConvData;
static void
initConvData(ConvData *data) {
uprv_memset(data, 0, sizeof(ConvData));
data->sharedData.structSize=sizeof(UConverterSharedData);
data->staticData.structSize=sizeof(UConverterStaticData);
data->sharedData.staticData=&data->staticData;
}
static void
cleanupConvData(ConvData *data) {
if(data!=NULL) {
if(data->cnvData!=NULL) {
data->cnvData->close(data->cnvData);
data->cnvData=NULL;
}
if(data->extData!=NULL) {
data->extData->close(data->extData);
data->extData=NULL;
}
ucm_close(data->ucm);
data->ucm=NULL;
}
}
extern const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES];
UBool VERBOSE = FALSE;
UBool SMALL = FALSE;
UBool IGNORE_SISO_CHECK = FALSE;
static void
createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode);
static void
writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status);
UBool haveCopyright=TRUE;
static UDataInfo dataInfo={
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
sizeof(UChar),
0,
{0x63, 0x6e, 0x76, 0x74},
{6, 2, 0, 0},
{0, 0, 0, 0}
};
static void
writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status)
{
UNewDataMemory *mem = NULL;
uint32_t sz2;
uint32_t size = 0;
int32_t tableType;
if(U_FAILURE(*status))
{
return;
}
tableType=TABLE_NONE;
if(data->cnvData!=NULL) {
tableType|=TABLE_BASE;
}
if(data->extData!=NULL) {
tableType|=TABLE_EXT;
}
mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status);
if(U_FAILURE(*status))
{
fprintf(stderr, "Couldn't create the udata %s.%s: %s\n",
cnvName,
"cnv",
u_errorName(*status));
return;
}
if(VERBOSE)
{
printf("- Opened udata %s.%s\n", cnvName, "cnv");
}
udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData));
size += sizeof(UConverterStaticData);
if(tableType&TABLE_BASE) {
size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType);
}
if(tableType&TABLE_EXT) {
size += data->extData->write(data->extData, &data->staticData, mem, tableType);
}
sz2 = udata_finish(mem, status);
if(size != sz2)
{
fprintf(stderr, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", (int)sz2, (int)size);
*status=U_INTERNAL_PROGRAM_ERROR;
}
if(VERBOSE)
{
printf("- Wrote %u bytes to the udata.\n", (int)sz2);
}
}
enum {
OPT_HELP_H,
OPT_HELP_QUESTION_MARK,
OPT_COPYRIGHT,
OPT_VERSION,
OPT_DESTDIR,
OPT_VERBOSE,
OPT_SMALL,
OPT_IGNORE_SISO_CHECK,
OPT_COUNT
};
static UOption options[]={
UOPTION_HELP_H,
UOPTION_HELP_QUESTION_MARK,
UOPTION_COPYRIGHT,
UOPTION_VERSION,
UOPTION_DESTDIR,
UOPTION_VERBOSE,
{ "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
{ "ignore-siso-check", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }
};
int main(int argc, char* argv[])
{
ConvData data;
UErrorCode err = U_ZERO_ERROR, localError;
char outFileName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
const char* destdir, *arg;
size_t destdirlen;
char* dot = NULL, *outBasename;
char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
char cnvNameWithPkg[UCNV_MAX_FULL_FILE_NAME_LENGTH];
UVersionInfo icuVersion;
UBool printFilename;
err = U_ZERO_ERROR;
U_MAIN_INIT_ARGS(argc, argv);
u_getVersion(icuVersion);
uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo));
options[OPT_DESTDIR].value=u_getDataDirectory();
argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
if(argc<0) {
fprintf(stderr,
"error in command line argument \"%s\"\n",
argv[-argc]);
} else if(argc<2) {
argc=-1;
}
if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) {
FILE *stdfile=argc<0 ? stderr : stdout;
fprintf(stdfile,
"usage: %s [-options] files...\n"
"\tread .ucm codepage mapping files and write .cnv files\n"
"options:\n"
"\t-h or -? or --help this usage text\n"
"\t-V or --version show a version message\n"
"\t-c or --copyright include a copyright notice\n"
"\t-d or --destdir destination directory, followed by the path\n"
"\t-v or --verbose Turn on verbose output\n",
argv[0]);
fprintf(stdfile,
"\t --small Generate smaller .cnv files. They will be\n"
"\t significantly smaller but may not be compatible with\n"
"\t older versions of ICU and will require heap memory\n"
"\t allocation when loaded.\n"
"\t --ignore-siso-check Use SI/SO other than 0xf/0xe.\n");
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}
if(options[OPT_VERSION].doesOccur) {
printf("makeconv version %u.%u, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
printf("%s\n", U_COPYRIGHT_STRING);
exit(0);
}
haveCopyright = options[OPT_COPYRIGHT].doesOccur;
destdir = options[OPT_DESTDIR].value;
VERBOSE = options[OPT_VERBOSE].doesOccur;
SMALL = options[OPT_SMALL].doesOccur;
if (options[OPT_IGNORE_SISO_CHECK].doesOccur) {
IGNORE_SISO_CHECK = TRUE;
}
if (destdir != NULL && *destdir != 0) {
uprv_strcpy(outFileName, destdir);
destdirlen = uprv_strlen(destdir);
outBasename = outFileName + destdirlen;
if (*(outBasename - 1) != U_FILE_SEP_CHAR) {
*outBasename++ = U_FILE_SEP_CHAR;
++destdirlen;
}
} else {
destdirlen = 0;
outBasename = outFileName;
}
#if DEBUG
{
int i;
printf("makeconv: processing %d files...\n", argc - 1);
for(i=1; i<argc; ++i) {
printf("%s ", argv[i]);
}
printf("\n");
fflush(stdout);
}
#endif
err = U_ZERO_ERROR;
printFilename = (UBool) (argc > 2 || VERBOSE);
for (++argv; --argc; ++argv)
{
arg = getLongPathname(*argv);
if(strlen(arg) >= UCNV_MAX_FULL_FILE_NAME_LENGTH)
{
fprintf(stderr, "%s\n", u_errorName(U_BUFFER_OVERFLOW_ERROR));
return U_BUFFER_OVERFLOW_ERROR;
}
if (destdirlen != 0)
{
const char *basename;
basename = findBasename(arg);
uprv_strcpy(outBasename, basename);
}
else
{
uprv_strcpy(outFileName, arg);
}
dot = uprv_strrchr(outBasename, '.');
if (dot)
{
*dot = '\0';
}
uprv_strcpy(cnvName, outBasename);
uprv_strcat(outBasename, CONVERTER_FILE_EXTENSION);
#if DEBUG
printf("makeconv: processing %s ...\n", arg);
fflush(stdout);
#endif
localError = U_ZERO_ERROR;
initConvData(&data);
createConverter(&data, arg, &localError);
if (U_FAILURE(localError))
{
fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n", outFileName, arg,
u_errorName(localError));
if(U_SUCCESS(err)) {
err = localError;
}
}
else
{
char *p;
p = strrchr(cnvName, U_FILE_SEP_CHAR);
if(p == NULL)
{
p = strrchr(cnvName, U_FILE_ALT_SEP_CHAR);
if(p == NULL)
{
p=cnvName;
}
}
else
{
p++;
}
if(uprv_stricmp(p,data.staticData.name))
{
fprintf(stderr, "Warning: %s%s claims to be '%s'\n",
cnvName, CONVERTER_FILE_EXTENSION,
data.staticData.name);
}
uprv_strcpy((char*)data.staticData.name, cnvName);
if(!uprv_isInvariantString((char*)data.staticData.name, -1)) {
fprintf(stderr,
"Error: A converter name must contain only invariant characters.\n"
"%s is not a valid converter name.\n",
data.staticData.name);
if(U_SUCCESS(err)) {
err = U_INVALID_TABLE_FORMAT;
}
}
uprv_strcpy(cnvNameWithPkg, cnvName);
localError = U_ZERO_ERROR;
writeConverterData(&data, cnvNameWithPkg, destdir, &localError);
if(U_FAILURE(localError))
{
fprintf(stderr, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName, arg,
u_errorName(localError));
if(U_SUCCESS(err)) {
err = localError;
}
}
else if (printFilename)
{
puts(outBasename);
}
}
fflush(stdout);
fflush(stderr);
cleanupConvData(&data);
}
return err;
}
static void
getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID) {
if( (name[0]=='i' || name[0]=='I') &&
(name[1]=='b' || name[1]=='B') &&
(name[2]=='m' || name[2]=='M')
) {
name+=3;
if(*name=='-') {
++name;
}
*pPlatform=UCNV_IBM;
*pCCSID=(int32_t)uprv_strtoul(name, NULL, 10);
} else {
*pPlatform=UCNV_UNKNOWN;
*pCCSID=0;
}
}
static void
readHeader(ConvData *data,
FileStream* convFile,
const char* converterName,
UErrorCode *pErrorCode) {
char line[1024];
char *s, *key, *value;
const UConverterStaticData *prototype;
UConverterStaticData *staticData;
if(U_FAILURE(*pErrorCode)) {
return;
}
staticData=&data->staticData;
staticData->platform=UCNV_IBM;
staticData->subCharLen=0;
while(T_FileStream_readLine(convFile, line, sizeof(line))) {
if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) {
continue;
}
if(uprv_strcmp(line, "CHARMAP")==0) {
break;
}
if(uprv_strcmp(key, "code_set_name")==0) {
if(*value!=0) {
uprv_strcpy((char *)staticData->name, value);
getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage);
}
} else if(uprv_strcmp(key, "subchar")==0) {
uint8_t bytes[UCNV_EXT_MAX_BYTES];
int8_t length;
s=value;
length=ucm_parseBytes(bytes, line, (const char **)&s);
if(1<=length && length<=4 && *s==0) {
staticData->subCharLen=length;
uprv_memcpy(staticData->subChar, bytes, length);
} else {
fprintf(stderr, "error: illegal <subchar> %s\n", value);
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
} else if(uprv_strcmp(key, "subchar1")==0) {
uint8_t bytes[UCNV_EXT_MAX_BYTES];
s=value;
if(1==ucm_parseBytes(bytes, line, (const char **)&s) && *s==0) {
staticData->subChar1=bytes[0];
} else {
fprintf(stderr, "error: illegal <subchar1> %s\n", value);
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
}
}
staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength;
staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength;
staticData->conversionType=data->ucm->states.conversionType;
if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
if(data->ucm->baseName[0]==0) {
prototype=ucnv_converterStaticData[staticData->conversionType];
if(prototype!=NULL) {
if(staticData->name[0]==0) {
uprv_strcpy((char *)staticData->name, prototype->name);
}
if(staticData->codepage==0) {
staticData->codepage=prototype->codepage;
}
if(staticData->platform==0) {
staticData->platform=prototype->platform;
}
if(staticData->minBytesPerChar==0) {
staticData->minBytesPerChar=prototype->minBytesPerChar;
}
if(staticData->maxBytesPerChar==0) {
staticData->maxBytesPerChar=prototype->maxBytesPerChar;
}
if(staticData->subCharLen==0) {
staticData->subCharLen=prototype->subCharLen;
if(prototype->subCharLen>0) {
uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen);
}
}
}
}
if(data->ucm->states.outputType<0) {
data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength-1;
}
if( staticData->subChar1!=0 &&
(staticData->minBytesPerChar>1 ||
(staticData->conversionType!=UCNV_MBCS &&
staticData->conversionType!=UCNV_EBCDIC_STATEFUL))
) {
fprintf(stderr, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
}
}
static UBool
readFile(ConvData *data, const char* converterName,
UErrorCode *pErrorCode) {
char line[1024];
char *end;
FileStream *convFile;
UCMStates *baseStates;
UBool dataIsBase;
if(U_FAILURE(*pErrorCode)) {
return FALSE;
}
data->ucm=ucm_open();
convFile=T_FileStream_open(converterName, "r");
if(convFile==NULL) {
*pErrorCode=U_FILE_ACCESS_ERROR;
return FALSE;
}
readHeader(data, convFile, converterName, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return FALSE;
}
if(data->ucm->baseName[0]==0) {
dataIsBase=TRUE;
baseStates=&data->ucm->states;
ucm_processStates(baseStates, IGNORE_SISO_CHECK);
} else {
dataIsBase=FALSE;
baseStates=NULL;
}
ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return FALSE;
}
while(T_FileStream_readLine(convFile, line, sizeof(line))) {
end=uprv_strchr(line, 0);
while(line<end &&
(*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) {
--end;
}
*end=0;
if(line[0]=='#' || u_skipWhitespace(line)==end) {
continue;
}
if(0==uprv_strcmp(line, "CHARMAP")) {
ucm_readTable(data->ucm, convFile, FALSE, baseStates, pErrorCode);
} else {
fprintf(stderr, "unexpected text after the base mapping table\n");
}
break;
}
T_FileStream_close(convFile);
if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) {
fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
}
return dataIsBase;
}
static void
createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) {
ConvData baseData;
UBool dataIsBase;
UConverterStaticData *staticData;
UCMStates *states, *baseStates;
if(U_FAILURE(*pErrorCode)) {
return;
}
initConvData(data);
dataIsBase=readFile(data, converterName, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return;
}
staticData=&data->staticData;
states=&data->ucm->states;
if(dataIsBase) {
data->cnvData=MBCSOpen(data->ucm);
if(data->cnvData==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
} else if(!data->cnvData->isValid(data->cnvData,
staticData->subChar, staticData->subCharLen)
) {
fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else if(staticData->subChar1!=0 &&
!data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1)
) {
fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else if(
data->ucm->ext->mappingsLength>0 &&
!ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) {
ucm_sortTable(data->ucm->base);
}
if(U_SUCCESS(*pErrorCode)) {
if(
!data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else {
ucm_moveMappings(data->ucm->base, data->ucm->ext);
ucm_sortTable(data->ucm->ext);
if(data->ucm->ext->mappingsLength>0) {
data->extData=CnvExtOpen(data->ucm);
if(data->extData==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
} else if(
!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
}
}
}
}
} else {
char baseFilename[500];
char *basename;
initConvData(&baseData);
uprv_strcpy(baseFilename, converterName);
basename=(char *)findBasename(baseFilename);
uprv_strcpy(basename, data->ucm->baseName);
uprv_strcat(basename, ".ucm");
dataIsBase=readFile(&baseData, baseFilename, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return;
} else if(!dataIsBase) {
fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename);
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else {
data->extData=CnvExtOpen(data->ucm);
if(data->extData==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
} else {
UCMapping *m, *mLimit;
uint8_t fallbackFlags;
baseStates=&baseData.ucm->states;
if(states->conversionType==UCNV_DBCS) {
staticData->minBytesPerChar=(int8_t)(states->minCharLength=2);
} else if(states->minCharLength==0) {
staticData->minBytesPerChar=(int8_t)(states->minCharLength=baseStates->minCharLength);
}
if(states->maxCharLength<states->minCharLength) {
staticData->maxBytesPerChar=(int8_t)(states->maxCharLength=baseStates->maxCharLength);
}
if(staticData->subCharLen==0) {
uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4);
staticData->subCharLen=baseData.staticData.subCharLen;
}
fallbackFlags=0;
for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
m<mLimit && fallbackFlags!=3;
++m
) {
if(m->f==1) {
fallbackFlags|=1;
} else if(m->f==3) {
fallbackFlags|=2;
}
}
if(fallbackFlags&1) {
staticData->hasFromUnicodeFallback=TRUE;
}
if(fallbackFlags&2) {
staticData->hasToUnicodeFallback=TRUE;
}
if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) {
fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) {
fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else if(
!ucm_checkValidity(data->ucm->ext, baseStates) ||
!ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else {
if(states->maxCharLength>1) {
const MBCSData *mbcsData=MBCSGetDummy();
int32_t needsMove=0;
for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
m<mLimit;
++m
) {
if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) {
m->f|=MBCS_FROM_U_EXT_FLAG;
m->moveFlag=UCM_MOVE_TO_EXT;
++needsMove;
}
}
if(needsMove!=0) {
ucm_moveMappings(baseData.ucm->base, data->ucm->ext);
ucm_sortTable(data->ucm->ext);
}
}
if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
}
}
}
}
cleanupConvData(&baseData);
}
}