#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <io.h>
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
# include <windows.h>
#else
# include <sys/time.h>
static unsigned long
timeGetTime() {
struct timeval t;
gettimeofday(&t, 0);
return t.tv_sec*1000+t.tv_usec/1000;
};
#endif
#include "unicode/utypes.h"
#include "unicode/ucnv.h"
#include "unicode/ustring.h"
#define INPUT_CAPACITY (1024*1024)
#define INTERMEDIATE_CAPACITY 4096
#define INTERMEDIATE_SMALL_CAPACITY 20
#define OUTPUT_CAPACITY INPUT_CAPACITY
#define TARGET_MEASURE_TIME_MS 2000
#define PERCENT(a, b) (int)(((a)*200+1)/(2*(b)))
#define ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0]))
static UChar input[INPUT_CAPACITY], output[OUTPUT_CAPACITY];
static char intermediate[INTERMEDIATE_CAPACITY];
static int32_t inputLength, encodedLength, outputLength, countInputCodePoints;
static int32_t utf8Length=0;
static double utf8Time=0.;
static const char *const
utfNames[]={
"UTF-8",
"SCSU", "BOCU-1"
};
typedef void
RoundtripFn(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode);
static void
roundtrip(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode) {
const UChar *pIn, *pInLimit;
UChar *pOut, *pOutLimit;
char *pInter, *pInterLimit, *p;
UBool flush;
ucnv_reset(cnv);
pIn=input;
pInLimit=input+inputLength;
pOut=output;
pOutLimit=output+OUTPUT_CAPACITY;
pInterLimit=intermediate+intermediateCapacity;
encodedLength=outputLength=0;
flush=FALSE;
while(pIn<pInLimit || !flush) {
pInter=intermediate;
flush=(UBool)(pIn==pInLimit);
ucnv_fromUnicode(cnv,
&pInter, pInterLimit,
&pIn, pInLimit,
NULL, flush,
pErrorCode);
encodedLength+=(int32_t)(pInter-intermediate);
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
flush=FALSE;
*pErrorCode=U_ZERO_ERROR;
} else if(U_FAILURE(*pErrorCode)) {
return;
}
p=intermediate;
ucnv_toUnicode(cnv,
&pOut, pOutLimit,
&p, pInter,
NULL, flush,
pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return;
}
}
outputLength=pOut-output;
if(inputLength!=outputLength) {
fprintf(stderr, "error: roundtrip failed, inputLength %d!=outputLength %d\n", inputLength, outputLength);
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
}
}
static void
noop(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode) {
}
static unsigned long
measureRoundtrips(RoundtripFn *fn, UConverter *cnv, const char *encName, int32_t intermediateCapacity, int32_t n) {
unsigned long _time;
UErrorCode errorCode;
_time=timeGetTime();
errorCode=U_ZERO_ERROR;
do {
fn(cnv, intermediateCapacity, &errorCode);
} while(U_SUCCESS(errorCode) && --n>0);
_time=timeGetTime()-_time;
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error in roundtrip conversion (%s): %s\n", encName, u_errorName(errorCode));
return 0x7fffffff;
}
if(0!=u_memcmp(input, output, inputLength)) {
fprintf(stderr, "error: roundtrip failed, input[]!=output[]\n");
return 0x7fffffff;
}
return _time;
}
static void
perEncAndCapacity(UConverter *cnv, const char *encName, int32_t intermediateCapacity) {
double rtTime;
unsigned long _time;
int32_t n;
n=10;
for(;;) {
_time=measureRoundtrips(roundtrip, cnv, encName, intermediateCapacity, n);
if(_time<500 && _time<TARGET_MEASURE_TIME_MS/10) {
n*=10;
} else {
break;
}
}
if(_time<TARGET_MEASURE_TIME_MS) {
n=(n*TARGET_MEASURE_TIME_MS)/_time+1;
}
_time=measureRoundtrips(roundtrip, cnv, encName, intermediateCapacity, n);
_time-=measureRoundtrips(noop, cnv, encName, intermediateCapacity, n);
rtTime=((double)_time*1000.)/(double)n;
printf("* performance report for %8s:\n", encName);
printf(" intermediate buffer capacity %8d B\n", intermediateCapacity);
if(intermediateCapacity==INTERMEDIATE_CAPACITY && utf8Length!=0) {
printf(" number of encoding bytes %8d B (%3d%% of UTF-8)\n", encodedLength, PERCENT(encodedLength, utf8Length));
printf(" roundtrip conversion time %8g μs (%3d%% of UTF-8)\n", rtTime, PERCENT(rtTime, utf8Time));
} else {
printf(" number of encoding bytes %8d B\n", encodedLength);
printf(" roundtrip conversion time %8g μs\n", rtTime);
}
printf(" average bytes/code point %8g B/cp\n", (double)encodedLength/countInputCodePoints);
puts("");
if(intermediateCapacity==INTERMEDIATE_CAPACITY && 0==strcmp(encName, "UTF-8")) {
utf8Length=encodedLength;
utf8Time=rtTime;
}
}
static void
perEnc(UConverter *cnv, const char *encName) {
perEncAndCapacity(cnv, encName, INTERMEDIATE_CAPACITY);
perEncAndCapacity(cnv, encName, INTERMEDIATE_SMALL_CAPACITY);
}
static void
testPerformance() {
UConverter *cnv;
UErrorCode errorCode;
int32_t i;
printf("number of code points %8d cp\n", countInputCodePoints);
printf("platform endianness: %8s-endian\n", U_IS_BIG_ENDIAN ? "big" : "little");
puts("");
for(i=0; i<ARRAY_LENGTH(utfNames); ++i) {
errorCode=U_ZERO_ERROR;
cnv=ucnv_open(utfNames[i], &errorCode);
if(U_SUCCESS(errorCode)) {
perEnc(cnv, utfNames[i]);
ucnv_close(cnv);
} else {
fprintf(stderr, "error opening converter for \"%s\" - %s\n", utfNames[i], u_errorName(errorCode));
}
}
}
static int32_t
readBlock(FILE *in) {
int length, blockLength;
blockLength=0;
while(blockLength<INTERMEDIATE_CAPACITY && !feof(in)) {
length=fread(intermediate, 1, INTERMEDIATE_CAPACITY-blockLength, in);
if(length<0 || ferror(in)) {
return -1;
}
blockLength+=length;
}
return (int32_t)blockLength;
}
static UBool
readInput(FILE *in, const char *encName) {
UConverter *cnv;
UChar *pOut, *pOutLimit;
const char *p, *limit;
int32_t length;
UErrorCode errorCode;
pOut=input;
pOutLimit=input+INPUT_CAPACITY;
errorCode=U_ZERO_ERROR;
length=readBlock(in);
if(length<0) {
return FALSE;
}
if(encName==NULL) {
int32_t signatureLength;
encName=ucnv_detectUnicodeSignature(intermediate, length,
&signatureLength,
&errorCode);
if(U_FAILURE(errorCode) || encName==NULL) {
printf("no Unicode signature - using UTF-8\n");
encName="UTF-8";
errorCode=U_ZERO_ERROR;
} else {
printf("detected signature for %s (removing %d bytes)\n", encName, signatureLength);
memmove(intermediate, intermediate+signatureLength, length-=signatureLength);
}
}
cnv=ucnv_open(encName, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: unable to ucnv_open(\"%s\") - %s\n", encName, u_errorName(errorCode));
return FALSE;
}
while(length>0) {
p=intermediate;
limit=p+length;
ucnv_toUnicode(cnv,
&pOut, pOutLimit,
&p, limit,
NULL, FALSE,
&errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error converting input to UTF-16: %s\n", u_errorName(errorCode));
ucnv_close(cnv);
return FALSE;
}
length=readBlock(in);
if(length<0) {
ucnv_close(cnv);
return FALSE;
}
}
ucnv_toUnicode(cnv,
&pOut, pOutLimit,
&p, p,
NULL, TRUE,
&errorCode);
ucnv_close(cnv);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error converting input to UTF-16: %s\n", u_errorName(errorCode));
return FALSE;
}
inputLength=(int32_t)(pOut-input);
countInputCodePoints=u_countChar32(input, inputLength);
if(inputLength<=0) {
fprintf(stderr, "warning: input is empty\n");
return FALSE;
}
return TRUE;
}
static void
showUsage(const char *myName) {
fprintf(stderr,
"Usage:\n"
"%s [-e encoding-name] filename | '-'\n"
" encoding-name must be the name of an encoding supported by ICU\n"
" the filename of the input file with text to be used\n"
" can be a dash (-) for standard input\n",
myName);
}
extern int
main(int argc, const char *argv[]) {
FILE *in;
const char *myName, *encName, *filename, *basename;
myName=argv[0];
if(argc<2) {
showUsage(myName);
return 1;
}
if(argv[1][0]=='-' && argv[1][1]=='e') {
encName=argv[1]+2;
--argc;
++argv;
if(*encName==0) {
if(argc<2) {
showUsage(myName);
return 1;
}
encName=argv[1];
--argc;
++argv;
}
} else {
encName=NULL;
}
if(argc<2) {
showUsage(myName);
return 1;
}
filename=argv[1];
if(filename[0]=='-' && filename[1]==0) {
filename="(standard input)";
in=stdin;
_setmode(_fileno(stdin), _O_BINARY);
} else {
in=fopen(filename, "rb");
if(in==NULL) {
fprintf(stderr, "error opening \"%s\"\n", filename);
showUsage(myName);
return 2;
}
}
basename=strrchr(filename, U_FILE_SEP_CHAR);
if(basename!=NULL) {
++basename;
} else {
basename=filename;
}
printf("# testing converter performance with file \"%s\"\n", basename);
if(!readInput(in, encName)) {
fprintf(stderr, "error reading \"%s\" (encoding %s)\n", filename, encName);
showUsage(myName);
return 2;
}
if(in!=stdin) {
fclose(in);
}
testPerformance();
return 0;
}