#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
struct unicode_attribute
{
const char *name;
const char *category;
const char *combining;
const char *bidi;
const char *decomposition;
const char *decdigit;
const char *digit;
const char *numeric;
int mirrored;
const char *oldname;
const char *comment;
unsigned int upper;
unsigned int lower;
unsigned int title;
};
#define NONE (~(unsigned int)0)
struct unicode_attribute unicode_attributes [0x110000];
static void
fill_attribute (unsigned int i,
const char *field1, const char *field2,
const char *field3, const char *field4,
const char *field5, const char *field6,
const char *field7, const char *field8,
const char *field9, const char *field10,
const char *field11, const char *field12,
const char *field13, const char *field14)
{
struct unicode_attribute * uni;
if (i >= 0x110000)
{
fprintf (stderr, "index too large\n");
exit (1);
}
uni = &unicode_attributes[i];
uni->name = strdup (field1);
uni->category = (field2[0] == '\0' ? "" : strdup (field2));
uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
uni->mirrored = (field9[0] == 'Y');
uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
}
#define FIELDLEN 120
static int
getfield (FILE *stream, char *buffer, int delim)
{
int count = 0;
int c;
for (; (c = getc (stream)), (c != EOF && c != delim); )
{
if (c == '\r')
continue;
if (++count >= FIELDLEN - 1)
{
fprintf (stderr, "field too long\n");
exit (1);
}
*buffer++ = c;
}
if (c == EOF)
return 0;
*buffer = '\0';
return 1;
}
static void
fill_attributes (const char *unicodedata_filename)
{
unsigned int i, j;
FILE *stream;
char field0[FIELDLEN];
char field1[FIELDLEN];
char field2[FIELDLEN];
char field3[FIELDLEN];
char field4[FIELDLEN];
char field5[FIELDLEN];
char field6[FIELDLEN];
char field7[FIELDLEN];
char field8[FIELDLEN];
char field9[FIELDLEN];
char field10[FIELDLEN];
char field11[FIELDLEN];
char field12[FIELDLEN];
char field13[FIELDLEN];
char field14[FIELDLEN];
int lineno = 0;
for (i = 0; i < 0x110000; i++)
unicode_attributes[i].name = NULL;
stream = fopen (unicodedata_filename, "r");
if (stream == NULL)
{
fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
exit (1);
}
for (;;)
{
int n;
lineno++;
n = getfield (stream, field0, ';');
n += getfield (stream, field1, ';');
n += getfield (stream, field2, ';');
n += getfield (stream, field3, ';');
n += getfield (stream, field4, ';');
n += getfield (stream, field5, ';');
n += getfield (stream, field6, ';');
n += getfield (stream, field7, ';');
n += getfield (stream, field8, ';');
n += getfield (stream, field9, ';');
n += getfield (stream, field10, ';');
n += getfield (stream, field11, ';');
n += getfield (stream, field12, ';');
n += getfield (stream, field13, ';');
n += getfield (stream, field14, '\n');
if (n == 0)
break;
if (n != 15)
{
fprintf (stderr, "short line in'%s':%d\n",
unicodedata_filename, lineno);
exit (1);
}
i = strtoul (field0, NULL, 16);
if (field1[0] == '<'
&& strlen (field1) >= 9
&& !strcmp (field1 + strlen(field1) - 8, ", First>"))
{
lineno++;
n = getfield (stream, field0, ';');
n += getfield (stream, field1, ';');
n += getfield (stream, field2, ';');
n += getfield (stream, field3, ';');
n += getfield (stream, field4, ';');
n += getfield (stream, field5, ';');
n += getfield (stream, field6, ';');
n += getfield (stream, field7, ';');
n += getfield (stream, field8, ';');
n += getfield (stream, field9, ';');
n += getfield (stream, field10, ';');
n += getfield (stream, field11, ';');
n += getfield (stream, field12, ';');
n += getfield (stream, field13, ';');
n += getfield (stream, field14, '\n');
if (n != 15)
{
fprintf (stderr, "missing end range in '%s':%d\n",
unicodedata_filename, lineno);
exit (1);
}
if (!(field1[0] == '<'
&& strlen (field1) >= 8
&& !strcmp (field1 + strlen (field1) - 7, ", Last>")))
{
fprintf (stderr, "missing end range in '%s':%d\n",
unicodedata_filename, lineno);
exit (1);
}
field1[strlen (field1) - 7] = '\0';
j = strtoul (field0, NULL, 16);
for (; i <= j; i++)
fill_attribute (i, field1+1, field2, field3, field4, field5,
field6, field7, field8, field9, field10,
field11, field12, field13, field14);
}
else
{
fill_attribute (i, field1, field2, field3, field4, field5,
field6, field7, field8, field9, field10,
field11, field12, field13, field14);
}
}
if (ferror (stream) || fclose (stream))
{
fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
exit (1);
}
}
char unicode_combining[0x110000];
static void
fill_combining (const char *proplist_filename)
{
unsigned int i;
FILE *stream;
char buf[100+1];
for (i = 0; i < 0x110000; i++)
unicode_combining[i] = 0;
stream = fopen (proplist_filename, "r");
if (stream == NULL)
{
fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
exit (1);
}
do
{
if (fscanf (stream, "%100[^\n]\n", buf) < 1)
{
fprintf (stderr, "no combining property found in '%s'\n",
proplist_filename);
exit (1);
}
}
while (strstr (buf, "(Combining)") == NULL);
for (;;)
{
unsigned int i1, i2;
if (fscanf (stream, "%100[^\n]\n", buf) < 1)
{
fprintf (stderr, "premature end of combining property in '%s'\n",
proplist_filename);
exit (1);
}
if (buf[0] == '*')
break;
if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
{
if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
{
fprintf (stderr, "parse error in combining property in '%s'\n",
proplist_filename);
exit (1);
}
}
else if (strlen (buf) >= 4)
{
if (sscanf (buf, "%4X", &i1) < 1)
{
fprintf (stderr, "parse error in combining property in '%s'\n",
proplist_filename);
exit (1);
}
i2 = i1;
}
else
{
fprintf (stderr, "parse error in combining property in '%s'\n",
proplist_filename);
exit (1);
}
for (i = i1; i <= i2; i++)
unicode_combining[i] = 1;
}
if (ferror (stream) || fclose (stream))
{
fprintf (stderr, "error reading from '%s'\n", proplist_filename);
exit (1);
}
}
const char * unicode_width[0x110000];
static void
fill_width (const char *width_filename)
{
unsigned int i, j;
FILE *stream;
char field0[FIELDLEN];
char field1[FIELDLEN];
char field2[FIELDLEN];
int lineno = 0;
for (i = 0; i < 0x110000; i++)
unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
stream = fopen (width_filename, "r");
if (stream == NULL)
{
fprintf (stderr, "error during fopen of '%s'\n", width_filename);
exit (1);
}
for (;;)
{
int n;
int c;
lineno++;
c = getc (stream);
if (c == EOF)
break;
if (c == '#')
{
do c = getc (stream); while (c != EOF && c != '\n');
continue;
}
ungetc (c, stream);
n = getfield (stream, field0, ';');
n += getfield (stream, field1, ' ');
n += getfield (stream, field2, '\n');
if (n == 0)
break;
if (n != 3)
{
fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
exit (1);
}
i = strtoul (field0, NULL, 16);
if (strstr (field0, "..") != NULL)
{
j = strtoul (strstr (field0, "..") + 2, NULL, 16);
for (; i <= j; i++)
unicode_width[i] = strdup (field1);
}
else
{
unicode_width[i] = strdup (field1);
}
}
if (ferror (stream) || fclose (stream))
{
fprintf (stderr, "error reading from '%s'\n", width_filename);
exit (1);
}
}
enum
{
LBP_BK = 0,
LBP_CM = 20,
LBP_ZW = 1,
LBP_IN = 2,
LBP_GL = 3,
LBP_CB = 22,
LBP_SP = 21,
LBP_BA = 4,
LBP_BB = 5,
LBP_B2 = 6,
LBP_HY = 7,
LBP_NS = 8,
LBP_OP = 9,
LBP_CL = 10,
LBP_QU = 11,
LBP_EX = 12,
LBP_ID = 13,
LBP_NU = 14,
LBP_IS = 15,
LBP_SY = 16,
LBP_AL = 17,
LBP_PR = 18,
LBP_PO = 19,
LBP_SA = 23,
LBP_AI = 24,
LBP_XX = 25
};
static int
get_lbp (unsigned int ch)
{
int attr = 0;
if (unicode_attributes[ch].name != NULL)
{
if (ch == 0x000A || ch == 0x000D || ch == 0x0085
|| ch == 0x000C
|| ch == 0x2028
|| ch == 0x2029 )
attr |= 1 << LBP_BK;
if (ch == 0x200B )
attr |= 1 << LBP_ZW;
if (ch == 0x2024
|| ch == 0x2025
|| ch == 0x2026 )
attr |= 1 << LBP_IN;
if (ch == 0xFEFF
|| ch == 0x00A0
|| ch == 0x202F
|| ch == 0x2007
|| ch == 0x2011
|| ch == 0x0F0C )
attr |= 1 << LBP_GL;
if (ch == 0xFFFC )
attr |= 1 << LBP_CB;
if (ch == 0x0020 )
attr |= 1 << LBP_SP;
if (ch == 0x2000
|| ch == 0x2001
|| ch == 0x2002
|| ch == 0x2003
|| ch == 0x2004
|| ch == 0x2005
|| ch == 0x2006
|| ch == 0x2008
|| ch == 0x2009
|| ch == 0x200A
|| ch == 0x0009
|| ch == 0x058A
|| ch == 0x2010
|| ch == 0x2012
|| ch == 0x2013
|| ch == 0x00AD
|| ch == 0x0F0B
|| ch == 0x1361
|| ch == 0x1680
|| ch == 0x17D5
|| ch == 0x2027
|| ch == 0x007C )
attr |= 1 << LBP_BA;
if (ch == 0x00B4
|| ch == 0x02C8
|| ch == 0x02CC
|| ch == 0x1806 )
attr |= 1 << LBP_BB;
if (ch == 0x2014 )
attr |= 1 << LBP_B2;
if (ch == 0x002D )
attr |= 1 << LBP_HY;
if (ch == 0x0021
|| ch == 0x003F
|| ch == 0xFE56
|| ch == 0xFE57
|| ch == 0xFF01
|| ch == 0xFF1F )
attr |= 1 << LBP_EX;
if (unicode_attributes[ch].category[0] == 'P'
&& unicode_attributes[ch].category[1] == 's')
attr |= 1 << LBP_OP;
if (ch == 0x3001
|| ch == 0x3002
|| ch == 0xFE50
|| ch == 0xFE52
|| ch == 0xFF0C
|| ch == 0xFF0E
|| ch == 0xFF61
|| ch == 0xFF64
|| (unicode_attributes[ch].category[0] == 'P'
&& unicode_attributes[ch].category[1] == 'e'))
attr |= 1 << LBP_CL;
if (ch == 0x0022
|| ch == 0x0027
|| (unicode_attributes[ch].category[0] == 'P'
&& (unicode_attributes[ch].category[1] == 'f'
|| unicode_attributes[ch].category[1] == 'i')))
attr |= 1 << LBP_QU;
if ((unicode_attributes[ch].category[0] == 'M'
&& (unicode_attributes[ch].category[1] == 'n'
|| unicode_attributes[ch].category[1] == 'c'
|| unicode_attributes[ch].category[1] == 'e'))
|| (ch >= 0x1160 && ch <= 0x11F9)
|| (unicode_attributes[ch].category[0] == 'C'
&& (unicode_attributes[ch].category[1] == 'c'
|| unicode_attributes[ch].category[1] == 'f')))
if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL))))
attr |= 1 << LBP_CM;
if (ch == 0x0E5A
|| ch == 0x0E5B
|| ch == 0x17D4
|| ch == 0x17D6
|| ch == 0x17D7
|| ch == 0x17D8
|| ch == 0x17D9
|| ch == 0x17DA
|| ch == 0x203C
|| ch == 0x2044
|| ch == 0x3005
|| ch == 0x301C
|| ch == 0x309B
|| ch == 0x309C
|| ch == 0x309D
|| ch == 0x309E
|| ch == 0x30FB
|| ch == 0x30FD
|| ch == 0xFE54
|| ch == 0xFE55
|| ch == 0xFF1A
|| ch == 0xFF1B
|| ch == 0xFF65
|| ch == 0xFF70
|| ch == 0xFF9E
|| ch == 0xFF9F
|| (unicode_attributes[ch].category[0] == 'L'
&& unicode_attributes[ch].category[1] == 'm'
&& (unicode_width[ch][0] == 'W'
|| unicode_width[ch][0] == 'H'))
|| (unicode_attributes[ch].category[0] == 'S'
&& unicode_attributes[ch].category[1] == 'k'
&& unicode_width[ch][0] == 'W')
|| strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
|| strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
attr |= 1 << LBP_NS;
if (unicode_attributes[ch].category[0] == 'N'
&& unicode_attributes[ch].category[1] == 'd'
&& strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
attr |= 1 << LBP_NU;
if (ch == 0x002C
|| ch == 0x002E
|| ch == 0x003A
|| ch == 0x003B
|| ch == 0x0589 )
attr |= 1 << LBP_IS;
if (ch == 0x002F )
attr |= 1 << LBP_SY;
if (ch == 0x0025
|| ch == 0x00A2
|| ch == 0x00B0
|| ch == 0x2030
|| ch == 0x2031
|| ch == 0x2032
|| ch == 0x2033
|| ch == 0x2034
|| ch == 0x2035
|| ch == 0x2036
|| ch == 0x2037
|| ch == 0x20A7
|| ch == 0x2103
|| ch == 0x2109
|| ch == 0x2126
|| ch == 0xFE6A
|| ch == 0xFF05
|| ch == 0xFFE0 )
attr |= 1 << LBP_PO;
if (ch == 0x002B
|| ch == 0x005C
|| ch == 0x00B1
|| ch == 0x2116
|| ch == 0x2212
|| ch == 0x2213
|| (unicode_attributes[ch].category[0] == 'S'
&& unicode_attributes[ch].category[1] == 'c'))
if (!(attr & (1 << LBP_PO)))
attr |= 1 << LBP_PR;
if (((ch >= 0x0E00 && ch <= 0x0EFF)
|| (ch >= 0x1000 && ch <= 0x109F)
|| (ch >= 0x1780 && ch <= 0x17FF))
&& unicode_attributes[ch].category[0] == 'L'
&& (unicode_attributes[ch].category[1] == 'm'
|| unicode_attributes[ch].category[1] == 'o'))
if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_NU) | (1 << LBP_BA) | (1 << LBP_PR))))
attr |= 1 << LBP_SA;
if ((ch >= 0x1100 && ch <= 0x115F)
|| (ch >= 0x2E80 && ch <= 0x2FFF)
|| ch == 0x3000
|| (ch >= 0x3130 && ch <= 0x318F)
|| (ch >= 0x3400 && ch <= 0x4DBF)
|| (ch >= 0x4E00 && ch <= 0x9FAF)
|| (ch >= 0xF900 && ch <= 0xFAFF)
|| (ch >= 0xAC00 && ch <= 0xD7AF)
|| (ch >= 0xA000 && ch <= 0xA48C)
|| (ch >= 0xA490 && ch <= 0xA4C6)
|| ch == 0xFE62
|| ch == 0xFE63
|| ch == 0xFE64
|| ch == 0xFE65
|| ch == 0xFE66
|| (ch >= 0xFF10 && ch <= 0xFF19)
|| (ch >= 0x20000 && ch <= 0x2A6D6)
|| (ch >= 0x2F800 && ch <= 0x2FA1D)
|| strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
|| (ch >= 0x3000 && ch <= 0x33FF
&& !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
|| ch == 0xFE30
|| ch == 0xFE31
|| ch == 0xFE32
|| ch == 0xFE33
|| ch == 0xFE34
|| ch == 0xFE49
|| ch == 0xFE4A
|| ch == 0xFE4B
|| ch == 0xFE4C
|| ch == 0xFE4D
|| ch == 0xFE4E
|| ch == 0xFE4F
|| ch == 0xFE51
|| ch == 0xFE58
|| ch == 0xFE5F
|| ch == 0xFE60
|| ch == 0xFE61
|| ch == 0xFE68
|| ch == 0xFE6B
|| ch == 0xFF02
|| ch == 0xFF03
|| ch == 0xFF06
|| ch == 0xFF07
|| ch == 0xFF0A
|| ch == 0xFF0B
|| ch == 0xFF0D
|| ch == 0xFF0F
|| ch == 0xFF1C
|| ch == 0xFF1D
|| ch == 0xFF1E
|| ch == 0xFF20
|| ch == 0xFF3C
|| ch == 0xFF3E
|| ch == 0xFF3F
|| ch == 0xFF40
|| ch == 0xFF5C
|| ch == 0xFF5E
|| ch == 0xFFE2
|| ch == 0xFFE3
|| ch == 0xFFE4)
{
if (unicode_width[ch] != NULL
&& unicode_width[ch][0] == 'A')
attr |= 1 << LBP_AI;
else
attr |= 1 << LBP_ID;
}
if ((unicode_attributes[ch].category[0] == 'L'
&& (unicode_attributes[ch].category[1] == 'u'
|| unicode_attributes[ch].category[1] == 'l'
|| unicode_attributes[ch].category[1] == 't'
|| unicode_attributes[ch].category[1] == 'm'
|| unicode_attributes[ch].category[1] == 'o'))
|| (unicode_attributes[ch].category[0] == 'S'
&& (unicode_attributes[ch].category[1] == 'm'
|| unicode_attributes[ch].category[1] == 'c'
|| unicode_attributes[ch].category[1] == 'k'
|| unicode_attributes[ch].category[1] == 'o'))
|| ch == 0x0023
|| ch == 0x0026
|| ch == 0x002A
|| ch == 0x0040
|| ch == 0x005F
|| ch == 0x00A1
|| ch == 0x00B2
|| ch == 0x00B3
|| ch == 0x00B7
|| ch == 0x00B9
|| ch == 0x00BC
|| ch == 0x00BD
|| ch == 0x00BE
|| ch == 0x00BF
|| ch == 0x037E
|| ch == 0x0387
|| ch == 0x055A
|| ch == 0x055B
|| ch == 0x055C
|| ch == 0x055D
|| ch == 0x055E
|| ch == 0x055F
|| ch == 0x05BE
|| ch == 0x05C0
|| ch == 0x05C3
|| ch == 0x05F3
|| ch == 0x05F4
|| ch == 0x060C
|| ch == 0x061B
|| ch == 0x061F
|| ch == 0x066A
|| ch == 0x066B
|| ch == 0x066C
|| ch == 0x066D
|| ch == 0x06D4
|| ch == 0x0700
|| ch == 0x0701
|| ch == 0x0702
|| ch == 0x0703
|| ch == 0x0704
|| ch == 0x0705
|| ch == 0x0706
|| ch == 0x0707
|| ch == 0x0708
|| ch == 0x0709
|| ch == 0x070A
|| ch == 0x070B
|| ch == 0x070C
|| ch == 0x070D
|| ch == 0x0964
|| ch == 0x0965
|| ch == 0x0970
|| ch == 0x09F4
|| ch == 0x09F5
|| ch == 0x09F6
|| ch == 0x09F7
|| ch == 0x09F8
|| ch == 0x09F9
|| ch == 0x0BF0
|| ch == 0x0BF1
|| ch == 0x0BF2
|| ch == 0x0DF4
|| ch == 0x0E4F
|| ch == 0x0F04
|| ch == 0x0F05
|| ch == 0x0F06
|| ch == 0x0F07
|| ch == 0x0F08
|| ch == 0x0F09
|| ch == 0x0F0A
|| ch == 0x0F0D
|| ch == 0x0F0E
|| ch == 0x0F0F
|| ch == 0x0F10
|| ch == 0x0F11
|| ch == 0x0F12
|| ch == 0x0F2A
|| ch == 0x0F2B
|| ch == 0x0F2C
|| ch == 0x0F2D
|| ch == 0x0F2E
|| ch == 0x0F2F
|| ch == 0x0F30
|| ch == 0x0F31
|| ch == 0x0F32
|| ch == 0x0F33
|| ch == 0x0F85
|| ch == 0x104A
|| ch == 0x104B
|| ch == 0x104C
|| ch == 0x104D
|| ch == 0x104E
|| ch == 0x104F
|| ch == 0x10FB
|| ch == 0x1362
|| ch == 0x1363
|| ch == 0x1364
|| ch == 0x1365
|| ch == 0x1366
|| ch == 0x1367
|| ch == 0x1368
|| ch == 0x1372
|| ch == 0x1373
|| ch == 0x1374
|| ch == 0x1375
|| ch == 0x1376
|| ch == 0x1377
|| ch == 0x1378
|| ch == 0x1379
|| ch == 0x137A
|| ch == 0x137B
|| ch == 0x137C
|| ch == 0x166D
|| ch == 0x166E
|| ch == 0x16EB
|| ch == 0x16EC
|| ch == 0x16ED
|| ch == 0x16EE
|| ch == 0x16EF
|| ch == 0x16F0
|| ch == 0x17DC
|| ch == 0x1800
|| ch == 0x1801
|| ch == 0x1802
|| ch == 0x1803
|| ch == 0x1804
|| ch == 0x1805
|| ch == 0x1807
|| ch == 0x1808
|| ch == 0x1809
|| ch == 0x180A
|| ch == 0x2015
|| ch == 0x2016
|| ch == 0x2017
|| ch == 0x2020
|| ch == 0x2021
|| ch == 0x2022
|| ch == 0x2023
|| ch == 0x2038
|| ch == 0x203B
|| ch == 0x203D
|| ch == 0x203E
|| ch == 0x203F
|| ch == 0x2040
|| ch == 0x2041
|| ch == 0x2042
|| ch == 0x2043
|| ch == 0x2048
|| ch == 0x2049
|| ch == 0x204A
|| ch == 0x204B
|| ch == 0x204C
|| ch == 0x204D
|| ch == 0x2070
|| ch == 0x2074
|| ch == 0x2075
|| ch == 0x2076
|| ch == 0x2077
|| ch == 0x2078
|| ch == 0x2079
|| ch == 0x2080
|| ch == 0x2081
|| ch == 0x2082
|| ch == 0x2083
|| ch == 0x2084
|| ch == 0x2085
|| ch == 0x2086
|| ch == 0x2087
|| ch == 0x2088
|| ch == 0x2089
|| (ch >= 0x2153 && ch <= 0x215E)
|| ch == 0x215F
|| (ch >= 0x2160 && ch <= 0x2183)
|| (ch >= 0x2460 && ch <= 0x2473)
|| (ch >= 0x2474 && ch <= 0x2487)
|| (ch >= 0x2488 && ch <= 0x249B)
|| ch == 0x24EA
|| (ch >= 0x2776 && ch <= 0x2793)
|| ch == 0x10320
|| ch == 0x10321
|| ch == 0x10322
|| ch == 0x10323
|| ch == 0x1034A)
if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_ID) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SA) | (1 << LBP_CB))))
{
if (unicode_width[ch] != NULL
&& unicode_width[ch][0] == 'A')
attr |= 1 << LBP_AI;
else
attr |= 1 << LBP_AL;
}
}
if (attr == 0)
attr |= 1 << LBP_XX;
return attr;
}
static void
debug_output_lbp (FILE *stream)
{
unsigned int i;
for (i = 0; i < 0x110000; i++)
{
int attr = get_lbp (i);
if (attr != 1 << LBP_XX)
{
fprintf (stream, "0x%04X", i);
#define PRINT_BIT(attr,bit) \
if (attr & (1 << bit)) fprintf (stream, " " #bit);
PRINT_BIT(attr,LBP_BK);
PRINT_BIT(attr,LBP_CM);
PRINT_BIT(attr,LBP_ZW);
PRINT_BIT(attr,LBP_IN);
PRINT_BIT(attr,LBP_GL);
PRINT_BIT(attr,LBP_CB);
PRINT_BIT(attr,LBP_SP);
PRINT_BIT(attr,LBP_BA);
PRINT_BIT(attr,LBP_BB);
PRINT_BIT(attr,LBP_B2);
PRINT_BIT(attr,LBP_HY);
PRINT_BIT(attr,LBP_NS);
PRINT_BIT(attr,LBP_OP);
PRINT_BIT(attr,LBP_CL);
PRINT_BIT(attr,LBP_QU);
PRINT_BIT(attr,LBP_EX);
PRINT_BIT(attr,LBP_ID);
PRINT_BIT(attr,LBP_NU);
PRINT_BIT(attr,LBP_IS);
PRINT_BIT(attr,LBP_SY);
PRINT_BIT(attr,LBP_AL);
PRINT_BIT(attr,LBP_PR);
PRINT_BIT(attr,LBP_PO);
PRINT_BIT(attr,LBP_SA);
PRINT_BIT(attr,LBP_XX);
PRINT_BIT(attr,LBP_AI);
#undef PRINT_BIT
fprintf (stream, "\n");
}
}
}
static void
debug_output_tables (const char *filename)
{
FILE *stream;
stream = fopen (filename, "w");
if (stream == NULL)
{
fprintf (stderr, "cannot open '%s' for writing\n", filename);
exit (1);
}
debug_output_lbp (stream);
if (ferror (stream) || fclose (stream))
{
fprintf (stderr, "error writing to '%s'\n", filename);
exit (1);
}
}
int unicode_org_lbp[0x110000];
static void
fill_org_lbp (const char *linebreak_filename)
{
unsigned int i, j;
FILE *stream;
char field0[FIELDLEN];
char field1[FIELDLEN];
char field2[FIELDLEN];
int lineno = 0;
for (i = 0; i < 0x110000; i++)
unicode_org_lbp[i] = LBP_XX;
stream = fopen (linebreak_filename, "r");
if (stream == NULL)
{
fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
exit (1);
}
for (;;)
{
int n;
int c;
int value;
lineno++;
c = getc (stream);
if (c == EOF)
break;
if (c == '#')
{
do c = getc (stream); while (c != EOF && c != '\n');
continue;
}
ungetc (c, stream);
n = getfield (stream, field0, ';');
n += getfield (stream, field1, ' ');
n += getfield (stream, field2, '\n');
if (n == 0)
break;
if (n != 3)
{
fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
lineno);
exit (1);
}
#define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
if (false) {}
TRY(LBP_BK)
TRY(LBP_CM)
TRY(LBP_ZW)
TRY(LBP_IN)
TRY(LBP_GL)
TRY(LBP_CB)
TRY(LBP_SP)
TRY(LBP_BA)
TRY(LBP_BB)
TRY(LBP_B2)
TRY(LBP_HY)
TRY(LBP_NS)
TRY(LBP_OP)
TRY(LBP_CL)
TRY(LBP_QU)
TRY(LBP_EX)
TRY(LBP_ID)
TRY(LBP_NU)
TRY(LBP_IS)
TRY(LBP_SY)
TRY(LBP_AL)
TRY(LBP_PR)
TRY(LBP_PO)
TRY(LBP_SA)
TRY(LBP_XX)
TRY(LBP_AI)
#undef TRY
else if (strcmp (field1, "LF") == 0) value = LBP_BK;
else if (strcmp (field1, "CR") == 0) value = LBP_BK;
else if (strcmp (field1, "SG") == 0) value = LBP_XX;
else
{
fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
field1, linebreak_filename, lineno);
exit (1);
}
i = strtoul (field0, NULL, 16);
if (strstr (field0, "..") != NULL)
{
j = strtoul (strstr (field0, "..") + 2, NULL, 16);
for (; i <= j; i++)
unicode_org_lbp[i] = value;
}
else
{
unicode_org_lbp[i] = value;
}
}
if (ferror (stream) || fclose (stream))
{
fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
exit (1);
}
}
static void
debug_output_org_lbp (FILE *stream)
{
unsigned int i;
for (i = 0; i < 0x110000; i++)
{
int attr = unicode_org_lbp[i];
if (attr != LBP_XX)
{
fprintf (stream, "0x%04X", i);
#define PRINT_BIT(attr,bit) \
if (attr == bit) fprintf (stream, " " #bit);
PRINT_BIT(attr,LBP_BK);
PRINT_BIT(attr,LBP_CM);
PRINT_BIT(attr,LBP_ZW);
PRINT_BIT(attr,LBP_IN);
PRINT_BIT(attr,LBP_GL);
PRINT_BIT(attr,LBP_CB);
PRINT_BIT(attr,LBP_SP);
PRINT_BIT(attr,LBP_BA);
PRINT_BIT(attr,LBP_BB);
PRINT_BIT(attr,LBP_B2);
PRINT_BIT(attr,LBP_HY);
PRINT_BIT(attr,LBP_NS);
PRINT_BIT(attr,LBP_OP);
PRINT_BIT(attr,LBP_CL);
PRINT_BIT(attr,LBP_QU);
PRINT_BIT(attr,LBP_EX);
PRINT_BIT(attr,LBP_ID);
PRINT_BIT(attr,LBP_NU);
PRINT_BIT(attr,LBP_IS);
PRINT_BIT(attr,LBP_SY);
PRINT_BIT(attr,LBP_AL);
PRINT_BIT(attr,LBP_PR);
PRINT_BIT(attr,LBP_PO);
PRINT_BIT(attr,LBP_SA);
PRINT_BIT(attr,LBP_XX);
PRINT_BIT(attr,LBP_AI);
#undef PRINT_BIT
fprintf (stream, "\n");
}
}
}
static void
debug_output_org_tables (const char *filename)
{
FILE *stream;
stream = fopen (filename, "w");
if (stream == NULL)
{
fprintf (stderr, "cannot open '%s' for writing\n", filename);
exit (1);
}
debug_output_org_lbp (stream);
if (ferror (stream) || fclose (stream))
{
fprintf (stderr, "error writing to '%s'\n", filename);
exit (1);
}
}
#define TABLE lbp_table
#define ELEMENT unsigned char
#define DEFAULT LBP_XX
#define xmalloc malloc
#define xrealloc realloc
#include "3level.h"
static void
output_lbp (FILE *stream)
{
unsigned int i;
struct lbp_table t;
unsigned int level1_offset, level2_offset, level3_offset;
t.p = 7;
t.q = 9;
lbp_table_init (&t);
for (i = 0; i < 0x110000; i++)
{
int attr = get_lbp (i);
if (attr == 0 || ((attr & (attr - 1)) != 0))
abort ();
if (attr != 1 << LBP_XX)
{
unsigned int log2_attr;
for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
lbp_table_add (&t, i, log2_attr);
}
}
lbp_table_finalize (&t);
level1_offset =
5 * sizeof (uint32_t);
level2_offset =
5 * sizeof (uint32_t)
+ t.level1_size * sizeof (uint32_t);
level3_offset =
5 * sizeof (uint32_t)
+ t.level1_size * sizeof (uint32_t)
+ (t.level2_size << t.q) * sizeof (uint32_t);
for (i = 0; i < 5; i++)
fprintf (stream, "#define lbrkprop_header_%d %d\n", i,
((uint32_t *) t.result)[i]);
fprintf (stream, "static const\n");
fprintf (stream, "struct\n");
fprintf (stream, " {\n");
fprintf (stream, " int level1[%d];\n", t.level1_size);
fprintf (stream, " int level2[%d << %d];\n", t.level2_size, t.q);
fprintf (stream, " unsigned char level3[%d << %d];\n", t.level3_size, t.p);
fprintf (stream, " }\n");
fprintf (stream, "lbrkprop =\n");
fprintf (stream, "{\n");
fprintf (stream, " {");
for (i = 0; i < t.level1_size; i++)
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level1_offset))[i];
fprintf (stream, " %5d%s",
offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
(i+1 < t.level1_size ? "," : ""));
}
fprintf (stream, " },\n");
fprintf (stream, " {");
if (t.level2_size << t.q > 8)
fprintf (stream, "\n ");
for (i = 0; i < t.level2_size << t.q; i++)
{
uint32_t offset;
if (i > 0 && (i % 8) == 0)
fprintf (stream, "\n ");
offset = ((uint32_t *) (t.result + level2_offset))[i];
fprintf (stream, " %5d%s",
offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
(i+1 < t.level2_size << t.q ? "," : ""));
}
if (t.level2_size << t.q > 8)
fprintf (stream, "\n ");
fprintf (stream, " },\n");
fprintf (stream, " {");
if (t.level3_size << t.p > 8)
fprintf (stream, "\n ");
for (i = 0; i < t.level3_size << t.p; i++)
{
unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
const char *value_string;
switch (value)
{
#define CASE(x) case x: value_string = #x; break;
CASE(LBP_BK);
CASE(LBP_CM);
CASE(LBP_ZW);
CASE(LBP_IN);
CASE(LBP_GL);
CASE(LBP_CB);
CASE(LBP_SP);
CASE(LBP_BA);
CASE(LBP_BB);
CASE(LBP_B2);
CASE(LBP_HY);
CASE(LBP_NS);
CASE(LBP_OP);
CASE(LBP_CL);
CASE(LBP_QU);
CASE(LBP_EX);
CASE(LBP_ID);
CASE(LBP_NU);
CASE(LBP_IS);
CASE(LBP_SY);
CASE(LBP_AL);
CASE(LBP_PR);
CASE(LBP_PO);
CASE(LBP_SA);
CASE(LBP_XX);
CASE(LBP_AI);
#undef CASE
default:
abort ();
}
if (i > 0 && (i % 8) == 0)
fprintf (stream, "\n ");
fprintf (stream, " %s%s", value_string,
(i+1 < t.level3_size << t.p ? "," : ""));
}
if (t.level3_size << t.p > 8)
fprintf (stream, "\n ");
fprintf (stream, " }\n");
fprintf (stream, "};\n");
}
static void
output_tables (const char *filename, const char *version)
{
FILE *stream;
stream = fopen (filename, "w");
if (stream == NULL)
{
fprintf (stderr, "cannot open '%s' for writing\n", filename);
exit (1);
}
fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
fprintf (stream, "/* Generated automatically by gen-lbrkprop for Unicode %s. */\n",
version);
fprintf (stream, "\n");
output_lbp (stream);
if (ferror (stream) || fclose (stream))
{
fprintf (stderr, "error writing to '%s'\n", filename);
exit (1);
}
}
int
main (int argc, char * argv[])
{
if (argc != 6)
{
fprintf (stderr, "Usage: %s UnicodeData.txt Combining.txt EastAsianWidth.txt LineBreak.txt version\n",
argv[0]);
exit (1);
}
fill_attributes (argv[1]);
fill_combining (argv[2]);
fill_width (argv[3]);
fill_org_lbp (argv[4]);
debug_output_tables ("lbrkprop.txt");
debug_output_org_tables ("lbrkprop_org.txt");
output_tables ("lbrkprop.h", argv[5]);
return 0;
}