#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include "linebreak.h"
#include <stdlib.h>
#include <string.h>
#include "c-ctype.h"
#include "xsize.h"
#include "utf8-ucs4.h"
#ifdef unused
#include "utf16-ucs4.h"
static inline int
u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n)
{
*puc = *s;
return 1;
}
#endif
#if defined (__GNUC__) && defined (__OPTIMIZE__)
static inline int
streq9 (const char *s1, const char *s2)
{
return strcmp (s1 + 9, s2 + 9) == 0;
}
static inline int
streq8 (const char *s1, const char *s2, char s28)
{
if (s1[8] == s28)
{
if (s28 == 0)
return 1;
else
return streq9 (s1, s2);
}
else
return 0;
}
static inline int
streq7 (const char *s1, const char *s2, char s27, char s28)
{
if (s1[7] == s27)
{
if (s27 == 0)
return 1;
else
return streq8 (s1, s2, s28);
}
else
return 0;
}
static inline int
streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
{
if (s1[6] == s26)
{
if (s26 == 0)
return 1;
else
return streq7 (s1, s2, s27, s28);
}
else
return 0;
}
static inline int
streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
{
if (s1[5] == s25)
{
if (s25 == 0)
return 1;
else
return streq6 (s1, s2, s26, s27, s28);
}
else
return 0;
}
static inline int
streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
{
if (s1[4] == s24)
{
if (s24 == 0)
return 1;
else
return streq5 (s1, s2, s25, s26, s27, s28);
}
else
return 0;
}
static inline int
streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
{
if (s1[3] == s23)
{
if (s23 == 0)
return 1;
else
return streq4 (s1, s2, s24, s25, s26, s27, s28);
}
else
return 0;
}
static inline int
streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
{
if (s1[2] == s22)
{
if (s22 == 0)
return 1;
else
return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
}
else
return 0;
}
static inline int
streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
{
if (s1[1] == s21)
{
if (s21 == 0)
return 1;
else
return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
}
else
return 0;
}
static inline int
streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
{
if (s1[0] == s20)
{
if (s20 == 0)
return 1;
else
return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
}
else
return 0;
}
#define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
#else
#define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
(strcmp (s1, s2) == 0)
#endif
static int
is_cjk_encoding (const char *encoding)
{
if (0
|| STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
|| STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
|| STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
|| STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
|| STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
|| STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
|| STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
|| STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
return 1;
return 0;
}
static int
is_utf8_encoding (const char *encoding)
{
if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
return 1;
return 0;
}
int uc_width (unsigned int uc, const char *encoding);
static const unsigned char nonspacing_table_data[16*64] = {
0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb,
0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00,
0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff,
0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10,
0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10,
0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10,
0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00,
0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10,
0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90,
0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0,
0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10,
0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07,
0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b,
0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f,
0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f,
0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02,
0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00,
0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f,
0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00,
0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8,
0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
static const signed char nonspacing_table_ind[240] = {
0, 1, 2, 3, 4, 5, 6, 7,
8, -1, -1, 9, 10, -1, -1, -1,
11, -1, -1, -1, -1, -1, -1, -1,
12, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, 13, -1, 14,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
15, -1, -1, -1, -1, -1, -1, -1
};
int
uc_width (unsigned int uc, const char *encoding)
{
if ((uc >> 9) < 240)
{
int ind = nonspacing_table_ind[uc >> 9];
if (ind >= 0)
if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
{
if (uc > 0 && uc < 0xa0)
return -1;
else
return 0;
}
}
else if ((uc >> 9) == (0xe0000 >> 9))
{
if (uc < 0xe0100
? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
: (uc <= 0xe01ef))
return 0;
}
if (uc >= 0x1100
&& ((uc < 0x1160)
|| (uc >= 0x2e80 && uc < 0x4dc0
&& !(uc == 0x303f))
|| (uc >= 0x4e00 && uc < 0xa4d0)
|| (uc >= 0xac00 && uc < 0xd7a4)
|| (uc >= 0xf900 && uc < 0xfb00)
|| (uc >= 0xfe30 && uc < 0xfe70)
|| (uc >= 0xff00 && uc < 0xff61)
|| (uc >= 0xffe0 && uc < 0xffe7)
|| (uc >= 0x20000 && uc <= 0x2fffd)
|| (uc >= 0x30000 && uc <= 0x3fffd)
) )
return 2;
if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
&& is_cjk_encoding (encoding))
return 2;
return 1;
}
#ifdef unused
int
u8_width (const unsigned char *s, size_t n, const char *encoding)
{
const unsigned char *s_end = s + n;
int width = 0;
while (s < s_end)
{
unsigned int uc;
int w;
s += u8_mbtouc (&uc, s, s_end - s);
if (uc == 0)
break;
w = uc_width (uc, encoding);
if (w >= 0)
width += w;
}
return width;
}
int
u16_width (const unsigned short *s, size_t n, const char *encoding)
{
const unsigned short *s_end = s + n;
int width = 0;
while (s < s_end)
{
unsigned int uc;
int w;
s += u16_mbtouc (&uc, s, s_end - s);
if (uc == 0)
break;
w = uc_width (uc, encoding);
if (w >= 0)
width += w;
}
return width;
}
int
u32_width (const unsigned int *s, size_t n, const char *encoding)
{
const unsigned int *s_end = s + n;
int width = 0;
while (s < s_end)
{
unsigned int uc = *s++;
int w;
if (uc == 0)
break;
w = uc_width (uc, encoding);
if (w >= 0)
width += w;
}
return width;
}
#endif
enum
{
LBP_BK = 0,
LBP_CM = 20,
LBP_ZW = 1,
LBP_IN = 2,
LBP_GL = 3,
LBP_CB = 22,
LBP_SP = 21,
LBP_BA = 4,
LBP_BB = 5,
LBP_B2 = 6,
LBP_HY = 7,
LBP_NS = 8,
LBP_OP = 9,
LBP_CL = 10,
LBP_QU = 11,
LBP_EX = 12,
LBP_ID = 13,
LBP_NU = 14,
LBP_IS = 15,
LBP_SY = 16,
LBP_AL = 17,
LBP_PR = 18,
LBP_PO = 19,
LBP_SA = 23,
LBP_AI = 24,
LBP_XX = 25
};
#include "lbrkprop.h"
static inline unsigned char
lbrkprop_lookup (unsigned int uc)
{
unsigned int index1 = uc >> lbrkprop_header_0;
if (index1 < lbrkprop_header_1)
{
int lookup1 = lbrkprop.level1[index1];
if (lookup1 >= 0)
{
unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
int lookup2 = lbrkprop.level2[lookup1 + index2];
if (lookup2 >= 0)
{
unsigned int index3 = uc & lbrkprop_header_4;
return lbrkprop.level3[lookup2 + index3];
}
}
}
return LBP_XX;
}
#define D 1
#define I 2
#define P 3
static const unsigned char lbrk_table[19][19] = {
{ P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
{ P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
{ P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
{ P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
{ P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
{ P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
{ P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
{ P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
{ P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
{ P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
{ P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
{ P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
{ P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
{ P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
{ P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
{ P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
{ P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
{ P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
{ P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
};
void
u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
{
int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
const unsigned char *s_end = s + n;
int last_prop = LBP_BK;
char *seen_space = NULL;
char *seen_space2 = NULL;
memset (p, UC_BREAK_PROHIBITED, n);
while (s < s_end)
{
unsigned int uc;
int count = u8_mbtouc (&uc, s, s_end - s);
int prop = lbrkprop_lookup (uc);
if (prop == LBP_BK)
{
*p = UC_BREAK_MANDATORY;
last_prop = LBP_BK;
seen_space = NULL;
seen_space2 = NULL;
}
else
{
char *q;
switch (prop)
{
case LBP_AI:
prop = LBP_AI_REPLACEMENT;
break;
case LBP_CB:
prop = LBP_ID;
break;
case LBP_SA:
case LBP_XX:
prop = LBP_AL;
break;
}
q = p;
if (prop == LBP_CM)
{
*p = UC_BREAK_PROHIBITED;
if (seen_space != NULL)
{
q = seen_space;
seen_space = seen_space2;
prop = LBP_AL;
goto lookup_via_table;
}
}
else if (prop == LBP_SP)
{
*p = UC_BREAK_PROHIBITED;
seen_space2 = seen_space;
seen_space = p;
}
else
{
lookup_via_table:
if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
abort ();
if (last_prop == LBP_BK)
{
*q = UC_BREAK_PROHIBITED;
}
else
{
switch (lbrk_table [last_prop-1] [prop-1])
{
case D:
*q = UC_BREAK_POSSIBLE;
break;
case I:
*q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
break;
case P:
*q = UC_BREAK_PROHIBITED;
break;
default:
abort ();
}
}
last_prop = prop;
seen_space = NULL;
seen_space2 = NULL;
}
}
s += count;
p += count;
}
}
#ifdef unused
void
u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
{
int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
const unsigned short *s_end = s + n;
int last_prop = LBP_BK;
char *seen_space = NULL;
char *seen_space2 = NULL;
memset (p, UC_BREAK_PROHIBITED, n);
while (s < s_end)
{
unsigned int uc;
int count = u16_mbtouc (&uc, s, s_end - s);
int prop = lbrkprop_lookup (uc);
if (prop == LBP_BK)
{
*p = UC_BREAK_MANDATORY;
last_prop = LBP_BK;
seen_space = NULL;
seen_space2 = NULL;
}
else
{
char *q;
switch (prop)
{
case LBP_AI:
prop = LBP_AI_REPLACEMENT;
break;
case LBP_CB:
prop = LBP_ID;
break;
case LBP_SA:
case LBP_XX:
prop = LBP_AL;
break;
}
q = p;
if (prop == LBP_CM)
{
*p = UC_BREAK_PROHIBITED;
if (seen_space != NULL)
{
q = seen_space;
seen_space = seen_space2;
prop = LBP_AL;
goto lookup_via_table;
}
}
else if (prop == LBP_SP)
{
*p = UC_BREAK_PROHIBITED;
seen_space2 = seen_space;
seen_space = p;
}
else
{
lookup_via_table:
if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
abort ();
if (last_prop == LBP_BK)
{
*q = UC_BREAK_PROHIBITED;
}
else
{
switch (lbrk_table [last_prop-1] [prop-1])
{
case D:
*q = UC_BREAK_POSSIBLE;
break;
case I:
*q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
break;
case P:
*q = UC_BREAK_PROHIBITED;
break;
default:
abort ();
}
}
last_prop = prop;
seen_space = NULL;
seen_space2 = NULL;
}
}
s += count;
p += count;
}
}
void
u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
{
int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
const unsigned int *s_end = s + n;
int last_prop = LBP_BK;
char *seen_space = NULL;
char *seen_space2 = NULL;
while (s < s_end)
{
unsigned int uc = *s;
int prop = lbrkprop_lookup (uc);
if (prop == LBP_BK)
{
*p = UC_BREAK_MANDATORY;
last_prop = LBP_BK;
seen_space = NULL;
seen_space2 = NULL;
}
else
{
char *q;
switch (prop)
{
case LBP_AI:
prop = LBP_AI_REPLACEMENT;
break;
case LBP_CB:
prop = LBP_ID;
break;
case LBP_SA:
case LBP_XX:
prop = LBP_AL;
break;
}
q = p;
if (prop == LBP_CM)
{
*p = UC_BREAK_PROHIBITED;
if (seen_space != NULL)
{
q = seen_space;
seen_space = seen_space2;
prop = LBP_AL;
goto lookup_via_table;
}
}
else if (prop == LBP_SP)
{
*p = UC_BREAK_PROHIBITED;
seen_space2 = seen_space;
seen_space = p;
}
else
{
lookup_via_table:
if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
abort ();
if (last_prop == LBP_BK)
{
*q = UC_BREAK_PROHIBITED;
}
else
{
switch (lbrk_table [last_prop-1] [prop-1])
{
case D:
*q = UC_BREAK_POSSIBLE;
break;
case I:
*q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
break;
case P:
*q = UC_BREAK_PROHIBITED;
break;
default:
abort ();
}
}
last_prop = prop;
seen_space = NULL;
seen_space2 = NULL;
}
}
s++;
p++;
}
}
#endif
int
u8_width_linebreaks (const unsigned char *s, size_t n,
int width, int start_column, int at_end_columns,
const char *o, const char *encoding,
char *p)
{
const unsigned char *s_end;
char *last_p;
int last_column;
int piece_width;
u8_possible_linebreaks (s, n, encoding, p);
s_end = s + n;
last_p = NULL;
last_column = start_column;
piece_width = 0;
while (s < s_end)
{
unsigned int uc;
int count = u8_mbtouc (&uc, s, s_end - s);
if (o != NULL && *o != UC_BREAK_UNDEFINED)
*p = *o;
if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
{
if (last_p != NULL && last_column + piece_width > width)
{
*last_p = UC_BREAK_POSSIBLE;
last_column = 0;
}
}
if (*p == UC_BREAK_MANDATORY)
{
last_p = NULL;
last_column = 0;
piece_width = 0;
}
else
{
int w;
if (*p == UC_BREAK_POSSIBLE)
{
last_p = p;
last_column += piece_width;
piece_width = 0;
}
*p = UC_BREAK_PROHIBITED;
w = uc_width (uc, encoding);
if (w >= 0)
piece_width += w;
}
s += count;
p += count;
if (o != NULL)
o += count;
}
if (last_p != NULL && last_column + piece_width + at_end_columns > width)
{
*last_p = UC_BREAK_POSSIBLE;
last_column = 0;
}
return last_column + piece_width;
}
#ifdef unused
int
u16_width_linebreaks (const unsigned short *s, size_t n,
int width, int start_column, int at_end_columns,
const char *o, const char *encoding,
char *p)
{
const unsigned short *s_end;
char *last_p;
int last_column;
int piece_width;
u16_possible_linebreaks (s, n, encoding, p);
s_end = s + n;
last_p = NULL;
last_column = start_column;
piece_width = 0;
while (s < s_end)
{
unsigned int uc;
int count = u16_mbtouc (&uc, s, s_end - s);
if (o != NULL && *o != UC_BREAK_UNDEFINED)
*p = *o;
if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
{
if (last_p != NULL && last_column + piece_width > width)
{
*last_p = UC_BREAK_POSSIBLE;
last_column = 0;
}
}
if (*p == UC_BREAK_MANDATORY)
{
last_p = NULL;
last_column = 0;
piece_width = 0;
}
else
{
int w;
if (*p == UC_BREAK_POSSIBLE)
{
last_p = p;
last_column += piece_width;
piece_width = 0;
}
*p = UC_BREAK_PROHIBITED;
w = uc_width (uc, encoding);
if (w >= 0)
piece_width += w;
}
s += count;
p += count;
if (o != NULL)
o += count;
}
if (last_p != NULL && last_column + piece_width + at_end_columns > width)
{
*last_p = UC_BREAK_POSSIBLE;
last_column = 0;
}
return last_column + piece_width;
}
int
u32_width_linebreaks (const unsigned int *s, size_t n,
int width, int start_column, int at_end_columns,
const char *o, const char *encoding,
char *p)
{
const unsigned int *s_end;
char *last_p;
int last_column;
int piece_width;
u32_possible_linebreaks (s, n, encoding, p);
s_end = s + n;
last_p = NULL;
last_column = start_column;
piece_width = 0;
while (s < s_end)
{
unsigned int uc = *s;
if (o != NULL && *o != UC_BREAK_UNDEFINED)
*p = *o;
if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
{
if (last_p != NULL && last_column + piece_width > width)
{
*last_p = UC_BREAK_POSSIBLE;
last_column = 0;
}
}
if (*p == UC_BREAK_MANDATORY)
{
last_p = NULL;
last_column = 0;
piece_width = 0;
}
else
{
int w;
if (*p == UC_BREAK_POSSIBLE)
{
last_p = p;
last_column += piece_width;
piece_width = 0;
}
*p = UC_BREAK_PROHIBITED;
w = uc_width (uc, encoding);
if (w >= 0)
piece_width += w;
}
s++;
p++;
if (o != NULL)
o++;
}
if (last_p != NULL && last_column + piece_width + at_end_columns > width)
{
*last_p = UC_BREAK_POSSIBLE;
last_column = 0;
}
return last_column + piece_width;
}
#endif
#ifdef TEST1
#include <stdio.h>
char *
read_file (FILE *stream)
{
#define BUFSIZE 4096
char *buf = NULL;
int alloc = 0;
int size = 0;
int count;
while (! feof (stream))
{
if (size + BUFSIZE > alloc)
{
alloc = alloc + alloc / 2;
if (alloc < size + BUFSIZE)
alloc = size + BUFSIZE;
buf = realloc (buf, alloc);
if (buf == NULL)
{
fprintf (stderr, "out of memory\n");
exit (1);
}
}
count = fread (buf + size, 1, BUFSIZE, stream);
if (count == 0)
{
if (ferror (stream))
{
perror ("fread");
exit (1);
}
}
else
size += count;
}
buf = realloc (buf, size + 1);
if (buf == NULL)
{
fprintf (stderr, "out of memory\n");
exit (1);
}
buf[size] = '\0';
return buf;
#undef BUFSIZE
}
int
main (int argc, char * argv[])
{
if (argc == 1)
{
char *input = read_file (stdin);
int length = strlen (input);
char *breaks = malloc (length);
int i;
u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
for (i = 0; i < length; i++)
{
switch (breaks[i])
{
case UC_BREAK_POSSIBLE:
putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
break;
case UC_BREAK_MANDATORY:
putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
break;
case UC_BREAK_PROHIBITED:
break;
default:
abort ();
}
putc (input[i], stdout);
}
free (breaks);
return 0;
}
else if (argc == 2)
{
int width = atoi (argv[1]);
char *input = read_file (stdin);
int length = strlen (input);
char *breaks = malloc (length);
int i;
u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
for (i = 0; i < length; i++)
{
switch (breaks[i])
{
case UC_BREAK_POSSIBLE:
putc ('\n', stdout);
break;
case UC_BREAK_MANDATORY:
break;
case UC_BREAK_PROHIBITED:
break;
default:
abort ();
}
putc (input[i], stdout);
}
free (breaks);
return 0;
}
else
return 1;
}
#endif
#if HAVE_ICONV
#include <iconv.h>
#include <errno.h>
#define UTF8_NAME "UTF-8"
static size_t
iconv_string_length (iconv_t cd, const char *s, size_t n)
{
#define TMPBUFSIZE 4096
size_t count = 0;
char tmpbuf[TMPBUFSIZE];
const char *inptr = s;
size_t insize = n;
while (insize > 0)
{
char *outptr = tmpbuf;
size_t outsize = TMPBUFSIZE;
size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
if (res == (size_t)(-1) && errno != E2BIG)
return (size_t)(-1);
count += outptr - tmpbuf;
}
#if defined _LIBICONV_VERSION \
|| !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
{
char *outptr = tmpbuf;
size_t outsize = TMPBUFSIZE;
size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
if (res == (size_t)(-1))
return (size_t)(-1);
count += outptr - tmpbuf;
}
iconv (cd, NULL, NULL, NULL, NULL);
#endif
return count;
#undef TMPBUFSIZE
}
static void
iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
size_t *offtable, char *t, size_t m)
{
size_t i;
const char *s_end;
const char *inptr;
char *outptr;
size_t outsize;
#if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
const size_t extra = 1;
#else
const size_t extra = 0;
#endif
for (i = 0; i < n; i++)
offtable[i] = (size_t)(-1);
s_end = s + n;
inptr = s;
outptr = t;
outsize = m + extra;
while (inptr < s_end)
{
const char *saved_inptr;
size_t insize;
size_t res;
offtable[inptr - s] = outptr - t;
saved_inptr = inptr;
res = (size_t)(-1);
for (insize = 1; inptr + insize <= s_end; insize++)
{
res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
if (!(res == (size_t)(-1) && errno == EINVAL))
break;
if (inptr != saved_inptr)
abort ();
}
if (res == (size_t)(-1))
abort ();
}
#if defined _LIBICONV_VERSION \
|| !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
abort ();
#endif
if (outsize != extra)
abort ();
}
#endif
#if C_CTYPE_ASCII
static int
is_all_ascii (const char *s, size_t n)
{
for (; n > 0; s++, n--)
{
unsigned char c = (unsigned char) *s;
if (!(c_isprint (c) || c_isspace (c)))
return 0;
}
return 1;
}
#endif
#if defined unused || defined TEST2
void
mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
char *p)
{
if (n == 0)
return;
if (is_utf8_encoding (encoding))
u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
else
{
#if HAVE_ICONV
iconv_t to_utf8;
# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
to_utf8 = (iconv_t)(-1);
else
# endif
# if defined __sun && !defined _LIBICONV_VERSION
if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
|| STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
|| STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
|| STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
|| STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
|| STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
to_utf8 = (iconv_t)(-1);
else
# endif
to_utf8 = iconv_open (UTF8_NAME, encoding);
if (to_utf8 != (iconv_t)(-1))
{
size_t m = iconv_string_length (to_utf8, s, n);
if (m != (size_t)(-1))
{
size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
char *memory =
(size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
if (memory != NULL)
{
size_t *offtable = (size_t *) memory;
char *t = (char *) (offtable + n);
char *q = (char *) (t + m);
size_t i;
iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
memset (p, UC_BREAK_PROHIBITED, n);
for (i = 0; i < n; i++)
if (offtable[i] != (size_t)(-1))
p[i] = q[offtable[i]];
free (memory);
iconv_close (to_utf8);
return;
}
}
iconv_close (to_utf8);
}
#endif
#if C_CTYPE_ASCII
if (is_all_ascii (s, n))
{
u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
return;
}
#endif
{
const char *s_end = s + n;
while (s < s_end)
{
*p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
s++;
p++;
}
}
}
}
#endif
int
mbs_width_linebreaks (const char *s, size_t n,
int width, int start_column, int at_end_columns,
const char *o, const char *encoding,
char *p)
{
if (n == 0)
return start_column;
if (is_utf8_encoding (encoding))
return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
else
{
#if HAVE_ICONV
iconv_t to_utf8;
# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
to_utf8 = (iconv_t)(-1);
else
# endif
# if defined __sun && !defined _LIBICONV_VERSION
if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
|| STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
|| STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
|| STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
|| STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
|| STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
to_utf8 = (iconv_t)(-1);
else
# endif
to_utf8 = iconv_open (UTF8_NAME, encoding);
if (to_utf8 != (iconv_t)(-1))
{
size_t m = iconv_string_length (to_utf8, s, n);
if (m != (size_t)(-1))
{
size_t memory_size =
xsum4 (xtimes (n, sizeof (size_t)), m, m,
(o != NULL ? m : 0));
char *memory =
(size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
if (memory != NULL)
{
size_t *offtable = (size_t *) memory;
char *t = (char *) (offtable + n);
char *q = (char *) (t + m);
char *o8 = (o != NULL ? (char *) (q + m) : NULL);
int res_column;
size_t i;
iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
if (o != NULL)
{
memset (o8, UC_BREAK_UNDEFINED, m);
for (i = 0; i < n; i++)
if (offtable[i] != (size_t)(-1))
o8[offtable[i]] = o[i];
}
res_column =
u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
memset (p, UC_BREAK_PROHIBITED, n);
for (i = 0; i < n; i++)
if (offtable[i] != (size_t)(-1))
p[i] = q[offtable[i]];
free (memory);
iconv_close (to_utf8);
return res_column;
}
}
iconv_close (to_utf8);
}
#endif
#if C_CTYPE_ASCII
if (is_all_ascii (s, n))
{
return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
}
#endif
{
const char *s_end = s + n;
while (s < s_end)
{
*p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
? UC_BREAK_MANDATORY
: UC_BREAK_PROHIBITED);
s++;
p++;
if (o != NULL)
o++;
}
return start_column;
}
}
}
#ifdef TEST2
#include <stdio.h>
#include <locale.h>
char *
read_file (FILE *stream)
{
#define BUFSIZE 4096
char *buf = NULL;
int alloc = 0;
int size = 0;
int count;
while (! feof (stream))
{
if (size + BUFSIZE > alloc)
{
alloc = alloc + alloc / 2;
if (alloc < size + BUFSIZE)
alloc = size + BUFSIZE;
buf = realloc (buf, alloc);
if (buf == NULL)
{
fprintf (stderr, "out of memory\n");
exit (1);
}
}
count = fread (buf + size, 1, BUFSIZE, stream);
if (count == 0)
{
if (ferror (stream))
{
perror ("fread");
exit (1);
}
}
else
size += count;
}
buf = realloc (buf, size + 1);
if (buf == NULL)
{
fprintf (stderr, "out of memory\n");
exit (1);
}
buf[size] = '\0';
return buf;
#undef BUFSIZE
}
int
main (int argc, char * argv[])
{
setlocale (LC_CTYPE, "");
if (argc == 1)
{
char *input = read_file (stdin);
int length = strlen (input);
char *breaks = malloc (length);
int i;
mbs_possible_linebreaks (input, length, locale_charset (), breaks);
for (i = 0; i < length; i++)
{
switch (breaks[i])
{
case UC_BREAK_POSSIBLE:
putc ('|', stdout);
break;
case UC_BREAK_MANDATORY:
break;
case UC_BREAK_PROHIBITED:
break;
default:
abort ();
}
putc (input[i], stdout);
}
free (breaks);
return 0;
}
else if (argc == 2)
{
int width = atoi (argv[1]);
char *input = read_file (stdin);
int length = strlen (input);
char *breaks = malloc (length);
int i;
mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
for (i = 0; i < length; i++)
{
switch (breaks[i])
{
case UC_BREAK_POSSIBLE:
putc ('\n', stdout);
break;
case UC_BREAK_MANDATORY:
break;
case UC_BREAK_PROHIBITED:
break;
default:
abort ();
}
putc (input[i], stdout);
}
free (breaks);
return 0;
}
else
return 1;
}
#endif