read-stringtable.c [plain text]
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include "read-stringtable.h"
#include <assert.h>
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "error.h"
#include "error-progname.h"
#include "read-po-abstract.h"
#include "po-hash.h"
#include "xalloc.h"
#include "exit.h"
#include "utf8-ucs4.h"
#include "ucs4-utf8.h"
#include "gettext.h"
#define _(str) gettext (str)
static const char *real_file_name;
extern lex_pos_ty gram_pos;
static FILE *fp;
static unsigned char phase1_pushback[4];
static int phase1_pushback_length;
static int
phase1_getc ()
{
int c;
if (phase1_pushback_length)
return phase1_pushback[--phase1_pushback_length];
c = getc (fp);
if (c == EOF)
{
if (ferror (fp))
error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
real_file_name);
return EOF;
}
return c;
}
static void
phase1_ungetc (int c)
{
if (c != EOF)
phase1_pushback[phase1_pushback_length++] = c;
}
#define UEOF -1
static int phase2_pushback[4];
static int phase2_pushback_length;
enum enc
{
enc_undetermined,
enc_ucs2be,
enc_ucs2le,
enc_utf8,
enc_iso8859_1
};
static enum enc encoding;
static int
phase2_getc ()
{
if (phase2_pushback_length)
return phase2_pushback[--phase2_pushback_length];
if (encoding == enc_undetermined)
{
int c0, c1;
c0 = phase1_getc ();
if (c0 == EOF)
return UEOF;
c1 = phase1_getc ();
if (c1 == EOF)
{
phase1_ungetc (c0);
encoding = enc_iso8859_1;
}
else if (c0 == 0xfe && c1 == 0xff)
encoding = enc_ucs2be;
else if (c0 == 0xff && c1 == 0xfe)
encoding = enc_ucs2le;
else
{
int c2;
c2 = phase1_getc ();
if (c2 == EOF)
{
phase1_ungetc (c1);
phase1_ungetc (c0);
encoding = enc_iso8859_1;
}
else if (c0 == 0xef && c1 == 0xbb && c2 == 0xbf)
encoding = enc_utf8;
else
{
phase1_ungetc (c2);
phase1_ungetc (c1);
phase1_ungetc (c0);
encoding = enc_iso8859_1;
}
}
}
switch (encoding)
{
case enc_ucs2be:
{
int c0, c1;
c0 = phase1_getc ();
if (c0 == EOF)
return UEOF;
c1 = phase1_getc ();
if (c1 == EOF)
return UEOF;
return (c0 << 8) + c1;
}
case enc_ucs2le:
{
int c0, c1;
c0 = phase1_getc ();
if (c0 == EOF)
return UEOF;
c1 = phase1_getc ();
if (c1 == EOF)
return UEOF;
return c0 + (c1 << 8);
}
case enc_utf8:
{
unsigned char buf[6];
unsigned int count;
int c;
unsigned int uc;
c = phase1_getc ();
if (c == EOF)
return UEOF;
buf[0] = c;
count = 1;
if (buf[0] >= 0xc0)
{
c = phase1_getc ();
if (c == EOF)
return UEOF;
buf[1] = c;
count = 2;
}
if (buf[0] >= 0xe0
&& ((buf[1] ^ 0x80) < 0x40))
{
c = phase1_getc ();
if (c == EOF)
return UEOF;
buf[2] = c;
count = 3;
}
if (buf[0] >= 0xf0
&& ((buf[1] ^ 0x80) < 0x40)
&& ((buf[2] ^ 0x80) < 0x40))
{
c = phase1_getc ();
if (c == EOF)
return UEOF;
buf[3] = c;
count = 4;
}
if (buf[0] >= 0xf8
&& ((buf[1] ^ 0x80) < 0x40)
&& ((buf[2] ^ 0x80) < 0x40)
&& ((buf[3] ^ 0x80) < 0x40))
{
c = phase1_getc ();
if (c == EOF)
return UEOF;
buf[4] = c;
count = 5;
}
if (buf[0] >= 0xfc
&& ((buf[1] ^ 0x80) < 0x40)
&& ((buf[2] ^ 0x80) < 0x40)
&& ((buf[3] ^ 0x80) < 0x40)
&& ((buf[4] ^ 0x80) < 0x40))
{
c = phase1_getc ();
if (c == EOF)
return UEOF;
buf[5] = c;
count = 6;
}
u8_mbtouc (&uc, buf, count);
return uc;
}
case enc_iso8859_1:
{
int c = phase1_getc ();
if (c == EOF)
return UEOF;
return c;
}
default:
abort ();
}
}
static void
phase2_ungetc (int c)
{
if (c != UEOF)
phase2_pushback[phase2_pushback_length++] = c;
}
static int
phase3_getc ()
{
int c = phase2_getc ();
if (c == '\n')
gram_pos.line_number++;
return c;
}
static void
phase3_ungetc (int c)
{
if (c == '\n')
--gram_pos.line_number;
phase2_ungetc (c);
}
static char *
conv_from_ucs4 (const int *buffer, size_t buflen)
{
unsigned char *utf8_string;
size_t pos;
unsigned char *q;
utf8_string = (unsigned char *) xmalloc (6 * buflen + 1);
for (pos = 0, q = utf8_string; pos < buflen; )
{
unsigned int uc;
int n;
uc = buffer[pos++];
n = u8_uctomb (q, uc, 6);
assert (n > 0);
q += n;
}
*q = '\0';
assert (q - utf8_string <= 6 * buflen);
return (char *) utf8_string;
}
static char *
parse_escaped_string (const int *string, size_t length)
{
static int *buffer;
static size_t bufmax;
static size_t buflen;
const int *string_limit = string + length;
int c;
if (string == string_limit)
return NULL;
c = *string++;
if (c != '"')
return NULL;
buflen = 0;
for (;;)
{
if (string == string_limit)
return NULL;
c = *string++;
if (c == '"')
break;
if (c == '\\')
{
if (string == string_limit)
return NULL;
c = *string++;
if (c >= '0' && c <= '7')
{
unsigned int n = 0;
int j = 0;
for (;;)
{
n = n * 8 + (c - '0');
if (++j == 3)
break;
if (string == string_limit)
break;
c = *string;
if (!(c >= '0' && c <= '7'))
break;
string++;
}
c = n;
}
else if (c == 'u' || c == 'U')
{
unsigned int n = 0;
int j;
for (j = 0; j < 4; j++)
{
if (string == string_limit)
break;
c = *string;
if (c >= '0' && c <= '9')
n = n * 16 + (c - '0');
else if (c >= 'A' && c <= 'F')
n = n * 16 + (c - 'A' + 10);
else if (c >= 'a' && c <= 'f')
n = n * 16 + (c - 'a' + 10);
else
break;
string++;
}
c = n;
}
else
switch (c)
{
case 'a': c = '\a'; break;
case 'b': c = '\b'; break;
case 't': c = '\t'; break;
case 'r': c = '\r'; break;
case 'n': c = '\n'; break;
case 'v': c = '\v'; break;
case 'f': c = '\f'; break;
}
}
if (buflen >= bufmax)
{
bufmax = 2 * bufmax + 10;
buffer = xrealloc (buffer, bufmax * sizeof (int));
}
buffer[buflen++] = c;
}
return conv_from_ucs4 (buffer, buflen);
}
static char *special_comment;
static inline void
special_comment_reset ()
{
if (special_comment != NULL)
free (special_comment);
special_comment = NULL;
}
static void
special_comment_add (const char *flag)
{
if (special_comment == NULL)
special_comment = xstrdup (flag);
else
{
size_t total_len = strlen (special_comment) + 2 + strlen (flag) + 1;
special_comment = xrealloc (special_comment, total_len);
strcat (special_comment, ", ");
strcat (special_comment, flag);
}
}
static inline void
special_comment_finish ()
{
if (special_comment != NULL)
{
po_callback_comment_special (special_comment);
free (special_comment);
special_comment = NULL;
}
}
static int *buffer;
static size_t bufmax;
static size_t buflen;
static bool next_is_obsolete;
static bool next_is_fuzzy;
static char *fuzzy_msgstr;
static bool expect_fuzzy_msgstr_as_c_comment;
static bool expect_fuzzy_msgstr_as_cxx_comment;
static inline void
comment_start ()
{
buflen = 0;
}
static inline void
comment_add (int c)
{
if (buflen >= bufmax)
{
bufmax = 2 * bufmax + 10;
buffer = xrealloc (buffer, bufmax * sizeof (int));
}
buffer[buflen++] = c;
}
static inline void
comment_line_end (size_t chars_to_remove, bool test_for_fuzzy_msgstr)
{
char *line;
buflen -= chars_to_remove;
while (buflen >= 1
&& (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
--buflen;
if (test_for_fuzzy_msgstr
&& buflen > 2 && buffer[0] == '=' && buffer[1] == ' '
&& (fuzzy_msgstr =
parse_escaped_string (buffer + 2,
buflen - (buffer[buflen - 1] == ';') - 2)))
return;
line = conv_from_ucs4 (buffer, buflen);
if (strcmp (line, "Flag: untranslated") == 0)
{
special_comment_add ("fuzzy");
next_is_fuzzy = true;
}
else if (strcmp (line, "Flag: unmatched") == 0)
next_is_obsolete = true;
else if (strlen (line) >= 6 && memcmp (line, "Flag: ", 6) == 0)
special_comment_add (line + 6);
else if (strlen (line) >= 9 && memcmp (line, "Comment: ", 9) == 0)
po_callback_comment_dot (line + 9);
else
{
char *last_colon;
unsigned long number;
char *endp;
if (strlen (line) >= 6 && memcmp (line, "File: ", 6) == 0
&& (last_colon = strrchr (line + 6, ':')) != NULL
&& *(last_colon + 1) != '\0'
&& (number = strtoul (last_colon + 1, &endp, 10), *endp == '\0'))
{
*last_colon = '\0';
po_callback_comment_filepos (line + 6, number);
}
else
po_callback_comment (line);
}
}
static int
phase4_getc ()
{
int c;
c = phase3_getc ();
if (c != '/')
return c;
c = phase3_getc ();
switch (c)
{
default:
phase3_ungetc (c);
return '/';
case '*':
{
bool last_was_star;
size_t trailing_stars;
bool seen_newline;
comment_start ();
last_was_star = false;
trailing_stars = 0;
seen_newline = false;
for (;;)
{
c = phase3_getc ();
if (c != '*')
break;
last_was_star = true;
}
phase3_ungetc (c);
for (;;)
{
c = phase3_getc ();
if (c == UEOF)
break;
if (!(buflen == 0 && (c == ' ' || c == '\t')))
comment_add (c);
switch (c)
{
case '\n':
seen_newline = true;
comment_line_end (1, false);
comment_start ();
last_was_star = false;
trailing_stars = 0;
continue;
case '*':
last_was_star = true;
trailing_stars++;
continue;
case '/':
if (last_was_star)
{
comment_line_end (trailing_stars + 1,
expect_fuzzy_msgstr_as_c_comment
&& !seen_newline);
break;
}
default:
last_was_star = false;
trailing_stars = 0;
continue;
}
break;
}
return ' ';
}
case '/':
comment_start ();
for (;;)
{
c = phase3_getc ();
if (c == '\n' || c == UEOF)
break;
if (!(buflen == 0 && (c == ' ' || c == '\t')))
comment_add (c);
}
comment_line_end (0, expect_fuzzy_msgstr_as_cxx_comment);
return '\n';
}
}
static inline void
phase4_ungetc (int c)
{
phase3_ungetc (c);
}
static bool
is_whitespace (int c)
{
return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f'
|| c == '\b');
}
static bool
is_quotable (int c)
{
if ((c >= '0' && c <= '9')
|| (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
return false;
switch (c)
{
case '!': case '#': case '$': case '%': case '&': case '*':
case '+': case '-': case '.': case '/': case ':': case '?':
case '@': case '|': case '~': case '_': case '^':
return false;
default:
return true;
}
}
static char *
read_string (lex_pos_ty *pos)
{
static int *buffer;
static size_t bufmax;
static size_t buflen;
int c;
do
c = phase4_getc ();
while (is_whitespace (c));
if (c == UEOF)
return NULL;
*pos = gram_pos;
buflen = 0;
if (c == '"')
{
for (;;)
{
c = phase3_getc ();
if (c == UEOF || c == '"')
break;
if (c == '\\')
{
c = phase3_getc ();
if (c == UEOF)
break;
if (c >= '0' && c <= '7')
{
unsigned int n = 0;
int j = 0;
for (;;)
{
n = n * 8 + (c - '0');
if (++j == 3)
break;
c = phase3_getc ();
if (!(c >= '0' && c <= '7'))
{
phase3_ungetc (c);
break;
}
}
c = n;
}
else if (c == 'u' || c == 'U')
{
unsigned int n = 0;
int j;
for (j = 0; j < 4; j++)
{
c = phase3_getc ();
if (c >= '0' && c <= '9')
n = n * 16 + (c - '0');
else if (c >= 'A' && c <= 'F')
n = n * 16 + (c - 'A' + 10);
else if (c >= 'a' && c <= 'f')
n = n * 16 + (c - 'a' + 10);
else
{
phase3_ungetc (c);
break;
}
}
c = n;
}
else
switch (c)
{
case 'a': c = '\a'; break;
case 'b': c = '\b'; break;
case 't': c = '\t'; break;
case 'r': c = '\r'; break;
case 'n': c = '\n'; break;
case 'v': c = '\v'; break;
case 'f': c = '\f'; break;
}
}
if (buflen >= bufmax)
{
bufmax = 2 * bufmax + 10;
buffer = xrealloc (buffer, bufmax * sizeof (int));
}
buffer[buflen++] = c;
}
if (c == UEOF)
{
error_with_progname = false;
error (0, 0, _("%s:%lu: warning: unterminated string"),
real_file_name, (unsigned long) gram_pos.line_number);
error_with_progname = true;
}
}
else
{
if (is_quotable (c))
{
error_with_progname = false;
error (0, 0, _("%s:%lu: warning: syntax error"),
real_file_name, (unsigned long) gram_pos.line_number);
error_with_progname = true;
}
for (; c != UEOF && !is_quotable (c); c = phase4_getc ())
{
if (buflen >= bufmax)
{
bufmax = 2 * bufmax + 10;
buffer = xrealloc (buffer, bufmax * sizeof (int));
}
buffer[buflen++] = c;
}
}
return conv_from_ucs4 (buffer, buflen);
}
void
stringtable_parse (abstract_po_reader_ty *pop, FILE *file,
const char *real_filename, const char *logical_filename)
{
fp = file;
real_file_name = real_filename;
gram_pos.file_name = xstrdup (real_file_name);
gram_pos.line_number = 1;
encoding = enc_undetermined;
expect_fuzzy_msgstr_as_c_comment = false;
expect_fuzzy_msgstr_as_cxx_comment = false;
for (;;)
{
char *msgid;
lex_pos_ty msgid_pos;
char *msgstr;
lex_pos_ty msgstr_pos;
int c;
special_comment_reset ();
next_is_obsolete = false;
next_is_fuzzy = false;
fuzzy_msgstr = NULL;
msgid = read_string (&msgid_pos);
if (msgid == NULL)
break;
special_comment_finish ();
do
c = phase4_getc ();
while (is_whitespace (c));
if (c == UEOF)
{
error_with_progname = false;
error (0, 0, _("%s:%lu: warning: unterminated key/value pair"),
real_file_name, (unsigned long) gram_pos.line_number);
error_with_progname = true;
break;
}
if (c == ';')
{
msgstr = "";
msgstr_pos = msgid_pos;
po_callback_message (msgid, &msgid_pos, NULL,
msgstr, strlen (msgstr) + 1, &msgstr_pos,
false, next_is_obsolete);
}
else if (c == '=')
{
msgstr = read_string (&msgstr_pos);
if (msgstr == NULL)
{
error_with_progname = false;
error (0, 0, _("%s:%lu: warning: unterminated key/value pair"),
real_file_name, (unsigned long) gram_pos.line_number);
error_with_progname = true;
break;
}
expect_fuzzy_msgstr_as_c_comment = next_is_fuzzy;
do
{
c = phase4_getc ();
if (fuzzy_msgstr != NULL)
expect_fuzzy_msgstr_as_c_comment = false;
}
while (is_whitespace (c));
expect_fuzzy_msgstr_as_c_comment = false;
if (c == ';')
{
if (fuzzy_msgstr == NULL && next_is_fuzzy)
{
do
c = phase3_getc ();
while (c == ' ');
phase3_ungetc (c);
expect_fuzzy_msgstr_as_cxx_comment = true;
c = phase4_getc ();
phase4_ungetc (c);
expect_fuzzy_msgstr_as_cxx_comment = false;
}
if (fuzzy_msgstr != NULL && strcmp (msgstr, msgid) == 0)
msgstr = fuzzy_msgstr;
po_callback_message (msgid, &msgid_pos, NULL,
msgstr, strlen (msgstr) + 1, &msgstr_pos,
false, next_is_obsolete);
}
else
{
error_with_progname = false;
error (0, 0, _("\
%s:%lu: warning: syntax error, expected ';' after string"),
real_file_name, (unsigned long) gram_pos.line_number);
error_with_progname = true;
break;
}
}
else
{
error_with_progname = false;
error (0, 0, _("\
%s:%lu: warning: syntax error, expected '=' or ';' after string"),
real_file_name, (unsigned long) gram_pos.line_number);
error_with_progname = true;
break;
}
}
fp = NULL;
real_file_name = NULL;
gram_pos.line_number = 0;
}