#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <assert.h>
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "message.h"
#include "xgettext.h"
#include "x-python.h"
#include "error.h"
#include "error-progname.h"
#include "xalloc.h"
#include "exit.h"
#include "po-charset.h"
#include "uniname.h"
#include "utf16-ucs4.h"
#include "ucs4-utf8.h"
#include "gettext.h"
#define _(s) gettext(s)
static bool extract_all = false;
static hash_table keywords;
static bool default_keywords = true;
void
x_python_extract_all ()
{
extract_all = true;
}
void
x_python_keyword (const char *name)
{
if (name == NULL)
default_keywords = false;
else
{
const char *end;
int argnum1;
int argnum2;
const char *colon;
if (keywords.table == NULL)
init_hash (&keywords, 100);
split_keywordspec (name, &end, &argnum1, &argnum2);
colon = strchr (name, ':');
if (colon == NULL || colon >= end)
{
if (argnum1 == 0)
argnum1 = 1;
insert_entry (&keywords, name, end - name,
(void *) (long) (argnum1 + (argnum2 << 10)));
}
}
}
static void
init_keywords ()
{
if (default_keywords)
{
x_python_keyword ("gettext");
x_python_keyword ("ugettext");
x_python_keyword ("dgettext:2");
x_python_keyword ("ngettext:1,2");
x_python_keyword ("ungettext:1,2");
x_python_keyword ("dngettext:2,3");
x_python_keyword ("_");
default_keywords = false;
}
}
void
init_flag_table_python ()
{
xgettext_record_flag ("gettext:1:pass-python-format");
xgettext_record_flag ("ugettext:1:pass-python-format");
xgettext_record_flag ("dgettext:2:pass-python-format");
xgettext_record_flag ("ngettext:1:pass-python-format");
xgettext_record_flag ("ngettext:2:pass-python-format");
xgettext_record_flag ("ungettext:1:pass-python-format");
xgettext_record_flag ("ungettext:2:pass-python-format");
xgettext_record_flag ("dngettext:2:pass-python-format");
xgettext_record_flag ("dngettext:3:pass-python-format");
xgettext_record_flag ("_:1:pass-python-format");
}
static const char *real_file_name;
static char *logical_file_name;
static int line_number;
static FILE *fp;
static unsigned char phase1_pushback[UNINAME_MAX + 4];
static int phase1_pushback_length;
static int
phase1_getc ()
{
int c;
if (phase1_pushback_length)
c = phase1_pushback[--phase1_pushback_length];
else
{
c = getc (fp);
if (c == EOF)
{
if (ferror (fp))
error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
real_file_name);
return EOF;
}
}
if (c == '\n')
line_number++;
return c;
}
static void
phase1_ungetc (int c)
{
if (c != EOF)
{
if (c == '\n')
--line_number;
phase1_pushback[phase1_pushback_length++] = c;
}
}
static char *buffer;
static size_t bufmax;
static size_t buflen;
static inline void
comment_start ()
{
buflen = 0;
}
static inline void
comment_add (int c)
{
size_t len = ((unsigned char) c < 0x80 ? 1 : 2);
if (buflen + len > bufmax)
{
bufmax = 2 * bufmax + 10;
buffer = xrealloc (buffer, bufmax);
}
if ((unsigned char) c < 0x80)
buffer[buflen++] = c;
else
{
buffer[buflen++] = 0xc0 | ((unsigned char) c >> 6);
buffer[buflen++] = 0x80 | ((unsigned char) c & 0x3f);
}
}
static inline void
comment_line_end ()
{
while (buflen >= 1
&& (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
--buflen;
if (buflen >= bufmax)
{
bufmax = 2 * bufmax + 10;
buffer = xrealloc (buffer, bufmax);
}
buffer[buflen] = '\0';
xgettext_comment_add (buffer);
}
static int last_comment_line;
static int last_non_comment_line;
static int
phase2_getc ()
{
int c;
for (;;)
{
c = phase1_getc ();
if (c == '\\')
{
c = phase1_getc ();
if (c != '\n')
{
phase1_ungetc (c);
return '\\';
}
}
else if (c == '#')
{
last_comment_line = line_number;
comment_start ();
for (;;)
{
c = phase1_getc ();
if (c == EOF || c == '\n')
break;
if (!(buflen == 0 && (c == ' ' || c == '\t')))
comment_add (c);
}
comment_line_end ();
return c;
}
else
return c;
}
}
static void
phase2_ungetc (int c)
{
phase1_ungetc (c);
}
enum token_type_ty
{
token_type_eof,
token_type_lparen,
token_type_rparen,
token_type_comma,
token_type_string,
token_type_symbol,
token_type_other
};
typedef enum token_type_ty token_type_ty;
typedef struct token_ty token_ty;
struct token_ty
{
token_type_ty type;
char *string;
int line_number;
};
#define P7_EOF (-1)
#define P7_STRING_END (-2)
static int
phase7_getuc (int quote_char,
bool triple, bool interpret_ansic, bool interpret_unicode,
unsigned int *backslash_counter)
{
int c;
for (;;)
{
c = phase1_getc ();
if (c == EOF)
return P7_EOF;
if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
{
if (triple)
{
int c1 = phase1_getc ();
if (c1 == quote_char)
{
int c2 = phase1_getc ();
if (c2 == quote_char)
return P7_STRING_END;
phase1_ungetc (c2);
}
phase1_ungetc (c1);
return c;
}
else
return P7_STRING_END;
}
if (c == '\n')
{
if (triple)
{
*backslash_counter = 0;
return '\n';
}
if (!(interpret_ansic || (*backslash_counter & 1) == 0))
{
*backslash_counter = 0;
return '\n';
}
phase1_ungetc (c);
error_with_progname = false;
error (0, 0, _("%s:%d: warning: unterminated string"),
logical_file_name, line_number);
error_with_progname = true;
return P7_STRING_END;
}
if (c != '\\')
{
*backslash_counter = 0;
return c;
}
if (!interpret_ansic && !interpret_unicode)
{
++*backslash_counter;
return '\\';
}
c = phase1_getc ();
if (c == EOF)
{
++*backslash_counter;
return '\\';
}
if (interpret_ansic)
switch (c)
{
case '\n':
continue;
case '\\':
++*backslash_counter;
return c;
case '\'': case '"':
*backslash_counter = 0;
return c;
case 'a':
*backslash_counter = 0;
return '\a';
case 'b':
*backslash_counter = 0;
return '\b';
case 'f':
*backslash_counter = 0;
return '\f';
case 'n':
*backslash_counter = 0;
return '\n';
case 'r':
*backslash_counter = 0;
return '\r';
case 't':
*backslash_counter = 0;
return '\t';
case 'v':
*backslash_counter = 0;
return '\v';
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7':
{
int n = c - '0';
c = phase1_getc ();
if (c != EOF)
{
if (c >= '0' && c <= '7')
{
n = (n << 3) + (c - '0');
c = phase1_getc ();
if (c != EOF)
{
if (c >= '0' && c <= '7')
n = (n << 3) + (c - '0');
else
phase1_ungetc (c);
}
}
else
phase1_ungetc (c);
}
*backslash_counter = 0;
return (unsigned char) n;
}
case 'x':
{
int c1 = phase1_getc ();
int n1;
if (c1 >= '0' && c1 <= '9')
n1 = c1 - '0';
else if (c1 >= 'A' && c1 <= 'F')
n1 = c1 - 'A' + 10;
else if (c1 >= 'a' && c1 <= 'f')
n1 = c1 - 'a' + 10;
else
n1 = -1;
if (n1 >= 0)
{
int c2 = phase1_getc ();
int n2;
if (c2 >= '0' && c2 <= '9')
n2 = c2 - '0';
else if (c2 >= 'A' && c2 <= 'F')
n2 = c2 - 'A' + 10;
else if (c2 >= 'a' && c2 <= 'f')
n2 = c2 - 'a' + 10;
else
n2 = -1;
if (n2 >= 0)
{
*backslash_counter = 0;
return (unsigned char) ((n1 << 4) + n2);
}
phase1_ungetc (c2);
}
phase1_ungetc (c1);
phase1_ungetc (c);
++*backslash_counter;
return '\\';
}
}
if (interpret_unicode)
{
if (c == 'u')
{
unsigned char buf[4];
unsigned int n = 0;
int i;
for (i = 0; i < 4; i++)
{
int c1 = phase1_getc ();
if (c1 >= '0' && c1 <= '9')
n = (n << 4) + (c1 - '0');
else if (c1 >= 'A' && c1 <= 'F')
n = (n << 4) + (c1 - 'A' + 10);
else if (c1 >= 'a' && c1 <= 'f')
n = (n << 4) + (c1 - 'a' + 10);
else
{
phase1_ungetc (c1);
while (--i >= 0)
phase1_ungetc (buf[i]);
phase1_ungetc (c);
++*backslash_counter;
return '\\';
}
buf[i] = c1;
}
*backslash_counter = 0;
return n;
}
if (interpret_ansic)
{
if (c == 'U')
{
unsigned char buf[8];
unsigned int n = 0;
int i;
for (i = 0; i < 8; i++)
{
int c1 = phase1_getc ();
if (c1 >= '0' && c1 <= '9')
n = (n << 4) + (c1 - '0');
else if (c1 >= 'A' && c1 <= 'F')
n = (n << 4) + (c1 - 'A' + 10);
else if (c1 >= 'a' && c1 <= 'f')
n = (n << 4) + (c1 - 'a' + 10);
else
{
phase1_ungetc (c1);
while (--i >= 0)
phase1_ungetc (buf[i]);
phase1_ungetc (c);
++*backslash_counter;
return '\\';
}
buf[i] = c1;
}
if (n < 0x110000)
{
*backslash_counter = 0;
return n;
}
error_with_progname = false;
error (0, 0, _("%s:%d: warning: invalid Unicode character"),
logical_file_name, line_number);
error_with_progname = true;
while (--i >= 0)
phase1_ungetc (buf[i]);
phase1_ungetc (c);
++*backslash_counter;
return '\\';
}
if (c == 'N')
{
int c1 = phase1_getc ();
if (c1 == '{')
{
unsigned char buf[UNINAME_MAX + 1];
int i;
unsigned int n;
for (i = 0; i < UNINAME_MAX; i++)
{
int c2 = phase1_getc ();
if (!(c2 >= ' ' && c2 <= '~'))
{
phase1_ungetc (c2);
while (--i >= 0)
phase1_ungetc (buf[i]);
phase1_ungetc (c1);
phase1_ungetc (c);
++*backslash_counter;
return '\\';
}
if (c2 == '}')
break;
buf[i] = c2;
}
buf[i] = '\0';
n = unicode_name_character ((char *) buf);
if (n != UNINAME_INVALID)
{
*backslash_counter = 0;
return n;
}
phase1_ungetc ('}');
while (--i >= 0)
phase1_ungetc (buf[i]);
}
phase1_ungetc (c1);
phase1_ungetc (c);
++*backslash_counter;
return '\\';
}
}
}
phase1_ungetc (c);
++*backslash_counter;
return '\\';
}
}
static int open_pbb;
static token_ty phase5_pushback[2];
static int phase5_pushback_length;
static void
phase5_get (token_ty *tp)
{
int c;
if (phase5_pushback_length)
{
*tp = phase5_pushback[--phase5_pushback_length];
return;
}
for (;;)
{
tp->line_number = line_number;
c = phase2_getc ();
switch (c)
{
case EOF:
tp->type = token_type_eof;
return;
case ' ':
case '\t':
case '\f':
continue;
case '\n':
if (last_non_comment_line > last_comment_line)
xgettext_comment_reset ();
if (open_pbb > 0)
continue;
tp->type = token_type_other;
return;
}
last_non_comment_line = tp->line_number;
switch (c)
{
case '.':
{
int c1 = phase2_getc ();
phase2_ungetc (c1);
if (!(c1 >= '0' && c1 <= '9'))
{
tp->type = token_type_other;
return;
}
}
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
case 'M': case 'N': case 'O': case 'P': case 'Q':
case 'S': case 'T': case 'V': case 'W': case 'X':
case 'Y': case 'Z':
case '_':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
case 'm': case 'n': case 'o': case 'p': case 'q':
case 's': case 't': case 'v': case 'w': case 'x':
case 'y': case 'z':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
symbol:
{
static char *buffer;
static int bufmax;
int bufpos;
bufpos = 0;
for (;;)
{
if (bufpos >= bufmax)
{
bufmax = 2 * bufmax + 10;
buffer = xrealloc (buffer, bufmax);
}
buffer[bufpos++] = c;
c = phase2_getc ();
switch (c)
{
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
case 'Y': case 'Z':
case '_':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
case 's': case 't': case 'u': case 'v': case 'w': case 'x':
case 'y': case 'z':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
continue;
default:
phase2_ungetc (c);
break;
}
break;
}
if (bufpos >= bufmax)
{
bufmax = 2 * bufmax + 10;
buffer = xrealloc (buffer, bufmax);
}
buffer[bufpos] = '\0';
tp->string = xstrdup (buffer);
tp->type = token_type_symbol;
return;
}
{
static unsigned short *buffer;
static int bufmax;
int bufpos;
int quote_char;
bool interpret_ansic;
bool interpret_unicode;
bool triple;
unsigned int backslash_counter;
case 'R': case 'r':
{
int c1 = phase1_getc ();
if (c1 == '"' || c1 == '\'')
{
quote_char = c1;
interpret_ansic = false;
interpret_unicode = false;
goto string;
}
phase1_ungetc (c1);
goto symbol;
}
case 'U': case 'u':
{
int c1 = phase1_getc ();
if (c1 == '"' || c1 == '\'')
{
quote_char = c1;
interpret_ansic = true;
interpret_unicode = true;
goto string;
}
if (c1 == 'R' || c1 == 'r')
{
int c2 = phase1_getc ();
if (c2 == '"' || c2 == '\'')
{
quote_char = c2;
interpret_ansic = false;
interpret_unicode = true;
goto string;
}
phase1_ungetc (c2);
}
phase1_ungetc (c1);
goto symbol;
}
case '"': case '\'':
quote_char = c;
interpret_ansic = true;
interpret_unicode = false;
string:
triple = false;
{
int c1 = phase1_getc ();
if (c1 == quote_char)
{
int c2 = phase1_getc ();
if (c2 == quote_char)
triple = true;
else
{
phase1_ungetc (c2);
phase1_ungetc (c1);
}
}
else
phase1_ungetc (c1);
}
backslash_counter = 0;
bufpos = 0;
for (;;)
{
int uc = phase7_getuc (quote_char, triple, interpret_ansic,
interpret_unicode, &backslash_counter);
unsigned int len;
if (uc == P7_EOF || uc == P7_STRING_END)
break;
assert (uc >= 0 && uc < 0x110000);
len = (uc < 0x10000 ? 1 : 2);
if (bufpos + len > bufmax)
{
bufmax = 2 * bufmax + 10;
buffer =
xrealloc (buffer, bufmax * sizeof (unsigned short));
}
if (uc < 0x10000)
buffer[bufpos++] = uc;
else
{
buffer[bufpos++] = 0xd800 + ((uc - 0x10000) >> 10);
buffer[bufpos++] = 0xdc00 + ((uc - 0x10000) & 0x3ff);
}
}
{
int pos;
unsigned char *utf8_string;
unsigned char *q;
utf8_string = (unsigned char *) xmalloc (3 * bufpos + 1);
for (pos = 0, q = utf8_string; pos < bufpos; )
{
unsigned int uc;
int n;
pos += u16_mbtouc (&uc, buffer + pos, bufpos - pos);
n = u8_uctomb (q, uc, 6);
assert (n > 0);
q += n;
}
*q = '\0';
assert (q - utf8_string <= 3 * bufpos);
tp->string = (char *) utf8_string;
}
tp->type = token_type_string;
return;
}
case '(':
open_pbb++;
tp->type = token_type_lparen;
return;
case ')':
if (open_pbb > 0)
open_pbb--;
tp->type = token_type_rparen;
return;
case ',':
tp->type = token_type_comma;
return;
case '[': case '{':
open_pbb++;
tp->type = token_type_other;
return;
case ']': case '}':
if (open_pbb > 0)
open_pbb--;
tp->type = token_type_other;
return;
default:
tp->type = token_type_other;
return;
}
}
}
static void
phase5_unget (token_ty *tp)
{
if (tp->type != token_type_eof)
phase5_pushback[phase5_pushback_length++] = *tp;
}
static void
x_python_lex (token_ty *tp)
{
phase5_get (tp);
if (tp->type != token_type_string)
return;
for (;;)
{
token_ty tmp;
size_t len;
phase5_get (&tmp);
if (tmp.type != token_type_string)
{
phase5_unget (&tmp);
return;
}
len = strlen (tp->string);
tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
strcpy (tp->string + len, tmp.string);
free (tmp.string);
}
}
static flag_context_list_table_ty *flag_context_list_table;
static bool
extract_parenthesized (message_list_ty *mlp,
flag_context_ty outer_context,
flag_context_list_iterator_ty context_iter,
int commas_to_skip, int plural_commas)
{
message_ty *plural_mp = NULL;
int state;
int next_commas_to_skip = -1;
int next_plural_commas = 0;
flag_context_list_iterator_ty next_context_iter =
passthrough_context_list_iterator;
flag_context_ty inner_context =
inherited_context (outer_context,
flag_context_list_iterator_advance (&context_iter));
state = 0;
for (;;)
{
token_ty token;
x_python_lex (&token);
switch (token.type)
{
case token_type_symbol:
{
void *keyword_value;
if (find_entry (&keywords, token.string, strlen (token.string),
&keyword_value)
== 0)
{
int argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
int argnum2 = (int) (long) keyword_value >> 10;
next_commas_to_skip = argnum1 - 1;
next_plural_commas = (argnum2 > argnum1 ? argnum2 - argnum1 : 0);
state = 1;
}
else
state = 0;
}
next_context_iter =
flag_context_list_iterator (
flag_context_list_table_lookup (
flag_context_list_table,
token.string, strlen (token.string)));
free (token.string);
continue;
case token_type_lparen:
if (extract_parenthesized (mlp, inner_context, next_context_iter,
state ? next_commas_to_skip : -1,
state ? next_plural_commas : 0))
return true;
next_context_iter = null_context_list_iterator;
state = 0;
continue;
case token_type_rparen:
return false;
case token_type_comma:
if (commas_to_skip >= 0)
{
if (commas_to_skip > 0)
commas_to_skip--;
else
if (plural_mp != NULL && plural_commas > 0)
{
commas_to_skip = plural_commas - 1;
plural_commas = 0;
}
else
commas_to_skip = -1;
}
inner_context =
inherited_context (outer_context,
flag_context_list_iterator_advance (
&context_iter));
next_context_iter = passthrough_context_list_iterator;
state = 0;
continue;
case token_type_string:
{
lex_pos_ty pos;
pos.file_name = logical_file_name;
pos.line_number = token.line_number;
if (extract_all)
remember_a_message (mlp, token.string, inner_context, &pos);
else
{
if (commas_to_skip == 0)
{
if (plural_mp == NULL)
{
message_ty *mp =
remember_a_message (mlp, token.string,
inner_context, &pos);
if (plural_commas > 0)
plural_mp = mp;
}
else
{
remember_a_message_plural (plural_mp, token.string,
inner_context, &pos);
plural_mp = NULL;
}
}
else
free (token.string);
}
}
next_context_iter = null_context_list_iterator;
state = 0;
continue;
case token_type_eof:
return true;
case token_type_other:
next_context_iter = null_context_list_iterator;
state = 0;
continue;
default:
abort ();
}
}
}
void
extract_python (FILE *f,
const char *real_filename, const char *logical_filename,
flag_context_list_table_ty *flag_table,
msgdomain_list_ty *mdlp)
{
message_list_ty *mlp = mdlp->item[0]->messages;
xgettext_current_source_encoding = po_charset_utf8;
fp = f;
real_file_name = real_filename;
logical_file_name = xstrdup (logical_filename);
line_number = 1;
last_comment_line = -1;
last_non_comment_line = -1;
open_pbb = 0;
flag_context_list_table = flag_table;
init_keywords ();
while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
-1, 0))
;
fp = NULL;
real_file_name = NULL;
logical_file_name = NULL;
line_number = 0;
}