x-sh.c   [plain text]


/* xgettext sh backend.
   Copyright (C) 2003 Free Software Foundation, Inc.
   Written by Bruno Haible <bruno@clisp.org>, 2003.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software Foundation,
   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include <errno.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "message.h"
#include "xgettext.h"
#include "x-sh.h"
#include "error.h"
#include "xalloc.h"
#include "exit.h"
#include "hash.h"
#include "gettext.h"

#define _(s) gettext(s)

/* The sh syntax is defined in POSIX:2001, see
     http://www.opengroup.org/onlinepubs/007904975/utilities/xcu_chap02.html
   Summary of sh syntax:
   - Input is broken into words, which are then subject to
     - tilde expansion ~...
     - command substitution `...`
     - variable substitution $var
     - arithmetic substitution $((...))
     - field splitting at whitespace (IFS)
     - wildcard pattern expansion *?
     - quote removal
   - Strings are enclosed in "..."; command substitution, variable
     substitution and arithmetic substitution are performed here as well.
   - '...' is a string without substitutions.
   - The list of resulting words is split into commands by semicolon and
     newline.
   - '#' at the beginning of a word introduces a comment until end of line.
   The parser is implemented in bash-2.05b/parse.y.  */


/* ====================== Keyword set customization.  ====================== */

/* If true extract all strings.  */
static bool extract_all = false;

static hash_table keywords;
static bool default_keywords = true;


void
x_sh_extract_all ()
{
  extract_all = true;
}


void
x_sh_keyword (const char *name)
{
  if (name == NULL)
    default_keywords = false;
  else
    {
      const char *end;
      int argnum1;
      int argnum2;
      const char *colon;

      if (keywords.table == NULL)
	init_hash (&keywords, 100);

      split_keywordspec (name, &end, &argnum1, &argnum2);

      /* The characters between name and end should form a valid C identifier.
	 A colon means an invalid parse in split_keywordspec().  */
      colon = strchr (name, ':');
      if (colon == NULL || colon >= end)
	{
	  if (argnum1 == 0)
	    argnum1 = 1;
	  insert_entry (&keywords, name, end - name,
			(void *) (long) (argnum1 + (argnum2 << 10)));
	}
    }
}

/* Finish initializing the keywords hash table.
   Called after argument processing, before each file is processed.  */
static void
init_keywords ()
{
  if (default_keywords)
    {
      x_sh_keyword ("gettext");
      x_sh_keyword ("ngettext:1,2");
      x_sh_keyword ("eval_gettext");
      x_sh_keyword ("eval_ngettext:1,2");
      default_keywords = false;
    }
}

void
init_flag_table_sh ()
{
  xgettext_record_flag ("gettext:1:pass-sh-format");
  xgettext_record_flag ("ngettext:1:pass-sh-format");
  xgettext_record_flag ("ngettext:2:pass-sh-format");
  xgettext_record_flag ("eval_gettext:1:sh-format");
  xgettext_record_flag ("eval_ngettext:1:sh-format");
  xgettext_record_flag ("eval_ngettext:2:sh-format");
}


/* ======================== Reading of characters.  ======================== */

/* Real filename, used in error messages about the input file.  */
static const char *real_file_name;

/* Logical filename and line number, used to label the extracted messages.  */
static char *logical_file_name;
static int line_number;

/* The input file stream.  */
static FILE *fp;


/* Fetch the next character from the input file.  */
static int
do_getc ()
{
  int c = getc (fp);

  if (c == EOF)
    {
      if (ferror (fp))
	error (EXIT_FAILURE, errno, _("\
error while reading \"%s\""), real_file_name);
    }
  else if (c == '\n')
   line_number++;

  return c;
}

/* Put back the last fetched character, not EOF.  */
static void
do_ungetc (int c)
{
  if (c == '\n')
    line_number--;
  ungetc (c, fp);
}


/* Remove backslash followed by newline from the input stream.
   Cope with potentially 2 characters of pushback.  */

/* Maximum used guaranteed to be < 4.  */
static int phase1_pushback[4];
static int phase1_pushback_length;

static int
phase1_getc ()
{
  int c;

  if (phase1_pushback_length)
    {
      c = phase1_pushback[--phase1_pushback_length];
      if (c == '\n')
	++line_number;
      return c;
    }
  for (;;)
    {
      c = do_getc ();
      if (c != '\\')
	return c;
      c = do_getc ();
      if (c != '\n')
	{
	  if (c != EOF)
	    do_ungetc (c);
	  return '\\';
	}
    }
}

static void
phase1_ungetc (int c)
{
  switch (c)
    {
    case EOF:
      break;

    case '\n':
      --line_number;
      /* FALLTHROUGH */

    default:
      phase1_pushback[phase1_pushback_length++] = c;
      break;
    }
}


/* ========================== Reading of tokens.  ========================== */


/* A token consists of a sequence of characters.  */
struct token
{
  int allocated;		/* number of allocated 'token_char's */
  int charcount;		/* number of used 'token_char's */
  char *chars;			/* the token's constituents */
};

/* Initialize a 'struct token'.  */
static inline void
init_token (struct token *tp)
{
  tp->allocated = 10;
  tp->chars = (char *) xmalloc (tp->allocated * sizeof (char));
  tp->charcount = 0;
}

/* Free the memory pointed to by a 'struct token'.  */
static inline void
free_token (struct token *tp)
{
  free (tp->chars);
}

/* Ensure there is enough room in the token for one more character.  */
static inline void
grow_token (struct token *tp)
{
  if (tp->charcount == tp->allocated)
    {
      tp->allocated *= 2;
      tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
    }
}

/* Convert a struct token * to a char*.  */
static char *
string_of_token (const struct token *tp)
{
  char *str;
  int n;

  n = tp->charcount;
  str = (char *) xmalloc (n + 1);
  memcpy (str, tp->chars, n);
  str[n] = '\0';
  return str;
}


/* ========================= Accumulating messages ========================= */


static message_list_ty *mlp;


/* ========================= Accumulating comments ========================= */


static char *buffer;
static size_t bufmax;
static size_t buflen;

static inline void
comment_start ()
{
  buflen = 0;
}

static inline void
comment_add (int c)
{
  if (buflen >= bufmax)
    {
      bufmax = 2 * bufmax + 10;
      buffer = xrealloc (buffer, bufmax);
    }
  buffer[buflen++] = c;
}

static inline void
comment_line_end ()
{
  while (buflen >= 1
	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
    --buflen;
  if (buflen >= bufmax)
    {
      bufmax = 2 * bufmax + 10;
      buffer = xrealloc (buffer, bufmax);
    }
  buffer[buflen] = '\0';
  xgettext_comment_add (buffer);
}


/* These are for tracking whether comments count as immediately before
   keyword.  */
static int last_comment_line;
static int last_non_comment_line;


/* ========================= Debackslashification ========================== */

/* This state tracks the effect of backquotes, double-quotes and single-quotes
   on the parsing of backslashes.  We make a single pass through the input
   file, keeping the state up to date.  This is much faster than accumulating
   strings and processing them with explicit debackslashification, like the
   shell does it.  */

/* The number of nested `...` or "`...`" constructs.  Assumed to be <= 32.  */
static unsigned int nested_backquotes;

/* A bit mask indicating which of the currently open `...` or "`...`"
   constructs is with double-quotes: "`...`".
   A bit value of 1 stands for "`...`", a bit value of 0 stands for `...`.
   Bit position 0 designates the outermost backquotes nesting,
   bit position 1 the second-outermost backquotes nesting,
   ...
   bit position (nested_backquotes-1) the innermost backquotes nesting.  */
static unsigned int open_doublequotes_mask;

/* A bit indicating whether a double-quote is currently open inside the
   innermost backquotes nesting.  */
static bool open_doublequote;

/* A bit indicating whether a single-quote is currently open inside the
   innermost backquotes nesting.  */
static bool open_singlequote;


/* Functions to update the state.  */

static inline void
saw_opening_backquote ()
{
  if (open_singlequote)
    abort ();
  if (open_doublequote)
    open_doublequotes_mask |= (unsigned int) 1 << nested_backquotes;
  nested_backquotes++;
  open_doublequote = false;
}

static inline void
saw_closing_backquote ()
{
  nested_backquotes--;
  open_doublequote = (open_doublequotes_mask >> nested_backquotes) & 1;
  open_doublequotes_mask &= ((unsigned int) 1 << nested_backquotes) - 1;
  open_singlequote = false; /* just for safety */
}

static inline void
saw_opening_doublequote ()
{
  if (open_singlequote || open_doublequote)
    abort ();
  open_doublequote = true;
}

static inline void
saw_closing_doublequote ()
{
  if (open_singlequote || !open_doublequote)
    abort ();
  open_doublequote = false;
}

static inline void
saw_opening_singlequote ()
{
  if (open_doublequote || open_singlequote)
    abort ();
  open_singlequote = true;
}

static inline void
saw_closing_singlequote ()
{
  if (open_doublequote || !open_singlequote)
    abort ();
  open_singlequote = false;
}


/* ========================== Reading of commands ========================== */

/* We are only interested in constant strings.  Other words need not to be
   represented precisely.  */
enum word_type
{
  t_string,	/* constant string */
  t_other,	/* other string */
  t_separator,	/* command separator: semicolon or newline */
  t_redirect,	/* redirection: one of < > >| << <<- >> <> <& >& */
  t_backquote,	/* closing '`' pseudo word */
  t_paren,	/* closing ')' pseudo word */
  t_eof		/* EOF marker */
};

struct word
{
  enum word_type type;
  struct token *token;		/* for t_string */
  int line_number_at_start;	/* for t_string */
};

/* Free the memory pointed to by a 'struct word'.  */
static inline void
free_word (struct word *wp)
{
  if (wp->type == t_string)
    {
      free_token (wp->token);
      free (wp->token);
    }
}

/* Convert a t_string token to a char*.  */
static char *
string_of_word (const struct word *wp)
{
  char *str;
  int n;

  if (!(wp->type == t_string))
    abort ();
  n = wp->token->charcount;
  str = (char *) xmalloc (n + 1);
  memcpy (str, wp->token->chars, n);
  str[n] = '\0';
  return str;
}


/* Whitespace recognition.  */

static inline bool
is_whitespace (int c)
{
  return (c == ' ' || c == '\t' || c == '\n');
}

/* Operator character recognition.  */

static inline bool
is_operator_start (int c)
{
  return (c == '|' || c == '&' || c == ';' || c == '<' || c == '>'
	  || c == '(' || c == ')');
}


/* Denotation of a quoted character.
   The distinction between quoted and unquoted character is important only for
   the special, whitespace and operator characters; it is irrelevant for
   alphanumeric characters, '\\' and many others.  */
#define QUOTED(c) (UCHAR_MAX + 1 + (c))
/* Values in the 'unsigned char' range are implicitly unquoted.  Among these,
   the following are important:
     '"'         opening or closing double quote
     '\''        opening or closing single quote
     '$'         the unknown result of a dollar expansion
     '`'         does not occur - replaced with OPENING_BACKQUOTE or
                 CLOSING_BACKQUOTE
 */
#define OPENING_BACKQUOTE (2 * (UCHAR_MAX + 1) + '`')
#define CLOSING_BACKQUOTE (3 * (UCHAR_MAX + 1) + '`')

/* Maximum used guaranteed to be < 4.  */
static int phase2_pushback[4];
static int phase2_pushback_length;

/* Forward declaration of local functions.  */
static void phase2_ungetc (int c);

/* Return the next character, with backslashes removed.
   The result is QUOTED(c) for some unsigned char c, if the next character
   is escaped sufficiently often to make it a regular constituent character,
   or simply an 'unsigned char' if it has its special meaning (of special,
   whitespace or operator charcter), or OPENING_BACKQUOTE, CLOSING_BACKQUOTE,
   EOF.
   It's the caller's responsibility to update the state.  */
static int
phase2_getc ()
{
  int c;

  if (phase2_pushback_length)
    {
      c = phase2_pushback[--phase2_pushback_length];
      if (c == '\n')
	++line_number;
      return c;
    }

  c = phase1_getc ();
  if (c == EOF)
    return c;
  if (c == '\'')
    return (open_doublequote ? QUOTED (c) : c);
  if (!open_singlequote)
    {
      if (c == '"' || c == '$')
	return c;
      if (c == '`')
	return (nested_backquotes > 0 ? CLOSING_BACKQUOTE : OPENING_BACKQUOTE);
    }
  if (c == '\\')
    {
      /* Number of debackslahificication passes that are active at the
	 current point.  */
      unsigned int debackslahify =
	nested_backquotes + (open_singlequote ? 0 : 1);
      /* Normal number of backslashes that yield a single backslash in the
	 final output.  */
      unsigned int expected_count =
	(unsigned int) 1 << debackslahify;
      /* Number of backslashes found.  */
      unsigned int count;

      for (count = 1; count < expected_count; count++)
	{
	  c = phase1_getc ();
	  if (c != '\\')
	    break;
	}
      if (count == expected_count)
	return '\\';

      /* The count of backslashes is > 0 and < expected_count, therefore the
	 result depends on c, the first character after the backslashes.
	 Note: The formulas below don't necessarily have a logic; they were
	 empirically determined such that 1. the xgettext-30 test succeeds,
	 2. the behaviour for count == 0 would correspond to the one without
	 any baskslash.  */
      if (c == '\'')
	{
	  if (!open_singlequote && count > (expected_count >> 1))
	    {
	      phase1_ungetc (c);
	      return '\\';
	    }
	  else
	    return (open_doublequote ? QUOTED (c) : c);
	}
      else if (c == '"')
	{
	  /* Each debackslahificication pass converts \\ to \ and \" to ";
	     passes corresponding to `...` drop a lone " whereas passes
	     corresponding to "`...`" leave it alone.  Therefore, the
	     minimum number of backslashes needed to get one double-quote
	     in the end is  open_doublequotes_mask + 1.  */
	  if (open_singlequote)
	    {
	      if (count > open_doublequotes_mask)
		{
		  phase2_ungetc (c);
		  return '\\';
		}
	      else
		return QUOTED (c);
	    }
	  else
	    {
	      if (count > open_doublequotes_mask)
		return QUOTED (c);
	      else
	        /* Some of the count values <= open_doublequotes_mask are
		   actually invalid here, but we assume a syntactically
		   correct input file anyway.  */
		return c;
	    }
	}
      else if (c == '`')
	{
	  /* FIXME: This code looks fishy.  */
	  if (count == expected_count - 1)
	    return c;
	  else
	    /* Some of the count values < expected_count - 1 are
	       actually invalid here, but we assume a syntactically
	       correct input file anyway.  */
	    if (nested_backquotes > 0 && !open_singlequote
		&& count >= (expected_count >> 2))
	      return OPENING_BACKQUOTE;
	    else
	      return CLOSING_BACKQUOTE;
	}
      else if (c == '$')
	{
	  if (open_singlequote)
	    return QUOTED (c);
	  if (count >= (expected_count >> 1))
	    return QUOTED (c);
	  else
	    return c;
	}
      else
	{
	  /* When not followed by a quoting character or backslash or dollar,
	     a backslash survives a debackslahificication pass unmodified.
	     Therefore each debackslahificication pass performs a
	       count := (count + 1) >> 1
	     operation.  Therefore the minimum number of backslashes needed
	     to get one backslash in the end is  (expected_count >> 1) + 1.  */
	  if (open_doublequote || open_singlequote)
	    {
	      if (count > 0)
		{
		  phase1_ungetc (c);
		  return '\\';
		}
	      else
		return QUOTED (c);
	    }
	  else
	    {
	      if (count > (expected_count >> 1))
		{
		  phase1_ungetc (c);
		  return '\\';
		}
	      else if (count > 0)
		return QUOTED (c);
	      else
		return c;
	    }
	}
    }

  return (open_singlequote || open_doublequote ? QUOTED (c) : c);
}

static void
phase2_ungetc (int c)
{
  switch (c)
    {
    case EOF:
      break;

    case '\n':
      --line_number;
      /* FALLTHROUGH */

    default:
      phase2_pushback[phase2_pushback_length++] = c;
      break;
    }
}


/* Context lookup table.  */
static flag_context_list_table_ty *flag_context_list_table;


/* Forward declaration of local functions.  */
static enum word_type read_command_list (int looking_for,
					 flag_context_ty outer_context);



/* Read the next word.
   'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
   or '\0'.  */
static void
read_word (struct word *wp, int looking_for, flag_context_ty context)
{
  int c;
  bool all_unquoted_digits;

  do
    {
      c = phase2_getc ();
      if (c == '#')
	{
	  /* Skip a comment up to end of line.  */
	  last_comment_line = line_number;
	  comment_start ();
	  for (;;)
	    {
	      c = phase1_getc ();
	      if (c == EOF || c == '\n')
		break;
	      /* We skip all leading white space, but not EOLs.  */
	      if (!(buflen == 0 && (c == ' ' || c == '\t')))
		comment_add (c);
	    }
	  comment_line_end ();
	}
      if (c == '\n')
	{
	  /* Comments assumed to be grouped with a message must immediately
	     precede it, with no non-whitespace token on a line between
	     both.  */
	  if (last_non_comment_line > last_comment_line)
	    xgettext_comment_reset ();
	  wp->type = t_separator;
	  return;
	}
    }
  while (is_whitespace (c));

  if (c == EOF)
    {
      wp->type = t_eof;
      return;
    }

  if (c == '<' || c == '>')
    {
      /* Recognize the redirection operators < > >| << <<- >> <> <& >&  */
      int c2 = phase2_getc ();
      if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
	{
	  if (c == '<' && c2 == '<')
	    {
	      int c3 = phase2_getc ();
	      if (c3 != '-')
		phase2_ungetc (c3);
	    }
	}
      else
	phase2_ungetc (c2);
      wp->type = t_redirect;
      return;
    }

  if (looking_for == CLOSING_BACKQUOTE && c == CLOSING_BACKQUOTE)
    {
      saw_closing_backquote ();
      wp->type = t_backquote;
      last_non_comment_line = line_number;
      return;
    }

  if (looking_for == ')' && c == ')')
    {
      wp->type = t_paren;
      last_non_comment_line = line_number;
      return;
    }

  if (is_operator_start (c))
    {
      wp->type = (c == ';' ? t_separator : t_other);
      return;
    }

  wp->type = t_string;
  wp->token = (struct token *) xmalloc (sizeof (struct token));
  init_token (wp->token);
  wp->line_number_at_start = line_number;
  all_unquoted_digits = true;

  for (;; c = phase2_getc ())
    {
      if (c == EOF)
	break;

      if (all_unquoted_digits && (c == '<' || c == '>'))
	{
	  /* Recognize the redirection operators < > >| << <<- >> <> <& >&
	     prefixed with a nonempty sequence of unquoted digits.  */
	  int c2 = phase2_getc ();
	  if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
	    {
	      if (c == '<' && c2 == '<')
		{
		  int c3 = phase2_getc ();
		  if (c3 != '-')
		    phase2_ungetc (c3);
		}
	    }
	  else
	    phase2_ungetc (c2);

	  wp->type = t_redirect;
	  free_token (wp->token);
	  free (wp->token);

	  last_non_comment_line = line_number;

	  return;
	}

      all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9');

      if (c == '$')
	{
	  int c2 = phase2_getc ();
	  if (c2 == '(')
	    {
	      int c3 = phase2_getc ();
	      if (c3 == '(')
		{
		  /* Arithmetic expression.  Skip until the matching closing
		     parenthesis.  */
		  unsigned int depth = 2;

		  do
		    {
		      c = phase2_getc ();
		      if (c == '(')
			depth++;
		      else if (c == ')')
			if (--depth == 0)
			  break;
		    }
		  while (c != EOF);
		}
	      else
		{
		  /* Command substitution.  */
		  phase2_ungetc (c3);
		  read_command_list (')', context);
		}
	    }
	  else if (c2 == '\'' && !open_singlequote)
	    {
	      /* Bash builtin for string with ANSI-C escape sequences.  */
	      saw_opening_singlequote ();
	      for (;;)
		{
		  c = phase2_getc ();
		  if (c == EOF)
		    break;
		  if (c == '\'')
		    {
		      saw_closing_singlequote ();
		      break;
		    }
		  if (c == '\\')
		    {
		      c = phase2_getc ();
		      switch (c)
			{
			default:
			  phase2_ungetc (c);
			  c = '\\';
			  break;

			case '\\':
			  break;
			case '\'':
			  /* Don't call saw_closing_singlequote () here.  */
			  break;

			case 'a':
			  c = '\a';
			  break;
			case 'b':
			  c = '\b';
			  break;
			case 'e':
			  c = 0x1b; /* ESC */
			  break;
			case 'f':
			  c = '\f';
			  break;
			case 'n':
			  c = '\n';
			  break;
			case 'r':
			  c = '\r';
			  break;
			case 't':
			  c = '\t';
			  break;
			case 'v':
			  c = '\v';
			  break;

			case 'x':
			  c = phase2_getc ();
			  if ((c >= '0' && c <= '9')
			      || (c >= 'A' && c <= 'F')
			      || (c >= 'a' && c <= 'f'))
			    {
			      int n;

			      if (c >= '0' && c <= '9')
				n = c - '0';
			      else if (c >= 'A' && c <= 'F')
				n = 10 + c - 'A';
			      else if (c >= 'a' && c <= 'f')
				n = 10 + c - 'a';
			      else
				abort ();

			      c = phase2_getc ();
			      if ((c >= '0' && c <= '9')
				  || (c >= 'A' && c <= 'F')
				  || (c >= 'a' && c <= 'f'))
				{
				  if (c >= '0' && c <= '9')
				    n = n * 16 + c - '0';
				  else if (c >= 'A' && c <= 'F')
				    n = n * 16 + 10 + c - 'A';
				  else if (c >= 'a' && c <= 'f')
				    n = n * 16 + 10 + c - 'a';
				  else
				    abort ();
				}
			      else
				phase2_ungetc (c);

			      c = n;
			    }
			  else
			    {
			      phase2_ungetc (c);
			      phase2_ungetc ('x');
			      c = '\\';
			    }
			  break;

			case '0': case '1': case '2': case '3':
			case '4': case '5': case '6': case '7':
			  {
			    int n = c - '0';

			    c = phase2_getc ();
			    if (c >= '0' && c <= '7')
			      {
				n = n * 8 + c - '0';

				c = phase2_getc ();
				if (c >= '0' && c <= '7')
				  n = n * 8 + c - '0';
				else
				  phase2_ungetc (c);
			      }
			    else
			      phase2_ungetc (c);

			    c = n;
			  }
			  break;
			}
		    }
		  if (wp->type == t_string)
		    {
		      grow_token (wp->token);
		      wp->token->chars[wp->token->charcount++] =
			(unsigned char) c;
		    }
		}
	      /* The result is a literal string.  Don't change wp->type.  */
	      continue;
	    }
	  else if (c2 == '"' && !open_doublequote)
	    {
	      /* Bash builtin for internationalized string.  */
	      lex_pos_ty pos;
	      struct token string;

	      saw_opening_doublequote ();
	      pos.file_name = logical_file_name;
	      pos.line_number = line_number;
	      init_token (&string);
	      for (;;)
		{
		  c = phase2_getc ();
		  if (c == EOF)
		    break;
		  if (c == '"')
		    {
		      saw_closing_doublequote ();
		      break;
		    }
		  grow_token (&string);
		  string.chars[string.charcount++] = (unsigned char) c;
		}
	      remember_a_message (mlp, string_of_token (&string),
				  context, &pos);
	      free_token (&string);

	      error_with_progname = false;
	      error (0, 0, _("%s:%lu: warning: the syntax $\"...\" is deprecated due to security reasons; use eval_gettext instead"),
		     pos.file_name, (unsigned long) pos.line_number);
	      error_with_progname = true;

	      /* The result at runtime is not constant. Therefore we
		 change wp->type.  */
	    }
	  else
	    phase2_ungetc (c2);
	  wp->type = t_other;
	  continue;
	}

      if (c == '\'')
	{
	  if (!open_singlequote)
	    {
	      /* Handle an opening single quote.  */
	      saw_opening_singlequote ();
	    }
	  else
	    {
	      /* Handle a closing single quote.  */
	      saw_closing_singlequote ();
	    }
	  continue;
	}

      if (c == '"')
	{
	  if (!open_doublequote)
	    {
	      /* Handle an opening double quote.  */
	      saw_opening_doublequote ();
	    }
	  else
	    {
	      /* Handle a closing double quote.  */
	      saw_closing_doublequote ();
	    }
	  continue;
	}

      if (c == OPENING_BACKQUOTE)
	{
	  /* Handle an opening backquote.  */
	  saw_opening_backquote ();

	  read_command_list (CLOSING_BACKQUOTE, context);

	  wp->type = t_other;
	  continue;
	}
      if (c == CLOSING_BACKQUOTE)
	break;

      if (!open_singlequote && !open_doublequote
	  && (is_whitespace (c) || is_operator_start (c)))
	break;

      if (wp->type == t_string)
	{
	  grow_token (wp->token);
	  wp->token->chars[wp->token->charcount++] = (unsigned char) c;
	}
    }

  phase2_ungetc (c);

  if (wp->type != t_string)
    {
      free_token (wp->token);
      free (wp->token);
    }
  last_non_comment_line = line_number;
}


/* Read the next command.
   'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
   or '\0'.
   Returns the type of the word that terminated the command.  */
static enum word_type
read_command (int looking_for, flag_context_ty outer_context)
{
  /* Read the words that make up the command.
     Here we completely ignore field splitting at whitespace and wildcard
     expansions; i.e. we assume that the source is written in such a way that
     every word in the program determines exactly one word in the resulting
     command.
     But we do not require that the 'gettext'/'ngettext' command is the
     first in the command; this is because 1. we want to allow for prefixes
     like "$verbose" that may expand to nothing, and 2. it's a big effort
     to know where a command starts in a $(for ...) or $(case ...) compound
     command.  */
  int arg = 0;			/* Current argument number.  */
  bool arg_of_redirect = false;	/* True right after a redirection operator.  */
  flag_context_list_iterator_ty context_iter;
  int argnum1 = -1;		/* First string position.  */
  int argnum2 = -1;		/* Plural string position.  */
  message_ty *plural_mp = NULL;	/* Remember the msgid.  */

  for (;;)
    {
      struct word inner;
      flag_context_ty inner_context;

      if (arg == 0)
	inner_context = null_context;
      else
	inner_context =
	  inherited_context (outer_context,
			     flag_context_list_iterator_advance (
			       &context_iter));

      read_word (&inner, looking_for, inner_context);

      /* Recognize end of command.  */
      if (inner.type == t_separator
	  || inner.type == t_backquote || inner.type == t_paren
	  || inner.type == t_eof)
	return inner.type;

      if (extract_all)
	{
	  if (inner.type == t_string)
	    {
	      lex_pos_ty pos;

	      pos.file_name = logical_file_name;
	      pos.line_number = inner.line_number_at_start;
	      remember_a_message (mlp, string_of_word (&inner),
				  inner_context, &pos);
	    }
	}

      if (arg_of_redirect)
	{
	  /* Ignore arguments of redirection operators.  */
	  arg_of_redirect = false;
	}
      else if (inner.type == t_redirect)
	{
	  /* Ignore this word and the following one.  */
	  arg_of_redirect = true;
	}
      else
	{
	  if (argnum1 < 0 && argnum2 < 0)
	    {
	      /* This is the function position.  */
	      arg = 0;
	      if (inner.type == t_string)
		{
		  char *function_name = string_of_word (&inner);
		  void *keyword_value;

		  if (find_entry (&keywords,
				  function_name, strlen (function_name),
				  &keyword_value)
		      == 0)
		    {
		      argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
		      argnum2 = (int) (long) keyword_value >> 10;
		    }

		  context_iter =
		    flag_context_list_iterator (
		      flag_context_list_table_lookup (
			flag_context_list_table,
			function_name, strlen (function_name)));

		  free (function_name);
		}
	      else
		context_iter = null_context_list_iterator;
	    }
	  else
	    {
	      /* These are the argument positions.
		 Extract a string if we have reached the right
		 argument position.  */
	      if (arg == argnum1)
		{
		  if (inner.type == t_string)
		    {
		      lex_pos_ty pos;
		      message_ty *mp;

		      pos.file_name = logical_file_name;
		      pos.line_number = inner.line_number_at_start;
		      mp = remember_a_message (mlp, string_of_word (&inner),
					       inner_context, &pos);
		      if (argnum2 > 0)
			plural_mp = mp;
		    }
		}
	      else if (arg == argnum2)
		{
		  if (inner.type == t_string && plural_mp != NULL)
		    {
		      lex_pos_ty pos;

		      pos.file_name = logical_file_name;
		      pos.line_number = inner.line_number_at_start;
		      remember_a_message_plural (plural_mp, string_of_word (&inner),
						 inner_context, &pos);
		    }
		}

	      if (arg >= argnum1 && arg >= argnum2)
		{
		  /* Stop looking for arguments of the last function_name.  */
		  /* FIXME: What about context_iter?  */
		  argnum1 = -1;
		  argnum2 = -1;
		  plural_mp = NULL;
		}
	    }

	  arg++;
	}

      free_word (&inner);
    }
}


/* Read a list of commands.
   'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
   or '\0'.
   Returns the type of the word that terminated the command list.  */
static enum word_type
read_command_list (int looking_for, flag_context_ty outer_context)
{
  for (;;)
    {
      enum word_type terminator;

      terminator = read_command (looking_for, outer_context);
      if (terminator != t_separator)
	return terminator;
    }
}


void
extract_sh (FILE *f,
	    const char *real_filename, const char *logical_filename,
	    flag_context_list_table_ty *flag_table,
	    msgdomain_list_ty *mdlp)
{
  mlp = mdlp->item[0]->messages;

  fp = f;
  real_file_name = real_filename;
  logical_file_name = xstrdup (logical_filename);
  line_number = 1;

  last_comment_line = -1;
  last_non_comment_line = -1;

  nested_backquotes = 0;
  open_doublequotes_mask = 0;
  open_doublequote = false;
  open_singlequote = false;

  flag_context_list_table = flag_table;

  init_keywords ();

  /* Eat tokens until eof is seen.  */
  read_command_list ('\0', null_context);

  fp = NULL;
  real_file_name = NULL;
  logical_file_name = NULL;
  line_number = 0;
}