parser.rb [plain text]

require 'strscan'

##
# A recursive-descent parser for RDoc markup.
#
# The parser tokenizes an input string then parses the tokens into a Document.
# Documents can be converted into output formats by writing a visitor like
# RDoc::Markup::ToHTML.
#
# The parser only handles the block-level constructs Paragraph, List,
# ListItem, Heading, Verbatim, BlankLine and Rule.  Inline markup such as
# <tt>\+blah\+</tt> is handled separately by RDoc::Markup::AttributeManager.
#
# To see what markup the Parser implements read RDoc.  To see how to use
# RDoc markup to format text in your program read RDoc::Markup.

class RDoc::Markup::Parser

  include RDoc::Text

  ##
  # List token types

  LIST_TOKENS = [
    :BULLET,
    :LABEL,
    :LALPHA,
    :NOTE,
    :NUMBER,
    :UALPHA,
  ]

  ##
  # Parser error subclass

  class Error < RuntimeError; end

  ##
  # Raised when the parser is unable to handle the given markup

  class ParseError < Error; end

  ##
  # Enables display of debugging information

  attr_accessor :debug

  ##
  # Token accessor

  attr_reader :tokens

  ##
  # Parses +str+ into a Document.
  #
  # Use RDoc::Markup#parse instead of this method.

  def self.parse str
    parser = new
    parser.tokenize str
    doc = RDoc::Markup::Document.new
    parser.parse doc
  end

  ##
  # Returns a token stream for +str+, for testing

  def self.tokenize str
    parser = new
    parser.tokenize str
    parser.tokens
  end

  ##
  # Creates a new Parser.  See also ::parse

  def initialize
    @binary_input   = nil
    @current_token  = nil
    @debug          = false
    @have_encoding  = Object.const_defined? :Encoding
    @have_byteslice = ''.respond_to? :byteslice
    @input          = nil
    @input_encoding = nil
    @line           = 0
    @line_pos       = 0
    @s              = nil
    @tokens         = []
  end

  ##
  # Builds a Heading of +level+

  def build_heading level
    type, text, = get

    text = case type
           when :TEXT then
             skip :NEWLINE
             text
           else
             unget
             ''
           end

    RDoc::Markup::Heading.new level, text
  end

  ##
  # Builds a List flush to +margin+

  def build_list margin
    p :list_start => margin if @debug

    list = RDoc::Markup::List.new
    label = nil

    until @tokens.empty? do
      type, data, column, = get

      case type
      when *LIST_TOKENS then
        if column < margin || (list.type && list.type != type) then
          unget
          break
        end

        list.type = type
        peek_type, _, column, = peek_token

        case type
        when :NOTE, :LABEL then
          label = [] unless label

          if peek_type == :NEWLINE then
            # description not on the same line as LABEL/NOTE
            # skip the trailing newline & any blank lines below
            while peek_type == :NEWLINE
              get
              peek_type, _, column, = peek_token
            end

            # we may be:
            #   - at end of stream
            #   - at a column < margin:
            #         [text]
            #       blah blah blah
            #   - at the same column, but with a different type of list item
            #       [text]
            #       * blah blah
            #   - at the same column, with the same type of list item
            #       [one]
            #       [two]
            # In all cases, we have an empty description.
            # In the last case only, we continue.
            if peek_type.nil? || column < margin then
              empty = true
            elsif column == margin then
              case peek_type
              when type
                empty = :continue
              when *LIST_TOKENS
                empty = true
              else
                empty = false
              end
            else
              empty = false
            end

            if empty then
              label << data
              next if empty == :continue
              break
            end
          end
        else
          data = nil
        end

        if label then
          data = label << data
          label = nil
        end

        list_item = RDoc::Markup::ListItem.new data
        parse list_item, column
        list << list_item

      else
        unget
        break
      end
    end

    p :list_end => margin if @debug

    if list.empty? then
      return nil unless label
      return nil unless [:LABEL, :NOTE].include? list.type

      list_item = RDoc::Markup::ListItem.new label, RDoc::Markup::BlankLine.new
      list << list_item
    end

    list
  end

  ##
  # Builds a Paragraph that is flush to +margin+

  def build_paragraph margin
    p :paragraph_start => margin if @debug

    paragraph = RDoc::Markup::Paragraph.new

    until @tokens.empty? do
      type, data, column, = get

      if type == :TEXT and column == margin then
        paragraph << data

        break if peek_token.first == :BREAK

        data << ' ' if skip :NEWLINE
      else
        unget
        break
      end
    end

    paragraph.parts.last.sub!(/ \z/, '') # cleanup

    p :paragraph_end => margin if @debug

    paragraph
  end

  ##
  # Builds a Verbatim that is indented from +margin+.
  #
  # The verbatim block is shifted left (the least indented lines start in
  # column 0).  Each part of the verbatim is one line of text, always
  # terminated by a newline.  Blank lines always consist of a single newline
  # character, and there is never a single newline at the end of the verbatim.

  def build_verbatim margin
    p :verbatim_begin => margin if @debug
    verbatim = RDoc::Markup::Verbatim.new

    min_indent = nil
    generate_leading_spaces = true
    line = ''

    until @tokens.empty? do
      type, data, column, = get

      if type == :NEWLINE then
        line << data
        verbatim << line
        line = ''
        generate_leading_spaces = true
        next
      end

      if column <= margin
        unget
        break
      end

      if generate_leading_spaces then
        indent = column - margin
        line << ' ' * indent
        min_indent = indent if min_indent.nil? || indent < min_indent
        generate_leading_spaces = false
      end

      case type
      when :HEADER then
        line << '=' * data
        _, _, peek_column, = peek_token
        peek_column ||= column + data
        indent = peek_column - column - data
        line << ' ' * indent
      when :RULE then
        width = 2 + data
        line << '-' * width
        _, _, peek_column, = peek_token
        peek_column ||= column + width
        indent = peek_column - column - width
        line << ' ' * indent
      when :BREAK, :TEXT then
        line << data
      else # *LIST_TOKENS
        list_marker = case type
                      when :BULLET then data
                      when :LABEL  then "[#{data}]"
                      when :NOTE   then "#{data}::"
                      else # :LALPHA, :NUMBER, :UALPHA
                        "#{data}."
                      end
        line << list_marker
        peek_type, _, peek_column = peek_token
        unless peek_type == :NEWLINE then
          peek_column ||= column + list_marker.length
          indent = peek_column - column - list_marker.length
          line << ' ' * indent
        end
      end

    end

    verbatim << line << "\n" unless line.empty?
    verbatim.parts.each { |p| p.slice!(0, min_indent) unless p == "\n" } if min_indent > 0
    verbatim.normalize

    p :verbatim_end => margin if @debug

    verbatim
  end

  ##
  # The character offset for the input string at the given +byte_offset+

  def char_pos byte_offset
    if @have_byteslice then
      @input.byteslice(0, byte_offset).length
    elsif @have_encoding then
      matched = @binary_input[0, byte_offset]
      matched.force_encoding @input_encoding
      matched.length
    else
      byte_offset
    end
  end

  ##
  # Pulls the next token from the stream.

  def get
    @current_token = @tokens.shift
    p :get => @current_token if @debug
    @current_token
  end

  ##
  # Parses the tokens into an array of RDoc::Markup::XXX objects,
  # and appends them to the passed +parent+ RDoc::Markup::YYY object.
  #
  # Exits at the end of the token stream, or when it encounters a token
  # in a column less than +indent+ (unless it is a NEWLINE).
  #
  # Returns +parent+.

  def parse parent, indent = 0
    p :parse_start => indent if @debug

    until @tokens.empty? do
      type, data, column, = get

      case type
      when :BREAK then
        parent << RDoc::Markup::BlankLine.new
        skip :NEWLINE, false
        next
      when :NEWLINE then
        # trailing newlines are skipped below, so this is a blank line
        parent << RDoc::Markup::BlankLine.new
        skip :NEWLINE, false
        next
      end

      # indentation change: break or verbatim
      if column < indent then
        unget
        break
      elsif column > indent then
        unget
        parent << build_verbatim(indent)
        next
      end

      # indentation is the same
      case type
      when :HEADER then
        parent << build_heading(data)
      when :RULE then
        parent << RDoc::Markup::Rule.new(data)
        skip :NEWLINE
      when :TEXT then
        unget
        parent << build_paragraph(indent)
      when *LIST_TOKENS then
        unget
        parent << build_list(indent)
      else
        type, data, column, line = @current_token
        raise ParseError, "Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}"
      end
    end

    p :parse_end => indent if @debug

    parent

  end

  ##
  # Returns the next token on the stream without modifying the stream

  def peek_token
    token = @tokens.first || []
    p :peek => token if @debug
    token
  end

  ##
  # Creates the StringScanner

  def setup_scanner input
    @line     = 0
    @line_pos = 0
    @input    = input.dup

    if @have_encoding and not @have_byteslice then
      @input_encoding = @input.encoding
      @binary_input   = @input.force_encoding Encoding::BINARY
    end

    @s = StringScanner.new input
  end

  ##
  # Skips the next token if its type is +token_type+.
  #
  # Optionally raises an error if the next token is not of the expected type.

  def skip token_type, error = true
    type, = get
    return unless type # end of stream
    return @current_token if token_type == type
    unget
    raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if error
  end

  ##
  # Turns text +input+ into a stream of tokens

  def tokenize input
    setup_scanner input

    until @s.eos? do
      pos = @s.pos

      # leading spaces will be reflected by the column of the next token
      # the only thing we loose are trailing spaces at the end of the file
      next if @s.scan(/ +/)

      # note: after BULLET, LABEL, etc.,
      # indent will be the column of the next non-newline token

      @tokens << case
                 # [CR]LF => :NEWLINE
                 when @s.scan(/\r?\n/) then
                   token = [:NEWLINE, @s.matched, *token_pos(pos)]
                   @line_pos = char_pos @s.pos
                   @line += 1
                   token
                 # === text => :HEADER then :TEXT
                 when @s.scan(/(=+)(\s*)/) then
                   level = @s[1].length
                   header = [:HEADER, level, *token_pos(pos)]

                   if @s[2] =~ /^\r?\n/ then
                     @s.pos -= @s[2].length
                     header
                   else
                     pos = @s.pos
                     @s.scan(/.*/)
                     @tokens << header
                     [:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)]
                   end
                 # --- (at least 3) and nothing else on the line => :RULE
                 when @s.scan(/(-{3,}) *\r?$/) then
                   [:RULE, @s[1].length - 2, *token_pos(pos)]
                 # * or - followed by white space and text => :BULLET
                 when @s.scan(/([*-]) +(\S)/) then
                   @s.pos -= @s[2].bytesize # unget \S
                   [:BULLET, @s[1], *token_pos(pos)]
                 # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
                 when @s.scan(/([a-z]|\d+)\. +(\S)/i) then
                   # FIXME if tab(s), the column will be wrong
                   # either support tabs everywhere by first expanding them to
                   # spaces, or assume that they will have been replaced
                   # before (and provide a check for that at least in debug
                   # mode)
                   list_label = @s[1]
                   @s.pos -= @s[2].bytesize # unget \S
                   list_type =
                     case list_label
                     when /[a-z]/ then :LALPHA
                     when /[A-Z]/ then :UALPHA
                     when /\d/    then :NUMBER
                     else
                       raise ParseError, "BUG token #{list_label}"
                     end
                   [list_type, list_label, *token_pos(pos)]
                 # [text] followed by spaces or end of line => :LABEL
                 when @s.scan(/\[(.*?)\]( +|\r?$)/) then
                   [:LABEL, @s[1], *token_pos(pos)]
                 # text:: followed by spaces or end of line => :NOTE
                 when @s.scan(/(.*?)::( +|\r?$)/) then
                   [:NOTE, @s[1], *token_pos(pos)]
                 # anything else: :TEXT
                 else @s.scan(/(.*?)(  )?\r?$/)
                   token = [:TEXT, @s[1], *token_pos(pos)]

                   if @s[2] then
                     @tokens << token
                     [:BREAK, @s[2], *token_pos(pos + @s[1].length)]
                   else
                     token
                   end
                 end
    end

    self
  end

  ##
  # Calculates the column (by character) and line of the current token from
  # +scanner+ based on +byte_offset+.

  def token_pos byte_offset
    offset = char_pos byte_offset

    [offset - @line_pos, @line]
  end

  ##
  # Returns the current token to the token stream

  def unget
    token = @current_token
    p :unget => token if @debug
    raise Error, 'too many #ungets' if token == @tokens.first
    @tokens.unshift token if token
  end

end