sentence.rb   [plain text]


# == sentence library
#
# = Features
#
# * syntax based sentences generation
# * sentence operations such as substitution.
#
# = Example
#
# Some arithmetic expressions using "+", "-", "*" and "/" are generated as follows.
#
#  require 'sentence'
#  Sentence.each({
#    :exp => [["num"],
#             [:exp, "+", :exp],
#             [:exp, "-", :exp],
#             [:exp, "*", :exp],
#             [:exp, "/", :exp]]
#  }, :exp, 2) {|sent| p sent }
#  #=>
#  #<Sentence: "num">
#  #<Sentence: ("num") "+" ("num")>
#  #<Sentence: ("num") "+" (("num") "+" ("num"))>
#  #<Sentence: ("num") "+" (("num") "-" ("num"))>
#  #<Sentence: ("num") "+" (("num") "*" ("num"))>
#  #<Sentence: ("num") "+" (("num") "/" ("num"))>
#  #<Sentence: (("num") "+" ("num")) "+" ("num")>
#  ...
#
# Sentence.each takes 3 arguments.
# The first argument is the syntax for the expressions.
# The second argument, :exp, is a generating nonterminal.
# The third argument, 2, limits derivation to restrict results finitely.
#
# Some arithmetic expressions including parenthesis can be generated as follows.
#
#  syntax = {
#    :factor => [["n"],
#                ["(", :exp, ")"]],
#    :term => [[:factor],
#              [:term, "*", :factor],
#              [:term, "/", :factor]],
#    :exp => [[:term],
#             [:exp, "+", :term],
#             [:exp, "-", :term]]
#  }
#  Sentence.each(syntax, :exp, 2) {|sent| p sent }
#  #=>
#  #<Sentence: (("n"))>
#  #<Sentence: (("(" ((("n"))) ")"))>
#  #<Sentence: (("(" ((("(" ((("n"))) ")"))) ")"))>
#  #<Sentence: (("(" (((("n")) "*" ("n"))) ")"))>
#  #<Sentence: (("(" (((("n")) "/" ("n"))) ")"))>
#  #<Sentence: (("(" (((("n"))) "+" (("n"))) ")"))>
#  #<Sentence: (("(" (((("n"))) "-" (("n"))) ")"))>
#  #<Sentence: ((("n")) "*" ("n"))>
#  #<Sentence: ((("n")) "*" ("(" ((("n"))) ")"))>
#  ...
#
#  Sentence#to_s can be used to concatenate strings
#  in a sentence:
#
#  Sentence.each(syntax, :exp, 2) {|sent| p sent.to_s }
#  #=>
#  "n"
#  "(n)"
#  "((n))"
#  "(n*n)"
#  "(n/n)"
#  "(n+n)"
#  "(n-n)"
#  "n*n"
#  "n*(n)"
#  ...
#

# Sentence() instantiates a sentence object.
#
#  Sentence("foo", "bar")
#  #=> #<Sentence: "foo" "bar">
#
#  Sentence("foo", ["bar", "baz"])
#  #=> #<Sentence: "foo" ("bar" "baz")>
#
def Sentence(*ary)
  Sentence.new(ary)
end

# Sentence class represents a tree with string leaves.
#
class Sentence
  # _ary_ represents a tree.
  # It should be a possibly nested array which contains strings.
  #
  # Note that _ary_ is not copied.
  # Don't modify _ary_ after the sentence object is instantiated.
  #
  #  Sentence.new(["a", "pen"])
  #  #<Sentence: "a" "pen">
  #
  #  Sentence.new(["I", "have", ["a", "pen"]])
  #  #<Sentence: "I" "have" ("a" "pen")>
  #
  def initialize(ary)
    @sent = ary
  end

  # returns a string which is concatenation of all strings.
  # No separator is used.
  #
  #  Sentence("2", "+", "3").to_s
  #  "2+3"
  #
  #  Sentence("2", "+", ["3", "*", "5"]).to_s
  #  "2+3*5"
  #
  def to_s
    @sent.join('')
  end

  # returns a string which is concatenation of all strings separated by _sep_.
  # If _sep_ is not given, single space is used.
  #
  #  Sentence("I", "have", ["a", "pen"]).join
  #  "I have a pen"
  #
  #  Sentence("I", "have", ["a", "pen"]).join("/")
  #  "I/have/a/pen"
  #
  #  Sentence("a", [], "b").join("/")
  #  "a/b"
  #
  def join(sep=' ')
    @sent.flatten.join(sep)
  end

  # returns a tree as a nested array.
  #
  # Note that the result is not copied.
  # Don't modify the result.
  #
  #  Sentence(["foo", "bar"], "baz").to_a
  #  #=> [["foo", "bar"], "baz"]
  #
  def to_a
    @sent
  end

  # returns <i>i</i>th element as a sentence or string.
  #
  #  s = Sentence(["foo", "bar"], "baz")
  #  s    #=> #<Sentence: ("foo" "bar") "baz">
  #  s[0] #=> #<Sentence: "foo" "bar">
  #  s[1] #=> "baz"
  #
  def [](i)
    e = @sent[i]
    e.respond_to?(:to_ary) ? Sentence.new(e) : e
  end

  # returns the number of top level elements.
  #
  #  Sentence.new(%w[foo bar]).length
  #  #=> 2
  #
  #  Sentence(%w[2 * 7], "+", %w[3 * 5]).length
  #  #=> 3
  #
  def length
    @sent.length
  end

  # iterates over children.
  #
  #  Sentence(%w[2 * 7], "+", %w[3 * 5]).each {|v| p v }
  #  #=>
  #  #<Sentence: "2" "*" "7">
  #  "+"
  #  #<Sentence: "3" "*" "5">
  #
  def each # :yield: element
    @sent.each_index {|i|
      yield self[i]
    }
  end
  include Enumerable

  def inspect
    "#<#{self.class}: #{inner_inspect(@sent, '')}>"
  end

  # :stopdoc:
  def inner_inspect(ary, r)
    first = true
    ary.each {|obj|
      r << ' ' if !first
      first = false
      if obj.respond_to? :to_ary
        r << '('
        inner_inspect(obj, r)
        r << ')'
      else
        r << obj.inspect
      end
    }
    r
  end
  # :startdoc:

  # returns new sentence object which
  # _target_ is substituted by the block.
  #
  # Sentence#subst invokes <tt>_target_ === _string_</tt> for each
  # string in the sentence.
  # The strings which === returns true are substituted by the block.
  # The block is invoked with the substituting string.
  #
  #  Sentence.new(%w[2 + 3]).subst("+") { "*" }
  #  #<Sentence: "2" "*" "3">
  #
  #  Sentence.new(%w[2 + 3]).subst(/\A\d+\z/) {|s| ((s.to_i)*2).to_s }
  #  #=> #<Sentence: "4" "+" "6">
  #
  def subst(target, &b) # :yield: string
    Sentence.new(subst_rec(@sent, target, &b))
  end

  # :stopdoc:
  def subst_rec(obj, target, &b)
    if obj.respond_to? :to_ary
      a = []
      obj.each {|e| a << subst_rec(e, target, &b) }
      a
    elsif target === obj
      yield obj
    else
      obj
    end
  end
  # :startdoc:

  # find a subsentence and return it.
  # The block is invoked for each subsentence in preorder manner.
  # The first subsentence which the block returns true is returned.
  #
  #  Sentence(%w[2 * 7], "+", %w[3 * 5]).find_subtree {|s| s[1] == "*" }
  #  #=> #<Sentence: "2" "*" "7">
  #
  def find_subtree(&b) # :yield: sentence
    find_subtree_rec(@sent, &b)
  end

  # :stopdoc:
  def find_subtree_rec(obj, &b)
    if obj.respond_to? :to_ary
      s = Sentence.new(obj)
      if b.call s
        return s
      else
        obj.each {|e|
          r = find_subtree_rec(e, &b)
          return r if r
        }
      end
    end
    nil
  end
  # :startdoc:

  # returns a new sentence object which expands according to the condition
  # given by the block.
  #
  # The block is invoked for each subsentence.
  # The subsentences which the block returns true are
  # expanded into parent.
  #
  #  s = Sentence(%w[2 * 7], "+", %w[3 * 5])
  #  #=> #<Sentence: ("2" "*" "7") "+" ("3" "*" "5")>
  #
  #  s.expand { true }
  #  #=> #<Sentence: "2" "*" "7" "+" "3" "*" "5">
  #
  #  s.expand {|s| s[0] == "3" }
  #  #=> #<Sentence: (("2" "*" "7") "+" "3" "*" "5")>
  #
  def expand(&b) # :yield: sentence
    Sentence.new(expand_rec(@sent, &b))
  end

  # :stopdoc:
  def expand_rec(obj, r=[], &b)
    if obj.respond_to? :to_ary
      obj.each {|o|
        s = Sentence.new(o)
        if b.call s
          expand_rec(o, r, &b)
        else
          a = []
          expand_rec(o, a, &b)
          r << a
        end
      }
    else
      r << obj
    end
    r
  end
  # :startdoc:

  # Sentence.each generates sentences
  # by deriving the start symbol _sym_ using _syntax_.
  # The derivation is restricted by an positive integer _limit_ to
  # avoid infinite generation.
  #
  # Sentence.each yields the block with a generated sentence.
  #
  #  Sentence.each({
  #    :exp => [["n"],
  #             [:exp, "+", :exp],
  #             [:exp, "*", :exp]]
  #    }, :exp, 1) {|sent| p sent }
  #  #=>
  #  #<Sentence: "n">
  #  #<Sentence: ("n") "+" ("n")>
  #  #<Sentence: ("n") "*" ("n")>
  #
  #  Sentence.each({
  #    :exp => [["n"],
  #             [:exp, "+", :exp],
  #             [:exp, "*", :exp]]
  #    }, :exp, 2) {|sent| p sent }
  #  #=>
  #  #<Sentence: "n">
  #  #<Sentence: ("n") "+" ("n")>
  #  #<Sentence: ("n") "+" (("n") "+" ("n"))>
  #  #<Sentence: ("n") "+" (("n") "*" ("n"))>
  #  #<Sentence: (("n") "+" ("n")) "+" ("n")>
  #  #<Sentence: (("n") "*" ("n")) "+" ("n")>
  #  #<Sentence: ("n") "*" ("n")>
  #  #<Sentence: ("n") "*" (("n") "+" ("n"))>
  #  #<Sentence: ("n") "*" (("n") "*" ("n"))>
  #  #<Sentence: (("n") "+" ("n")) "*" ("n")>
  #  #<Sentence: (("n") "*" ("n")) "*" ("n")>
  #
  def Sentence.each(syntax, sym, limit)
    Gen.new(syntax).each_tree(sym, limit) {|tree|
      yield Sentence.new(tree)
    }
  end

  # Sentence.expand_syntax returns an expanded syntax:
  # * No rule derives to empty sequence
  # * Underivable rule simplified
  # * No channel rule
  # * Symbols which has zero or one choices are not appered in rhs.
  #
  # Note that the rules which can derive empty and non-empty
  # sequences are modified to derive only non-empty sequences.
  #
  #  Sentence.expand_syntax({
  #    :underivable1 => [],
  #    :underivable2 => [[:underivable1]],
  #    :underivable3 => [[:underivable3]],
  #    :empty_only1 => [[]],
  #    :empty_only2 => [[:just_empty1, :just_empty1]],
  #    :empty_or_not => [[], ["foo"]],
  #    :empty_or_not_2 => [[:empty_or_not, :empty_or_not]],
  #    :empty_or_not_3 => [[:empty_or_not, :empty_or_not, :empty_or_not]],
  #    :empty_or_not_4 => [[:empty_or_not_2, :empty_or_not_2]],
  #    :channel1 => [[:channeled_data]],
  #    :channeled_data => [["a", "b"], ["c", "d"]],
  #    :single_choice => [["single", "choice"]],
  #    :single_choice_2 => [[:single_choice, :single_choice]],
  #  })
  #  #=>
  #  {
  #   :underivable1=>[], # underivable rules are simplified to [].
  #   :underivable2=>[],
  #   :underivable3=>[],
  #   :empty_only1=>[], # derivation to empty sequence are removed.
  #   :empty_only2=>[],
  #   :empty_or_not=>[["foo"]], # empty sequences are removed too.
  #   :empty_or_not_2=>[["foo"], ["foo", "foo"]],
  #   :empty_or_not_3=>[["foo"], ["foo", "foo"], ["foo", "foo", "foo"]],
  #   :empty_or_not_4=> [["foo"], ["foo", "foo"], [:empty_or_not_2, :empty_or_not_2]],
  #   :channel1=>[["a", "b"], ["c", "d"]], # channel rules are removed.
  #   :channeled_data=>[["a", "b"], ["c", "d"]],
  #   :single_choice=>[["single", "choice"]], # single choice rules are expanded.
  #   :single_choice_2=>[["single", "choice", "single", "choice"]],
  #  }
  #
  #  Sentence.expand_syntax({
  #    :factor => [["n"],
  #                ["(", :exp, ")"]],
  #    :term => [[:factor],
  #              [:term, "*", :factor],
  #              [:term, "/", :factor]],
  #    :exp => [[:term],
  #             [:exp, "+", :term],
  #             [:exp, "-", :term]]
  #  })
  #  #=>
  #  {:exp=> [["n"],
  #           ["(", :exp, ")"],
  #           [:exp, "+", :term],
  #           [:exp, "-", :term],
  #           [:term, "*", :factor],
  #           [:term, "/", :factor]],
  #   :factor=> [["n"],
  #              ["(", :exp, ")"]],
  #   :term=> [["n"],
  #            ["(", :exp, ")"],
  #            [:term, "*", :factor],
  #            [:term, "/", :factor]]
  #  }
  #
  def Sentence.expand_syntax(syntax)
    Sentence::Gen.expand_syntax(syntax)
  end

  # :stopdoc:
  class Gen
    def Gen.each_tree(syntax, sym, limit, &b)
      Gen.new(syntax).each_tree(sym, limit, &b)
    end

    def Gen.each_string(syntax, sym, limit, &b)
      Gen.new(syntax).each_string(sym, limit, &b)
    end

    def initialize(syntax)
      @syntax = syntax
    end

    def self.expand_syntax(syntax)
      syntax = simplify_underivable_rules(syntax)
      syntax = simplify_emptyonly_rules(syntax)
      syntax = make_rules_no_empseq(syntax)
      syntax = expand_channel_rules(syntax)

      syntax = expand_noalt_rules(syntax)
      syntax = reorder_rules(syntax)
    end

    def self.simplify_underivable_rules(syntax)
      deribable_syms = {}
      changed = true
      while changed
        changed = false
        syntax.each {|sym, rules|
          next if deribable_syms[sym]
          rules.each {|rhs|
            if rhs.all? {|e| String === e || deribable_syms[e] }
              deribable_syms[sym] = true
              changed = true
              break
            end
          }
        }
      end
      result = {}
      syntax.each {|sym, rules|
        if deribable_syms[sym]
          rules2 = []
          rules.each {|rhs|
            rules2 << rhs if rhs.all? {|e| String === e || deribable_syms[e] }
          }
          result[sym] = rules2.uniq
        else
          result[sym] = []
        end
      }
      result
    end

    def self.simplify_emptyonly_rules(syntax)
      justempty_syms = {}
      changed = true
      while changed
        changed = false
        syntax.each {|sym, rules|
          next if justempty_syms[sym]
          if !rules.empty? && rules.all? {|rhs| rhs.all? {|e| justempty_syms[e] } }
            justempty_syms[sym] = true
            changed = true
          end
        }
      end
      result = {}
      syntax.each {|sym, rules|
        result[sym] = rules.map {|rhs| rhs.reject {|e| justempty_syms[e] } }.uniq
      }
      result
    end

    def self.expand_emptyable_syms(rhs, emptyable_syms)
      if rhs.empty?
        yield []
      else
        first = rhs[0]
        rest = rhs[1..-1]
        if emptyable_syms[first]
          expand_emptyable_syms(rest, emptyable_syms) {|rhs2|
            yield [first] + rhs2
            yield rhs2
          }
        else
          expand_emptyable_syms(rest, emptyable_syms) {|rhs2|
            yield [first] + rhs2
          }
        end
      end
    end

    def self.make_rules_no_empseq(syntax)
      emptyable_syms = {}
      changed = true
      while changed
        changed = false
        syntax.each {|sym, rules|
          next if emptyable_syms[sym]
          rules.each {|rhs|
            if rhs.all? {|e| emptyable_syms[e] }
              emptyable_syms[sym] = true
              changed = true
              break
            end
          }
        }
      end
      result = {}
      syntax.each {|sym, rules|
        rules2 = []
        rules.each {|rhs|
          expand_emptyable_syms(rhs, emptyable_syms) {|rhs2|
            next if rhs2.empty?
            rules2 << rhs2
          }
        }
        result[sym] = rules2.uniq
      }
      result
    end

    def self.expand_channel_rules(syntax)
      channel_rules = {}
      syntax.each {|sym, rules|
        channel_rules[sym] = {sym=>true}
        rules.each {|rhs|
          if rhs.length == 1 && Symbol === rhs[0]
            channel_rules[sym][rhs[0]] = true
          end
        }
      }
      changed = true
      while changed
        changed = false
        channel_rules.each {|sym, set|
          n1 = set.size
          set.keys.each {|s|
            set.update(channel_rules[s])
          }
          n2 = set.size
          changed = true if n1 < n2
        }
      end
      result = {}
      syntax.each {|sym, rules|
        rules2 = []
        channel_rules[sym].each_key {|s|
          syntax[s].each {|rhs|
            unless rhs.length == 1 && Symbol === rhs[0]
              rules2 << rhs
            end
          }
        }
        result[sym] = rules2.uniq
      }
      result
    end

    def self.expand_noalt_rules(syntax)
      noalt_syms = {}
      syntax.each {|sym, rules|
        if rules.length == 1
          noalt_syms[sym] = true
        end
      }
      result = {}
      syntax.each {|sym, rules|
        rules2 = []
        rules.each {|rhs|
          rhs2 = []
          rhs.each {|e|
            if noalt_syms[e]
              rhs2.concat syntax[e][0]
            else
              rhs2 << e
            end
          }
          rules2 << rhs2
        }
        result[sym] = rules2.uniq
      }
      result
    end

    def self.reorder_rules(syntax)
      result = {}
      syntax.each {|sym, rules|
        result[sym] = rules.sort_by {|rhs|
          [rhs.find_all {|e| Symbol === e }.length, rhs.length]
        }
      }
      result
    end

    def each_tree(sym, limit)
      generate_from_sym(sym, limit) {|_, tree|
        yield tree
      }
      nil
    end

    def each_string(sym, limit)
      generate_from_sym(sym, limit) {|_, tree|
        yield [tree].join('')
      }
      nil
    end

    def generate_from_sym(sym, limit, &b)
      return if limit < 0
      if String === sym
        yield limit, sym
      else
        rules = @syntax[sym]
        raise "undefined rule: #{sym}" if !rules
        rules.each {|rhs|
          if rhs.length == 1 || rules.length == 1
            limit1 = limit
          else
            limit1 = limit-1
          end
          generate_from_rhs(rhs, limit1, &b)
        }
      end
      nil
    end

    def generate_from_rhs(rhs, limit)
      return if limit < 0
      if rhs.empty?
        yield limit, []
      else
        generate_from_sym(rhs[0], limit) {|limit1, child|
          generate_from_rhs(rhs[1..-1], limit1) {|limit2, arr|
            yield limit2, [child, *arr]
          }
        }
      end
      nil
    end
  end
  # :startdoc:

end