class Rouge::RegexLexer

@abstract A stateful lexer that uses sets of regular expressions to tokenize a string. Most lexers are instances of RegexLexer.

Constants

MAX_NULL_SCANS

The number of successive scans permitted without consuming the input stream. If this is exceeded, the match fails.

Public Class Methods

append(name, &b) click to toggle source
# File lib/rouge/regex_lexer.rb, line 195
def self.append(name, &b)
  name = name.to_s
  dsl = state_definitions[name] or raise "no such state #{name.inspect}"
  replace_state(name, dsl.appended(&b))
end
get_state(name) click to toggle source

@private

# File lib/rouge/regex_lexer.rb, line 202
def self.get_state(name)
  return name if name.is_a? State

  states[name.to_sym] ||= begin
    defn = state_definitions[name.to_s] or raise "unknown state: #{name.inspect}"
    defn.to_state(self)
  end
end
prepend(name, &b) click to toggle source
# File lib/rouge/regex_lexer.rb, line 189
def self.prepend(name, &b)
  name = name.to_s
  dsl = state_definitions[name] or raise "no such state #{name.inspect}"
  replace_state(name, dsl.prepended(&b))
end
replace_state(name, new_defn) click to toggle source
# File lib/rouge/regex_lexer.rb, line 162
def self.replace_state(name, new_defn)
  states[name] = nil
  state_definitions[name] = new_defn
end
start(&b) click to toggle source

Specify an action to be run every fresh lex.

@example

start { puts "I'm lexing a new string!" }
# File lib/rouge/regex_lexer.rb, line 178
def self.start(&b)
  start_procs << b
end
start_procs() click to toggle source

The routines to run at the beginning of a fresh lex. @see start

# File lib/rouge/regex_lexer.rb, line 169
def self.start_procs
  @start_procs ||= InheritableList.new(superclass.start_procs)
end
state(name, &b) click to toggle source

Define a new state for this lexer with the given name. The block will be evaluated in the context of a {StateDSL}.

# File lib/rouge/regex_lexer.rb, line 184
def self.state(name, &b)
  name = name.to_s
  state_definitions[name] = StateDSL.new(name, &b)
end
state_definitions() click to toggle source
# File lib/rouge/regex_lexer.rb, line 157
def self.state_definitions
  @state_definitions ||= InheritableHash.new(superclass.state_definitions)
end
states() click to toggle source

The states hash for this lexer. @see state

# File lib/rouge/regex_lexer.rb, line 153
def self.states
  @states ||= {}
end

Public Instance Methods

delegate(lexer, text=nil) click to toggle source

Delegate the lex to another lexer. The lex method will be called with `:continue` set to true, so that reset! will not be called. In this way, a single lexer can be repeatedly delegated to while maintaining its own internal state stack.

@param [#lex] lexer

The lexer or lexer class to delegate to

@param [String] text

The text to delegate.  This defaults to the last matched string.
# File lib/rouge/regex_lexer.rb, line 363
def delegate(lexer, text=nil)
  puts "    delegating to #{lexer.inspect}" if @debug
  text ||= @current_stream[0]

  lexer.lex(text, :continue => true) do |tok, val|
    puts "    delegated token: #{tok.inspect}, #{val.inspect}" if @debug
    yield_token(tok, val)
  end
end
get_state(state_name) click to toggle source

@private

# File lib/rouge/regex_lexer.rb, line 212
def get_state(state_name)
  self.class.get_state(state_name)
end
goto(state_name) click to toggle source

replace the head of the stack with the given state

# File lib/rouge/regex_lexer.rb, line 407
def goto(state_name)
  raise 'empty stack!' if stack.empty?

  puts "    going to state :#{state_name} " if @debug
  stack[-1] = get_state(state_name)
end
group(tok) click to toggle source

@deprecated

Yield a token with the next matched group. Subsequent calls to this method will yield subsequent groups.

# File lib/rouge/regex_lexer.rb, line 342
def group(tok)
  raise "RegexLexer#group is deprecated: use #groups instead"
end
groups(*tokens) click to toggle source

Yield tokens corresponding to the matched groups of the current match.

# File lib/rouge/regex_lexer.rb, line 348
def groups(*tokens)
  tokens.each_with_index do |tok, i|
    yield_token(tok, @current_stream[i+1])
  end
end
in_state?(state_name) click to toggle source

Check if `state_name` is in the state stack.

# File lib/rouge/regex_lexer.rb, line 422
def in_state?(state_name)
  state_name = state_name.to_s
  stack.any? do |state|
    state.name == state_name.to_s
  end
end
pop!(times=1) click to toggle source

Pop the state stack. If a number is passed in, it will be popped that number of times.

# File lib/rouge/regex_lexer.rb, line 396
def pop!(times=1)
  raise 'empty stack!' if stack.empty?

  puts "    popping stack: #{times}" if @debug

  stack.pop(times)

  nil
end
push(state_name=nil, &b) click to toggle source

Push a state onto the stack. If no state name is given and you've passed a block, a state will be dynamically created using the {StateDSL}.

# File lib/rouge/regex_lexer.rb, line 380
def push(state_name=nil, &b)
  push_state = if state_name
    get_state(state_name)
  elsif block_given?
    StateDSL.new(b.inspect, &b).to_state(self.class)
  else
    # use the top of the stack by default
    self.state
  end

  puts "    pushing :#{push_state.name}" if @debug
  stack.push(push_state)
end
recurse(text=nil) click to toggle source
# File lib/rouge/regex_lexer.rb, line 373
def recurse(text=nil)
  delegate(self.class, text)
end
reset!() click to toggle source

reset this lexer to its initial state. This runs all of the start_procs.

# File lib/rouge/regex_lexer.rb, line 233
def reset!
  @stack = nil
  @current_stream = nil

  puts "start blocks" if @debug && self.class.start_procs.any?
  self.class.start_procs.each do |pr|
    instance_eval(&pr)
  end
end
reset_stack() click to toggle source

reset the stack back to `[:root]`.

# File lib/rouge/regex_lexer.rb, line 415
def reset_stack
  puts '    resetting stack' if @debug
  stack.clear
  stack.push get_state(:root)
end
stack() click to toggle source

The state stack. This is initially the single state `[:root]`. It is an error for this stack to be empty. @see state

# File lib/rouge/regex_lexer.rb, line 219
def stack
  @stack ||= [get_state(:root)]
end
state() click to toggle source

The current state - i.e. one on top of the state stack.

NB: if the state stack is empty, this will throw an error rather than returning nil.

# File lib/rouge/regex_lexer.rb, line 227
def state
  stack.last or raise 'empty stack!'
end
state?(state_name) click to toggle source

Check if `state_name` is the state on top of the state stack.

# File lib/rouge/regex_lexer.rb, line 430
def state?(state_name)
  state_name.to_s == state.name
end
step(state, stream) click to toggle source

Runs one step of the lex. Rules in the current state are tried until one matches, at which point its callback is called.

@return true if a rule was tried successfully @return false otherwise.

# File lib/rouge/regex_lexer.rb, line 288
def step(state, stream)
  state.rules.each do |rule|
    if rule.is_a?(State)
      puts "  entering mixin #{rule.name}" if @debug
      return true if step(rule, stream)
      puts "  exiting  mixin #{rule.name}" if @debug
    else
      puts "  trying #{rule.inspect}" if @debug

      # XXX HACK XXX
      # StringScanner's implementation of ^ is b0rken.
      # see http://bugs.ruby-lang.org/issues/7092
      # TODO: this doesn't cover cases like /(a|^b)/, but it's
      # the most common, for now...
      next if rule.beginning_of_line && !stream.beginning_of_line?

      if (size = stream.skip(rule.re))
        puts "    got #{stream[0].inspect}" if @debug

        instance_exec(stream, &rule.callback)

        if size.zero?
          @null_steps += 1
          if @null_steps > MAX_NULL_SCANS
            puts "    too many scans without consuming the string!" if @debug
            return false
          end
        else
          @null_steps = 0
        end

        return true
      end
    end
  end

  false
end
stream_tokens(str, &b) click to toggle source

This implements the lexer protocol, by yielding [token, value] pairs.

The process for lexing works as follows, until the stream is empty:

  1. We look at the state on top of the stack (which by default is `[:root]`).

  2. Each rule in that state is tried until one is successful. If one is found, that rule's callback is evaluated - which may yield tokens and manipulate the state stack. Otherwise, one character is consumed with an `'Error'` token, and we continue at (1.)

@see step step (where (2.) is implemented)

# File lib/rouge/regex_lexer.rb, line 255
def stream_tokens(str, &b)
  stream = StringScanner.new(str)

  @current_stream = stream
  @output_stream  = b
  @states         = self.class.states
  @null_steps     = 0

  until stream.eos?
    if @debug
      puts "lexer: #{self.class.tag}"
      puts "stack: #{stack.map(&:name).map(&:to_sym).inspect}"
      puts "stream: #{stream.peek(20).inspect}"
    end

    success = step(state, stream)

    if !success
      puts "    no match, yielding Error" if @debug
      b.call(Token::Tokens::Error, stream.getch)
    end
  end
end
token(tok, val=@current_stream[0]) click to toggle source

Yield a token.

@param tok

the token type

@param val

(optional) the string value to yield.  If absent, this defaults
to the entire last match.
# File lib/rouge/regex_lexer.rb, line 334
def token(tok, val=@current_stream[0])
  yield_token(tok, val)
end

Private Instance Methods

yield_token(tok, val) click to toggle source
# File lib/rouge/regex_lexer.rb, line 435
def yield_token(tok, val)
  return if val.nil? || val.empty?
  puts "    yielding #{tok.qualname}, #{val.inspect}" if @debug
  @output_stream.yield(tok, val)
end