class YARP::LexCompat
This class is responsible for lexing the source using YARP
and then converting those tokens to be compatible with Ripper. In the vast majority of cases, this is a one-to-one mapping of the token type. Everything else generally lines up. However, there are a few cases that require special handling.
Constants
- RIPPER
This is a mapping of
YARP
token types to Ripper token types. This is a many-to-one mapping because we split up our token types, whereas Ripper tends to group them.
Attributes
filepath[R]
source[R]
Public Class Methods
new(source, filepath = "")
click to toggle source
# File yarp/lex_compat.rb, line 554 def initialize(source, filepath = "") @source = source @filepath = filepath || "" end
Public Instance Methods
result()
click to toggle source
# File yarp/lex_compat.rb, line 559 def result tokens = [] state = :default heredoc_stack = [[]] result = YARP.lex(source, @filepath) result_value = result.value previous_state = nil # In previous versions of Ruby, Ripper wouldn't flush the bom before the # first token, so we had to have a hack in place to account for that. This # checks for that behavior. bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0 bom = source.byteslice(0..2) == "\xEF\xBB\xBF" result_value.each_with_index do |(token, lex_state), index| lineno = token.location.start_line column = token.location.start_column # If there's a UTF-8 byte-order mark as the start of the file, then for # certain tokens ripper sets the first token back by 3 bytes. It also # keeps the byte order mark in the first token's value. This is weird, # and I don't want to mirror that in our parser. So instead, we'll match # up the columns and values here. if bom && lineno == 1 column -= 3 if index == 0 && column == 0 && !bom_flushed flushed = case token.type when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE, :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I, :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I, :PERCENT_UPPER_W, :STRING_BEGIN true when :REGEXP_BEGIN, :SYMBOL_BEGIN token.value.start_with?("%") else false end unless flushed column -= 3 value = token.value value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding)) end end end event = RIPPER.fetch(token.type) value = token.value lex_state = Ripper::Lexer::State.new(lex_state) token = case event when :on___end__ EndContentToken.new([[lineno, column], event, value, lex_state]) when :on_comment IgnoreStateToken.new([[lineno, column], event, value, lex_state]) when :on_heredoc_end # Heredoc end tokens can be emitted in an odd order, so we don't # want to bother comparing the state on them. IgnoreStateToken.new([[lineno, column], event, value, lex_state]) when :on_ident if lex_state == Ripper::EXPR_END # If we have an identifier that follows a method name like: # # def foo bar # # then Ripper will mark bar as END|LABEL if there is a local in a # parent scope named bar because it hasn't pushed the local table # yet. We do this more accurately, so we need to allow comparing # against both END and END|LABEL. ParamToken.new([[lineno, column], event, value, lex_state]) elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL # In the event that we're comparing identifiers, we're going to # allow a little divergence. Ripper doesn't account for local # variables introduced through named captures in regexes, and we # do, which accounts for this difference. IdentToken.new([[lineno, column], event, value, lex_state]) else Token.new([[lineno, column], event, value, lex_state]) end when :on_embexpr_end IgnoreStateToken.new([[lineno, column], event, value, lex_state]) when :on_ignored_nl # Ignored newlines can occasionally have a LABEL state attached to # them which doesn't actually impact anything. We don't mirror that # state so we ignored it. IgnoredNewlineToken.new([[lineno, column], event, value, lex_state]) when :on_regexp_end # On regex end, Ripper scans and then sets end state, so the ripper # lexed output is begin, when it should be end. YARP sets lex state # correctly to end state, but we want to be able to compare against # Ripper's lexed state. So here, if it's a regexp end token, we # output the state as the previous state, solely for the sake of # comparison. previous_token = result_value[index - 1][0] lex_state = if RIPPER.fetch(previous_token.type) == :on_embexpr_end # If the previous token is embexpr_end, then we have to do even # more processing. The end of an embedded expression sets the # state to the state that it had at the beginning of the # embedded expression. So we have to go and find that state and # set it here. counter = 1 current_index = index - 1 until counter == 0 current_index -= 1 current_event = RIPPER.fetch(result_value[current_index][0].type) counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0 end Ripper::Lexer::State.new(result_value[current_index][1]) else previous_state end Token.new([[lineno, column], event, value, lex_state]) when :on_eof previous_token = result_value[index - 1][0] # If we're at the end of the file and the previous token was a # comment and there is still whitespace after the comment, then # Ripper will append a on_nl token (even though there isn't # necessarily a newline). We mirror that here. start_offset = previous_token.location.end_offset end_offset = token.location.start_offset if previous_token.type == :COMMENT && start_offset < end_offset if bom start_offset += 3 end_offset += 3 end tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state]) end Token.new([[lineno, column], event, value, lex_state]) else Token.new([[lineno, column], event, value, lex_state]) end previous_state = lex_state # The order in which tokens appear in our lexer is different from the # order that they appear in Ripper. When we hit the declaration of a # heredoc in YARP, we skip forward and lex the rest of the content of # the heredoc before going back and lexing at the end of the heredoc # identifier. # # To match up to ripper, we keep a small state variable around here to # track whether we're in the middle of a heredoc or not. In this way we # can shuffle around the token to match Ripper's output. case state when :default # The default state is when there are no heredocs at all. In this # state we can append the token to the list of tokens and move on. tokens << token # If we get the declaration of a heredoc, then we open a new heredoc # and move into the heredoc_opened state. if event == :on_heredoc_beg state = :heredoc_opened heredoc_stack.last << Heredoc.build(token) end when :heredoc_opened # The heredoc_opened state is when we've seen the declaration of a # heredoc and are now lexing the body of the heredoc. In this state we # push tokens onto the most recently created heredoc. heredoc_stack.last.last << token case event when :on_heredoc_beg # If we receive a heredoc declaration while lexing the body of a # heredoc, this means we have nested heredocs. In this case we'll # push a new heredoc onto the stack and stay in the heredoc_opened # state since we're now lexing the body of the new heredoc. heredoc_stack << [Heredoc.build(token)] when :on_heredoc_end # If we receive the end of a heredoc, then we're done lexing the # body of the heredoc. In this case we now have a completed heredoc # but need to wait for the next newline to push it into the token # stream. state = :heredoc_closed end when :heredoc_closed if %i[on_nl on_ignored_nl on_comment].include?(event) || (event == :on_tstring_content && value.end_with?("\n")) if heredoc_stack.size > 1 flushing = heredoc_stack.pop heredoc_stack.last.last << token flushing.each do |heredoc| heredoc.to_a.each do |flushed_token| heredoc_stack.last.last << flushed_token end end state = :heredoc_opened next end elsif event == :on_heredoc_beg tokens << token state = :heredoc_opened heredoc_stack.last << Heredoc.build(token) next elsif heredoc_stack.size > 1 heredoc_stack[-2].last << token next end heredoc_stack.last.each do |heredoc| tokens.concat(heredoc.to_a) end heredoc_stack.last.clear state = :default tokens << token end end # Drop the EOF token from the list tokens = tokens[0...-1] # We sort by location to compare against Ripper's output tokens.sort_by!(&:location) if result_value.size - 1 > tokens.size raise StandardError, "Lost tokens when performing lex_compat" end ParseResult.new(tokens, result.comments, result.errors, result.warnings, []) end