@@ -226,7 +226,7 @@ def state
226226 end
227227
228228 # Tokens where state should be ignored
229- # used for :on_comment, :on_heredoc_end, :on_embexpr_end
229+ # used for :on_sp, : on_comment, :on_heredoc_end, :on_embexpr_end
230230 class IgnoreStateToken < Token
231231 def ==( other ) # :nodoc:
232232 self [ 0 ...-1 ] == other [ 0 ...-1 ]
@@ -611,10 +611,10 @@ def self.build(opening)
611611 BOM_FLUSHED = RUBY_VERSION >= "3.3.0"
612612 private_constant :BOM_FLUSHED
613613
614- attr_reader :source , : options
614+ attr_reader :options
615615
616- def initialize ( source , **options )
617- @source = source
616+ def initialize ( code , **options )
617+ @code = code
618618 @options = options
619619 end
620620
@@ -624,12 +624,14 @@ def result
624624 state = :default
625625 heredoc_stack = [ [ ] ] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]]
626626
627- result = Prism . lex ( source , **options )
627+ result = Prism . lex ( @code , **options )
628+ source = result . source
628629 result_value = result . value
629630 previous_state = nil #: State?
630631 last_heredoc_end = nil #: Integer?
632+ eof_token = nil
631633
632- bom = source . byteslice ( 0 ..2 ) == "\xEF \xBB \xBF "
634+ bom = source . slice ( 0 ..2 ) == "\xEF \xBB \xBF "
633635
634636 result_value . each_with_index do |( token , lex_state ) , index |
635637 lineno = token . location . start_line
@@ -741,6 +743,7 @@ def result
741743
742744 Token . new ( [ [ lineno , column ] , event , value , lex_state ] )
743745 when :on_eof
746+ eof_token = token
744747 previous_token = result_value [ index - 1 ] [ 0 ]
745748
746749 # If we're at the end of the file and the previous token was a
@@ -763,7 +766,7 @@ def result
763766 end_offset += 3
764767 end
765768
766- tokens << Token . new ( [ [ lineno , 0 ] , :on_nl , source . byteslice ( start_offset ...end_offset ) , lex_state ] )
769+ tokens << Token . new ( [ [ lineno , 0 ] , :on_nl , source . slice ( start_offset ...end_offset ) , lex_state ] )
767770 end
768771 end
769772
@@ -857,7 +860,89 @@ def result
857860 # We sort by location to compare against Ripper's output
858861 tokens . sort_by! ( &:location )
859862
860- Result . new ( tokens , result . comments , result . magic_comments , result . data_loc , result . errors , result . warnings , Source . for ( source ) )
863+ # Add :on_sp tokens
864+ tokens = add_on_sp_tokens ( tokens , source , result . data_loc , bom , eof_token )
865+
866+ Result . new ( tokens , result . comments , result . magic_comments , result . data_loc , result . errors , result . warnings , source )
867+ end
868+
869+ def add_on_sp_tokens ( tokens , source , data_loc , bom , eof_token )
870+ new_tokens = [ ]
871+
872+ prev_token_state = Translation ::Ripper ::Lexer ::State . cached ( Translation ::Ripper ::EXPR_BEG )
873+ prev_token_end = bom ? 3 : 0
874+
875+ tokens . each do |token |
876+ line , column = token . location
877+ start_offset = source . line_to_byte_offset ( line ) + column
878+ # Ripper reports columns on line 1 without counting the BOM, so we adjust to get the real offset
879+ start_offset += 3 if line == 1 && bom
880+
881+ if start_offset > prev_token_end
882+ sp_value = source . slice ( prev_token_end , start_offset - prev_token_end )
883+ sp_line = source . line ( prev_token_end )
884+ sp_column = source . column ( prev_token_end )
885+ # Ripper reports columns on line 1 without counting the BOM
886+ sp_column -= 3 if sp_line == 1 && bom
887+ continuation_index = sp_value . byteindex ( "\\ " )
888+
889+ # ripper emits up to three :on_sp tokens when line continuations are used
890+ if continuation_index
891+ next_whitespace_index = continuation_index + 1
892+ next_whitespace_index += 1 if sp_value . byteslice ( next_whitespace_index ) == "\r "
893+ next_whitespace_index += 1
894+ first_whitespace = sp_value [ 0 ...continuation_index ]
895+ continuation = sp_value [ continuation_index ...next_whitespace_index ]
896+ second_whitespace = sp_value [ next_whitespace_index ..]
897+
898+ new_tokens << IgnoreStateToken . new ( [
899+ [ sp_line , sp_column ] ,
900+ :on_sp ,
901+ first_whitespace ,
902+ prev_token_state
903+ ] ) unless first_whitespace . empty?
904+
905+ new_tokens << IgnoreStateToken . new ( [
906+ [ sp_line , sp_column + continuation_index ] ,
907+ :on_sp ,
908+ continuation ,
909+ prev_token_state
910+ ] )
911+
912+ new_tokens << IgnoreStateToken . new ( [
913+ [ sp_line + 1 , 0 ] ,
914+ :on_sp ,
915+ second_whitespace ,
916+ prev_token_state
917+ ] ) unless second_whitespace . empty?
918+ else
919+ new_tokens << IgnoreStateToken . new ( [
920+ [ sp_line , sp_column ] ,
921+ :on_sp ,
922+ sp_value ,
923+ prev_token_state
924+ ] )
925+ end
926+ end
927+
928+ new_tokens << token
929+ prev_token_state = token . state
930+ prev_token_end = start_offset + token . value . bytesize
931+ end
932+
933+ unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl
934+ end_offset = eof_token . location . end_offset
935+ if prev_token_end < end_offset
936+ new_tokens << IgnoreStateToken . new ( [
937+ [ source . line ( prev_token_end ) , source . column ( prev_token_end ) ] ,
938+ :on_sp ,
939+ source . slice ( prev_token_end , end_offset - prev_token_end ) ,
940+ prev_token_state
941+ ] )
942+ end
943+ end
944+
945+ new_tokens
861946 end
862947 end
863948
0 commit comments