From c49e725ff1635ef369d18bf9ada41d0f1ebb1bfa Mon Sep 17 00:00:00 2001 From: Artemis System Date: Fri, 13 Jun 2025 12:32:50 +0200 Subject: [PATCH 1/2] Implement tab support --- src/Language/PureScript/CST/Convert.hs | 2 +- src/Language/PureScript/CST/Lexer.hs | 41 +++++--- src/Language/PureScript/CST/Monad.hs | 2 +- src/Language/PureScript/CST/Positions.hs | 126 ++++++++++++----------- src/Language/PureScript/CST/Print.hs | 8 +- src/Language/PureScript/CST/Types.hs | 4 +- src/Language/PureScript/CST/Utils.hs | 2 +- 7 files changed, 103 insertions(+), 82 deletions(-) diff --git a/src/Language/PureScript/CST/Convert.hs b/src/Language/PureScript/CST/Convert.hs index 59b68adf1d..aab94a718d 100644 --- a/src/Language/PureScript/CST/Convert.hs +++ b/src/Language/PureScript/CST/Convert.hs @@ -50,7 +50,7 @@ comments :: [Comment a] -> [C.Comment] comments = mapMaybe comment sourcePos :: SourcePos -> Pos.SourcePos -sourcePos (SourcePos line col) = Pos.SourcePos line col +sourcePos (SourcePos line (tabs, col)) = Pos.SourcePos line (tabs + col) sourceSpan :: String -> SourceRange -> Pos.SourceSpan sourceSpan name (SourceRange start end) = Pos.SourceSpan name (sourcePos start) (sourcePos end) diff --git a/src/Language/PureScript/CST/Lexer.hs b/src/Language/PureScript/CST/Lexer.hs index 726a76f26a..15f459e652 100644 --- a/src/Language/PureScript/CST/Lexer.hs +++ b/src/Language/PureScript/CST/Lexer.hs @@ -22,7 +22,7 @@ import Data.Text.PureScript qualified as Text import Language.PureScript.CST.Errors (ParserErrorInfo(..), ParserErrorType(..)) import Language.PureScript.CST.Monad (LexResult, LexState(..), ParserM(..), throw) import Language.PureScript.CST.Layout (LayoutDelim(..), insertLayout, lytToken, unwindLayout) -import Language.PureScript.CST.Positions (advanceLeading, advanceToken, advanceTrailing, applyDelta, textDelta) +import Language.PureScript.CST.Positions (advanceLeading, advanceToken, advanceTrailing, applyDelta, textDelta, PosDelta (PosDelta)) import Language.PureScript.CST.Types (Comment(..), LineFeed(..), SourcePos(..), SourceRange(..), SourceStyle(..), SourceToken(..), Token(..), TokenAnn(..)) -- | Stops at the first lexing error and replaces it with TokEof. Otherwise, @@ -50,10 +50,10 @@ lex' lexComments src = do let (leading, src') = lexComments src lexWithState $ LexState - { lexPos = advanceLeading (SourcePos 1 1) leading + { lexPos = advanceLeading (SourcePos 1 (0, 1)) leading , lexLeading = leading , lexSource = src' - , lexStack = [(SourcePos 0 0, LytRoot)] + , lexStack = [(SourcePos 0 (0, 0), LytRoot)] } -- | Lexes according to top-level declaration context rules. @@ -61,13 +61,13 @@ lexTopLevel :: Text -> [LexResult] lexTopLevel src = do let (leading, src') = comments src - lexPos = advanceLeading (SourcePos 1 1) leading + lexPos = advanceLeading (SourcePos 1 (0, 1)) leading hd = Right $ lytToken lexPos TokLayoutStart tl = lexWithState $ LexState { lexPos = lexPos , lexLeading = leading , lexSource = src' - , lexStack = [(lexPos, LytWhere), (SourcePos 0 0, LytRoot)] + , lexStack = [(lexPos, LytWhere), (SourcePos 0 (0, 0), LytRoot)] } hd : tl @@ -90,7 +90,7 @@ lexWithState = go pos = applyDelta lexPos chunkDelta pure $ Left ( state { lexSource = lexSource' } - , ParserErrorInfo (SourceRange pos $ applyDelta pos (0, 1)) [] lexStack err + , ParserErrorInfo (SourceRange pos $ applyDelta pos (PosDelta 0 0 1)) [] lexStack err ) onSuccess _ (TokEof, _) = @@ -183,26 +183,30 @@ breakComments = k0 [] case mbComm of Just comm -> k0 (comm : acc') Nothing -> pure (reverse acc', []) - else - k1 acc' (goWs [] $ Text.unpack lines) + else do + tabs <- nextWhile (== '\t') + k1 acc' (goWs [] $ Text.unpack (lines <> tabs)) k1 trl acc = do - ws <- nextWhile (\c -> c == ' ' || isLineFeed c) + ws <- nextWhile (\c -> c == ' ' || c == '\t' || isLineFeed c) let acc' = goWs acc $ Text.unpack ws mbComm <- comment case mbComm of Just comm -> k1 trl (comm : acc') Nothing -> pure (reverse trl, reverse acc') - goWs a ('\r' : '\n' : ls) = goWs (Line CRLF : a) ls - goWs a ('\r' : ls) = goWs (Line CRLF : a) ls - goWs a ('\n' : ls) = goWs (Line LF : a) ls + goWs a ('\r' : '\n' : ls) = goTab CRLF a 0 ls + goWs a ('\r' : ls) = goTab CRLF a 0 ls + goWs a ('\n' : ls) = goTab LF a 0 ls goWs a (' ' : ls) = goSpace a 1 ls goWs a _ = a goSpace a !n (' ' : ls) = goSpace a (n + 1) ls goSpace a n ls = goWs (Space n : a) ls + goTab ending a !n ('\t' : ls) = goTab ending a (n + 1) ls + goTab ending a n ls = goWs (Line ending n : a) ls + isBlockComment = Parser $ \inp _ ksucc -> case Text.uncons inp of Just ('-', inp2) -> @@ -260,14 +264,21 @@ breakShebang = shebangComment >>= \case Just ('\r', inp2) -> case Text.uncons inp2 of Just ('\n', inp3) -> - Just (Line CRLF, inp3) + Just (unconsTabs CRLF inp3) _ -> - Just (Line CRLF, inp2) + Just (unconsTabs CRLF inp2) Just ('\n', inp2) -> - Just (Line LF, inp2) + Just (unconsTabs LF inp2) _ -> Nothing + unconsTabs :: LineFeed -> Text -> (Comment LineFeed, Text) + unconsTabs l inp = case Text.takeWhile (== '\t') inp of + "" -> (Line l 0, inp) + str -> do + let tabs = Text.length str + (Line l tabs, Text.drop tabs str) + unconsShebang :: Text -> Maybe (Text, Text) unconsShebang = fmap ("#!",) . Text.stripPrefix "#!" diff --git a/src/Language/PureScript/CST/Monad.hs b/src/Language/PureScript/CST/Monad.hs index 2b79f1a9b3..de027e3962 100644 --- a/src/Language/PureScript/CST/Monad.hs +++ b/src/Language/PureScript/CST/Monad.hs @@ -103,7 +103,7 @@ mkParserError stack toks ty = } where range = case NE.nonEmpty toks of - Nothing -> SourceRange (SourcePos 0 0) (SourcePos 0 0) + Nothing -> SourceRange (SourcePos 0 (0, 0)) (SourcePos 0 (0, 0)) Just neToks -> widen (tokRange . tokAnn $ NE.head neToks) (tokRange . tokAnn $ NE.last neToks) diff --git a/src/Language/PureScript/CST/Positions.hs b/src/Language/PureScript/CST/Positions.hs index 20d5724271..3f61cb647b 100644 --- a/src/Language/PureScript/CST/Positions.hs +++ b/src/Language/PureScript/CST/Positions.hs @@ -11,10 +11,16 @@ import Data.Foldable (foldl') import Data.List.NonEmpty qualified as NE import Data.Maybe (fromMaybe) import Data.Text (Text) -import Data.Void (Void) +import Data.Void (Void, absurd) import Data.Text qualified as Text import Language.PureScript.CST.Types +data PosDelta = PosDelta + { posDeltaLines :: {-# UNPACK #-} !Int + , posDeltaTabs :: {-# UNPACK #-} !Int + , posDeltaColumns :: {-# UNPACK #-} !Int + } + advanceToken :: SourcePos -> Token -> SourcePos advanceToken pos = applyDelta pos . tokenDelta @@ -22,77 +28,81 @@ advanceLeading :: SourcePos -> [Comment LineFeed] -> SourcePos advanceLeading = foldl' $ \a -> applyDelta a . commentDelta lineDelta advanceTrailing :: SourcePos -> [Comment Void] -> SourcePos -advanceTrailing = foldl' $ \a -> applyDelta a . commentDelta (const (0, 0)) +advanceTrailing = foldl' $ \a -> applyDelta a . commentDelta absurd -tokenDelta :: Token -> (Int, Int) +tokenDelta :: Token -> PosDelta tokenDelta = \case - TokLeftParen -> (0, 1) - TokRightParen -> (0, 1) - TokLeftBrace -> (0, 1) - TokRightBrace -> (0, 1) - TokLeftSquare -> (0, 1) - TokRightSquare -> (0, 1) - TokLeftArrow ASCII -> (0, 2) - TokLeftArrow Unicode -> (0, 1) - TokRightArrow ASCII -> (0, 2) - TokRightArrow Unicode -> (0, 1) - TokRightFatArrow ASCII -> (0, 2) - TokRightFatArrow Unicode -> (0, 1) - TokDoubleColon ASCII -> (0, 2) - TokDoubleColon Unicode -> (0, 1) - TokForall ASCII -> (0, 6) - TokForall Unicode -> (0, 1) - TokEquals -> (0, 1) - TokPipe -> (0, 1) - TokTick -> (0, 1) - TokDot -> (0, 1) - TokComma -> (0, 1) - TokUnderscore -> (0, 1) - TokBackslash -> (0, 1) - TokLowerName qual name -> (0, qualDelta qual + Text.length name) - TokUpperName qual name -> (0, qualDelta qual + Text.length name) - TokOperator qual sym -> (0, qualDelta qual + Text.length sym) - TokSymbolName qual sym -> (0, qualDelta qual + Text.length sym + 2) - TokSymbolArr Unicode -> (0, 3) - TokSymbolArr ASCII -> (0, 4) - TokHole hole -> (0, Text.length hole + 1) - TokChar raw _ -> (0, Text.length raw + 2) - TokInt raw _ -> (0, Text.length raw) - TokNumber raw _ -> (0, Text.length raw) + TokLeftParen -> PosDelta 0 0 1 + TokRightParen -> PosDelta 0 0 1 + TokLeftBrace -> PosDelta 0 0 1 + TokRightBrace -> PosDelta 0 0 1 + TokLeftSquare -> PosDelta 0 0 1 + TokRightSquare -> PosDelta 0 0 1 + TokLeftArrow ASCII -> PosDelta 0 0 2 + TokLeftArrow Unicode -> PosDelta 0 0 1 + TokRightArrow ASCII -> PosDelta 0 0 2 + TokRightArrow Unicode -> PosDelta 0 0 1 + TokRightFatArrow ASCII -> PosDelta 0 0 2 + TokRightFatArrow Unicode -> PosDelta 0 0 1 + TokDoubleColon ASCII -> PosDelta 0 0 2 + TokDoubleColon Unicode -> PosDelta 0 0 1 + TokForall ASCII -> PosDelta 0 0 6 + TokForall Unicode -> PosDelta 0 0 1 + TokEquals -> PosDelta 0 0 1 + TokPipe -> PosDelta 0 0 1 + TokTick -> PosDelta 0 0 1 + TokDot -> PosDelta 0 0 1 + TokComma -> PosDelta 0 0 1 + TokUnderscore -> PosDelta 0 0 1 + TokBackslash -> PosDelta 0 0 1 + TokLowerName qual name -> PosDelta 0 0 (qualDelta qual + Text.length name) + TokUpperName qual name -> PosDelta 0 0 (qualDelta qual + Text.length name) + TokOperator qual sym -> PosDelta 0 0 (qualDelta qual + Text.length sym) + TokSymbolName qual sym -> PosDelta 0 0 (qualDelta qual + Text.length sym + 2) + TokSymbolArr Unicode -> PosDelta 0 0 3 + TokSymbolArr ASCII -> PosDelta 0 0 4 + TokHole hole -> PosDelta 0 0 (Text.length hole + 1) + TokChar raw _ -> PosDelta 0 0 (Text.length raw + 2) + TokInt raw _ -> PosDelta 0 0 (Text.length raw) + TokNumber raw _ -> PosDelta 0 0 (Text.length raw) TokString raw _ -> multiLine 1 $ textDelta raw TokRawString raw -> multiLine 3 $ textDelta raw - TokLayoutStart -> (0, 0) - TokLayoutSep -> (0, 0) - TokLayoutEnd -> (0, 0) - TokEof -> (0, 0) + TokLayoutStart -> PosDelta 0 0 0 + TokLayoutSep -> PosDelta 0 0 0 + TokLayoutEnd -> PosDelta 0 0 0 + TokEof -> PosDelta 0 0 0 qualDelta :: [Text] -> Int qualDelta = foldr ((+) . (+ 1) . Text.length) 0 -multiLine :: Int -> (Int, Int) -> (Int, Int) -multiLine n (0, c) = (0, c + n + n) -multiLine n (l, c) = (l, c + n) +multiLine :: Int -> PosDelta -> PosDelta +multiLine n (PosDelta 0 0 c) = PosDelta 0 0 (c + n + n) +multiLine n (PosDelta l i c) = PosDelta l i (c + n) -commentDelta :: (a -> (Int, Int)) -> Comment a -> (Int, Int) +commentDelta :: (a -> Int -> PosDelta) -> Comment a -> PosDelta commentDelta k = \case Comment raw -> textDelta raw - Space n -> (0, n) - Line a -> k a + Space n -> PosDelta 0 0 n + Line n i -> k n i -lineDelta :: LineFeed -> (Int, Int) -lineDelta _ = (1, 1) +lineDelta :: LineFeed -> Int -> PosDelta +lineDelta _ tabs = PosDelta 1 tabs 1 -textDelta :: Text -> (Int, Int) -textDelta = Text.foldl' go (0, 0) +textDelta :: Text -> PosDelta +textDelta = Text.foldl' go (PosDelta 0 0 0) where - go (!l, !c) = \case - '\n' -> (l + 1, 1) - _ -> (l, c + 1) - -applyDelta :: SourcePos -> (Int, Int) -> SourcePos -applyDelta (SourcePos l c) = \case - (0, n) -> SourcePos l (c + n) - (k, d) -> SourcePos (l + k) d + go (PosDelta l i c) = \case + '\n' -> PosDelta (l + 1) 0 1 + '\t' -> PosDelta l (i + 1) c + _ -> PosDelta l i (c + 1) + +applyDelta :: SourcePos -> PosDelta -> SourcePos +applyDelta pos delta = case (pos, delta) of + (SourcePos l (i, c), PosDelta 0 0 n) -> SourcePos l (i, c + n) + (SourcePos l (i, 1), PosDelta 0 t n) -> SourcePos l (i + t, n) + -- If we encounter tabs mid-line, treat them as 1 character + (SourcePos l (i, c), PosDelta 0 t n) -> SourcePos l (i, c + t + n - 1) + (SourcePos l _, PosDelta k t n) -> SourcePos (l + k) (t, n) sepLast :: Separated a -> a sepLast (Separated hd []) = hd diff --git a/src/Language/PureScript/CST/Print.hs b/src/Language/PureScript/CST/Print.hs index f6d300ab67..505909ed13 100644 --- a/src/Language/PureScript/CST/Print.hs +++ b/src/Language/PureScript/CST/Print.hs @@ -17,6 +17,7 @@ import Data.Text (Text) import Data.Text qualified as Text import Language.PureScript.CST.Types (Comment(..), LineFeed(..), Module, SourceStyle(..), SourceToken(..), Token(..), TokenAnn(..)) import Language.PureScript.CST.Flatten (flattenModule) +import Data.Void (Void) printToken :: Token -> Text printToken = printToken' True @@ -86,11 +87,10 @@ printLeadingComment :: Comment LineFeed -> Text printLeadingComment = \case Comment raw -> raw Space n -> Text.replicate n " " - Line LF -> "\n" - Line CRLF -> "\r\n" + Line LF n -> "\n" <> Text.replicate n "\t" + Line CRLF n -> "\r\n" <> Text.replicate n "\t" -printTrailingComment :: Comment void -> Text +printTrailingComment :: Comment Void -> Text printTrailingComment = \case Comment raw -> raw Space n -> Text.replicate n " " - Line _ -> "" diff --git a/src/Language/PureScript/CST/Types.hs b/src/Language/PureScript/CST/Types.hs index ba90f7e95b..781b1d4700 100644 --- a/src/Language/PureScript/CST/Types.hs +++ b/src/Language/PureScript/CST/Types.hs @@ -21,7 +21,7 @@ import Language.PureScript.PSString (PSString) data SourcePos = SourcePos { srcLine :: {-# UNPACK #-} !Int - , srcColumn :: {-# UNPACK #-} !Int + , srcColumn :: {-# UNPACK #-} !(Int, Int) } deriving (Show, Eq, Ord, Generic, NFData) data SourceRange = SourceRange @@ -32,7 +32,7 @@ data SourceRange = SourceRange data Comment l = Comment !Text | Space {-# UNPACK #-} !Int - | Line !l + | Line !l !Int deriving (Show, Eq, Ord, Generic, Functor, NFData) data LineFeed = LF | CRLF diff --git a/src/Language/PureScript/CST/Utils.hs b/src/Language/PureScript/CST/Utils.hs index 68dcf7d87c..fd4f04ae13 100644 --- a/src/Language/PureScript/CST/Utils.hs +++ b/src/Language/PureScript/CST/Utils.hs @@ -73,7 +73,7 @@ opName n = OpName (N.coerceOpName <$> n) placeholder :: SourceToken placeholder = SourceToken - { tokAnn = TokenAnn (SourceRange (SourcePos 0 0) (SourcePos 0 0)) [] [] + { tokAnn = TokenAnn (SourceRange (SourcePos 0 (0, 0)) (SourcePos 0 (0, 0))) [] [] , tokValue = TokLowerName [] "" } From b0b29804f70153b7f4e8dec4905ea3cbff8a4a7d Mon Sep 17 00:00:00 2001 From: Artemis System Date: Fri, 13 Jun 2025 12:34:32 +0200 Subject: [PATCH 2/2] Remove failing tab indent test --- tests/purs/failing/Whitespace1.out | 10 ---------- tests/purs/failing/Whitespace1.purs | 5 ----- 2 files changed, 15 deletions(-) delete mode 100644 tests/purs/failing/Whitespace1.out delete mode 100644 tests/purs/failing/Whitespace1.purs diff --git a/tests/purs/failing/Whitespace1.out b/tests/purs/failing/Whitespace1.out deleted file mode 100644 index 299c3ddb53..0000000000 --- a/tests/purs/failing/Whitespace1.out +++ /dev/null @@ -1,10 +0,0 @@ -Error found: -at tests/purs/failing/Whitespace1.purs:5:1 - 5:2 (line 5, column 1 - line 5, column 2) - - Unable to parse module: - Illegal whitespace character U+0009 - - -See https://github.com/purescript/documentation/blob/master/errors/ErrorParsingModule.md for more information, -or to contribute content related to this error. - diff --git a/tests/purs/failing/Whitespace1.purs b/tests/purs/failing/Whitespace1.purs deleted file mode 100644 index b73805a0c7..0000000000 --- a/tests/purs/failing/Whitespace1.purs +++ /dev/null @@ -1,5 +0,0 @@ --- @shouldFailWith ErrorParsingModule -module Main where - -test = do - test