Regex Haskell Alex-正则表达式匹配错误的字符串?
我正在尝试为基于缩进的语法编写lexer,但在匹配缩进时遇到了问题 这是我的密码:Regex Haskell Alex-正则表达式匹配错误的字符串?,regex,haskell,lexical-analysis,alex,Regex,Haskell,Lexical Analysis,Alex,我正在尝试为基于缩进的语法编写lexer,但在匹配缩进时遇到了问题 这是我的密码: { module Lexer ( main ) where import System.IO.Unsafe } %wrapper "monadUserState" $whitespace = [\ \t\b] $digit = 0-9 -- digits $alpha = [A-Za-z] $lett
{
module Lexer ( main ) where
import System.IO.Unsafe
}
%wrapper "monadUserState"
$whitespace = [\ \t\b]
$digit = 0-9 -- digits
$alpha = [A-Za-z]
$letter = [a-zA-Z] -- alphabetic characters
$ident = [$letter $digit _] -- identifier character
$indent = [\ \t]
@number = [$digit]+
@identifier = $alpha($alpha|_|$digit)*
error:-
@identifier { mkL LVarId }
\n $whitespace* \n { skip }
\n $whitespace* { setIndent }
$whitespace+ { skip }
{
data Lexeme = Lexeme AlexPosn LexemeClass (Maybe String)
instance Show Lexeme where
show (Lexeme _ LEOF _) = " Lexeme EOF"
show (Lexeme p cl mbs) = " Lexeme class=" ++ show cl ++ showap p ++ showst mbs
where
showap pp = " posn=" ++ showPosn pp
showst Nothing = ""
showst (Just s) = " string=" ++ show s
instance Eq Lexeme where
(Lexeme _ cls1 _) == (Lexeme _ cls2 _) = cls1 == cls2
showPosn :: AlexPosn -> String
showPosn (AlexPn _ line col) = show line ++ ':': show col
tokPosn :: Lexeme -> AlexPosn
tokPosn (Lexeme p _ _) = p
data LexemeClass
= LVarId
| LTIndent Int
| LTDedent Int
| LIndent
| LDedent
| LEOF
deriving (Show, Eq)
mkL :: LexemeClass -> AlexInput -> Int -> Alex Lexeme
mkL c (p, _, _, str) len = return (Lexeme p c (Just (take len str)))
data AlexUserState = AlexUserState { indent :: Int }
alexInitUserState :: AlexUserState
alexInitUserState = AlexUserState 0
type Action = AlexInput -> Int -> Alex Lexeme
getLexerIndentLevel :: Alex Int
getLexerIndentLevel = Alex $ \s@AlexState{alex_ust=ust} -> Right (s, indent ust)
setLexerIndentLevel :: Int -> Alex ()
setLexerIndentLevel i = Alex $ \s@AlexState{alex_ust=ust} -> Right (s{alex_ust=(AlexUserState i)}, ())
setIndent :: Action
setIndent input@(p, _, _, str) i = do
--let !x = unsafePerformIO $ putStrLn $ "|matched string: " ++ str ++ "|"
lastIndent <- getLexerIndentLevel
currIndent <- countIndent (drop 1 str) 0 -- first char is always \n
if (lastIndent < currIndent) then
do setLexerIndentLevel currIndent
mkL (LTIndent (currIndent - lastIndent)) input i
else if (lastIndent > currIndent) then
do setLexerIndentLevel currIndent
mkL (LTDedent (lastIndent - currIndent)) input i
else alexMonadScan
where
countIndent str total
| take 1 str == "\t" = do skip input 1
countIndent (drop 1 str) (total+1)
| take 4 str == " " = do skip input 4
countIndent (drop 4 str) (total+1)
| otherwise = return total
alexEOF :: Alex Lexeme
alexEOF = return (Lexeme undefined LEOF Nothing)
scanner :: String -> Either String [Lexeme]
scanner str =
let loop = do
tok@(Lexeme _ cl _) <- alexMonadScan
if (cl == LEOF)
then return [tok]
else do toks <- loop
return (tok:toks)
in runAlex str loop
addIndentations :: [Lexeme] -> [Lexeme]
addIndentations (lex@(Lexeme pos (LTIndent c) _):ls) =
concat [iter lex c, addIndentations ls]
where iter lex c = if c == 0 then []
else (Lexeme pos LIndent Nothing):(iter lex (c-1))
addIndentations (lex@(Lexeme pos (LTDedent c) _):ls) =
concat [iter lex c, addIndentations ls]
where iter lex c = if c == 0 then []
else (Lexeme pos LDedent Nothing):(iter lex (c-1))
addIndentations (l:ls) = l:(addIndentations ls)
addIndentations [] = []
main = do
s <- getContents
return ()
print $ fmap addIndentations (scanner s)
}
因此,调用setIndent
时不只是使用空格。在返回缩进的词素后,字符串的其他部分被省略
这是亚历克斯的错误吗?或者我做错了什么?所以我没有详细分析您的代码,但我确实注意到:
setIndent :: Action
setIndent input@(p, _, _, str) i = do
--let !x = unsafePerformIO $ putStrLn $ "|matched string: " ++ str ++ "|"
请注意,str
是输入的其余部分,而不仅仅是当前标记。要获取当前令牌,您需要take i str
。也许这会给您一种印象,即令牌匹配的输入比实际情况更多
当然,我们在GHC自己的lexer中处理缩进,所以您可能会想(尽管正如您所料,它相当大和复杂)。谢谢您的回复,所以我更改了代码,现在它正在处理
take I str
,但没有任何更改。setIndent
返回缩进标记后,Alex会忽略该行的其余部分(因为Alex操作函数不会决定输入字符串的下一部分,我认为它与我的setIndent
函数无关),您的正则表达式表示它匹配一个换行符,后跟零个或多个空格字符,这看起来与调试输出中发生的情况一模一样。它匹配空格字符,但也匹配空格字符后面的一些附加字符,并且这些字符没有被匹配换行后空格字符的正则表达式后面的正则表达式匹配。如果您实际运行代码并键入一些缩进行(缩进是制表符的4个空格),您将看到缩进标记之后的标记不知何故被省略了。
setIndent :: Action
setIndent input@(p, _, _, str) i = do
--let !x = unsafePerformIO $ putStrLn $ "|matched string: " ++ str ++ "|"