Haskell程序在尝试使用Aeson解析115MB JSON文件时内存不足

Haskell程序在尝试使用Aeson解析115MB JSON文件时内存不足,json,parsing,haskell,out-of-memory,aeson,Json,Parsing,Haskell,Out Of Memory,Aeson,我的Haskell程序在尝试解析115MB JSON文件时内存不足。我怀疑我正在做一些你不应该在Haskell中做的事情-在程序的前一步,我耗尽了内存,因为我是在Strings而不是ByteStrings上操作的-但我无法弄清楚是什么 我已将我的程序浓缩为以下MWE: {-# LANGUAGE GeneralizedNewtypeDeriving, OverloadedStrings, FlexibleInstances #-} --------------------------------

我的Haskell程序在尝试解析115MB JSON文件时内存不足。我怀疑我正在做一些你不应该在Haskell中做的事情-在程序的前一步,我耗尽了内存,因为我是在
String
s而不是
ByteString
s上操作的-但我无法弄清楚是什么

我已将我的程序浓缩为以下MWE:

{-# LANGUAGE GeneralizedNewtypeDeriving, OverloadedStrings, FlexibleInstances #-}

----------------------------------------
-- Imports
----------------------------------------

import System.Environment
  ( getArgs )
import Control.Monad
  ( mzero
  , when
  )

import qualified Data.ByteString.Lazy.Char8 as BS
import Data.Aeson
import Data.Maybe
import Data.Scientific
  ( Scientific )

import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Vector as V


----------------------------------------
-- Data types
----------------------------------------

newtype Natural
  = Natural Integer
  deriving (Show, Eq, Ord)

instance Num Natural where
    fromInteger = toNatural
    x + y = toNatural (fromNatural x + fromNatural y)
    x - y = let r = fromNatural x - fromNatural y
            in if r < 0
               then error "Subtraction yielded a negative value"
               else toNatural r
    x * y = toNatural (fromNatural x * fromNatural y)
    abs x = x
    signum x = toNatural $ signum $ fromNatural x

instance Enum Natural where
  toEnum = toNatural . toInteger
  fromEnum = fromInteger . fromNatural

instance Real Natural where
  toRational (Natural i) = toRational i

instance Integral Natural where
  quotRem (Natural x) (Natural y) =
    ( toNatural $ quot x y
    , toNatural $ rem x y
    )
  toInteger (Natural i) = i

instance FromJSON Natural where
  parseJSON (Number sn) = return $ sn2nat sn
  parseJSON _ = mzero

instance ToJSON Natural where
  toJSON i = toJSON (fromNatural i)

----------------------------------------

data PatternMatchset
  = PatternMatchset
      { pmTarget :: TargetMachineID
      , pmMatches :: [PatternMatch]
      , pmTime :: Maybe Double
      }
  deriving (Show)

instance FromJSON PatternMatchset where
  parseJSON (Object v) =
    PatternMatchset
      <$> v .: "target-machine-id"
      <*> v .: "match-data"
      <*> v .: "time"
  parseJSON _ = mzero

instance ToJSON PatternMatchset where
  toJSON m =
    object [ "target-machine-id" .= (pmTarget m)
           , "match-data"        .= (pmMatches m)
           , "time"              .= (pmTime m)
           ]

----------------------------------------

data PatternMatch
  = PatternMatch
      { pmInstrID :: InstructionID
      , pmMatchID :: MatchID
      , pmMatch :: Match NodeID
      }
  deriving (Show)

instance FromJSON PatternMatch where
  parseJSON (Object v) =
    PatternMatch
      <$> v .: "instr-id"
      <*> v .: "match-id"
      <*> v .: "match"
  parseJSON _ = mzero

instance ToJSON PatternMatch where
  toJSON m =
    object [ "instr-id"   .= (pmInstrID m)
           , "match-id"   .= (pmMatchID m)
           , "match"      .= (pmMatch m)
           ]

----------------------------------------

data Match n
  = Match { f2pMaps :: M.Map n [n]
          , p2fMaps :: M.Map n [n]
          }
  deriving (Show, Eq, Ord)

instance FromJSON (Match NodeID) where
  parseJSON v@(Array _) =
    do list <- parseJSON v
       return $ toMatch list
  parseJSON _ = mzero

instance ToJSON (Match NodeID) where
  toJSON m = toJSON $ fromMatch m

----------------------------------------

data Mapping n
  = Mapping
      { fNode :: n
      , pNode :: n
      }
  deriving (Show, Eq, Ord)

instance FromJSON (Mapping NodeID) where
  parseJSON v@(Array _) =
    do list <- parseJSON v
       when (length list /= 2) mzero
       return Mapping { fNode = head list
                      , pNode = last list
                      }
  parseJSON _ = mzero

instance ToJSON (Mapping NodeID) where
  toJSON m = Array (V.fromList [toJSON $ fNode m, toJSON $ pNode m])

----------------------------------------

newtype MatchID
  = MatchID Natural
  deriving (Show, Eq, Ord, Num, Enum, Real, Integral)

instance FromJSON MatchID where
  parseJSON (Number sn) = return $ toMatchID $ sn2nat sn
  parseJSON _ = mzero

instance ToJSON MatchID where
  toJSON mid = toJSON (fromMatchID mid)

----------------------------------------

newtype NodeID
  = NodeID Natural
  deriving (Show, Eq, Ord, Num, Enum, Real, Integral)

instance FromJSON NodeID where
  parseJSON (Number sn) = return $ toNodeID $ sn2nat sn
  parseJSON _ = mzero

instance ToJSON NodeID where
  toJSON mid = toJSON (fromNodeID mid)

----------------------------------------

newtype InstructionID
  = InstructionID Natural
  deriving (Show, Eq, Ord, Num, Enum, Real, Integral)

instance FromJSON InstructionID where
  parseJSON (Number sn) = return $ toInstructionID $ sn2nat sn
  parseJSON _ = mzero

instance ToJSON InstructionID where
  toJSON mid = toJSON (fromInstructionID mid)

----------------------------------------

newtype TargetMachineID
  = TargetMachineID String
  deriving (Show, Eq)

instance FromJSON TargetMachineID where
  parseJSON (String s) = return $ toTargetMachineID $ T.unpack s
  parseJSON _ = mzero

instance ToJSON TargetMachineID where
  toJSON tmid = toJSON (fromTargetMachineID tmid)


----------------------------------------
-- Help functions
----------------------------------------

-- | Converts an 'Integral' into a 'Natural'. If conversion fails, 'Nothing' is
-- returned.
maybeToNatural :: (Integral i) => i -> Maybe Natural
maybeToNatural x
  | x < 0     = Nothing
  | otherwise = Just $ Natural $ toInteger x

-- | Converts an 'Integral' into a 'Natural'. If conversion fails, an error is
-- reported.
toNatural :: (Integral i) => i -> Natural
toNatural x =
  let n = maybeToNatural x
  in if isJust n
     then fromJust n
     else error $ "toNatural: negative number: " ++
                  show (toInteger x :: Integer)

-- | Converts a 'Natural' into an 'Integer'.
fromNatural :: Natural -> Integer
fromNatural (Natural i) = i

-- | Converts a scientific number to a natural number. If the number is not an
-- non-negative then an error occurs.
sn2nat :: Scientific -> Natural
sn2nat sn =
  let int_value = round sn
  in if fromInteger int_value /= sn
     then error $ "sn2nat: not an integer: " ++ show sn
     else toNatural int_value

fromTargetMachineID :: TargetMachineID -> String
fromTargetMachineID (TargetMachineID i) = i

toTargetMachineID :: String -> TargetMachineID
toTargetMachineID = TargetMachineID

fromMatchID :: MatchID -> Natural
fromMatchID (MatchID i) = i

toMatchID :: (Integral i) => i -> MatchID
toMatchID = MatchID . toNatural

fromNodeID :: NodeID -> Natural
fromNodeID (NodeID i) = i

toNodeID :: (Integral i) => i -> NodeID
toNodeID = NodeID . toNatural

fromInstructionID :: InstructionID -> Natural
fromInstructionID (InstructionID i) = i

toInstructionID :: (Integral i) => i -> InstructionID
toInstructionID = InstructionID . toNatural

toMatch :: Ord n => [Mapping n] -> Match n
toMatch ms =
  let insert (n1, n2) m = M.insertWith (++) n1 [n2] m
  in Match { f2pMaps = foldr insert M.empty $
                       map (\m -> (fNode m, pNode m)) ms
           , p2fMaps = foldr insert M.empty $
                       map (\m -> (pNode m, fNode m)) ms
           }

fromMatch :: Ord n => Match n -> [Mapping n]
fromMatch m =
  M.foldrWithKey
    (\fn pns ms -> (ms ++ map (\pn -> Mapping { fNode = fn, pNode = pn }) pns))
    []
    (f2pMaps m)


----------------------------------------
-- Main program
----------------------------------------

main :: IO ()
main =
  do args <- getArgs
     when (length args == 0) $
       error $ "No input file"
     when (length args > 1) $
       error $ "Too many arguments"
     let file = head args
     str <- BS.readFile file
     let pmset = decode str
     when (isNothing pmset) $
       error $ "Failed to parse JSON"
     putStrLn $ BS.unpack $ encode (fromJust pmset :: PatternMatchset)
上面的程序只是解析JSON文件,将其转换回JSON并打印数据。要获得更大的输入文件,只需复制粘贴
匹配数据列表中的对象并将其附加到列表中即可

我尝试使用-O2标志编译程序,但没有效果。

尝试更改:

putStrLn $ BS.unpack $ encode (fromJust pmset :: PatternMatchset)


前者使我的机器陷入地狱。后者完成得很好。

FYI,您在示例输入中的
匹配数据
列表后缺少一个逗号。@pat:确实是这样。现在修好了。谢谢。啊,是的,还有一个
字符串要处理。不幸的是,在我的机器上,我仍然没有足够的内存来进行大的输入…您使用的是什么版本的编译器?我使用的是堆栈1.5.1下的ghc 8.0.2。进程的最大常驻集大小为2621968384字节,输入文件为115688713字节。您有多少物理内存?有多少交换空间?我有16GB,不知道交换的情况。我还突然想到,我正在使用的文件没有空格,这意味着在上面的格式中,它扩展到大约500MB。
putStrLn $ BS.unpack $ encode (fromJust pmset :: PatternMatchset)
 BS.putStrLn $ encode (fromJust pmset :: PatternMatchset)