1

My Haskell program runs out of memory when trying to parse a 115MB JSON file. I suspect I'm doing something that you shouldn't do in Haskell - at an earlier step in the program, I ran out of memory because I was operating on Strings instead of ByteStrings - but I am unable to figure out what.

I've condensed my program down into the following MWE:

{-# LANGUAGE GeneralizedNewtypeDeriving, OverloadedStrings, FlexibleInstances #-}

----------------------------------------
-- Imports
----------------------------------------

import System.Environment
  ( getArgs )
import Control.Monad
  ( mzero
  , when
  )

import qualified Data.ByteString.Lazy.Char8 as BS
import Data.Aeson
import Data.Maybe
import Data.Scientific
  ( Scientific )

import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Vector as V


----------------------------------------
-- Data types
----------------------------------------

newtype Natural
  = Natural Integer
  deriving (Show, Eq, Ord)

instance Num Natural where
    fromInteger = toNatural
    x + y = toNatural (fromNatural x + fromNatural y)
    x - y = let r = fromNatural x - fromNatural y
            in if r < 0
               then error "Subtraction yielded a negative value"
               else toNatural r
    x * y = toNatural (fromNatural x * fromNatural y)
    abs x = x
    signum x = toNatural $ signum $ fromNatural x

instance Enum Natural where
  toEnum = toNatural . toInteger
  fromEnum = fromInteger . fromNatural

instance Real Natural where
  toRational (Natural i) = toRational i

instance Integral Natural where
  quotRem (Natural x) (Natural y) =
    ( toNatural $ quot x y
    , toNatural $ rem x y
    )
  toInteger (Natural i) = i

instance FromJSON Natural where
  parseJSON (Number sn) = return $ sn2nat sn
  parseJSON _ = mzero

instance ToJSON Natural where
  toJSON i = toJSON (fromNatural i)

----------------------------------------

data PatternMatchset
  = PatternMatchset
      { pmTarget :: TargetMachineID
      , pmMatches :: [PatternMatch]
      , pmTime :: Maybe Double
      }
  deriving (Show)

instance FromJSON PatternMatchset where
  parseJSON (Object v) =
    PatternMatchset
      <$> v .: "target-machine-id"
      <*> v .: "match-data"
      <*> v .: "time"
  parseJSON _ = mzero

instance ToJSON PatternMatchset where
  toJSON m =
    object [ "target-machine-id" .= (pmTarget m)
           , "match-data"        .= (pmMatches m)
           , "time"              .= (pmTime m)
           ]

----------------------------------------

data PatternMatch
  = PatternMatch
      { pmInstrID :: InstructionID
      , pmMatchID :: MatchID
      , pmMatch :: Match NodeID
      }
  deriving (Show)

instance FromJSON PatternMatch where
  parseJSON (Object v) =
    PatternMatch
      <$> v .: "instr-id"
      <*> v .: "match-id"
      <*> v .: "match"
  parseJSON _ = mzero

instance ToJSON PatternMatch where
  toJSON m =
    object [ "instr-id"   .= (pmInstrID m)
           , "match-id"   .= (pmMatchID m)
           , "match"      .= (pmMatch m)
           ]

----------------------------------------

data Match n
  = Match { f2pMaps :: M.Map n [n]
          , p2fMaps :: M.Map n [n]
          }
  deriving (Show, Eq, Ord)

instance FromJSON (Match NodeID) where
  parseJSON v@(Array _) =
    do list <- parseJSON v
       return $ toMatch list
  parseJSON _ = mzero

instance ToJSON (Match NodeID) where
  toJSON m = toJSON $ fromMatch m

----------------------------------------

data Mapping n
  = Mapping
      { fNode :: n
      , pNode :: n
      }
  deriving (Show, Eq, Ord)

instance FromJSON (Mapping NodeID) where
  parseJSON v@(Array _) =
    do list <- parseJSON v
       when (length list /= 2) mzero
       return Mapping { fNode = head list
                      , pNode = last list
                      }
  parseJSON _ = mzero

instance ToJSON (Mapping NodeID) where
  toJSON m = Array (V.fromList [toJSON $ fNode m, toJSON $ pNode m])

----------------------------------------

newtype MatchID
  = MatchID Natural
  deriving (Show, Eq, Ord, Num, Enum, Real, Integral)

instance FromJSON MatchID where
  parseJSON (Number sn) = return $ toMatchID $ sn2nat sn
  parseJSON _ = mzero

instance ToJSON MatchID where
  toJSON mid = toJSON (fromMatchID mid)

----------------------------------------

newtype NodeID
  = NodeID Natural
  deriving (Show, Eq, Ord, Num, Enum, Real, Integral)

instance FromJSON NodeID where
  parseJSON (Number sn) = return $ toNodeID $ sn2nat sn
  parseJSON _ = mzero

instance ToJSON NodeID where
  toJSON mid = toJSON (fromNodeID mid)

----------------------------------------

newtype InstructionID
  = InstructionID Natural
  deriving (Show, Eq, Ord, Num, Enum, Real, Integral)

instance FromJSON InstructionID where
  parseJSON (Number sn) = return $ toInstructionID $ sn2nat sn
  parseJSON _ = mzero

instance ToJSON InstructionID where
  toJSON mid = toJSON (fromInstructionID mid)

----------------------------------------

newtype TargetMachineID
  = TargetMachineID String
  deriving (Show, Eq)

instance FromJSON TargetMachineID where
  parseJSON (String s) = return $ toTargetMachineID $ T.unpack s
  parseJSON _ = mzero

instance ToJSON TargetMachineID where
  toJSON tmid = toJSON (fromTargetMachineID tmid)


----------------------------------------
-- Help functions
----------------------------------------

-- | Converts an 'Integral' into a 'Natural'. If conversion fails, 'Nothing' is
-- returned.
maybeToNatural :: (Integral i) => i -> Maybe Natural
maybeToNatural x
  | x < 0     = Nothing
  | otherwise = Just $ Natural $ toInteger x

-- | Converts an 'Integral' into a 'Natural'. If conversion fails, an error is
-- reported.
toNatural :: (Integral i) => i -> Natural
toNatural x =
  let n = maybeToNatural x
  in if isJust n
     then fromJust n
     else error $ "toNatural: negative number: " ++
                  show (toInteger x :: Integer)

-- | Converts a 'Natural' into an 'Integer'.
fromNatural :: Natural -> Integer
fromNatural (Natural i) = i

-- | Converts a scientific number to a natural number. If the number is not an
-- non-negative then an error occurs.
sn2nat :: Scientific -> Natural
sn2nat sn =
  let int_value = round sn
  in if fromInteger int_value /= sn
     then error $ "sn2nat: not an integer: " ++ show sn
     else toNatural int_value

fromTargetMachineID :: TargetMachineID -> String
fromTargetMachineID (TargetMachineID i) = i

toTargetMachineID :: String -> TargetMachineID
toTargetMachineID = TargetMachineID

fromMatchID :: MatchID -> Natural
fromMatchID (MatchID i) = i

toMatchID :: (Integral i) => i -> MatchID
toMatchID = MatchID . toNatural

fromNodeID :: NodeID -> Natural
fromNodeID (NodeID i) = i

toNodeID :: (Integral i) => i -> NodeID
toNodeID = NodeID . toNatural

fromInstructionID :: InstructionID -> Natural
fromInstructionID (InstructionID i) = i

toInstructionID :: (Integral i) => i -> InstructionID
toInstructionID = InstructionID . toNatural

toMatch :: Ord n => [Mapping n] -> Match n
toMatch ms =
  let insert (n1, n2) m = M.insertWith (++) n1 [n2] m
  in Match { f2pMaps = foldr insert M.empty $
                       map (\m -> (fNode m, pNode m)) ms
           , p2fMaps = foldr insert M.empty $
                       map (\m -> (pNode m, fNode m)) ms
           }

fromMatch :: Ord n => Match n -> [Mapping n]
fromMatch m =
  M.foldrWithKey
    (\fn pns ms -> (ms ++ map (\pn -> Mapping { fNode = fn, pNode = pn }) pns))
    []
    (f2pMaps m)


----------------------------------------
-- Main program
----------------------------------------

main :: IO ()
main =
  do args <- getArgs
     when (length args == 0) $
       error $ "No input file"
     when (length args > 1) $
       error $ "Too many arguments"
     let file = head args
     str <- BS.readFile file
     let pmset = decode str
     when (isNothing pmset) $
       error $ "Failed to parse JSON"
     putStrLn $ BS.unpack $ encode (fromJust pmset :: PatternMatchset)

The input is of the following format:

{
  "match-data": [
    {
      "instr-id": 31,
      "match": [
        [2354, 5],
        [2343, 3],
        [2341, 10],
        [2340, 9],
        [1478, 8],
        [1476, 6]
      ],
      "match-id": 0
    }
  ],
  "target-machine-id": "Architecture",
  "time": 27.642428397
}

The program above simply parses the JSON file, converts it back to JSON and prints the data. To get a larger input file, simply copy-paste the object within the match-data list and append it to the list.

I've tried compiling the program using -O2 flag, to no avail.

2
  • FYI, you are missing a comma after the match-data list in the example input. Commented Nov 28, 2017 at 15:53
  • @pat: Indeed I am. Fixed now. Thanks. Commented Nov 28, 2017 at 16:29

1 Answer 1

1

Try changing:

putStrLn $ BS.unpack $ encode (fromJust pmset :: PatternMatchset)

to

 BS.putStrLn $ encode (fromJust pmset :: PatternMatchset)

The former caused my machine to go into swap hell. The latter completed just fine.

Sign up to request clarification or add additional context in comments.

5 Comments

Ah yes, another String to get rid of. Unfortunately, on my machine I still run out of memory for the large input...
What version of what compiler are you using?
I'm using ghc 8.0.2 under stack 1.5.1. The process had a maximum resident set size of 2,621,968,384 bytes, with a 115,688,713 byte input file.
How much physical memory do you have? How much swap space?
I've got 16GB, dunno about the swap. I also just occurred to me that the file I'm using has no whitespace, meaning that in the format above it expands to about 500MB.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.