Loops 为什么我使用iteratee IO的Mapreduce实现(真实世界的haskell)也会失败;“打开的文件太多”;

Loops 为什么我使用iteratee IO的Mapreduce实现(真实世界的haskell)也会失败;“打开的文件太多”;,loops,haskell,mapreduce,Loops,Haskell,Mapreduce,我正在实现一个haskell程序,它将文件的每一行与文件中的每一行进行比较。可以按如下方式实现单线程 distance :: Int -> Int -> Int distance a b = (a-b)*(a-b) sumOfDistancesOnSmallFile :: FilePath -> IO Int sumOfDistancesOnSmallFile path = do fileContents <- readFile path

我正在实现一个haskell程序,它将文件的每一行与文件中的每一行进行比较。可以按如下方式实现单线程

distance :: Int -> Int -> Int
distance a b = (a-b)*(a-b)

sumOfDistancesOnSmallFile :: FilePath -> IO Int
sumOfDistancesOnSmallFile path = do
              fileContents <- readFile path
              return $ allDistances $ map read $ lines $ fileContents
              where
                  allDistances (x:xs) = (allDistances xs) + ( sum $ map (distance x) xs)
                  allDistances _ = 0
因为我希望程序在文件大小上也可以伸缩,所以我最初使用了惰性IO。但是,这在“打开的文件太多”的情况下失败了,我在一篇文章中询问了这一点(GC处理文件句柄的时间太晚)。完整的懒惰IO版本发布在那里

正如公认的答案所解释的那样,严格IO可以解决这个问题。这确实解决了2k行文件的“打开的文件太多”问题,但在50k文件上“内存不足”时失败

请注意,第一个单线程实现(不带mapreduce)能够处理50k文件

我最感兴趣的另一种解决方案是使用iterateeio。我希望这可以解决文件句柄和内存资源耗尽的问题。但是,我的实现仍然失败,在2k行文件上出现“打开的文件太多”错误

iteratee IO版本具有与本书中相同的mapReduce函数,但有一个修改过的chunkedFileEnum,使其能够与枚举器一起工作

所以我的问题是,;下面的iteratee IO基实现有什么问题?懒惰在哪里

import Control.Monad.IO.Class (liftIO)
import Control.Monad.Trans (MonadIO, liftIO)
import System.IO

import qualified Data.Enumerator.List as EL
import qualified Data.Enumerator.Text as ET
import Data.Enumerator hiding (map, filter, head, sequence)

import Data.Text(Text)
import Data.Text.Read
import Data.Maybe

import qualified Data.ByteString.Char8 as Str
import Control.Exception (bracket,finally)
import Control.Monad(forM,liftM)
import Control.Parallel.Strategies
import Control.Parallel
import Control.DeepSeq (NFData)
import Data.Int (Int64)

--Goal: in a file with n values, calculate the sum of all n*(n-1)/2 squared distances

--My operation for one value pair
distance :: Int -> Int -> Int
distance a b = (a-b)*(a-b)

combineDistances :: [Int] -> Int
combineDistances = sum

--Test file generation
createTestFile :: Int -> FilePath -> IO ()
createTestFile n path = writeFile path $ unlines $ map show $ take n $ infiniteList 0 1
        where infiniteList :: Int->Int-> [Int]
              infiniteList i j = (i + j) : infiniteList j (i+j)

--Applying my operation simply on a file 
--(Actually does NOT throw an Out of memory on a file generated by createTestFile 50000)
--But I want to use multiple cores..
sumOfDistancesOnSmallFile :: FilePath -> IO Int
sumOfDistancesOnSmallFile path = do
                  fileContents <- readFile path
                  return $ allDistances $ map read $ lines $ fileContents
                  where
                      allDistances (x:xs) = (allDistances xs) + ( sum $ map (distance x)    xs)
                      allDistances _ = 0

--Setting up an enumerator of read values from a text stream
readerEnumerator :: Monad m =>Integral a => Reader a -> Step a m b -> Iteratee Text m b
readerEnumerator reader = joinI . (EL.concatMapM transformer)
                            where transformer input = case reader input of
                                         Right (val, remainder) -> return [val]
                                         Left err -> return [0]

readEnumerator :: Monad m =>Integral a => Step a m b -> Iteratee Text m b
readEnumerator = readerEnumerator (signed decimal)

--The iteratee version of my operation
distancesFirstToTailIt :: Monad m=> Iteratee Int m Int
distancesFirstToTailIt = do
    maybeNum <- EL.head
    maybe (return 0) distancesOneToManyIt maybeNum

distancesOneToManyIt :: Monad m=> Int -> Iteratee Int m Int
distancesOneToManyIt base = do
    maybeNum <- EL.head
    maybe (return 0) combineNextDistance maybeNum
    where combineNextDistance nextNum = do
              rest <- distancesOneToManyIt base
              return $ combineDistances [(distance base nextNum),rest]

--The mapreduce algorithm
mapReduce :: Strategy b -- evaluation strategy for mapping
          -> (a -> b)   -- map function
          -> Strategy c -- evaluation strategy for reduction
          -> ([b] -> c) -- reduce function
          -> [a]        -- list to map over
          -> c
mapReduce mapStrat mapFunc reduceStrat reduceFunc input =
          mapResult `pseq` reduceResult
          where mapResult    = parMap mapStrat mapFunc input
                reduceResult = reduceFunc mapResult `using` reduceStrat

--Applying the iteratee operation using mapreduce
sumOfDistancesOnFileWithIt :: FilePath -> IO Int
sumOfDistancesOnFileWithIt path = chunkedFileEnum chunkByLinesTails (distancesUsingMapReduceIt) path

distancesUsingMapReduceIt :: [Enumerator Text IO Int] -> IO Int
distancesUsingMapReduceIt = mapReduce rpar (runEnumeratorAsMapFunc)
                                      rpar (sumValuesAsReduceFunc)
                            where runEnumeratorAsMapFunc :: Enumerator Text IO Int -> IO Int
                                  runEnumeratorAsMapFunc = (\source->run_ (source $$ readEnumerator $$ distancesFirstToTailIt))
                                  sumValuesAsReduceFunc :: [IO Int] -> IO Int
                                  sumValuesAsReduceFunc = liftM sum . sequence
                              

--Working with (file)chunk enumerators:
data ChunkSpec = CS{
    chunkOffset :: !Int
    , chunkLength :: !Int
    } deriving (Eq,Show)

chunkedFileEnum ::   (NFData (a)) => MonadIO m =>
                (FilePath-> IO [ChunkSpec])
           ->   ([Enumerator Text m b]->IO a)
           ->   FilePath
           ->   IO a
chunkedFileEnum chunkCreator funcOnChunks path = do
    (chunks, handles)<- chunkedEnum chunkCreator path
    r <- funcOnChunks chunks
    (rdeepseq r `seq` (return r)) `finally` mapM_ hClose handles

chunkedEnum ::  MonadIO m=>
                (FilePath -> IO [ChunkSpec])
            ->  FilePath
            ->  IO ([Enumerator Text m b], [Handle])
chunkedEnum chunkCreator path = do
    chunks <- chunkCreator path
    liftM unzip . forM chunks $ \spec -> do
        h <- openFile path ReadMode
        hSeek h AbsoluteSeek (fromIntegral (chunkOffset spec))
        let chunk = ET.enumHandle h --Note:chunklength not taken into account, so just to EOF
        return (chunk,h)

-- returns set of chunks representing  tails . lines . readFile 
chunkByLinesTails :: FilePath -> IO[ChunkSpec]
chunkByLinesTails path = do
    bracket (openFile path ReadMode) hClose $ \h-> do
        totalSize <- fromIntegral `liftM` hFileSize h
        let chunkSize = 1
            findChunks offset = do
            let newOffset = offset + chunkSize
            hSeek h AbsoluteSeek (fromIntegral newOffset)
            let findNewline lineSeekOffset = do
                eof <- hIsEOF h
                if eof
                    then return [CS offset (totalSize - offset)]
                    else do
                        bytes <- Str.hGet h 256
                        case Str.elemIndex '\n' bytes of
                            Just n -> do
                                nextChunks <- findChunks (lineSeekOffset + n + 1)
                                return (CS offset (totalSize-offset):nextChunks)
                            Nothing -> findNewline (lineSeekOffset + Str.length bytes)
            findNewline newOffset
        findChunks 0
import Control.Monad.IO.Class(liftIO)
进口管制.Monad.Trans(MonadIO,liftIO)
导入系统.IO
导入符合条件的Data.Enumerator.List作为EL
将限定数据.Enumerator.Text作为ET导入
导入数据。枚举数隐藏(映射、筛选器、头、序列)
导入数据。文本(Text)
导入Data.Text.Read
导入数据,也许吧
将限定的Data.ByteString.Char8作为Str导入
导入控制。异常(括号,最后)
进口管制.Monad(表格,liftM)
进口管制.平行战略
进口管制
import Control.DeepSeq(NFData)
导入Data.Int(Int64)
--目标:在具有n个值的文件中,计算所有n*(n-1)/2平方距离的总和
--我对一个值对的操作
距离::Int->Int->Int
距离AB=(a-b)*(a-b)
组合实例::[Int]->Int
组合状态=总和
--测试文件生成
createTestFile::Int->FilePath->IO()
createTestFile n path=writeFile路径$unlines$map显示$take n$infiniteList 0 1
其中无限列表::Int->Int->[Int]
无限列表i j=(i+j):无限列表j(i+j)
--仅在文件上应用我的操作
--(实际上不会对createTestFile 50000生成的文件抛出内存不足)
--但我想使用多核。。
sumOfDistancesOnSmallFile::FilePath->IO Int
sumOfDistancesOnSmallFile路径=do
fileContents Integral a=>Reader a->Step a m b->Iteratee Text m b
readerEnumerator reader=joinI。(EL.concatMapM变压器)
其中,transformer input=的案例读取器输入
右(val,余数)->返回[val]
左错误->返回[0]
readEnumerator::Monad m=>积分a=>步骤a m b->迭代文本m b
readEnumerator=readerEnumerator(有符号十进制)
--我的操作的迭代版本
距离firsttotailit::Monad m=>Iteratee Int m Int
距离firsttotailit=do
maybeNum Int->Iteratee Int m Int
距离SONETOMANYIT base=do
maybeNum b)——映射函数
->战略c——减排评估战略
->([b]->c)--减少功能
->[a]--要映射的列表
->c
mapReduce mapStrat mapFunc reduceStrat reduceFunc输入=
mapResult`pseq`reduceResult
其中mapResult=parMap mapStrat mapFunc input
reduceResult=reduceFunc mapResult`using`reduceResult`
--使用mapreduce应用iteratee操作
sumOfDistancesOnFileWithIt::FilePath->IO Int
sumOfDistancesOnFileWithIt path=chunkedFileEnum chunkByLinesTails(距离使用MapReduceIT)路径
距离使用MapReduceId::[Enumerator Text IO Int]->IO Int
距离使用MapReduceEIT=mapReduce rpar(运行枚举器SMAPFUNC)
rpar(sumValuesAsReduceFunc)
其中runEnumeratorAsMapFunc::枚举器文本IO Int->IO Int
runEnumeratorAsMapFunc=(\source->run_source$$readEnumerator$$DistancesfFirstToTailt))
sumValuesAsReduceFunc::[IO Int]->IO Int
sumValuesAsReduceFunc=liftM总和。序列
--使用(文件)块枚举器:
数据块spec=CS{
chunkOffset::!Int
,chunkLength::!Int
}推导(等式,显示)
chunkedFileEnum::(NFData(a))=>MonadIO m=>
(文件路径->IO[ChunkSpec])
->([枚举器文本m b]->IO a)
->文件路径
->木卫一
chunkedFileEnum chunkCreator funcOnChunks path=do
(块、句柄)IO[ChunkSpec])
->文件路径
->IO([Enumerator Text MB],[Handle])
chunkedEnum chunkCreator path=do
大块的
hio[ChunkSpec]
chunkByLinesTails路径=do
括号(openFile路径读取模式)hClose$\h->do

totalSize正如错误所说,打开的文件太多。我希望Haskell能按顺序运行大部分程序,但有些“火花”是并行的。然而,正如sclv所提到的,Haskell总是激发评估的火花

这在纯功能程序中通常不是问题,但在处理IO(资源)时却是问题。我把现实世界中Haskell书中描述的平行度放大了太多。因此,我的结论是,在sparks中处理IO资源时,只能在有限的范围内进行并行处理。在纯功能部件中,过度并行可能会成功

因此,我的帖子的答案是,不要在
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Trans (MonadIO, liftIO)
import System.IO

import qualified Data.Enumerator.List as EL
import qualified Data.Enumerator.Text as ET
import Data.Enumerator hiding (map, filter, head, sequence)

import Data.Text(Text)
import Data.Text.Read
import Data.Maybe

import qualified Data.ByteString.Char8 as Str
import Control.Exception (bracket,finally)
import Control.Monad(forM,liftM)
import Control.Parallel.Strategies
import Control.Parallel
import Control.DeepSeq (NFData)
import Data.Int (Int64)

--Goal: in a file with n values, calculate the sum of all n*(n-1)/2 squared distances

--My operation for one value pair
distance :: Int -> Int -> Int
distance a b = (a-b)*(a-b)

combineDistances :: [Int] -> Int
combineDistances = sum

--Test file generation
createTestFile :: Int -> FilePath -> IO ()
createTestFile n path = writeFile path $ unlines $ map show $ take n $ infiniteList 0 1
        where infiniteList :: Int->Int-> [Int]
              infiniteList i j = (i + j) : infiniteList j (i+j)

--Applying my operation simply on a file 
--(Actually does NOT throw an Out of memory on a file generated by createTestFile 50000)
--But I want to use multiple cores..
sumOfDistancesOnSmallFile :: FilePath -> IO Int
sumOfDistancesOnSmallFile path = do
                  fileContents <- readFile path
                  return $ allDistances $ map read $ lines $ fileContents
                  where
                      allDistances (x:xs) = (allDistances xs) + ( sum $ map (distance x)    xs)
                      allDistances _ = 0

--Setting up an enumerator of read values from a text stream
readerEnumerator :: Monad m =>Integral a => Reader a -> Step a m b -> Iteratee Text m b
readerEnumerator reader = joinI . (EL.concatMapM transformer)
                            where transformer input = case reader input of
                                         Right (val, remainder) -> return [val]
                                         Left err -> return [0]

readEnumerator :: Monad m =>Integral a => Step a m b -> Iteratee Text m b
readEnumerator = readerEnumerator (signed decimal)

--The iteratee version of my operation
distancesFirstToTailIt :: Monad m=> Iteratee Int m Int
distancesFirstToTailIt = do
    maybeNum <- EL.head
    maybe (return 0) distancesOneToManyIt maybeNum

distancesOneToManyIt :: Monad m=> Int -> Iteratee Int m Int
distancesOneToManyIt base = do
    maybeNum <- EL.head
    maybe (return 0) combineNextDistance maybeNum
    where combineNextDistance nextNum = do
              rest <- distancesOneToManyIt base
              return $ combineDistances [(distance base nextNum),rest]

--The mapreduce algorithm
mapReduce :: Strategy b -- evaluation strategy for mapping
          -> (a -> b)   -- map function
          -> Strategy c -- evaluation strategy for reduction
          -> ([b] -> c) -- reduce function
          -> [a]        -- list to map over
          -> c
mapReduce mapStrat mapFunc reduceStrat reduceFunc input =
          mapResult `pseq` reduceResult
          where mapResult    = parMap mapStrat mapFunc input
                reduceResult = reduceFunc mapResult `using` reduceStrat

--Applying the iteratee operation using mapreduce
sumOfDistancesOnFileWithIt :: FilePath -> IO Int
sumOfDistancesOnFileWithIt path = chunkedFileEnum chunkByLinesTails (distancesUsingMapReduceIt) path

distancesUsingMapReduceIt :: [Enumerator Text IO Int] -> IO Int
distancesUsingMapReduceIt = mapReduce rpar (runEnumeratorAsMapFunc)
                                      rpar (sumValuesAsReduceFunc)
                            where runEnumeratorAsMapFunc :: Enumerator Text IO Int -> IO Int
                                  runEnumeratorAsMapFunc = (\source->run_ (source $$ readEnumerator $$ distancesFirstToTailIt))
                                  sumValuesAsReduceFunc :: [IO Int] -> IO Int
                                  sumValuesAsReduceFunc = liftM sum . sequence
                              

--Working with (file)chunk enumerators:
data ChunkSpec = CS{
    chunkOffset :: !Int
    , chunkLength :: !Int
    } deriving (Eq,Show)

chunkedFileEnum ::   (NFData (a)) => MonadIO m =>
                (FilePath-> IO [ChunkSpec])
           ->   ([Enumerator Text m b]->IO a)
           ->   FilePath
           ->   IO a
chunkedFileEnum chunkCreator funcOnChunks path = do
    (chunks, handles)<- chunkedEnum chunkCreator path
    r <- funcOnChunks chunks
    (rdeepseq r `seq` (return r)) `finally` mapM_ hClose handles

chunkedEnum ::  MonadIO m=>
                (FilePath -> IO [ChunkSpec])
            ->  FilePath
            ->  IO ([Enumerator Text m b], [Handle])
chunkedEnum chunkCreator path = do
    chunks <- chunkCreator path
    liftM unzip . forM chunks $ \spec -> do
        h <- openFile path ReadMode
        hSeek h AbsoluteSeek (fromIntegral (chunkOffset spec))
        let chunk = ET.enumHandle h --Note:chunklength not taken into account, so just to EOF
        return (chunk,h)

-- returns set of chunks representing  tails . lines . readFile 
chunkByLinesTails :: FilePath -> IO[ChunkSpec]
chunkByLinesTails path = do
    bracket (openFile path ReadMode) hClose $ \h-> do
        totalSize <- fromIntegral `liftM` hFileSize h
        let chunkSize = 1
            findChunks offset = do
            let newOffset = offset + chunkSize
            hSeek h AbsoluteSeek (fromIntegral newOffset)
            let findNewline lineSeekOffset = do
                eof <- hIsEOF h
                if eof
                    then return [CS offset (totalSize - offset)]
                    else do
                        bytes <- Str.hGet h 256
                        case Str.elemIndex '\n' bytes of
                            Just n -> do
                                nextChunks <- findChunks (lineSeekOffset + n + 1)
                                return (CS offset (totalSize-offset):nextChunks)
                            Nothing -> findNewline (lineSeekOffset + Str.length bytes)
            findNewline newOffset
        findChunks 0
                                                                                               individual    inherited
COST CENTRE              MODULE                                               no.    entries  %time %alloc   %time %alloc

MAIN                     MAIN                                                   1            0   0.0    0.3   100.0  100.0
  main                    Main                                                1648           2   0.0    0.0    50.0   98.9
    sumOfDistancesOnFileWithIt MapReduceTest                                  1649           1   0.0    0.0    50.0   98.9
      chunkedFileEnum       MapReduceTest                                     1650           1   0.0    0.0    50.0   98.9
        chunkedEnum          MapReduceTest                                    1651         495   0.0   24.2    50.0   98.9
          lineOffsets         MapReduceTest                                   1652           1  50.0   74.6    50.0   74.6