2015-07-21 61 views
9

我正試圖寫入一個非常大量的數據到恆定內存中的文件。爲什麼不能在常量內存中運行?

import qualified Data.ByteString.Lazy as B 

{- Creates and writes num grids of dimensions aa x aa -} 
writeGrids :: Int -> Int -> IO() 
writeGrids num aa = do 
    rng <- newPureMT 
    let (grids,shuffleds) = createGrids rng aa 
    createDirectoryIfMissing True "data/grids/" 
    B.writeFile (gridFileName num aa) 
       (encode (take num grids)) 
    B.writeFile (shuffledFileName num aa) 
       (encode (take num shuffleds)) 

但是,這會消耗與num大小成正比的內存。我知道createGrids是一個足夠懶惰的函數,因爲我已經通過將error "not lazy enough"(如Haskell wiki here建議的)附加到它返回的列表末尾來測試它,並且不會引發錯誤。 take是在Data.List中定義的惰性函數。 encode也是在Data.Binary中定義的懶惰函數。 B.writeFileData.ByteString.Lazy中定義。

下面是完整的代碼,以便能夠執行它:

import Control.Arrow (first) 
import Data.Binary 
import GHC.Float (double2Float) 
import System.Random (next) 
import System.Random.Mersenne.Pure64 (PureMT, newPureMT, randomDouble) 
import System.Random.Shuffle (shuffle') 
import qualified Data.ByteString.Lazy as B 

main :: IO() 
main = writeGrids 1000 64 

{- Creates and writes num grids of dimensions aa x aa -} 
writeGrids :: Int -> Int -> IO() 
writeGrids num aa = do 
    rng <- newPureMT 
    let (grids,shuffleds) = createGrids rng aa 
    B.writeFile "grids.bin" (encode (take num grids)) 
    B.writeFile "shuffleds.bin" (encode (take num shuffleds)) 

{- a random number generator, dimension of grids to make 
    returns a pair of lists, the first is a list of grids of dimensions 
    aa x aa, the second is a list of the shuffled grids corresponding to the first list -} 
createGrids :: PureMT -> Int -> ([[(Float,Float)]],[[(Float,Float)]]) 
createGrids rng aa = (grids,shuffleds) where 
     rs = randomFloats rng 
     grids = map (getGridR aa) (chunksOf (2 * aa * aa) rs) 
     shuffleds = shuffler (aa * aa) rng grids 

{- length of each grid, a random number generator, a list of grids 
    returns a the list with each grid shuffled -} 
shuffler :: Int -> PureMT -> [[(Float,Float)]] -> [[(Float,Float)]] 
shuffler n rng (xs:xss) = shuffle' xs n rng : shuffler n (snd (next rng))   xss 
shuffler _ _ [] = [] 

{- divides list into chunks of size n -} 
chunksOf :: Int -> [a] -> [[a]] 
chunksOf n = go 
    where go xs = case splitAt n xs of 
       (ys,zs) | null ys -> [] 
         | otherwise -> ys : go zs 

{- dimension of grid, list of random floats [0,1] 
    returns a list of (x,y) points of length n^2 such that all 
    points are in the range [0,1] and the points are a randomly 
    perturbed regular grid -} 
getGridR :: Int -> [Float] -> [(Float,Float)] 
getGridR n rs = pts where 
    nn = n * n 
    (irs,jrs) = splitAt nn rs 
    n' = fromIntegral n 
    grid = [ (p,q) | p <- [0..n'-1], q <- [0..n'-1] ] 
    pts = zipWith (\(p,q) (ir,jr) -> ((p+ir)/n',(q+jr)/n')) grid (zip irs jrs) 

{- an infinite list of random floats in range [0,1] -} 
randomFloats :: PureMT -> [Float] 
randomFloats rng = let (d,rng') = first double2Float (randomDouble rng) 
        in d : randomFloats rng' 

所需的軟件包是: ,字節串 ,二進制 ,隨機 ,梅森 - 隨機pure64 ,隨機洗牌

回答

10

內存使用的兩個原因:

第一個Data.Binary.encode似乎不會在恆定的空間中運行。下面的程序使用910 MB內存:

import Data.Binary 
import qualified Data.ByteString.Lazy as B 

len = 10000000 :: Int 

main = B.writeFile "grids.bin" $ encode [0..len] 

如果我們從len留下0出來,我們得到97 MB的內存使用情況。

與此相反,下面的程序使用1 MB:

import qualified Data.ByteString.Lazy.Char8 as B 

main = B.writeFile "grids.bin" $ B.pack $ show [0..(1000000::Int)] 

,在你的程序shuffleds包含的內容grids參考,其中防止grids垃圾收集。所以當我們打印grids時,我們也對它進行評估,然後它必須坐在內存中直到我們完成打印shuffleds。以下版本的程序仍然消耗大量內存,但如果我們用B.writeFile註釋掉兩行中的一行,它會使用恆定的空間。

import qualified Data.ByteString.Lazy.Char8 as B 

writeGrids :: Int -> Int -> IO() 
writeGrids num aa = do 
    rng <- newPureMT 
    let (grids,shuffleds) = createGrids rng aa 
    B.writeFile "grids.bin" (B.pack $ show (take num grids)) 
    B.writeFile "shuffleds.bin" (B.pack $ show (take num shuffleds)) 
+8

重新第一點:該'Binary'實例'[α]'迫使列表與'length' – jberryman

+5

二進制編碼字節串的格式的脊柱'word64與大端長度|| data'。這解釋了jberryman提到的嚴格脊柱。人們可以很容易地對塊進行序列化並終止布爾值,從而使編碼形式像「Word16 big endian chunk size ||塊||假||塊|| .. || TRUE'。這應該能夠以O(塊大小)運行。要保持以前的格式,你必須重寫第一個字段,使你的IO稍微複雜一些。 –

6

爲了什麼是值得的,這裏是一個完整的解決方案,結合每個人的想法。內存消耗恆定在〜6MB(編譯-O2)。在變化

import Control.Arrow (first) 
import Control.Monad.State (state, evalState) 
import Data.Binary 
import GHC.Float (double2Float) 
import System.Random (next) 
import System.Random.Mersenne.Pure64 (PureMT, newPureMT, randomDouble) 
import System.Random.Shuffle (shuffle') 
import qualified Data.ByteString as B (hPut) 
import qualified Pipes.Binary as P (encode) 
import qualified Pipes.Prelude as P (zip, mapM, drain) 
import Pipes (runEffect, (>->)) 
import System.IO (withFile, IOMode(AppendMode)) 

main :: IO() 
main = writeGrids 1000 64 

{- Creates and writes num grids of dimensions aa x aa -} 
writeGrids :: Int -> Int -> IO() 
writeGrids num aa = do 
    rng <- newPureMT 
    let (grids, shuffleds) = createGrids rng aa 
     gridFile = "grids.bin" 
     shuffledFile = "shuffleds.bin" 
     encoder = P.encode . SerList . take num 
    writeFile gridFile "" 
    writeFile shuffledFile "" 
    withFile gridFile AppendMode $ \hGr -> 
     withFile shuffledFile AppendMode $ \hSh -> 
      runEffect 
       $ P.zip (encoder grids) (encoder shuffleds) 
       >-> P.mapM (\(ch1, ch2) -> B.hPut hGr ch1 >> B.hPut hSh ch2) 
       >-> P.drain -- discards the stream of() results. 

{- a random number generator, dimension of grids to make 
    returns a pair of lists, the first is a list of grids of dimensions 
    aa x aa, the second is a list of the shuffled grids corresponding to the first list -} 
createGrids :: PureMT -> Int -> ([[(Float,Float)]], [[(Float,Float)]]) 
createGrids rng aa = unzip gridsAndShuffleds where 
     rs = randomFloats rng 
     grids = map (getGridR aa) (chunksOf (2 * aa * aa) rs) 
     gridsAndShuffleds = shuffler (aa * aa) rng grids 

{- length of each grid, a random number generator, a list of grids 
    returns a the list with each grid shuffled -} 
shuffler :: Int -> PureMT -> [[(Float,Float)]] -> [([(Float,Float)], [(Float,Float)])] 
shuffler n rng xss = evalState (traverse oneShuffle xss) rng 
    where 
    oneShuffle xs = state $ \r -> ((xs, shuffle' xs n r), snd (next r)) 

newtype SerList a = SerList { runSerList :: [a] } 
    deriving (Show) 

instance Binary a => Binary (SerList a) where 
    put (SerList (x:xs)) = put False >> put x >> put (SerList xs) 
    put _    = put True 
    get = do 
     stop <- get :: Get Bool 
     if stop 
      then return (SerList []) 
      else do 
       x   <- get 
       SerList xs <- get 
       return (SerList (x : xs)) 

{- divides list into chunks of size n -} 
chunksOf :: Int -> [a] -> [[a]] 
chunksOf n = go 
    where go xs = case splitAt n xs of 
       (ys,zs) | null ys -> [] 
         | otherwise -> ys : go zs 

{- dimension of grid, list of random floats [0,1] 
    returns a list of (x,y) points of length n^2 such that all 
    points are in the range [0,1] and the points are a randomly 
    perturbed regular grid -} 
getGridR :: Int -> [Float] -> [(Float,Float)] 
getGridR n rs = pts where 
    nn = n * n 
    (irs,jrs) = splitAt nn rs 
    n' = fromIntegral n 
    grid = [ (p,q) | p <- [0..n'-1], q <- [0..n'-1] ] 
    pts = zipWith (\(p,q) (ir,jr) -> ((p+ir)/n',(q+jr)/n')) grid (zip irs jrs) 

{- an infinite list of random floats in range [0,1] -} 
randomFloats :: PureMT -> [Float] 
randomFloats rng = let (d,rng') = first double2Float (randomDouble rng) 
        in d : randomFloats rng' 

評論:

  • shuffler現已與State函子遍歷。它通過輸入列表單次生成一個配對列表,其中每個網格都與其混洗版本配對。 createGrids然後(懶洋洋地)解壓這個列表。

  • 這些文件是用pipes機器寫成的,鬆散地受到this answer(我最初使用P.foldM寫這個)的啓發。請注意,我使用的hPut是嚴格的字節字符串之一,因爲它作用於生產者提供的嚴格的塊(使用P.zip(本質上,它是一對懶惰的成對提供塊的惰性字節串))。

  • SerList是否存在自定義Binary實例Thomas M. DuBuisson暗示。請注意,我並沒有在實例的get方法中對懶惰和嚴格性有太多的想法。如果這樣會導致麻煩,this question看起來很有用。

相關問題