addressbook/src/Control/Addressbook/Streaming.hs

74 lines
2.1 KiB
Haskell
Raw Normal View History

2020-12-10 23:08:03 +02:00
module Control.Addressbook.Streaming where
import Conduit
import qualified Data.Conduit.Binary as CB
import qualified Data.Conduit.Combinators as C
2020-12-11 17:55:35 +02:00
import qualified Data.Conduit.List as CL
2020-12-10 23:08:03 +02:00
import Data.Email
import Data.Email.Header
2020-12-10 23:20:38 +02:00
(Header(..))
2020-12-10 23:08:03 +02:00
2020-12-10 23:20:38 +02:00
import qualified Data.Foldable as F
2020-12-11 17:55:35 +02:00
import Data.Maybe
(fromMaybe)
import System.Environment
(lookupEnv)
import System.FilePath
((</>))
import Data.Set (Set)
import Data.ByteString (ByteString)
import qualified Data.Set as Set
import qualified Data.ByteString.Lazy as LBS
import Data.Char (ord)
import qualified Data.ByteString.Lazy.Char8 as LBC
import System.IO.Unsafe (unsafeInterleaveIO)
import Control.Parallel.Strategies (rseq, parMap)
import qualified Data.List as L
2021-11-04 10:10:35 +02:00
import Control.Monad (unless)
2022-02-07 15:03:00 +02:00
import System.Posix (touchFile)
2020-12-10 23:20:38 +02:00
2020-12-10 23:13:23 +02:00
combine :: (MonadUnliftIO m, MonadResource m, MonadThrow m, MonadIO m) => ConduitM FilePath Header m ()
2020-12-10 23:08:03 +02:00
combine = await >>= \case
Nothing -> pure ()
Just path -> (CB.sourceFile path .| parseEmail) >> combine
chunks :: Int -> [a] -> [[a]]
chunks n = L.unfoldr $ \case
[] -> Nothing
xs -> Just (splitAt n xs)
2020-12-10 23:08:03 +02:00
run :: IO ()
run = do
2020-12-11 17:55:35 +02:00
datDir <- fromMaybe "./" <$> lookupEnv "HOME"
2021-10-29 22:59:04 +03:00
let datFile = datDir </> ".addressbook.dat"
2022-02-07 15:03:00 +02:00
touchFile datFile
2021-10-29 22:59:04 +03:00
original <- Set.fromList . map LBS.toStrict . lbsLines <$> LBS.readFile datFile
xs <- LBS.getContents >>= stream
2021-11-04 10:11:37 +02:00
let set = original `Set.union` F.fold (parMap rseq F.fold (chunks 200 xs))
2021-11-04 10:10:35 +02:00
unless (original == set) $
runResourceT $
runConduit $
CL.sourceList (Set.elems set)
.| C.map (<> "\n")
.| CB.sinkFileCautious datFile
2020-12-10 23:20:38 +02:00
where
separate = \case
From x -> [x]
To xs -> F.toList xs
-- A set of (locally) unique addresses. Composes with parMap
2021-10-29 22:59:04 +03:00
lbsLines = LBS.split (fromIntegral $ ord '\n')
stream :: LBS.ByteString -> IO [Set ByteString]
2020-12-10 23:20:38 +02:00
stream =
traverse (unsafeInterleaveIO . parse . LBC.unpack)
. filter (not . LBS.null)
2021-10-29 22:59:04 +03:00
. lbsLines
parse path =
runResourceT $
runConduit $
CB.sourceFile path .| parseEmail .| C.concatMap separate .| C.foldMap Set.singleton