diff --git a/addressbook.cabal b/addressbook.cabal index 571b0ca..ef0188e 100644 --- a/addressbook.cabal +++ b/addressbook.cabal @@ -56,7 +56,7 @@ executable addressbook , text hs-source-dirs: app default-language: Haskell2010 - ghc-options: -Wall -threaded + ghc-options: -Wall -threaded -eventlog test-suite addressbook-test import: deps diff --git a/src/Control/Addressbook/Streaming.hs b/src/Control/Addressbook/Streaming.hs index cd04bfd..efdedfe 100644 --- a/src/Control/Addressbook/Streaming.hs +++ b/src/Control/Addressbook/Streaming.hs @@ -31,35 +31,46 @@ import qualified Data.Set as Set import qualified Data.ByteString.Lazy as LBS import Data.Char (ord) import qualified Data.ByteString.Lazy.Char8 as LBC +import System.IO.Unsafe (unsafeInterleaveIO) +import Control.Parallel.Strategies (using, parList, rseq, parBuffer, rdeepseq, parMap) +import qualified Data.List as L +import Control.Concurrent (getNumCapabilities) +import Debug.Trace (traceShow) combine :: (MonadUnliftIO m, MonadResource m, MonadThrow m, MonadIO m) => ConduitM FilePath Header m () combine = await >>= \case Nothing -> pure () Just path -> (CB.sourceFile path .| parseEmail) >> combine +chunks :: Int -> [a] -> [[a]] +chunks n = L.unfoldr $ \case + [] -> Nothing + xs -> Just (splitAt n xs) + run :: IO () run = do datDir <- fromMaybe "./" <$> lookupEnv "HOME" - x <- LBS.getContents >>= stream + xs <- LBS.getContents >>= stream + let x = F.fold (parMap rseq F.fold (chunks 200 xs)) runResourceT $ runConduit (CL.sourceList (Set.elems x) .| C.map (<> "\n") .| CB.sinkFileCautious (datDir ".addressbook.dat")) where separate = \case From x -> [x] To xs -> F.toList xs - stream :: LBS.ByteString -> IO (Set ByteString) - stream xs = F.fold <$> traverse (parse . LBC.unpack) (LBS.split (fromIntegral $ ord '\n') xs) + stream :: LBS.ByteString -> IO ([Set ByteString]) + stream = traverse (unsafeInterleaveIO . parse . LBC.unpack) . filter (not . LBS.null) . LBS.split (fromIntegral $ ord '\n') parse path = runResourceT $ runConduit $ - CB.sourceFile path .| parseEmail .| C.concatMap separate .| CT.encode CT.utf8 .| C.foldMap Set.singleton + CB.sourceFile path .| parseEmail .| C.concatMap separate .| C.foldMap Set.singleton run_ :: IO () run_ = do datDir <- fromMaybe "./" <$> lookupEnv "HOME" runResourceT $ do x <- runConduit stream - runConduit (CL.sourceList (Map.keys x) .| C.map (<> "\n") .| CB.sinkFileCautious (datDir ".addressbook.dat")) + runConduit (CL.sourceList (Set.elems x) .| C.map (<> "\n") .| CB.sinkFileCautious (datDir ".addressbook.dat")) where separate = \case From x -> [x] @@ -71,5 +82,4 @@ run_ = do .| C.map T.unpack .| combine .| C.concatMap separate - .| CT.encode CT.utf8 - .| C.foldMap (`Map.singleton` ()) + .| C.foldMap (Set.singleton) diff --git a/src/Data/Email.hs b/src/Data/Email.hs index 72ee8a2..b34e50f 100644 --- a/src/Data/Email.hs +++ b/src/Data/Email.hs @@ -5,12 +5,9 @@ import Data.Email.Header import Conduit import qualified Data.Conduit.Combinators as C -import qualified Data.Conduit.Text as CT import Data.ByteString (ByteString) parseEmail :: (MonadUnliftIO m, MonadThrow m, Monad m) => ConduitM ByteString Header m () -parseEmail = catchC (CT.decode CT.utf8) err .| CT.lines .| C.concatMap decode - where - err e = liftIO (print @CT.TextException e) >> yield "" +parseEmail = C.linesUnboundedAscii .| C.concatMap decode diff --git a/src/Data/Email/Header.hs b/src/Data/Email/Header.hs index 2651757..6c3d918 100644 --- a/src/Data/Email/Header.hs +++ b/src/Data/Email/Header.hs @@ -1,29 +1,25 @@ +{-# LANGUAGE OverloadedStrings #-} module Data.Email.Header where -import Data.Text - (Text) -import qualified Data.Text as T - import qualified Data.Foldable as F -import Data.Attoparsec.Text +import Data.Attoparsec.ByteString.Char8 import Data.Vector (Vector) import qualified Data.Vector as V -import Data.Char - (isSpace) - import Control.Applicative ((<|>)) +import Data.ByteString (ByteString) +import qualified Data.ByteString.Char8 as BC data Header - = From !Text - | To !(Vector Text) + = From !ByteString + | To !(Vector ByteString) deriving (Show, Eq) -decode :: Text -> Either String Header +decode :: ByteString -> Either String Header decode = parseOnly parseHeader where parseHeader :: Parser Header @@ -33,23 +29,23 @@ decode = parseOnly parseHeader parseTo :: Parser Header parseTo = To <$> (string "To:" *> emptySpace *> emails) emptySpace = many' space - emails :: Parser (Vector Text) + emails :: Parser (Vector ByteString) emails = V.fromList <$> (bracketEmail <|> email) `sepBy` char ',' - bracketEmail :: Parser Text + bracketEmail :: Parser ByteString bracketEmail = do _ <- manyTill anyChar (char '<') email - email :: Parser Text + email :: Parser ByteString email = do _ <- many' space - name <- T.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= '@')) + name <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= '@')) _ <- char '@' - rest <- T.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= ',' && c /= '>')) + rest <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= ',' && c /= '>')) _ <- many' (notChar ',') pure (name <> "@" <> rest) -encode :: Header -> Text +encode :: Header -> ByteString encode = \case From addr -> "From: " <> addr - To addrs -> "To: " <> T.intercalate ", " (F.toList addrs) + To addrs -> "To: " <> BC.intercalate ", " (F.toList addrs)