From 41c666fe937d25f35af012f2309e307bce2fb78d Mon Sep 17 00:00:00 2001 From: Mats Rauhala Date: Fri, 29 Oct 2021 22:41:46 +0300 Subject: [PATCH] Make the analysis parallel (#2) Co-authored-by: Mats Rauhala Reviewed-on: https://git.rauhala.info/MasseR/addressbook/pulls/2 Co-authored-by: Mats Rauhala Co-committed-by: Mats Rauhala --- addressbook.cabal | 3 +- default.nix | 7 +++-- src/Control/Addressbook/Streaming.hs | 46 ++++++++++++++++++---------- src/Data/Email.hs | 5 +-- src/Data/Email/Header.hs | 32 +++++++++---------- test/Test/Data/Email/Header.hs | 13 ++++---- 6 files changed, 58 insertions(+), 48 deletions(-) diff --git a/addressbook.cabal b/addressbook.cabal index 6bf76b3..ef0188e 100644 --- a/addressbook.cabal +++ b/addressbook.cabal @@ -41,6 +41,7 @@ library , vector , containers , filepath + , parallel hs-source-dirs: src default-language: Haskell2010 ghc-options: -Wall @@ -55,7 +56,7 @@ executable addressbook , text hs-source-dirs: app default-language: Haskell2010 - ghc-options: -Wall -threaded + ghc-options: -Wall -threaded -eventlog test-suite addressbook-test import: deps diff --git a/default.nix b/default.nix index d687fff..1431253 100644 --- a/default.nix +++ b/default.nix @@ -1,7 +1,7 @@ { mkDerivation, attoparsec, base, bytestring, conduit , conduit-extra, containers, criterion, filepath, hedgehog , hedgehog-corpus, HUnit, lens, lib, mtl, optparse-applicative -, tasty, tasty-hedgehog, tasty-hunit, text, vector +, parallel, tasty, tasty-hedgehog, tasty-hunit, text, vector }: mkDerivation { pname = "addressbook"; @@ -11,10 +11,11 @@ mkDerivation { isExecutable = true; libraryHaskellDepends = [ attoparsec base bytestring conduit conduit-extra containers - filepath lens mtl text vector + filepath lens mtl parallel text vector ]; executableHaskellDepends = [ - base criterion hedgehog-corpus optparse-applicative text + base bytestring containers criterion hedgehog-corpus + optparse-applicative text ]; testHaskellDepends = [ base bytestring conduit conduit-extra containers hedgehog diff --git a/src/Control/Addressbook/Streaming.hs b/src/Control/Addressbook/Streaming.hs index a8989fd..5c22554 100644 --- a/src/Control/Addressbook/Streaming.hs +++ b/src/Control/Addressbook/Streaming.hs @@ -1,23 +1,18 @@ module Control.Addressbook.Streaming where -import qualified Data.Text as T import Conduit import qualified Data.Conduit.Binary as CB import qualified Data.Conduit.Combinators as C import qualified Data.Conduit.List as CL -import qualified Data.Conduit.Text as CT import Data.Email import Data.Email.Header (Header(..)) -import System.IO - (stdin) import qualified Data.Foldable as F -import qualified Data.Map.Strict as Map import Data.Maybe (fromMaybe) @@ -25,28 +20,47 @@ import System.Environment (lookupEnv) import System.FilePath (()) +import Data.Set (Set) +import Data.ByteString (ByteString) +import qualified Data.Set as Set +import qualified Data.ByteString.Lazy as LBS +import Data.Char (ord) +import qualified Data.ByteString.Lazy.Char8 as LBC +import System.IO.Unsafe (unsafeInterleaveIO) +import Control.Parallel.Strategies (rseq, parMap) +import qualified Data.List as L combine :: (MonadUnliftIO m, MonadResource m, MonadThrow m, MonadIO m) => ConduitM FilePath Header m () combine = await >>= \case Nothing -> pure () Just path -> (CB.sourceFile path .| parseEmail) >> combine +chunks :: Int -> [a] -> [[a]] +chunks n = L.unfoldr $ \case + [] -> Nothing + xs -> Just (splitAt n xs) + run :: IO () run = do datDir <- fromMaybe "./" <$> lookupEnv "HOME" - runResourceT $ do - x <- runConduit stream - runConduit (CL.sourceList (Map.keys x) .| C.map (<> "\n") .| CB.sinkFileCautious (datDir ".addressbook.dat")) + xs <- LBS.getContents >>= stream + let set = F.fold (parMap rseq F.fold (chunks 20 xs)) + runResourceT $ + runConduit $ + CL.sourceList (Set.elems set) + .| C.map (<> "\n") + .| CB.sinkFileCautious (datDir ".addressbook.dat") where separate = \case From x -> [x] To xs -> F.toList xs + -- A set of (locally) unique addresses. Composes with parMap + stream :: LBS.ByteString -> IO [Set ByteString] stream = - CB.sourceHandle stdin - .| CT.decode CT.utf8 - .| CT.lines - .| C.map T.unpack - .| combine - .| C.concatMap separate - .| CT.encode CT.utf8 - .| C.foldMap (`Map.singleton` ()) + traverse (unsafeInterleaveIO . parse . LBC.unpack) + . filter (not . LBS.null) + . LBS.split (fromIntegral $ ord '\n') + parse path = + runResourceT $ + runConduit $ + CB.sourceFile path .| parseEmail .| C.concatMap separate .| C.foldMap Set.singleton diff --git a/src/Data/Email.hs b/src/Data/Email.hs index 72ee8a2..b34e50f 100644 --- a/src/Data/Email.hs +++ b/src/Data/Email.hs @@ -5,12 +5,9 @@ import Data.Email.Header import Conduit import qualified Data.Conduit.Combinators as C -import qualified Data.Conduit.Text as CT import Data.ByteString (ByteString) parseEmail :: (MonadUnliftIO m, MonadThrow m, Monad m) => ConduitM ByteString Header m () -parseEmail = catchC (CT.decode CT.utf8) err .| CT.lines .| C.concatMap decode - where - err e = liftIO (print @CT.TextException e) >> yield "" +parseEmail = C.linesUnboundedAscii .| C.concatMap decode diff --git a/src/Data/Email/Header.hs b/src/Data/Email/Header.hs index 2651757..6c3d918 100644 --- a/src/Data/Email/Header.hs +++ b/src/Data/Email/Header.hs @@ -1,29 +1,25 @@ +{-# LANGUAGE OverloadedStrings #-} module Data.Email.Header where -import Data.Text - (Text) -import qualified Data.Text as T - import qualified Data.Foldable as F -import Data.Attoparsec.Text +import Data.Attoparsec.ByteString.Char8 import Data.Vector (Vector) import qualified Data.Vector as V -import Data.Char - (isSpace) - import Control.Applicative ((<|>)) +import Data.ByteString (ByteString) +import qualified Data.ByteString.Char8 as BC data Header - = From !Text - | To !(Vector Text) + = From !ByteString + | To !(Vector ByteString) deriving (Show, Eq) -decode :: Text -> Either String Header +decode :: ByteString -> Either String Header decode = parseOnly parseHeader where parseHeader :: Parser Header @@ -33,23 +29,23 @@ decode = parseOnly parseHeader parseTo :: Parser Header parseTo = To <$> (string "To:" *> emptySpace *> emails) emptySpace = many' space - emails :: Parser (Vector Text) + emails :: Parser (Vector ByteString) emails = V.fromList <$> (bracketEmail <|> email) `sepBy` char ',' - bracketEmail :: Parser Text + bracketEmail :: Parser ByteString bracketEmail = do _ <- manyTill anyChar (char '<') email - email :: Parser Text + email :: Parser ByteString email = do _ <- many' space - name <- T.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= '@')) + name <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= '@')) _ <- char '@' - rest <- T.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= ',' && c /= '>')) + rest <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= ',' && c /= '>')) _ <- many' (notChar ',') pure (name <> "@" <> rest) -encode :: Header -> Text +encode :: Header -> ByteString encode = \case From addr -> "From: " <> addr - To addrs -> "To: " <> T.intercalate ", " (F.toList addrs) + To addrs -> "To: " <> BC.intercalate ", " (F.toList addrs) diff --git a/test/Test/Data/Email/Header.hs b/test/Test/Data/Email/Header.hs index bc58f71..d6cc750 100644 --- a/test/Test/Data/Email/Header.hs +++ b/test/Test/Data/Email/Header.hs @@ -10,11 +10,12 @@ import qualified Hedgehog.Corpus as Corpus import qualified Hedgehog.Gen as Gen import qualified Hedgehog.Range as Range -import Data.Text -import qualified Data.Text as T +import Data.ByteString +import qualified Data.ByteString as T import qualified Data.Vector as V import Data.Email.Header +import qualified Data.ByteString.Char8 as BC genHeader :: Gen Header genHeader = Gen.choice @@ -22,17 +23,17 @@ genHeader = Gen.choice , To . V.fromList <$> Gen.list (Range.linear 0 10) genEmail ] -genEmail :: Gen Text +genEmail :: Gen ByteString genEmail = do name <- Gen.element Corpus.simpsons domain <- Gen.element Corpus.cooking tld <- Gen.element ["com","fi","org"] pure $ name <> "@" <> domain <> "." <> tld -wrapped :: Char -> Text -> Char -> Text -wrapped l x r = T.singleton l <> x <> T.singleton r +wrapped :: Char -> ByteString -> Char -> ByteString +wrapped l x r = BC.singleton l <> x <> BC.singleton r -genComment :: Gen Text +genComment :: Gen ByteString genComment = do x <- Gen.element Corpus.simpsons Gen.element [x, wrapped '"' x '"']