Make the analysis parallel (#2)
Co-authored-by: Mats Rauhala <mats.rauhala@iki.fi> Reviewed-on: #2 Co-authored-by: Mats Rauhala <masse@rauhala.info> Co-committed-by: Mats Rauhala <masse@rauhala.info>
This commit is contained in:
parent
1c4e766f92
commit
41c666fe93
@ -41,6 +41,7 @@ library
|
|||||||
, vector
|
, vector
|
||||||
, containers
|
, containers
|
||||||
, filepath
|
, filepath
|
||||||
|
, parallel
|
||||||
hs-source-dirs: src
|
hs-source-dirs: src
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall
|
||||||
@ -55,7 +56,7 @@ executable addressbook
|
|||||||
, text
|
, text
|
||||||
hs-source-dirs: app
|
hs-source-dirs: app
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
ghc-options: -Wall -threaded
|
ghc-options: -Wall -threaded -eventlog
|
||||||
|
|
||||||
test-suite addressbook-test
|
test-suite addressbook-test
|
||||||
import: deps
|
import: deps
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
{ mkDerivation, attoparsec, base, bytestring, conduit
|
{ mkDerivation, attoparsec, base, bytestring, conduit
|
||||||
, conduit-extra, containers, criterion, filepath, hedgehog
|
, conduit-extra, containers, criterion, filepath, hedgehog
|
||||||
, hedgehog-corpus, HUnit, lens, lib, mtl, optparse-applicative
|
, hedgehog-corpus, HUnit, lens, lib, mtl, optparse-applicative
|
||||||
, tasty, tasty-hedgehog, tasty-hunit, text, vector
|
, parallel, tasty, tasty-hedgehog, tasty-hunit, text, vector
|
||||||
}:
|
}:
|
||||||
mkDerivation {
|
mkDerivation {
|
||||||
pname = "addressbook";
|
pname = "addressbook";
|
||||||
@ -11,10 +11,11 @@ mkDerivation {
|
|||||||
isExecutable = true;
|
isExecutable = true;
|
||||||
libraryHaskellDepends = [
|
libraryHaskellDepends = [
|
||||||
attoparsec base bytestring conduit conduit-extra containers
|
attoparsec base bytestring conduit conduit-extra containers
|
||||||
filepath lens mtl text vector
|
filepath lens mtl parallel text vector
|
||||||
];
|
];
|
||||||
executableHaskellDepends = [
|
executableHaskellDepends = [
|
||||||
base criterion hedgehog-corpus optparse-applicative text
|
base bytestring containers criterion hedgehog-corpus
|
||||||
|
optparse-applicative text
|
||||||
];
|
];
|
||||||
testHaskellDepends = [
|
testHaskellDepends = [
|
||||||
base bytestring conduit conduit-extra containers hedgehog
|
base bytestring conduit conduit-extra containers hedgehog
|
||||||
|
@ -1,23 +1,18 @@
|
|||||||
module Control.Addressbook.Streaming where
|
module Control.Addressbook.Streaming where
|
||||||
|
|
||||||
import qualified Data.Text as T
|
|
||||||
|
|
||||||
import Conduit
|
import Conduit
|
||||||
import qualified Data.Conduit.Binary as CB
|
import qualified Data.Conduit.Binary as CB
|
||||||
import qualified Data.Conduit.Combinators as C
|
import qualified Data.Conduit.Combinators as C
|
||||||
import qualified Data.Conduit.List as CL
|
import qualified Data.Conduit.List as CL
|
||||||
import qualified Data.Conduit.Text as CT
|
|
||||||
|
|
||||||
import Data.Email
|
import Data.Email
|
||||||
import Data.Email.Header
|
import Data.Email.Header
|
||||||
(Header(..))
|
(Header(..))
|
||||||
|
|
||||||
import System.IO
|
|
||||||
(stdin)
|
|
||||||
|
|
||||||
import qualified Data.Foldable as F
|
import qualified Data.Foldable as F
|
||||||
|
|
||||||
import qualified Data.Map.Strict as Map
|
|
||||||
|
|
||||||
import Data.Maybe
|
import Data.Maybe
|
||||||
(fromMaybe)
|
(fromMaybe)
|
||||||
@ -25,28 +20,47 @@ import System.Environment
|
|||||||
(lookupEnv)
|
(lookupEnv)
|
||||||
import System.FilePath
|
import System.FilePath
|
||||||
((</>))
|
((</>))
|
||||||
|
import Data.Set (Set)
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import qualified Data.Set as Set
|
||||||
|
import qualified Data.ByteString.Lazy as LBS
|
||||||
|
import Data.Char (ord)
|
||||||
|
import qualified Data.ByteString.Lazy.Char8 as LBC
|
||||||
|
import System.IO.Unsafe (unsafeInterleaveIO)
|
||||||
|
import Control.Parallel.Strategies (rseq, parMap)
|
||||||
|
import qualified Data.List as L
|
||||||
|
|
||||||
combine :: (MonadUnliftIO m, MonadResource m, MonadThrow m, MonadIO m) => ConduitM FilePath Header m ()
|
combine :: (MonadUnliftIO m, MonadResource m, MonadThrow m, MonadIO m) => ConduitM FilePath Header m ()
|
||||||
combine = await >>= \case
|
combine = await >>= \case
|
||||||
Nothing -> pure ()
|
Nothing -> pure ()
|
||||||
Just path -> (CB.sourceFile path .| parseEmail) >> combine
|
Just path -> (CB.sourceFile path .| parseEmail) >> combine
|
||||||
|
|
||||||
|
chunks :: Int -> [a] -> [[a]]
|
||||||
|
chunks n = L.unfoldr $ \case
|
||||||
|
[] -> Nothing
|
||||||
|
xs -> Just (splitAt n xs)
|
||||||
|
|
||||||
run :: IO ()
|
run :: IO ()
|
||||||
run = do
|
run = do
|
||||||
datDir <- fromMaybe "./" <$> lookupEnv "HOME"
|
datDir <- fromMaybe "./" <$> lookupEnv "HOME"
|
||||||
runResourceT $ do
|
xs <- LBS.getContents >>= stream
|
||||||
x <- runConduit stream
|
let set = F.fold (parMap rseq F.fold (chunks 20 xs))
|
||||||
runConduit (CL.sourceList (Map.keys x) .| C.map (<> "\n") .| CB.sinkFileCautious (datDir </> ".addressbook.dat"))
|
runResourceT $
|
||||||
|
runConduit $
|
||||||
|
CL.sourceList (Set.elems set)
|
||||||
|
.| C.map (<> "\n")
|
||||||
|
.| CB.sinkFileCautious (datDir </> ".addressbook.dat")
|
||||||
where
|
where
|
||||||
separate = \case
|
separate = \case
|
||||||
From x -> [x]
|
From x -> [x]
|
||||||
To xs -> F.toList xs
|
To xs -> F.toList xs
|
||||||
|
-- A set of (locally) unique addresses. Composes with parMap
|
||||||
|
stream :: LBS.ByteString -> IO [Set ByteString]
|
||||||
stream =
|
stream =
|
||||||
CB.sourceHandle stdin
|
traverse (unsafeInterleaveIO . parse . LBC.unpack)
|
||||||
.| CT.decode CT.utf8
|
. filter (not . LBS.null)
|
||||||
.| CT.lines
|
. LBS.split (fromIntegral $ ord '\n')
|
||||||
.| C.map T.unpack
|
parse path =
|
||||||
.| combine
|
runResourceT $
|
||||||
.| C.concatMap separate
|
runConduit $
|
||||||
.| CT.encode CT.utf8
|
CB.sourceFile path .| parseEmail .| C.concatMap separate .| C.foldMap Set.singleton
|
||||||
.| C.foldMap (`Map.singleton` ())
|
|
||||||
|
@ -5,12 +5,9 @@ import Data.Email.Header
|
|||||||
|
|
||||||
import Conduit
|
import Conduit
|
||||||
import qualified Data.Conduit.Combinators as C
|
import qualified Data.Conduit.Combinators as C
|
||||||
import qualified Data.Conduit.Text as CT
|
|
||||||
|
|
||||||
import Data.ByteString
|
import Data.ByteString
|
||||||
(ByteString)
|
(ByteString)
|
||||||
|
|
||||||
parseEmail :: (MonadUnliftIO m, MonadThrow m, Monad m) => ConduitM ByteString Header m ()
|
parseEmail :: (MonadUnliftIO m, MonadThrow m, Monad m) => ConduitM ByteString Header m ()
|
||||||
parseEmail = catchC (CT.decode CT.utf8) err .| CT.lines .| C.concatMap decode
|
parseEmail = C.linesUnboundedAscii .| C.concatMap decode
|
||||||
where
|
|
||||||
err e = liftIO (print @CT.TextException e) >> yield ""
|
|
||||||
|
@ -1,29 +1,25 @@
|
|||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
module Data.Email.Header where
|
module Data.Email.Header where
|
||||||
|
|
||||||
import Data.Text
|
|
||||||
(Text)
|
|
||||||
import qualified Data.Text as T
|
|
||||||
|
|
||||||
import qualified Data.Foldable as F
|
import qualified Data.Foldable as F
|
||||||
|
|
||||||
import Data.Attoparsec.Text
|
import Data.Attoparsec.ByteString.Char8
|
||||||
|
|
||||||
import Data.Vector
|
import Data.Vector
|
||||||
(Vector)
|
(Vector)
|
||||||
import qualified Data.Vector as V
|
import qualified Data.Vector as V
|
||||||
|
|
||||||
import Data.Char
|
|
||||||
(isSpace)
|
|
||||||
|
|
||||||
import Control.Applicative
|
import Control.Applicative
|
||||||
((<|>))
|
((<|>))
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import qualified Data.ByteString.Char8 as BC
|
||||||
|
|
||||||
data Header
|
data Header
|
||||||
= From !Text
|
= From !ByteString
|
||||||
| To !(Vector Text)
|
| To !(Vector ByteString)
|
||||||
deriving (Show, Eq)
|
deriving (Show, Eq)
|
||||||
|
|
||||||
decode :: Text -> Either String Header
|
decode :: ByteString -> Either String Header
|
||||||
decode = parseOnly parseHeader
|
decode = parseOnly parseHeader
|
||||||
where
|
where
|
||||||
parseHeader :: Parser Header
|
parseHeader :: Parser Header
|
||||||
@ -33,23 +29,23 @@ decode = parseOnly parseHeader
|
|||||||
parseTo :: Parser Header
|
parseTo :: Parser Header
|
||||||
parseTo = To <$> (string "To:" *> emptySpace *> emails)
|
parseTo = To <$> (string "To:" *> emptySpace *> emails)
|
||||||
emptySpace = many' space
|
emptySpace = many' space
|
||||||
emails :: Parser (Vector Text)
|
emails :: Parser (Vector ByteString)
|
||||||
emails = V.fromList <$> (bracketEmail <|> email) `sepBy` char ','
|
emails = V.fromList <$> (bracketEmail <|> email) `sepBy` char ','
|
||||||
bracketEmail :: Parser Text
|
bracketEmail :: Parser ByteString
|
||||||
bracketEmail = do
|
bracketEmail = do
|
||||||
_ <- manyTill anyChar (char '<')
|
_ <- manyTill anyChar (char '<')
|
||||||
email
|
email
|
||||||
email :: Parser Text
|
email :: Parser ByteString
|
||||||
email = do
|
email = do
|
||||||
_ <- many' space
|
_ <- many' space
|
||||||
name <- T.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= '@'))
|
name <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= '@'))
|
||||||
_ <- char '@'
|
_ <- char '@'
|
||||||
rest <- T.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= ',' && c /= '>'))
|
rest <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= ',' && c /= '>'))
|
||||||
_ <- many' (notChar ',')
|
_ <- many' (notChar ',')
|
||||||
pure (name <> "@" <> rest)
|
pure (name <> "@" <> rest)
|
||||||
|
|
||||||
|
|
||||||
encode :: Header -> Text
|
encode :: Header -> ByteString
|
||||||
encode = \case
|
encode = \case
|
||||||
From addr -> "From: " <> addr
|
From addr -> "From: " <> addr
|
||||||
To addrs -> "To: " <> T.intercalate ", " (F.toList addrs)
|
To addrs -> "To: " <> BC.intercalate ", " (F.toList addrs)
|
||||||
|
@ -10,11 +10,12 @@ import qualified Hedgehog.Corpus as Corpus
|
|||||||
import qualified Hedgehog.Gen as Gen
|
import qualified Hedgehog.Gen as Gen
|
||||||
import qualified Hedgehog.Range as Range
|
import qualified Hedgehog.Range as Range
|
||||||
|
|
||||||
import Data.Text
|
import Data.ByteString
|
||||||
import qualified Data.Text as T
|
import qualified Data.ByteString as T
|
||||||
import qualified Data.Vector as V
|
import qualified Data.Vector as V
|
||||||
|
|
||||||
import Data.Email.Header
|
import Data.Email.Header
|
||||||
|
import qualified Data.ByteString.Char8 as BC
|
||||||
|
|
||||||
genHeader :: Gen Header
|
genHeader :: Gen Header
|
||||||
genHeader = Gen.choice
|
genHeader = Gen.choice
|
||||||
@ -22,17 +23,17 @@ genHeader = Gen.choice
|
|||||||
, To . V.fromList <$> Gen.list (Range.linear 0 10) genEmail
|
, To . V.fromList <$> Gen.list (Range.linear 0 10) genEmail
|
||||||
]
|
]
|
||||||
|
|
||||||
genEmail :: Gen Text
|
genEmail :: Gen ByteString
|
||||||
genEmail = do
|
genEmail = do
|
||||||
name <- Gen.element Corpus.simpsons
|
name <- Gen.element Corpus.simpsons
|
||||||
domain <- Gen.element Corpus.cooking
|
domain <- Gen.element Corpus.cooking
|
||||||
tld <- Gen.element ["com","fi","org"]
|
tld <- Gen.element ["com","fi","org"]
|
||||||
pure $ name <> "@" <> domain <> "." <> tld
|
pure $ name <> "@" <> domain <> "." <> tld
|
||||||
|
|
||||||
wrapped :: Char -> Text -> Char -> Text
|
wrapped :: Char -> ByteString -> Char -> ByteString
|
||||||
wrapped l x r = T.singleton l <> x <> T.singleton r
|
wrapped l x r = BC.singleton l <> x <> BC.singleton r
|
||||||
|
|
||||||
genComment :: Gen Text
|
genComment :: Gen ByteString
|
||||||
genComment = do
|
genComment = do
|
||||||
x <- Gen.element Corpus.simpsons
|
x <- Gen.element Corpus.simpsons
|
||||||
Gen.element [x, wrapped '"' x '"']
|
Gen.element [x, wrapped '"' x '"']
|
||||||
|
Loading…
Reference in New Issue
Block a user