Compare commits

..

5 Commits

Author SHA1 Message Date
5182c47a19 Fix bug in parser 2021-11-04 10:24:22 +02:00
a47d4903cb Increase chunk size back to 200 2021-11-04 10:11:37 +02:00
00e585281c Avoid unneeded writes 2021-11-04 10:10:35 +02:00
03860c370a Share old info 2021-10-29 22:59:04 +03:00
41c666fe93 Make the analysis parallel (#2)
Co-authored-by: Mats Rauhala <mats.rauhala@iki.fi>
Reviewed-on: #2
Co-authored-by: Mats Rauhala <masse@rauhala.info>
Co-committed-by: Mats Rauhala <masse@rauhala.info>
2021-10-29 22:41:46 +03:00
3 changed files with 49 additions and 29 deletions

View File

@ -29,6 +29,7 @@ import qualified Data.ByteString.Lazy.Char8 as LBC
import System.IO.Unsafe (unsafeInterleaveIO)
import Control.Parallel.Strategies (rseq, parMap)
import qualified Data.List as L
import Control.Monad (unless)
combine :: (MonadUnliftIO m, MonadResource m, MonadThrow m, MonadIO m) => ConduitM FilePath Header m ()
combine = await >>= \case
@ -43,23 +44,27 @@ chunks n = L.unfoldr $ \case
run :: IO ()
run = do
datDir <- fromMaybe "./" <$> lookupEnv "HOME"
let datFile = datDir </> ".addressbook.dat"
original <- Set.fromList . map LBS.toStrict . lbsLines <$> LBS.readFile datFile
xs <- LBS.getContents >>= stream
let set = F.fold (parMap rseq F.fold (chunks 20 xs))
runResourceT $
runConduit $
CL.sourceList (Set.elems set)
.| C.map (<> "\n")
.| CB.sinkFileCautious (datDir </> ".addressbook.dat")
let set = original `Set.union` F.fold (parMap rseq F.fold (chunks 200 xs))
unless (original == set) $
runResourceT $
runConduit $
CL.sourceList (Set.elems set)
.| C.map (<> "\n")
.| CB.sinkFileCautious datFile
where
separate = \case
From x -> [x]
To xs -> F.toList xs
-- A set of (locally) unique addresses. Composes with parMap
lbsLines = LBS.split (fromIntegral $ ord '\n')
stream :: LBS.ByteString -> IO [Set ByteString]
stream =
traverse (unsafeInterleaveIO . parse . LBC.unpack)
. filter (not . LBS.null)
. LBS.split (fromIntegral $ ord '\n')
. lbsLines
parse path =
runResourceT $
runConduit $

View File

@ -1,4 +1,5 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ApplicativeDo #-}
module Data.Email.Header where
import qualified Data.Foldable as F
@ -21,28 +22,39 @@ data Header
decode :: ByteString -> Either String Header
decode = parseOnly parseHeader
where
parseHeader :: Parser Header
parseHeader = parseFrom <|> parseTo
parseFrom :: Parser Header
parseFrom = From <$> (string "From:" *> emptySpace *> (bracketEmail <|> email))
parseTo :: Parser Header
parseTo = To <$> (string "To:" *> emptySpace *> emails)
emptySpace = many' space
emails :: Parser (Vector ByteString)
emails = V.fromList <$> (bracketEmail <|> email) `sepBy` char ','
bracketEmail :: Parser ByteString
bracketEmail = do
_ <- manyTill anyChar (char '<')
email
email :: Parser ByteString
email = do
_ <- many' space
name <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= '@'))
_ <- char '@'
rest <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= ',' && c /= '>'))
_ <- many' (notChar ',')
pure (name <> "@" <> rest)
{-# INLINE decode #-}
parseHeader :: Parser Header
parseHeader = parseFrom <|> parseTo
{-# INLINE parseHeader #-}
parseFrom :: Parser Header
parseFrom = From <$> (string "From:" *> emptySpace *> (bracketEmail <|> email))
parseTo :: Parser Header
parseTo = To <$> (string "To:" *> emptySpace *> emails)
emptySpace :: Parser ()
emptySpace = () <$ many' space
emails :: Parser (Vector ByteString)
emails = V.fromList <$> (bracketEmail <|> email) `sepBy` char ','
bracketEmail :: Parser ByteString
bracketEmail = do
_ <- manyTill anyChar (char '<')
email
{-# INLINE bracketEmail #-}
email :: Parser ByteString
email = do
_ <- many' space
name <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= '@'))
_ <- char '@'
rest <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= ',' && c /= '>' && c /= '<'))
_ <- many' (notChar ',')
pure (name <> "@" <> rest)
{-# INLINE email #-}
encode :: Header -> ByteString

View File

@ -31,4 +31,7 @@ tests = testGroup "Data.Email"
[ testCase "Can parse a sample email" $ do
got <- parseToList sample
got @?= [ From "me@example.com", To ["you@example.com"]]
, testCase "Combined mailto" $ do
let got = decode "To: James Doe<james.doe@example.com<mailto:james.doe@example.com>>"
got @?= Right (To ["james.doe@example.com"])
]