Compare commits

...

6 Commits

Author SHA1 Message Date
6f0d8d0372 Add flakes 2022-02-07 15:03:00 +02:00
5182c47a19 Fix bug in parser 2021-11-04 10:24:22 +02:00
a47d4903cb Increase chunk size back to 200 2021-11-04 10:11:37 +02:00
00e585281c Avoid unneeded writes 2021-11-04 10:10:35 +02:00
03860c370a Share old info 2021-10-29 22:59:04 +03:00
41c666fe93 Make the analysis parallel (#2)
Co-authored-by: Mats Rauhala <mats.rauhala@iki.fi>
Reviewed-on: #2
Co-authored-by: Mats Rauhala <masse@rauhala.info>
Co-committed-by: Mats Rauhala <masse@rauhala.info>
2021-10-29 22:41:46 +03:00
9 changed files with 213 additions and 65 deletions

View File

@ -41,6 +41,8 @@ library
, vector , vector
, containers , containers
, filepath , filepath
, parallel
, unix
hs-source-dirs: src hs-source-dirs: src
default-language: Haskell2010 default-language: Haskell2010
ghc-options: -Wall ghc-options: -Wall
@ -55,7 +57,7 @@ executable addressbook
, text , text
hs-source-dirs: app hs-source-dirs: app
default-language: Haskell2010 default-language: Haskell2010
ghc-options: -Wall -threaded ghc-options: -Wall -threaded -eventlog
test-suite addressbook-test test-suite addressbook-test
import: deps import: deps

View File

@ -1,7 +1,7 @@
{ mkDerivation, attoparsec, base, bytestring, conduit { mkDerivation, attoparsec, base, bytestring, conduit
, conduit-extra, containers, criterion, filepath, hedgehog , conduit-extra, containers, criterion, filepath, hedgehog
, hedgehog-corpus, HUnit, lens, lib, mtl, optparse-applicative , hedgehog-corpus, HUnit, lens, lib, mtl, optparse-applicative
, tasty, tasty-hedgehog, tasty-hunit, text, vector , parallel, tasty, tasty-hedgehog, tasty-hunit, text, unix, vector
}: }:
mkDerivation { mkDerivation {
pname = "addressbook"; pname = "addressbook";
@ -11,10 +11,11 @@ mkDerivation {
isExecutable = true; isExecutable = true;
libraryHaskellDepends = [ libraryHaskellDepends = [
attoparsec base bytestring conduit conduit-extra containers attoparsec base bytestring conduit conduit-extra containers
filepath lens mtl text vector filepath lens mtl parallel text unix vector
]; ];
executableHaskellDepends = [ executableHaskellDepends = [
base criterion hedgehog-corpus optparse-applicative text base bytestring containers criterion hedgehog-corpus
optparse-applicative text
]; ];
testHaskellDepends = [ testHaskellDepends = [
base bytestring conduit conduit-extra containers hedgehog base bytestring conduit conduit-extra containers hedgehog

62
flake.lock Normal file
View File

@ -0,0 +1,62 @@
{
"nodes": {
"easy-hls": {
"inputs": {
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1637250802,
"narHash": "sha256-/crlHEVB148PGQLZCsHOR9L5qgvCAfRSocIoKgmMAhA=",
"owner": "jkachmar",
"repo": "easy-hls-nix",
"rev": "7c123399ef8a67dc0e505d9cf7f2c7f64f1cd847",
"type": "github"
},
"original": {
"owner": "jkachmar",
"repo": "easy-hls-nix",
"type": "github"
}
},
"flake-utils": {
"locked": {
"lastModified": 1637014545,
"narHash": "sha256-26IZAc5yzlD9FlDT54io1oqG/bBoyka+FJk5guaX4x4=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "bba5dcc8e0b20ab664967ad83d24d64cb64ec4f4",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1637312849,
"narHash": "sha256-OhVZopkyryEfLyPwcXk2IQsdi80lj6TY1YFoMNZ4hCQ=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "e4806bb4416f88c20f8be0b8ef9b5b09ff9022a6",
"type": "github"
},
"original": {
"id": "nixpkgs",
"type": "indirect"
}
},
"root": {
"inputs": {
"easy-hls": "easy-hls",
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs"
}
}
},
"root": "root",
"version": 7
}

53
flake.nix Normal file
View File

@ -0,0 +1,53 @@
{
description = "addressbook";
inputs = {
easy-hls = {
url = "github:jkachmar/easy-hls-nix";
inputs.nixpkgs.follows = "nixpkgs";
};
flake-utils = {
url = "github:numtide/flake-utils";
};
};
outputs = { self, nixpkgs, flake-utils, easy-hls }:
{ overlay = final: prev: {
haskellPackages = prev.haskellPackages.override ( old: {
overrides = final.lib.composeExtensions (old.overrides or (_: _: {})) (f: p: {
addressbook = f.callPackage ./. {};
});
});
};
}
//
flake-utils.lib.eachSystem ["x86_64-linux" "x86_64-darwin"] ( system:
let
pkgs = import nixpkgs { inherit system; overlays = [ self.overlay ]; };
hp = pkgs.haskellPackages;
hls = (easy-hls.withGhcs [ hp.ghc ]).${system};
in
rec {
packages = { inherit (hp) addressbook; };
defaultPackage = packages.addressbook;
apps.addressbook = {
type = "app";
program = "${hp.addressbook}/bin/addressbook";
};
devShell = hp.shellFor {
packages = h: [h.addressbook];
withHoogle = true;
buildInputs = with pkgs; [
entr
cabal-install
hp.hlint
stylish-haskell
ghcid
hls
];
};
}
);
}

View File

@ -1,23 +1,18 @@
module Control.Addressbook.Streaming where module Control.Addressbook.Streaming where
import qualified Data.Text as T
import Conduit import Conduit
import qualified Data.Conduit.Binary as CB import qualified Data.Conduit.Binary as CB
import qualified Data.Conduit.Combinators as C import qualified Data.Conduit.Combinators as C
import qualified Data.Conduit.List as CL import qualified Data.Conduit.List as CL
import qualified Data.Conduit.Text as CT
import Data.Email import Data.Email
import Data.Email.Header import Data.Email.Header
(Header(..)) (Header(..))
import System.IO
(stdin)
import qualified Data.Foldable as F import qualified Data.Foldable as F
import qualified Data.Map.Strict as Map
import Data.Maybe import Data.Maybe
(fromMaybe) (fromMaybe)
@ -25,28 +20,54 @@ import System.Environment
(lookupEnv) (lookupEnv)
import System.FilePath import System.FilePath
((</>)) ((</>))
import Data.Set (Set)
import Data.ByteString (ByteString)
import qualified Data.Set as Set
import qualified Data.ByteString.Lazy as LBS
import Data.Char (ord)
import qualified Data.ByteString.Lazy.Char8 as LBC
import System.IO.Unsafe (unsafeInterleaveIO)
import Control.Parallel.Strategies (rseq, parMap)
import qualified Data.List as L
import Control.Monad (unless)
import System.Posix (touchFile)
combine :: (MonadUnliftIO m, MonadResource m, MonadThrow m, MonadIO m) => ConduitM FilePath Header m () combine :: (MonadUnliftIO m, MonadResource m, MonadThrow m, MonadIO m) => ConduitM FilePath Header m ()
combine = await >>= \case combine = await >>= \case
Nothing -> pure () Nothing -> pure ()
Just path -> (CB.sourceFile path .| parseEmail) >> combine Just path -> (CB.sourceFile path .| parseEmail) >> combine
chunks :: Int -> [a] -> [[a]]
chunks n = L.unfoldr $ \case
[] -> Nothing
xs -> Just (splitAt n xs)
run :: IO () run :: IO ()
run = do run = do
datDir <- fromMaybe "./" <$> lookupEnv "HOME" datDir <- fromMaybe "./" <$> lookupEnv "HOME"
runResourceT $ do let datFile = datDir </> ".addressbook.dat"
x <- runConduit stream touchFile datFile
runConduit (CL.sourceList (Map.keys x) .| C.map (<> "\n") .| CB.sinkFileCautious (datDir </> ".addressbook.dat")) original <- Set.fromList . map LBS.toStrict . lbsLines <$> LBS.readFile datFile
xs <- LBS.getContents >>= stream
let set = original `Set.union` F.fold (parMap rseq F.fold (chunks 200 xs))
unless (original == set) $
runResourceT $
runConduit $
CL.sourceList (Set.elems set)
.| C.map (<> "\n")
.| CB.sinkFileCautious datFile
where where
separate = \case separate = \case
From x -> [x] From x -> [x]
To xs -> F.toList xs To xs -> F.toList xs
-- A set of (locally) unique addresses. Composes with parMap
lbsLines = LBS.split (fromIntegral $ ord '\n')
stream :: LBS.ByteString -> IO [Set ByteString]
stream = stream =
CB.sourceHandle stdin traverse (unsafeInterleaveIO . parse . LBC.unpack)
.| CT.decode CT.utf8 . filter (not . LBS.null)
.| CT.lines . lbsLines
.| C.map T.unpack parse path =
.| combine runResourceT $
.| C.concatMap separate runConduit $
.| CT.encode CT.utf8 CB.sourceFile path .| parseEmail .| C.concatMap separate .| C.foldMap Set.singleton
.| C.foldMap (`Map.singleton` ())

View File

@ -5,12 +5,9 @@ import Data.Email.Header
import Conduit import Conduit
import qualified Data.Conduit.Combinators as C import qualified Data.Conduit.Combinators as C
import qualified Data.Conduit.Text as CT
import Data.ByteString import Data.ByteString
(ByteString) (ByteString)
parseEmail :: (MonadUnliftIO m, MonadThrow m, Monad m) => ConduitM ByteString Header m () parseEmail :: (MonadUnliftIO m, MonadThrow m, Monad m) => ConduitM ByteString Header m ()
parseEmail = catchC (CT.decode CT.utf8) err .| CT.lines .| C.concatMap decode parseEmail = C.linesUnboundedAscii .| C.concatMap decode
where
err e = liftIO (print @CT.TextException e) >> yield ""

View File

@ -1,55 +1,63 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ApplicativeDo #-}
module Data.Email.Header where module Data.Email.Header where
import Data.Text
(Text)
import qualified Data.Text as T
import qualified Data.Foldable as F import qualified Data.Foldable as F
import Data.Attoparsec.Text import Data.Attoparsec.ByteString.Char8
import Data.Vector import Data.Vector
(Vector) (Vector)
import qualified Data.Vector as V import qualified Data.Vector as V
import Data.Char
(isSpace)
import Control.Applicative import Control.Applicative
((<|>)) ((<|>))
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BC
data Header data Header
= From !Text = From !ByteString
| To !(Vector Text) | To !(Vector ByteString)
deriving (Show, Eq) deriving (Show, Eq)
decode :: Text -> Either String Header decode :: ByteString -> Either String Header
decode = parseOnly parseHeader decode = parseOnly parseHeader
where {-# INLINE decode #-}
parseHeader :: Parser Header
parseHeader = parseFrom <|> parseTo parseHeader :: Parser Header
parseFrom :: Parser Header parseHeader = parseFrom <|> parseTo
parseFrom = From <$> (string "From:" *> emptySpace *> (bracketEmail <|> email)) {-# INLINE parseHeader #-}
parseTo :: Parser Header
parseTo = To <$> (string "To:" *> emptySpace *> emails) parseFrom :: Parser Header
emptySpace = many' space parseFrom = From <$> (string "From:" *> emptySpace *> (bracketEmail <|> email))
emails :: Parser (Vector Text)
emails = V.fromList <$> (bracketEmail <|> email) `sepBy` char ',' parseTo :: Parser Header
bracketEmail :: Parser Text parseTo = To <$> (string "To:" *> emptySpace *> emails)
bracketEmail = do
_ <- manyTill anyChar (char '<') emptySpace :: Parser ()
email emptySpace = () <$ many' space
email :: Parser Text
email = do emails :: Parser (Vector ByteString)
_ <- many' space emails = V.fromList <$> (bracketEmail <|> email) `sepBy` char ','
name <- T.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= '@'))
_ <- char '@' bracketEmail :: Parser ByteString
rest <- T.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= ',' && c /= '>')) bracketEmail = do
_ <- many' (notChar ',') _ <- manyTill anyChar (char '<')
pure (name <> "@" <> rest) email
{-# INLINE bracketEmail #-}
email :: Parser ByteString
email = do
_ <- many' space
name <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= '@'))
_ <- char '@'
rest <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= ',' && c /= '>' && c /= '<'))
_ <- many' (notChar ',')
pure (name <> "@" <> rest)
{-# INLINE email #-}
encode :: Header -> Text encode :: Header -> ByteString
encode = \case encode = \case
From addr -> "From: " <> addr From addr -> "From: " <> addr
To addrs -> "To: " <> T.intercalate ", " (F.toList addrs) To addrs -> "To: " <> BC.intercalate ", " (F.toList addrs)

View File

@ -31,4 +31,7 @@ tests = testGroup "Data.Email"
[ testCase "Can parse a sample email" $ do [ testCase "Can parse a sample email" $ do
got <- parseToList sample got <- parseToList sample
got @?= [ From "me@example.com", To ["you@example.com"]] got @?= [ From "me@example.com", To ["you@example.com"]]
, testCase "Combined mailto" $ do
let got = decode "To: James Doe<james.doe@example.com<mailto:james.doe@example.com>>"
got @?= Right (To ["james.doe@example.com"])
] ]

View File

@ -10,11 +10,12 @@ import qualified Hedgehog.Corpus as Corpus
import qualified Hedgehog.Gen as Gen import qualified Hedgehog.Gen as Gen
import qualified Hedgehog.Range as Range import qualified Hedgehog.Range as Range
import Data.Text import Data.ByteString
import qualified Data.Text as T import qualified Data.ByteString as T
import qualified Data.Vector as V import qualified Data.Vector as V
import Data.Email.Header import Data.Email.Header
import qualified Data.ByteString.Char8 as BC
genHeader :: Gen Header genHeader :: Gen Header
genHeader = Gen.choice genHeader = Gen.choice
@ -22,17 +23,17 @@ genHeader = Gen.choice
, To . V.fromList <$> Gen.list (Range.linear 0 10) genEmail , To . V.fromList <$> Gen.list (Range.linear 0 10) genEmail
] ]
genEmail :: Gen Text genEmail :: Gen ByteString
genEmail = do genEmail = do
name <- Gen.element Corpus.simpsons name <- Gen.element Corpus.simpsons
domain <- Gen.element Corpus.cooking domain <- Gen.element Corpus.cooking
tld <- Gen.element ["com","fi","org"] tld <- Gen.element ["com","fi","org"]
pure $ name <> "@" <> domain <> "." <> tld pure $ name <> "@" <> domain <> "." <> tld
wrapped :: Char -> Text -> Char -> Text wrapped :: Char -> ByteString -> Char -> ByteString
wrapped l x r = T.singleton l <> x <> T.singleton r wrapped l x r = BC.singleton l <> x <> BC.singleton r
genComment :: Gen Text genComment :: Gen ByteString
genComment = do genComment = do
x <- Gen.element Corpus.simpsons x <- Gen.element Corpus.simpsons
Gen.element [x, wrapped '"' x '"'] Gen.element [x, wrapped '"' x '"']