Compare commits

..

6 Commits

Author SHA1 Message Date
6f0d8d0372 Add flakes 2022-02-07 15:03:00 +02:00
5182c47a19 Fix bug in parser 2021-11-04 10:24:22 +02:00
a47d4903cb Increase chunk size back to 200 2021-11-04 10:11:37 +02:00
00e585281c Avoid unneeded writes 2021-11-04 10:10:35 +02:00
03860c370a Share old info 2021-10-29 22:59:04 +03:00
41c666fe93 Make the analysis parallel (#2)
Co-authored-by: Mats Rauhala <mats.rauhala@iki.fi>
Reviewed-on: #2
Co-authored-by: Mats Rauhala <masse@rauhala.info>
Co-committed-by: Mats Rauhala <masse@rauhala.info>
2021-10-29 22:41:46 +03:00
7 changed files with 169 additions and 31 deletions

View File

@ -42,6 +42,7 @@ library
, containers , containers
, filepath , filepath
, parallel , parallel
, unix
hs-source-dirs: src hs-source-dirs: src
default-language: Haskell2010 default-language: Haskell2010
ghc-options: -Wall ghc-options: -Wall

View File

@ -1,7 +1,7 @@
{ mkDerivation, attoparsec, base, bytestring, conduit { mkDerivation, attoparsec, base, bytestring, conduit
, conduit-extra, containers, criterion, filepath, hedgehog , conduit-extra, containers, criterion, filepath, hedgehog
, hedgehog-corpus, HUnit, lens, lib, mtl, optparse-applicative , hedgehog-corpus, HUnit, lens, lib, mtl, optparse-applicative
, parallel, tasty, tasty-hedgehog, tasty-hunit, text, vector , parallel, tasty, tasty-hedgehog, tasty-hunit, text, unix, vector
}: }:
mkDerivation { mkDerivation {
pname = "addressbook"; pname = "addressbook";
@ -11,7 +11,7 @@ mkDerivation {
isExecutable = true; isExecutable = true;
libraryHaskellDepends = [ libraryHaskellDepends = [
attoparsec base bytestring conduit conduit-extra containers attoparsec base bytestring conduit conduit-extra containers
filepath lens mtl parallel text vector filepath lens mtl parallel text unix vector
]; ];
executableHaskellDepends = [ executableHaskellDepends = [
base bytestring containers criterion hedgehog-corpus base bytestring containers criterion hedgehog-corpus

62
flake.lock Normal file
View File

@ -0,0 +1,62 @@
{
"nodes": {
"easy-hls": {
"inputs": {
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1637250802,
"narHash": "sha256-/crlHEVB148PGQLZCsHOR9L5qgvCAfRSocIoKgmMAhA=",
"owner": "jkachmar",
"repo": "easy-hls-nix",
"rev": "7c123399ef8a67dc0e505d9cf7f2c7f64f1cd847",
"type": "github"
},
"original": {
"owner": "jkachmar",
"repo": "easy-hls-nix",
"type": "github"
}
},
"flake-utils": {
"locked": {
"lastModified": 1637014545,
"narHash": "sha256-26IZAc5yzlD9FlDT54io1oqG/bBoyka+FJk5guaX4x4=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "bba5dcc8e0b20ab664967ad83d24d64cb64ec4f4",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1637312849,
"narHash": "sha256-OhVZopkyryEfLyPwcXk2IQsdi80lj6TY1YFoMNZ4hCQ=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "e4806bb4416f88c20f8be0b8ef9b5b09ff9022a6",
"type": "github"
},
"original": {
"id": "nixpkgs",
"type": "indirect"
}
},
"root": {
"inputs": {
"easy-hls": "easy-hls",
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs"
}
}
},
"root": "root",
"version": 7
}

53
flake.nix Normal file
View File

@ -0,0 +1,53 @@
{
description = "addressbook";
inputs = {
easy-hls = {
url = "github:jkachmar/easy-hls-nix";
inputs.nixpkgs.follows = "nixpkgs";
};
flake-utils = {
url = "github:numtide/flake-utils";
};
};
outputs = { self, nixpkgs, flake-utils, easy-hls }:
{ overlay = final: prev: {
haskellPackages = prev.haskellPackages.override ( old: {
overrides = final.lib.composeExtensions (old.overrides or (_: _: {})) (f: p: {
addressbook = f.callPackage ./. {};
});
});
};
}
//
flake-utils.lib.eachSystem ["x86_64-linux" "x86_64-darwin"] ( system:
let
pkgs = import nixpkgs { inherit system; overlays = [ self.overlay ]; };
hp = pkgs.haskellPackages;
hls = (easy-hls.withGhcs [ hp.ghc ]).${system};
in
rec {
packages = { inherit (hp) addressbook; };
defaultPackage = packages.addressbook;
apps.addressbook = {
type = "app";
program = "${hp.addressbook}/bin/addressbook";
};
devShell = hp.shellFor {
packages = h: [h.addressbook];
withHoogle = true;
buildInputs = with pkgs; [
entr
cabal-install
hp.hlint
stylish-haskell
ghcid
hls
];
};
}
);
}

View File

@ -29,6 +29,8 @@ import qualified Data.ByteString.Lazy.Char8 as LBC
import System.IO.Unsafe (unsafeInterleaveIO) import System.IO.Unsafe (unsafeInterleaveIO)
import Control.Parallel.Strategies (rseq, parMap) import Control.Parallel.Strategies (rseq, parMap)
import qualified Data.List as L import qualified Data.List as L
import Control.Monad (unless)
import System.Posix (touchFile)
combine :: (MonadUnliftIO m, MonadResource m, MonadThrow m, MonadIO m) => ConduitM FilePath Header m () combine :: (MonadUnliftIO m, MonadResource m, MonadThrow m, MonadIO m) => ConduitM FilePath Header m ()
combine = await >>= \case combine = await >>= \case
@ -43,23 +45,28 @@ chunks n = L.unfoldr $ \case
run :: IO () run :: IO ()
run = do run = do
datDir <- fromMaybe "./" <$> lookupEnv "HOME" datDir <- fromMaybe "./" <$> lookupEnv "HOME"
let datFile = datDir </> ".addressbook.dat"
touchFile datFile
original <- Set.fromList . map LBS.toStrict . lbsLines <$> LBS.readFile datFile
xs <- LBS.getContents >>= stream xs <- LBS.getContents >>= stream
let set = F.fold (parMap rseq F.fold (chunks 20 xs)) let set = original `Set.union` F.fold (parMap rseq F.fold (chunks 200 xs))
unless (original == set) $
runResourceT $ runResourceT $
runConduit $ runConduit $
CL.sourceList (Set.elems set) CL.sourceList (Set.elems set)
.| C.map (<> "\n") .| C.map (<> "\n")
.| CB.sinkFileCautious (datDir </> ".addressbook.dat") .| CB.sinkFileCautious datFile
where where
separate = \case separate = \case
From x -> [x] From x -> [x]
To xs -> F.toList xs To xs -> F.toList xs
-- A set of (locally) unique addresses. Composes with parMap -- A set of (locally) unique addresses. Composes with parMap
lbsLines = LBS.split (fromIntegral $ ord '\n')
stream :: LBS.ByteString -> IO [Set ByteString] stream :: LBS.ByteString -> IO [Set ByteString]
stream = stream =
traverse (unsafeInterleaveIO . parse . LBC.unpack) traverse (unsafeInterleaveIO . parse . LBC.unpack)
. filter (not . LBS.null) . filter (not . LBS.null)
. LBS.split (fromIntegral $ ord '\n') . lbsLines
parse path = parse path =
runResourceT $ runResourceT $
runConduit $ runConduit $

View File

@ -1,4 +1,5 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ApplicativeDo #-}
module Data.Email.Header where module Data.Email.Header where
import qualified Data.Foldable as F import qualified Data.Foldable as F
@ -21,28 +22,39 @@ data Header
decode :: ByteString -> Either String Header decode :: ByteString -> Either String Header
decode = parseOnly parseHeader decode = parseOnly parseHeader
where {-# INLINE decode #-}
parseHeader :: Parser Header parseHeader :: Parser Header
parseHeader = parseFrom <|> parseTo parseHeader = parseFrom <|> parseTo
{-# INLINE parseHeader #-}
parseFrom :: Parser Header parseFrom :: Parser Header
parseFrom = From <$> (string "From:" *> emptySpace *> (bracketEmail <|> email)) parseFrom = From <$> (string "From:" *> emptySpace *> (bracketEmail <|> email))
parseTo :: Parser Header parseTo :: Parser Header
parseTo = To <$> (string "To:" *> emptySpace *> emails) parseTo = To <$> (string "To:" *> emptySpace *> emails)
emptySpace = many' space
emptySpace :: Parser ()
emptySpace = () <$ many' space
emails :: Parser (Vector ByteString) emails :: Parser (Vector ByteString)
emails = V.fromList <$> (bracketEmail <|> email) `sepBy` char ',' emails = V.fromList <$> (bracketEmail <|> email) `sepBy` char ','
bracketEmail :: Parser ByteString bracketEmail :: Parser ByteString
bracketEmail = do bracketEmail = do
_ <- manyTill anyChar (char '<') _ <- manyTill anyChar (char '<')
email email
{-# INLINE bracketEmail #-}
email :: Parser ByteString email :: Parser ByteString
email = do email = do
_ <- many' space _ <- many' space
name <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= '@')) name <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= '@'))
_ <- char '@' _ <- char '@'
rest <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= ',' && c /= '>')) rest <- BC.pack <$> many' (satisfy (\c -> not (isSpace c) && c /= ',' && c /= '>' && c /= '<'))
_ <- many' (notChar ',') _ <- many' (notChar ',')
pure (name <> "@" <> rest) pure (name <> "@" <> rest)
{-# INLINE email #-}
encode :: Header -> ByteString encode :: Header -> ByteString

View File

@ -31,4 +31,7 @@ tests = testGroup "Data.Email"
[ testCase "Can parse a sample email" $ do [ testCase "Can parse a sample email" $ do
got <- parseToList sample got <- parseToList sample
got @?= [ From "me@example.com", To ["you@example.com"]] got @?= [ From "me@example.com", To ["you@example.com"]]
, testCase "Combined mailto" $ do
let got = decode "To: James Doe<james.doe@example.com<mailto:james.doe@example.com>>"
got @?= Right (To ["james.doe@example.com"])
] ]