From 87f6eb00f6e649b3c59aa371a4bfa1b5b4dc4f52 Mon Sep 17 00:00:00 2001 From: Mats Rauhala Date: Fri, 29 Oct 2021 17:22:29 +0300 Subject: [PATCH] Bring the addressbook back up to date - bytestring-trie is not available - hls - modern shell.nix - base - Re-implement trie, with benchmarks - Realize my implementation of trie is slower than Data.Map, use that instead --- addressbook.cabal | 42 ++++++++++++--- bench/Bench.hs | 54 ++++++++++++++++++++ default.nix | 20 ++++---- easy-hls-nix.json | 11 ++++ easy-hsl-nix.json | 10 ++++ shell.nix | 17 +++++-- src/Control/Addressbook/Query.hs | 16 +++--- src/Data/Trie.hs | 87 ++++++++++++++++++++++++++++++++ test/MyLibTest.hs | 2 + test/Test/Data/Trie.hs | 45 +++++++++++++++++ 10 files changed, 278 insertions(+), 26 deletions(-) create mode 100644 bench/Bench.hs create mode 100644 easy-hls-nix.json create mode 100644 easy-hsl-nix.json create mode 100644 src/Data/Trie.hs create mode 100644 test/Test/Data/Trie.hs diff --git a/addressbook.cabal b/addressbook.cabal index a1b4f92..6bf76b3 100644 --- a/addressbook.cabal +++ b/addressbook.cabal @@ -16,8 +16,13 @@ maintainer: mats.rauhala@iki.fi -- category: extra-source-files: CHANGELOG.md +common deps + build-depends: base >=4.13.0.0 && <4.15 + library + import: deps exposed-modules: MyLib + , Data.Trie , Data.Email.Header , Data.Email , Control.Addressbook.Streaming @@ -26,15 +31,13 @@ library -- other-extensions: default-extensions: OverloadedStrings LambdaCase - build-depends: base ^>=4.13.0.0 - , attoparsec + build-depends: attoparsec , mtl , conduit , conduit-extra , bytestring , lens , text - , bytestring-trie , vector , containers , filepath @@ -43,27 +46,29 @@ library ghc-options: -Wall executable addressbook + import: deps main-is: Main.hs -- other-modules: -- other-extensions: - build-depends: base ^>=4.13.0.0, addressbook + build-depends: addressbook , optparse-applicative , text hs-source-dirs: app default-language: Haskell2010 - ghc-options: -Wall + ghc-options: -Wall -threaded test-suite addressbook-test + import: deps default-language: Haskell2010 default-extensions: OverloadedStrings LambdaCase other-modules: Test.Data.Email.Header , Test.Data.Email + , Test.Data.Trie type: exitcode-stdio-1.0 hs-source-dirs: test main-is: MyLibTest.hs - build-depends: base ^>=4.13.0.0 - , addressbook + build-depends: addressbook , tasty , tasty-hedgehog , tasty-hunit @@ -75,4 +80,27 @@ test-suite addressbook-test , vector , conduit , conduit-extra + , containers ghc-options: -Wall + + +-- I know there's a benchmark type, but haskellPackages.shellFor doesn't seem +-- to support it and I don't have the time to figure it out +executable addressbook-bench + import: deps + default-language: Haskell2010 + default-extensions: OverloadedStrings + LambdaCase + -- other-modules: Test.Data.Email.Header + -- , Test.Data.Email + -- , Test.Data.Trie + hs-source-dirs: bench + main-is: Bench.hs + build-depends: addressbook + , criterion + , hedgehog-corpus + , bytestring + , containers + ghc-options: -Wall + + diff --git a/bench/Bench.hs b/bench/Bench.hs new file mode 100644 index 0000000..1761ab3 --- /dev/null +++ b/bench/Bench.hs @@ -0,0 +1,54 @@ +module Main where + +import Criterion.Main (defaultMain) +import Data.ByteString (ByteString) +import Hedgehog.Corpus (muppets, simpsons, vegetables, viruses) +import Criterion +import qualified Data.Map.Strict as M +import qualified Data.ByteString.Char8 as B +import qualified Data.Trie as Trie + +setupEnv :: IO ([(ByteString, ())], [(ByteString, ())]) +setupEnv = pure (small, big) + where + small = + [ (firstname <> "." <> surname <> "@" <> domain <> "." <> tld, ()) + | firstname <- simpsons + , surname <- muppets + , domain <- vegetables + , tld <- ["com", "fi", "co.uk", "info", "org"] + ] + big = + [ (firstname <> "." <> surname <> "@" <> domain1 <> "." <> domain2 <> "." <> tld, ()) + | firstname <- simpsons + , surname <- muppets + , domain1 <- vegetables + , domain2 <- viruses + , tld <- ["com", "fi", "co.uk", "info", "org"] + ] + +main :: IO () +main = defaultMain + [ env setupEnv $ \ ~(small,big) -> bgroup "main" + [ bgroup "small" + [ bgroup "Data.Map" + [ bench "length . toList . fromList" $ whnf (length . M.toList . M.fromList) small + , bench "length . filter (\"homer\" `isPrefixOf`) . keys . fromList" $ whnf (length . M.keys . M.filterWithKey (\k _ -> "homer" `B.isPrefixOf` k) . M.fromList) small + ] + , bgroup "Data.Trie" + [ bench "length . toList . fromList" $ whnf (length . Trie.toList . Trie.fromList) small + , bench "length . keys . submap \"homer\" . fromList" $ whnf (length . Trie.keys . Trie.submap "homer" . Trie.fromList) small + ] + ] + , bgroup "big" + [ bgroup "Data.Map" + [ bench "length . toList . fromList" $ whnf (length . M.toList . M.fromList) big + , bench "length . filter (\"homer\" `isPrefixOf`) . keys . fromList" $ whnf (length . M.keys . M.filterWithKey (\k _ -> "homer" `B.isPrefixOf` k) . M.fromList) big + ] + , bgroup "Data.Trie" + [ bench "length . toList . fromList" $ whnf (length . Trie.toList . Trie.fromList) big + , bench "length . keys . submap \"homer\" . fromList" $ whnf (length . Trie.keys . Trie.submap "homer" . Trie.fromList) big + ] + ] + ] + ] diff --git a/default.nix b/default.nix index 4df2eca..d687fff 100644 --- a/default.nix +++ b/default.nix @@ -1,6 +1,6 @@ -{ mkDerivation, attoparsec, base, bytestring, bytestring-trie -, conduit, conduit-extra, containers, filepath, hedgehog -, hedgehog-corpus, HUnit, lens, mtl, optparse-applicative, stdenv +{ mkDerivation, attoparsec, base, bytestring, conduit +, conduit-extra, containers, criterion, filepath, hedgehog +, hedgehog-corpus, HUnit, lens, lib, mtl, optparse-applicative , tasty, tasty-hedgehog, tasty-hunit, text, vector }: mkDerivation { @@ -10,13 +10,15 @@ mkDerivation { isLibrary = true; isExecutable = true; libraryHaskellDepends = [ - attoparsec base bytestring bytestring-trie conduit conduit-extra - containers filepath lens mtl text vector + attoparsec base bytestring conduit conduit-extra containers + filepath lens mtl text vector + ]; + executableHaskellDepends = [ + base criterion hedgehog-corpus optparse-applicative text ]; - executableHaskellDepends = [ base optparse-applicative text ]; testHaskellDepends = [ - base bytestring conduit conduit-extra hedgehog hedgehog-corpus - HUnit tasty tasty-hedgehog tasty-hunit text vector + base bytestring conduit conduit-extra containers hedgehog + hedgehog-corpus HUnit tasty tasty-hedgehog tasty-hunit text vector ]; - license = stdenv.lib.licenses.bsd3; + license = lib.licenses.bsd3; } diff --git a/easy-hls-nix.json b/easy-hls-nix.json new file mode 100644 index 0000000..13bd576 --- /dev/null +++ b/easy-hls-nix.json @@ -0,0 +1,11 @@ +{ + "url": "https://github.com/ssbothwell/easy-hls-nix", + "rev": "393ccab35104d5d49e0ff9eadf7b8654e87abffd", + "date": "2021-09-16T11:13:40-07:00", + "path": "/nix/store/dsfqcsiahsp9rkip4fsqzz32x0swa3d4-easy-hls-nix", + "sha256": "0q1qxlkzjqx2hvf9k2cp5a98vlvsj13lap6hm7gl1kkqp88ai1dw", + "fetchLFS": false, + "fetchSubmodules": false, + "deepClone": false, + "leaveDotGit": false +} diff --git a/easy-hsl-nix.json b/easy-hsl-nix.json new file mode 100644 index 0000000..c024c9e --- /dev/null +++ b/easy-hsl-nix.json @@ -0,0 +1,10 @@ +{ + "url": "https://github.com/jkachmar/easy-hls-nix", + "rev": "291cf77f512a7121bb6801cde35ee1e8b7287f91", + "date": "2021-04-13T16:12:58-04:00", + "path": "/nix/store/wlmckg348q32ylfpg5dv3d5x88nkbi59-easy-hls-nix", + "sha256": "1bvbcp9zwmh53sm16ycp8phhc6vzc72a71sf0bvyjgfbn6zp68bc", + "fetchSubmodules": false, + "deepClone": false, + "leaveDotGit": false +} diff --git a/shell.nix b/shell.nix index 0df029a..48ff83f 100644 --- a/shell.nix +++ b/shell.nix @@ -1,15 +1,24 @@ with (import {}); -let addressbook = haskellPackages.callPackage ./. {}; +let + hp = haskellPackages.extend (self: super: { + addressbook = self.callPackage ./. {}; + }); + easy-hls = callPackage easy-hls-src { ghcVersions = [ hp.ghc.version ]; }; + easy-hls-src = fetchFromGitHub { + owner = "ssbothwell"; + repo = "easy-hls-nix"; + inherit (builtins.fromJSON (builtins.readFile ./easy-hls-nix.json)) rev sha256; + }; in -mkShell { - name = "addressbook-shell"; +hp.shellFor { + packages = h: [h.addressbook]; buildInputs = [ ghcid + easy-hls stylish-haskell haskellPackages.cabal-install - (haskellPackages.ghcWithHoogle (_: addressbook.buildInputs ++ addressbook.propagatedBuildInputs)) ]; } diff --git a/src/Control/Addressbook/Query.hs b/src/Control/Addressbook/Query.hs index 02ce4c3..ca43a54 100644 --- a/src/Control/Addressbook/Query.hs +++ b/src/Control/Addressbook/Query.hs @@ -12,7 +12,7 @@ import qualified Data.Conduit.Combinators as C import qualified Data.Conduit.List as CL import qualified Data.Conduit.Text as CT -import qualified Data.Trie as Trie +import qualified Data.Map.Strict as Map import System.IO (stdout) @@ -26,22 +26,26 @@ import System.FilePath import Control.Exception (catch) +import Data.ByteString (ByteString) +import qualified Data.ByteString.Char8 as B +-- XXX: The current Data.Trie implementation is much slower than Data.Set query :: Text -> IO () query prefix = do datDir <- fromMaybe "./" <$> lookupEnv "HOME" - state <- catch @IOError (runResourceT $ runConduit $ readState datDir) (\_ -> pure Trie.empty) + state <- catch @IOError (runResourceT $ runConduit $ readState datDir) (\_ -> pure Map.empty) runConduit $ outputResults state where - readState :: FilePath -> ConduitM () Void (ResourceT IO) (Trie.Trie [Text]) + readState :: FilePath -> ConduitM () Void (ResourceT IO) (Map.Map ByteString [Text]) readState dir = CB.sourceFile (dir ".addressbook.dat") .| CT.decode CT.utf8 .| CT.lines - .| C.foldMap (\s -> Trie.singleton (TE.encodeUtf8 $ T.toLower s) [s]) - outputResults :: Trie.Trie [Text] -> ConduitM () Void IO () + .| C.foldMap (\s -> Map.singleton (TE.encodeUtf8 $ T.toLower s) [s]) + prefixB = TE.encodeUtf8 prefix + outputResults :: Map.Map ByteString [Text] -> ConduitM () Void IO () outputResults state = - CL.sourceList (Trie.elems $ Trie.submap (TE.encodeUtf8 prefix) state) + CL.sourceList (Map.elems $ Map.filterWithKey (\k _ -> prefixB `B.isPrefixOf` k) state) .| C.concat .| C.map (<> "\n") .| CT.encode CT.utf8 diff --git a/src/Data/Trie.hs b/src/Data/Trie.hs new file mode 100644 index 0000000..304c228 --- /dev/null +++ b/src/Data/Trie.hs @@ -0,0 +1,87 @@ +-- The bytestring-trie is marked as broken. Trie is a simple(ish) datastructure, implement one here +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE DeriveFunctor #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE DeriveFoldable #-} +module Data.Trie where +import Data.ByteString (ByteString) +import Data.Word (Word8) +import Data.Foldable (foldl') +import Data.Semigroup (Last(..)) +import qualified Data.ByteString as B +import Data.Coerce (coerce) +import Data.IntMap (IntMap) +import qualified Data.IntMap.Strict as M + +data Pair a b = Pair !a !b + deriving (Functor, Foldable) + +data Trie a + = Empty + | Branch !(Maybe (Pair ByteString a)) !(IntMap (Trie a)) + deriving (Functor, Foldable) + +instance Semigroup a => Semigroup (Trie a) where + (<>) = union + +instance Monoid a => Monoid (Trie a) where + mempty = Empty + +instance Show a => Show (Trie a) where + show = show . toList + +keys :: Trie a -> [ByteString] +keys Empty = [] +keys (Branch Nothing m) = concatMap keys (M.elems m) +keys (Branch (Just (Pair b _)) m) = b : concatMap keys (M.elems m) + +union :: Semigroup a => Trie a -> Trie a -> Trie a +union Empty r = r +union l Empty = l +union (Branch m_pa cl) (Branch ma cr) = Branch (merge m_pa ma) (M.unionWith union cl cr) + where + merge (Just (Pair x a)) (Just (Pair _ b)) = Just $ Pair x (a <> b) + merge Nothing r = r + merge l Nothing = l + +unionR :: forall a. Trie a -> Trie a -> Trie a +unionR a b = coerce @(Trie (Last a)) $ union (coerce a) (coerce b) + +singleton :: forall a. ByteString -> a -> Trie a +singleton bs a = go (B.uncons bs) + where + go :: Maybe (Word8, ByteString) -> Trie a + go Nothing = + let x = Pair bs a + in x `seq` Branch (Just (Pair bs a)) M.empty + go (Just (w,c)) = + let y = M.singleton (fromIntegral w) (go (B.uncons c)) + in y `seq` Branch Nothing y + +empty :: Trie a +empty = Empty + +elems :: Trie a -> [a] +elems = foldr (:) [] + +submap :: forall a. ByteString -> Trie a -> Trie a +submap bs = go (B.uncons bs) + where + go :: Maybe (Word8, ByteString) -> Trie a -> Trie a + go _ Empty = Empty + go Nothing t = t + go (Just (w,cs)) (Branch _ c) = maybe empty (go (B.uncons cs)) $ M.lookup (fromIntegral w) c + +insert :: forall a. ByteString -> a -> Trie a -> Trie a +insert bs a = (`unionR` singleton bs a) + +fromList :: [(ByteString, a)] -> Trie a +fromList = foldl' (\acc (k,v) -> insert k v acc) empty + +toList :: forall a. Trie a -> [(ByteString, a)] +toList = go + where + go :: Trie a -> [(ByteString, a)] + go Empty = [] + go (Branch Nothing m) = concatMap (\(_, child) -> go child) (M.toList m) + go (Branch (Just (Pair a b)) m) = (a, b) : concatMap (\(_, child) -> go child) (M.toList m) diff --git a/test/MyLibTest.hs b/test/MyLibTest.hs index 3f29032..522edac 100644 --- a/test/MyLibTest.hs +++ b/test/MyLibTest.hs @@ -4,11 +4,13 @@ import Test.Tasty import qualified Test.Data.Email as Data.Email import qualified Test.Data.Email.Header as Data.Email.Header +import qualified Test.Data.Trie as Data.Trie tests :: TestTree tests = testGroup "tests" [ Data.Email.Header.tests , Data.Email.tests + , Data.Trie.tests ] main :: IO () diff --git a/test/Test/Data/Trie.hs b/test/Test/Data/Trie.hs new file mode 100644 index 0000000..f779fbb --- /dev/null +++ b/test/Test/Data/Trie.hs @@ -0,0 +1,45 @@ +module Test.Data.Trie where + +import Test.Tasty +import Test.Tasty.Hedgehog (testProperty) +import Hedgehog +import Data.Monoid (Sum(..)) +import Data.ByteString (ByteString) +import Data.Map (Map) +import qualified Hedgehog.Gen as Gen +import qualified Hedgehog.Range as Range +import qualified Data.Trie as T +import qualified Data.Map.Strict as M + +genMap :: Gen (Map ByteString (Sum Int)) +genMap = Gen.map (Range.linear 0 100) genT + where + genT = (,) <$> Gen.bytes (Range.linear 0 20) <*> fmap Sum (Gen.integral (Range.linear 0 10)) + +propIsomorphic :: Property +propIsomorphic = property $ do + m <- forAll genMap + let wanted = m + got = M.fromList . T.toList . T.fromList . M.toList $ wanted + wanted === got + +propElems :: Property +propElems = property $ do + m <- forAll genMap + let wanted = M.elems m + got = T.elems . T.fromList . M.toList $ m + wanted === got + +propKeys :: Property +propKeys = property $ do + m <- forAll genMap + let wanted = M.keys m + got = T.keys . T.fromList . M.toList $ m + wanted === got + +tests :: TestTree +tests = testGroup "Data.Trie" + [ testProperty "isomorphic to Map" $ propIsomorphic + , testProperty "isomorphic elems" $ propElems + , testProperty "isomorphic keys" $ propKeys + ]