Bring the addressbook back up to date

- bytestring-trie is not available
- hls
- modern shell.nix
- base
- Re-implement trie, with benchmarks
- Realize my implementation of trie is slower than Data.Map, use that
  instead
This commit is contained in:
2021-10-29 17:22:29 +03:00
parent a672fecbc9
commit 87f6eb00f6
10 changed files with 278 additions and 26 deletions

View File

@ -12,7 +12,7 @@ import qualified Data.Conduit.Combinators as C
import qualified Data.Conduit.List as CL
import qualified Data.Conduit.Text as CT
import qualified Data.Trie as Trie
import qualified Data.Map.Strict as Map
import System.IO
(stdout)
@ -26,22 +26,26 @@ import System.FilePath
import Control.Exception
(catch)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as B
-- XXX: The current Data.Trie implementation is much slower than Data.Set
query :: Text -> IO ()
query prefix = do
datDir <- fromMaybe "./" <$> lookupEnv "HOME"
state <- catch @IOError (runResourceT $ runConduit $ readState datDir) (\_ -> pure Trie.empty)
state <- catch @IOError (runResourceT $ runConduit $ readState datDir) (\_ -> pure Map.empty)
runConduit $ outputResults state
where
readState :: FilePath -> ConduitM () Void (ResourceT IO) (Trie.Trie [Text])
readState :: FilePath -> ConduitM () Void (ResourceT IO) (Map.Map ByteString [Text])
readState dir =
CB.sourceFile (dir </> ".addressbook.dat")
.| CT.decode CT.utf8
.| CT.lines
.| C.foldMap (\s -> Trie.singleton (TE.encodeUtf8 $ T.toLower s) [s])
outputResults :: Trie.Trie [Text] -> ConduitM () Void IO ()
.| C.foldMap (\s -> Map.singleton (TE.encodeUtf8 $ T.toLower s) [s])
prefixB = TE.encodeUtf8 prefix
outputResults :: Map.Map ByteString [Text] -> ConduitM () Void IO ()
outputResults state =
CL.sourceList (Trie.elems $ Trie.submap (TE.encodeUtf8 prefix) state)
CL.sourceList (Map.elems $ Map.filterWithKey (\k _ -> prefixB `B.isPrefixOf` k) state)
.| C.concat
.| C.map (<> "\n")
.| CT.encode CT.utf8

87
src/Data/Trie.hs Normal file
View File

@ -0,0 +1,87 @@
-- The bytestring-trie is marked as broken. Trie is a simple(ish) datastructure, implement one here
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE DeriveFoldable #-}
module Data.Trie where
import Data.ByteString (ByteString)
import Data.Word (Word8)
import Data.Foldable (foldl')
import Data.Semigroup (Last(..))
import qualified Data.ByteString as B
import Data.Coerce (coerce)
import Data.IntMap (IntMap)
import qualified Data.IntMap.Strict as M
data Pair a b = Pair !a !b
deriving (Functor, Foldable)
data Trie a
= Empty
| Branch !(Maybe (Pair ByteString a)) !(IntMap (Trie a))
deriving (Functor, Foldable)
instance Semigroup a => Semigroup (Trie a) where
(<>) = union
instance Monoid a => Monoid (Trie a) where
mempty = Empty
instance Show a => Show (Trie a) where
show = show . toList
keys :: Trie a -> [ByteString]
keys Empty = []
keys (Branch Nothing m) = concatMap keys (M.elems m)
keys (Branch (Just (Pair b _)) m) = b : concatMap keys (M.elems m)
union :: Semigroup a => Trie a -> Trie a -> Trie a
union Empty r = r
union l Empty = l
union (Branch m_pa cl) (Branch ma cr) = Branch (merge m_pa ma) (M.unionWith union cl cr)
where
merge (Just (Pair x a)) (Just (Pair _ b)) = Just $ Pair x (a <> b)
merge Nothing r = r
merge l Nothing = l
unionR :: forall a. Trie a -> Trie a -> Trie a
unionR a b = coerce @(Trie (Last a)) $ union (coerce a) (coerce b)
singleton :: forall a. ByteString -> a -> Trie a
singleton bs a = go (B.uncons bs)
where
go :: Maybe (Word8, ByteString) -> Trie a
go Nothing =
let x = Pair bs a
in x `seq` Branch (Just (Pair bs a)) M.empty
go (Just (w,c)) =
let y = M.singleton (fromIntegral w) (go (B.uncons c))
in y `seq` Branch Nothing y
empty :: Trie a
empty = Empty
elems :: Trie a -> [a]
elems = foldr (:) []
submap :: forall a. ByteString -> Trie a -> Trie a
submap bs = go (B.uncons bs)
where
go :: Maybe (Word8, ByteString) -> Trie a -> Trie a
go _ Empty = Empty
go Nothing t = t
go (Just (w,cs)) (Branch _ c) = maybe empty (go (B.uncons cs)) $ M.lookup (fromIntegral w) c
insert :: forall a. ByteString -> a -> Trie a -> Trie a
insert bs a = (`unionR` singleton bs a)
fromList :: [(ByteString, a)] -> Trie a
fromList = foldl' (\acc (k,v) -> insert k v acc) empty
toList :: forall a. Trie a -> [(ByteString, a)]
toList = go
where
go :: Trie a -> [(ByteString, a)]
go Empty = []
go (Branch Nothing m) = concatMap (\(_, child) -> go child) (M.toList m)
go (Branch (Just (Pair a b)) m) = (a, b) : concatMap (\(_, child) -> go child) (M.toList m)