solidabis-koodihaaste/src/Data/NGram.hs

31 lines
855 B
Haskell

{-# LANGUAGE OverloadedStrings #-}
{-|
Module : Data.NGram
Description : N-gram utilities
Copyright : (c) Mats Rauhala, 2019
License : BSD3
Maintainer : mats.rauhala@iki.fi
Stability : experimental
Portability : POSIX
Utilities for building n-gram models
-}
module Data.NGram where
import Data.List (unfoldr)
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as M
import Data.Monoid (Sum (..))
import Data.Text (Text)
import qualified Data.Text as T
-- | Build a n-gram frequency map
ngram :: Int -> Text -> Map Text (Sum Int)
ngram n = M.unionsWith (<>) . unfoldr go
where
go :: Text -> Maybe (Map Text (Sum Int), Text)
go str =
case T.splitAt n str of
("", _) -> Nothing
(xs, ys) -> Just (M.singleton (T.toLower xs) 1, ys)