31 lines
855 B
Haskell
31 lines
855 B
Haskell
{-# LANGUAGE OverloadedStrings #-}
|
|
{-|
|
|
Module : Data.NGram
|
|
Description : N-gram utilities
|
|
Copyright : (c) Mats Rauhala, 2019
|
|
License : BSD3
|
|
Maintainer : mats.rauhala@iki.fi
|
|
Stability : experimental
|
|
Portability : POSIX
|
|
|
|
Utilities for building n-gram models
|
|
-}
|
|
module Data.NGram where
|
|
|
|
import Data.List (unfoldr)
|
|
import Data.Map.Strict (Map)
|
|
import qualified Data.Map.Strict as M
|
|
import Data.Monoid (Sum (..))
|
|
import Data.Text (Text)
|
|
import qualified Data.Text as T
|
|
|
|
-- | Build a n-gram frequency map
|
|
ngram :: Int -> Text -> Map Text (Sum Int)
|
|
ngram n = M.unionsWith (<>) . unfoldr go
|
|
where
|
|
go :: Text -> Maybe (Map Text (Sum Int), Text)
|
|
go str =
|
|
case T.splitAt n str of
|
|
("", _) -> Nothing
|
|
(xs, ys) -> Just (M.singleton (T.toLower xs) 1, ys)
|