33 lines
900 B
Haskell
33 lines
900 B
Haskell
{-# LANGUAGE RecordWildCards #-}
|
|
module Data.Language where
|
|
|
|
import Data.Foldable (fold)
|
|
import Data.Map.Strict (Map)
|
|
import qualified Data.Map.Strict as M
|
|
import Data.Maybe (fromMaybe)
|
|
import Data.Monoid (Sum (..))
|
|
import Data.Text (Text)
|
|
|
|
import Data.NGram
|
|
|
|
data Model
|
|
= Model { parts :: Map Text (Sum Int)
|
|
, total :: Sum Int }
|
|
|
|
ngram_size :: Int
|
|
ngram_size = 3
|
|
|
|
buildModel :: Text -> Model
|
|
buildModel str =
|
|
let parts = ngram ngram_size str
|
|
total = fold parts
|
|
in Model{..}
|
|
|
|
goodness :: Model -> Text -> Double
|
|
goodness Model{..} str =
|
|
let comparison = M.keys $ ngram ngram_size str
|
|
ranksum = getSum $ foldMap (Sum . log . elm) comparison
|
|
in ranksum / fromIntegral (length comparison)
|
|
where
|
|
elm x = fromIntegral (getSum (fromMaybe 1 $ M.lookup x parts)) / fromIntegral (getSum total)
|