solidabis-koodihaaste/src/Data/Language.hs

33 lines
900 B
Haskell

{-# LANGUAGE RecordWildCards #-}
module Data.Language where
import Data.Foldable (fold)
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as M
import Data.Maybe (fromMaybe)
import Data.Monoid (Sum (..))
import Data.Text (Text)
import Data.NGram
data Model
= Model { parts :: Map Text (Sum Int)
, total :: Sum Int }
ngram_size :: Int
ngram_size = 3
buildModel :: Text -> Model
buildModel str =
let parts = ngram ngram_size str
total = fold parts
in Model{..}
goodness :: Model -> Text -> Double
goodness Model{..} str =
let comparison = M.keys $ ngram ngram_size str
ranksum = getSum $ foldMap (Sum . log . elm) comparison
in ranksum / fromIntegral (length comparison)
where
elm x = fromIntegral (getSum (fromMaybe 1 $ M.lookup x parts)) / fromIntegral (getSum total)