solidabis-koodihaaste/src/Data/Language.hs

61 lines
1.8 KiB
Haskell

{-# LANGUAGE RecordWildCards #-}
{-|
Module : Data.Language
Description : The language model
Copyright : (c) Mats Rauhala, 2019
License : BSD3
Maintainer : mats.rauhala@iki.fi
Stability : experimental
Portability : POSIX
An n-gram model over a language. Should be trained with regular text.
The idea behind this module is to compare a sentence n-grams against the model. Higher the results, higher the probability of it being the same language as the model.
-}
module Data.Language where
import Data.Foldable (fold)
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as M
import Data.Maybe (fromMaybe)
import Data.Monoid (Sum (..))
import Data.Text (Text)
import Data.NGram
-- | Model represents the language
data Model
= Model { parts :: Map Text (Sum Int)
, total :: Sum Int }
-- | Has* style pattern for accessing the language model
class HasModel a where
getModel :: a -> Model
setModel :: a -> Model -> a
instance HasModel Model where
getModel = id
setModel = const
-- | The default n-gram size
ngramSize :: Int
ngramSize = 3
-- | Build a language model out of a corpus content
buildModel :: Text -> Model
buildModel str =
let parts = ngram ngramSize str
total = fold parts
in Model{..}
-- | Calculate the 'goodness' of a sentence
--
-- Higher numbers means higher probability of being part of the same language as the model
goodness :: Model -> Text -> Double
goodness Model{..} str =
let comparison = M.keys $ ngram ngramSize str
ranksum = getSum $ foldMap (Sum . log . elm) comparison
in ranksum / fromIntegral (length comparison)
where
elm x = fromIntegral (getSum (fromMaybe 1 $ M.lookup x parts)) / fromIntegral (getSum total)