61 lines
1.8 KiB
Haskell
61 lines
1.8 KiB
Haskell
{-# LANGUAGE RecordWildCards #-}
|
|
{-|
|
|
Module : Data.Language
|
|
Description : The language model
|
|
Copyright : (c) Mats Rauhala, 2019
|
|
License : BSD3
|
|
Maintainer : mats.rauhala@iki.fi
|
|
Stability : experimental
|
|
Portability : POSIX
|
|
|
|
An n-gram model over a language. Should be trained with regular text.
|
|
|
|
The idea behind this module is to compare a sentence n-grams against the model. Higher the results, higher the probability of it being the same language as the model.
|
|
-}
|
|
module Data.Language where
|
|
|
|
import Data.Foldable (fold)
|
|
import Data.Map.Strict (Map)
|
|
import qualified Data.Map.Strict as M
|
|
import Data.Maybe (fromMaybe)
|
|
import Data.Monoid (Sum (..))
|
|
import Data.Text (Text)
|
|
|
|
import Data.NGram
|
|
|
|
-- | Model represents the language
|
|
data Model
|
|
= Model { parts :: Map Text (Sum Int)
|
|
, total :: Sum Int }
|
|
|
|
-- | Has* style pattern for accessing the language model
|
|
class HasModel a where
|
|
getModel :: a -> Model
|
|
setModel :: a -> Model -> a
|
|
|
|
instance HasModel Model where
|
|
getModel = id
|
|
setModel = const
|
|
|
|
-- | The default n-gram size
|
|
ngramSize :: Int
|
|
ngramSize = 3
|
|
|
|
-- | Build a language model out of a corpus content
|
|
buildModel :: Text -> Model
|
|
buildModel str =
|
|
let parts = ngram ngramSize str
|
|
total = fold parts
|
|
in Model{..}
|
|
|
|
-- | Calculate the 'goodness' of a sentence
|
|
--
|
|
-- Higher numbers means higher probability of being part of the same language as the model
|
|
goodness :: Model -> Text -> Double
|
|
goodness Model{..} str =
|
|
let comparison = M.keys $ ngram ngramSize str
|
|
ranksum = getSum $ foldMap (Sum . log . elm) comparison
|
|
in ranksum / fromIntegral (length comparison)
|
|
where
|
|
elm x = fromIntegral (getSum (fromMaybe 1 $ M.lookup x parts)) / fromIntegral (getSum total)
|