solidabis-koodihaaste/src/Data/Language.hs

{-# LANGUAGE RecordWildCards #-}
{-|
Module      : Data.Language
Description : The language model
Copyright   : (c) Mats Rauhala, 2019
License     : BSD3
Maintainer  : mats.rauhala@iki.fi
Stability   : experimental
Portability : POSIX

An n-gram model over a language. Should be trained with regular text.

The idea behind this module is to compare a sentence n-grams against the model. Higher the results, higher the probability of it being the same language as the model.
-}
module Data.Language where

import           Data.Foldable   (fold)
import           Data.Map.Strict (Map)
import qualified Data.Map.Strict as M
import           Data.Maybe      (fromMaybe)
import           Data.Monoid     (Sum (..))
import           Data.Text       (Text)

import           Data.NGram

-- | Model represents the language
data Model
  = Model { parts :: Map Text (Sum Int)
          , total :: Sum Int }

-- | Has* style pattern for accessing the language model
class HasModel a where
  getModel :: a -> Model
  setModel :: a -> Model -> a

instance HasModel Model where
  getModel = id
  setModel = const

-- | The default n-gram size
ngramSize :: Int
ngramSize = 3

-- | Build a language model out of a corpus content
buildModel :: Text -> Model
buildModel str =
  let parts = ngram ngramSize str
      total = fold parts
  in Model{..}

-- | Calculate the 'goodness' of a sentence
--
-- Higher numbers means higher probability of being part of the same language as the model
goodness :: Model -> Text -> Double
goodness Model{..} str =
  let comparison = M.keys $ ngram ngramSize str
      ranksum = getSum $ foldMap (Sum . log . elm) comparison
  in ranksum / fromIntegral (length comparison)
  where
    elm x = fromIntegral (getSum (fromMaybe 1 $ M.lookup x parts)) / fromIntegral (getSum total)