From 046ae33fb4c0c4d070dff7591c04741d6ed9a830 Mon Sep 17 00:00:00 2001 From: Mats Rauhala Date: Tue, 15 Oct 2019 20:22:31 +0300 Subject: [PATCH] Helpers --- src/Data/Caesar.hs | 16 ++++++++++++++++ src/Data/Language.hs | 31 +++++++++++++++++++++++++++++++ src/Data/NGram.hs | 17 +++++++++++++++++ 3 files changed, 64 insertions(+) diff --git a/src/Data/Caesar.hs b/src/Data/Caesar.hs index 6956edb..ce3c37a 100644 --- a/src/Data/Caesar.hs +++ b/src/Data/Caesar.hs @@ -1 +1,17 @@ module Data.Caesar where + +type Caesar = Char + +next :: Char -> Char +next c = + case c of + '.' -> '.' + ' ' -> ' ' + 'ö' -> 'ä' + 'ä' -> 'å' + 'å' -> 'z' + 'a' -> 'ö' + x -> pred x + +caesar :: Int -> Caesar -> Caesar +caesar n x = foldr ($) x (replicate n next) diff --git a/src/Data/Language.hs b/src/Data/Language.hs index dde92a5..69df3a3 100644 --- a/src/Data/Language.hs +++ b/src/Data/Language.hs @@ -1 +1,32 @@ +{-# LANGUAGE RecordWildCards #-} module Data.Language where + +import Data.Foldable (fold) +import Data.Map.Strict (Map) +import qualified Data.Map.Strict as M +import Data.Maybe (fromMaybe) +import Data.Monoid (Sum (..)) +import Data.Text (Text) + +import Data.NGram + +data Model + = Model { parts :: Map Text (Sum Int) + , total :: Sum Int } + +ngram_size :: Int +ngram_size = 3 + +buildModel :: Text -> Model +buildModel str = + let parts = ngram ngram_size str + total = fold parts + in Model{..} + +goodness :: Model -> Text -> Double +goodness Model{..} str = + let comparison = M.keys $ ngram ngram_size str + ranksum = getSum $ foldMap (Sum . log . elm) comparison + in ranksum / fromIntegral (length comparison) + where + elm x = fromIntegral (getSum (fromMaybe 1 $ M.lookup x parts)) / fromIntegral (getSum total) diff --git a/src/Data/NGram.hs b/src/Data/NGram.hs index 19d7351..3c0efe6 100644 --- a/src/Data/NGram.hs +++ b/src/Data/NGram.hs @@ -1 +1,18 @@ +{-# LANGUAGE OverloadedStrings #-} module Data.NGram where + +import Data.List (unfoldr) +import Data.Map.Strict (Map) +import qualified Data.Map.Strict as M +import Data.Monoid (Sum (..)) +import Data.Text (Text) +import qualified Data.Text as T + +ngram :: Int -> Text -> Map Text (Sum Int) +ngram n = M.unionsWith (<>) . unfoldr go + where + go :: Text -> Maybe (Map Text (Sum Int), Text) + go str = + case T.splitAt n str of + ("", _) -> Nothing + (xs, ys) -> Just (M.singleton (T.toLower xs) 1, ys)