From 4adc75c33c2a3cc0f64a993232bc7c34c884bfc9 Mon Sep 17 00:00:00 2001 From: Mats Rauhala Date: Fri, 12 Nov 2021 17:25:47 +0200 Subject: [PATCH] Parse RSS and have published timestamps --- FeedMonad-demo/app/Main.hs | 13 +++++- FeedMonad/FeedMonad.cabal | 3 ++ FeedMonad/src/Data/Entry.hs | 2 + FeedMonad/src/Database.hs | 44 ++++++++++++++++- FeedMonad/src/Feed/Parser.hs | 88 ++++++++++++++++++++++++++++++++++ FeedMonad/src/FeedMonad.hs | 91 ++++++++++++++++++------------------ 6 files changed, 193 insertions(+), 48 deletions(-) create mode 100644 FeedMonad/src/Feed/Parser.hs diff --git a/FeedMonad-demo/app/Main.hs b/FeedMonad-demo/app/Main.hs index a97641a..2be415e 100644 --- a/FeedMonad-demo/app/Main.hs +++ b/FeedMonad-demo/app/Main.hs @@ -13,8 +13,19 @@ import Control.Lens myFeeds :: [ Category FeedId ] myFeeds = - [ Category "News" + [ Category "Code" [Leaf (FeedId (URL "https://github.com/feediron/feediron-recipes/commits/master.atom"))] + , Category "News" + [ Category "Yle" + [ Leaf (FeedId (URL "https://feeds.yle.fi/uutiset/v1/majorHeadlines/YLE_UUTISET.rss")) + , Leaf (FeedId (URL "https://feeds.yle.fi/uutiset/v1/recent.rss?publisherIds=YLE_UUTISET&concepts=18-147345")) + ] + ] + , Category "Programming" + [ Leaf (FeedId (URL "https://reddit.com/r/haskell.rss")) + , Leaf (FeedId (URL "https://discourse.dhall-lang.org/latest.rss")) + , Leaf (FeedId (URL "https://www.haskellforall.com/feeds/posts/default")) + ] ] myFilters :: Middleware diff --git a/FeedMonad/FeedMonad.cabal b/FeedMonad/FeedMonad.cabal index 5ce5462..54fc97b 100644 --- a/FeedMonad/FeedMonad.cabal +++ b/FeedMonad/FeedMonad.cabal @@ -34,6 +34,7 @@ library Control.Monad.App Control.Monad.HTTP Data.URL + Feed.Parser -- Modules included in this library but not exported. -- other-modules: @@ -55,6 +56,8 @@ library , xdg-basedir , free , feed + , xml-conduit + , time hs-source-dirs: src default-language: Haskell2010 ghc-options: -Wall diff --git a/FeedMonad/src/Data/Entry.hs b/FeedMonad/src/Data/Entry.hs index 8cdcfcd..badcb27 100644 --- a/FeedMonad/src/Data/Entry.hs +++ b/FeedMonad/src/Data/Entry.hs @@ -9,6 +9,7 @@ import Control.Lens import Data.SafeCopy import GHC.Generics (Generic) import Data.URL (URL ) +import Data.Time (UTCTime) newtype Tag = Tag Text @@ -22,6 +23,7 @@ data Entry = Entry , entryContent :: ByteString , entryScore :: Int , entryTags :: Set Tag + , entryPublished :: UTCTime } deriving (Eq, Ord, Show) diff --git a/FeedMonad/src/Database.hs b/FeedMonad/src/Database.hs index 5e67f3e..949e7ed 100644 --- a/FeedMonad/src/Database.hs +++ b/FeedMonad/src/Database.hs @@ -13,9 +13,12 @@ module Database , EntryId(..) -- * Update functions , SaveEntry(..) + , SaveEntries(..) -- * Query functions , GetEntries(..) , GetEntry(..) + , CountEntries(..) + , UnseenEntries(..) ) where @@ -28,6 +31,9 @@ import Control.Lens import Control.Applicative ((<|>)) import Control.Monad.Reader (asks) import Data.URL (URL) +import qualified Data.Map.Strict as M +import Data.Set (Set) +import qualified Data.Set as S newtype FeedId = FeedId URL deriving (Show, Eq, Ord, Generic) @@ -49,16 +55,52 @@ emptyFeedMonadState = FeedMonadState mempty feeds :: Lens' FeedMonadState (Map FeedId (Map EntryId Entry)) feeds = lens _feeds (\ fms f -> fms{_feeds=f}) + -- | Save the entry to the database saveEntry :: FeedId -> EntryId -> Entry -> Update FeedMonadState () saveEntry fid eid en = modifying (feeds . at fid . non mempty . at eid) (<|> Just en) +-- | Save a batch of entries to the database +-- +-- Saves the new entries, preferring the old entries over the new entries +saveEntries :: FeedId -> [Entry] -> Update FeedMonadState () +saveEntries fid entries = modifying (feeds . at fid) (\old -> Just (toEntryMap entries) <> old) + where + toEntryMap = foldMap (\e -> M.singleton (EntryId (entryURL e)) e) + getEntries :: FeedId -> Query FeedMonadState [Entry] getEntries fid = asks (toListOf (feeds . ix fid . traversed)) +-- | Count the number of entries in a feed +countEntries :: FeedId -> Query FeedMonadState Int +countEntries fid = asks (lengthOf (feeds . ix fid . traversed)) + getEntry :: FeedId -> EntryId -> Query FeedMonadState (Maybe Entry) getEntry fid eid = asks (preview (feeds . ix fid . ix eid)) +-- | Filter the unseen entries +-- +unseenEntries :: FeedId -> Set EntryId -> Query FeedMonadState (Set EntryId) +-- This seems to be an unfortunate effect of acid-state :/. If I try +-- to insert all the entries and let the 'Map' handle updates, the +-- acid-state event gets humongous. With my tests with only a few +-- feeds, each run uses about a megabyte of disk space. Multiply this +-- by 48 times a day, 365 times a year, you would get 17G a year. +-- +-- Even though the save isn't now done in a single atomic operation, +-- this should be fine, because at worst we would be inserting +-- duplicates and the 'saveEntries' function already handles that. +unseenEntries fid s = do + keys <- asks (foldMapOf (feeds . ix fid) M.keysSet) + pure $ s `S.difference` keys + deriveSafeCopy 0 'base ''FeedMonadState -makeAcidic ''FeedMonadState ['saveEntry, 'getEntries, 'getEntry] +makeAcidic ''FeedMonadState + [ 'saveEntry + , 'getEntries + , 'getEntry + , 'saveEntries + , 'countEntries + , 'unseenEntries + ] diff --git a/FeedMonad/src/Feed/Parser.hs b/FeedMonad/src/Feed/Parser.hs new file mode 100644 index 0000000..5943572 --- /dev/null +++ b/FeedMonad/src/Feed/Parser.hs @@ -0,0 +1,88 @@ +{-# LANGUAGE NamedFieldPuns #-} +{-# LANGUAGE OverloadedStrings #-} +module Feed.Parser where +import Control.Lens (from, lazy, re, view, _Just) +import Data.ByteString.Lazy (ByteString) +import Data.Entry (Entry(..), Tag(Tag)) +import Data.Foldable (toList) +import Data.Maybe (mapMaybe) +import qualified Data.Set as S +import Data.Text.Strict.Lens (unpacked, utf8) +import Data.URL (URL(URL), _URL) +import Debug.Trace (trace) +import qualified Text.Atom.Feed as Atom +import Text.Feed.Types (Feed(..)) +import qualified Text.RSS.Syntax as RSS +import Text.XML + ( Document(Document, documentEpilogue, documentPrologue, documentRoot) + , Element(..) + , Node(NodeElement) + , Prologue(Prologue, prologueAfter, prologueBefore, prologueDoctype) + , def + , fromXMLElement + , renderLBS + ) +import Data.Time (parseTimeM, rfc822DateFormat, defaultTimeLocale, iso8601DateFormat) + +parseAtom :: Atom.Feed -> [Entry] +parseAtom Atom.Feed{Atom.feedEntries=es} = mapMaybe parseEntry es + where + parseEntry :: Atom.Entry -> Maybe Entry + parseEntry atomEntry = Entry + <$> view (unpacked . from _URL . re _Just) entryId + <*> title + <*> content + <*> pure 0 + <*> pure mempty + <*> (parseTimeM True defaultTimeLocale (iso8601DateFormat (Just "%H:%M:%S%Q%EZ")) . view unpacked =<< entryPublished) + where + content = + case entryContent of + Just (Atom.TextContent txt) -> Just $ view (re utf8 . lazy) txt + Just (Atom.HTMLContent txt) -> Just $ view (re utf8 . lazy) txt + Just (Atom.XHTMLContent xml) -> Just $ renderElement (toList $ fromXMLElement xml) + _ -> Nothing + title = + case entryTitle of + Atom.TextString s -> Just s + _ -> Nothing + Atom.Entry{ Atom.entryId + , Atom.entryTitle + , Atom.entryContent + , Atom.entryPublished } = atomEntry + +renderElement :: [Element] -> ByteString +renderElement els = renderLBS def doc + where + el = Element {elementName = "span", elementAttributes = mempty, elementNodes = map NodeElement els} + doc = Document {documentPrologue = prologue, documentRoot = el, documentEpilogue = []} + prologue = Prologue {prologueBefore = [], prologueDoctype = Nothing, prologueAfter = []} + +parseRSS :: RSS.RSS -> [Entry] +parseRSS RSS.RSS{RSS.rssChannel=RSS.RSSChannel{RSS.rssItems = items}} = + mapMaybe parseItem items + where + parseItem :: RSS.RSSItem -> Maybe Entry + parseItem item = Entry + <$> (URL . view unpacked <$> rssItemLink) + <*> rssItemTitle + <*> (regularContent <> Just otherContent) + <*> pure 0 + <*> pure (foldMap (S.singleton . Tag . RSS.rssCategoryValue) rssItemCategories) + <*> (parseTimeM True defaultTimeLocale rfc822DateFormat . view unpacked =<< rssItemPubDate) + where + regularContent = view (re utf8 . lazy) <$> rssItemContent + otherContent = renderElement (concatMap (toList . fromXMLElement) rssItemOther) + RSS.RSSItem { RSS.rssItemTitle + , RSS.rssItemLink + , RSS.rssItemCategories + , RSS.rssItemContent + , RSS.rssItemOther + , RSS.rssItemPubDate + } = item + +parseEntries :: Feed -> [Entry] +parseEntries (AtomFeed atom) = parseAtom atom +parseEntries (RSSFeed rss) = parseRSS rss +parseEntries (RSS1Feed _rss1) = trace "rss1" [] +parseEntries (XMLFeed _xml) = trace "xml" [] diff --git a/FeedMonad/src/FeedMonad.hs b/FeedMonad/src/FeedMonad.hs index 0378a79..67febd5 100644 --- a/FeedMonad/src/FeedMonad.hs +++ b/FeedMonad/src/FeedMonad.hs @@ -1,27 +1,32 @@ -{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveFunctor #-} +{-# LANGUAGE NamedFieldPuns #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TupleSections #-} module FeedMonad where -import Data.Text (Text) -import Middleware (Middleware) -import Numeric.Natural (Natural) -import Data.Category (Category) -import Data.Acid (openLocalState, AcidState (closeAcidState)) import Control.Exception (bracket) -import Database (emptyFeedMonadState, FeedId (FeedId)) -import Network.HTTP.Client.TLS (newTlsManager) -import Control.Monad.App (runApp) -import Data.Environment +import Control.Monad.App (App, runApp) +import Control.Monad.HTTP (execute, fetch) +import Control.Monad.Reader (asks) import Control.Monad.Trans (liftIO) -import Data.Foldable (for_, traverse_) -import Control.Monad.HTTP (fetch, execute) +import Data.Acid (AcidState(closeAcidState), openLocalState, query, update) +import Data.Category (Category) +import Data.Environment +import Data.Foldable (for_, toList) +import Data.Text (Text) +import Database + ( CountEntries(CountEntries) + , FeedId(FeedId) + , SaveEntries(SaveEntries) + , emptyFeedMonadState, UnseenEntries (UnseenEntries), EntryId (EntryId) + ) +import Feed.Parser (parseEntries) +import Middleware (Middleware) +import Network.HTTP.Client.TLS (newTlsManager) +import Numeric.Natural (Natural) import Text.Feed.Import (parseFeedSource) -import Text.Feed.Types -import qualified Text.Atom.Feed as Atom -import Data.Entry -import Control.Lens -import Data.Text.Strict.Lens (utf8, unpacked) -import Data.URL (_URL) +import Data.Entry (Entry(entryURL)) +import qualified Data.Map.Strict as M newtype Minutes = Minutes Natural @@ -47,37 +52,31 @@ defaultConfig = FeedMonad , secretToken = "i am a secret" } -parseAtom :: Atom.Feed -> [Entry] -parseAtom Atom.Feed{Atom.feedEntries=es} = map parseEntry es + +updateFeeds :: FeedMonad -> App () +updateFeeds f = do + mgr <- asks environmentManager + st <- asks environmentAcidState + for_ (feeds f) $ \c -> for_ c $ \fid -> liftIO $ do + let FeedId u = fid + entries <- maybe [] parseEntries . parseFeedSource <$> liftIO (execute mgr (fetch u)) + finalEntries <- foldMap (\e -> M.singleton (EntryId $ entryURL e) e) <$> traverse (execute mgr . filters f pure) entries + newEntries <- query st (UnseenEntries fid (M.keysSet finalEntries)) + update st (SaveEntries fid (foldMap (\eid -> toList $ M.lookup eid finalEntries) newEntries)) + +queryCategory :: FeedMonad -> App [Category (FeedId, Int)] +queryCategory = traverse (traverse q) . feeds where - parseEntry :: Atom.Entry -> Entry - parseEntry atomEntry = Entry - { entryURL = view (unpacked . from _URL) $ Atom.entryId atomEntry - , entryTitle = - case Atom.entryTitle atomEntry of - Atom.TextString txt -> txt - _ -> "Title supported" - , entryContent = - case Atom.entryContent atomEntry of - Just (Atom.TextContent txt) -> view (re utf8 . lazy) txt - Just (Atom.HTMLContent html) -> view (re utf8 . lazy) html - Just _ -> "Content not supported" - Nothing -> "" - , entryScore = 0 - , entryTags = mempty - } + q :: FeedId -> App (FeedId, Int) + q fid = do + st <- asks environmentAcidState + (fid, ) <$> liftIO (query st (CountEntries fid)) defaultMain :: FeedMonad -> IO () defaultMain f = bracket (openLocalState emptyFeedMonadState) closeAcidState $ \st -> do mgr <- newTlsManager - runApp (Environment mgr st) $ - for_ (feeds f) $ \c -> for_ c $ \fid -> do - let FeedId u = fid - feed <- parseFeedSource <$> liftIO (execute mgr (fetch u)) - case feed of - Nothing -> pure () - Just (AtomFeed atom) -> liftIO (traverse (execute mgr . filters f pure) (parseAtom atom) >>= traverse_ print) - Just (RSSFeed _rssFeed) -> liftIO $ putStrLn "rssFeed" - Just (RSS1Feed _rss1Feed) -> liftIO $ putStrLn "rss1Feed" - Just (XMLFeed _xmlFeed) -> liftIO $ putStrLn "xmlFeed" + runApp (Environment mgr st) $ do + updateFeeds f + cat <- queryCategory f + liftIO $ print cat