Parse RSS and have published timestamps

This commit is contained in:
Mats Rauhala 2021-11-12 17:25:47 +02:00
parent e1b4155a25
commit 4adc75c33c
6 changed files with 193 additions and 48 deletions

View File

@ -13,8 +13,19 @@ import Control.Lens
myFeeds :: [ Category FeedId ] myFeeds :: [ Category FeedId ]
myFeeds = myFeeds =
[ Category "News" [ Category "Code"
[Leaf (FeedId (URL "https://github.com/feediron/feediron-recipes/commits/master.atom"))] [Leaf (FeedId (URL "https://github.com/feediron/feediron-recipes/commits/master.atom"))]
, Category "News"
[ Category "Yle"
[ Leaf (FeedId (URL "https://feeds.yle.fi/uutiset/v1/majorHeadlines/YLE_UUTISET.rss"))
, Leaf (FeedId (URL "https://feeds.yle.fi/uutiset/v1/recent.rss?publisherIds=YLE_UUTISET&concepts=18-147345"))
]
]
, Category "Programming"
[ Leaf (FeedId (URL "https://reddit.com/r/haskell.rss"))
, Leaf (FeedId (URL "https://discourse.dhall-lang.org/latest.rss"))
, Leaf (FeedId (URL "https://www.haskellforall.com/feeds/posts/default"))
]
] ]
myFilters :: Middleware myFilters :: Middleware

View File

@ -34,6 +34,7 @@ library
Control.Monad.App Control.Monad.App
Control.Monad.HTTP Control.Monad.HTTP
Data.URL Data.URL
Feed.Parser
-- Modules included in this library but not exported. -- Modules included in this library but not exported.
-- other-modules: -- other-modules:
@ -55,6 +56,8 @@ library
, xdg-basedir , xdg-basedir
, free , free
, feed , feed
, xml-conduit
, time
hs-source-dirs: src hs-source-dirs: src
default-language: Haskell2010 default-language: Haskell2010
ghc-options: -Wall ghc-options: -Wall

View File

@ -9,6 +9,7 @@ import Control.Lens
import Data.SafeCopy import Data.SafeCopy
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Data.URL (URL ) import Data.URL (URL )
import Data.Time (UTCTime)
newtype Tag = Tag Text newtype Tag = Tag Text
@ -22,6 +23,7 @@ data Entry = Entry
, entryContent :: ByteString , entryContent :: ByteString
, entryScore :: Int , entryScore :: Int
, entryTags :: Set Tag , entryTags :: Set Tag
, entryPublished :: UTCTime
} }
deriving (Eq, Ord, Show) deriving (Eq, Ord, Show)

View File

@ -13,9 +13,12 @@ module Database
, EntryId(..) , EntryId(..)
-- * Update functions -- * Update functions
, SaveEntry(..) , SaveEntry(..)
, SaveEntries(..)
-- * Query functions -- * Query functions
, GetEntries(..) , GetEntries(..)
, GetEntry(..) , GetEntry(..)
, CountEntries(..)
, UnseenEntries(..)
) )
where where
@ -28,6 +31,9 @@ import Control.Lens
import Control.Applicative ((<|>)) import Control.Applicative ((<|>))
import Control.Monad.Reader (asks) import Control.Monad.Reader (asks)
import Data.URL (URL) import Data.URL (URL)
import qualified Data.Map.Strict as M
import Data.Set (Set)
import qualified Data.Set as S
newtype FeedId = FeedId URL newtype FeedId = FeedId URL
deriving (Show, Eq, Ord, Generic) deriving (Show, Eq, Ord, Generic)
@ -49,16 +55,52 @@ emptyFeedMonadState = FeedMonadState mempty
feeds :: Lens' FeedMonadState (Map FeedId (Map EntryId Entry)) feeds :: Lens' FeedMonadState (Map FeedId (Map EntryId Entry))
feeds = lens _feeds (\ fms f -> fms{_feeds=f}) feeds = lens _feeds (\ fms f -> fms{_feeds=f})
-- | Save the entry to the database -- | Save the entry to the database
saveEntry :: FeedId -> EntryId -> Entry -> Update FeedMonadState () saveEntry :: FeedId -> EntryId -> Entry -> Update FeedMonadState ()
saveEntry fid eid en = modifying (feeds . at fid . non mempty . at eid) (<|> Just en) saveEntry fid eid en = modifying (feeds . at fid . non mempty . at eid) (<|> Just en)
-- | Save a batch of entries to the database
--
-- Saves the new entries, preferring the old entries over the new entries
saveEntries :: FeedId -> [Entry] -> Update FeedMonadState ()
saveEntries fid entries = modifying (feeds . at fid) (\old -> Just (toEntryMap entries) <> old)
where
toEntryMap = foldMap (\e -> M.singleton (EntryId (entryURL e)) e)
getEntries :: FeedId -> Query FeedMonadState [Entry] getEntries :: FeedId -> Query FeedMonadState [Entry]
getEntries fid = asks (toListOf (feeds . ix fid . traversed)) getEntries fid = asks (toListOf (feeds . ix fid . traversed))
-- | Count the number of entries in a feed
countEntries :: FeedId -> Query FeedMonadState Int
countEntries fid = asks (lengthOf (feeds . ix fid . traversed))
getEntry :: FeedId -> EntryId -> Query FeedMonadState (Maybe Entry) getEntry :: FeedId -> EntryId -> Query FeedMonadState (Maybe Entry)
getEntry fid eid = asks (preview (feeds . ix fid . ix eid)) getEntry fid eid = asks (preview (feeds . ix fid . ix eid))
-- | Filter the unseen entries
--
unseenEntries :: FeedId -> Set EntryId -> Query FeedMonadState (Set EntryId)
-- This seems to be an unfortunate effect of acid-state :/. If I try
-- to insert all the entries and let the 'Map' handle updates, the
-- acid-state event gets humongous. With my tests with only a few
-- feeds, each run uses about a megabyte of disk space. Multiply this
-- by 48 times a day, 365 times a year, you would get 17G a year.
--
-- Even though the save isn't now done in a single atomic operation,
-- this should be fine, because at worst we would be inserting
-- duplicates and the 'saveEntries' function already handles that.
unseenEntries fid s = do
keys <- asks (foldMapOf (feeds . ix fid) M.keysSet)
pure $ s `S.difference` keys
deriveSafeCopy 0 'base ''FeedMonadState deriveSafeCopy 0 'base ''FeedMonadState
makeAcidic ''FeedMonadState ['saveEntry, 'getEntries, 'getEntry] makeAcidic ''FeedMonadState
[ 'saveEntry
, 'getEntries
, 'getEntry
, 'saveEntries
, 'countEntries
, 'unseenEntries
]

View File

@ -0,0 +1,88 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
module Feed.Parser where
import Control.Lens (from, lazy, re, view, _Just)
import Data.ByteString.Lazy (ByteString)
import Data.Entry (Entry(..), Tag(Tag))
import Data.Foldable (toList)
import Data.Maybe (mapMaybe)
import qualified Data.Set as S
import Data.Text.Strict.Lens (unpacked, utf8)
import Data.URL (URL(URL), _URL)
import Debug.Trace (trace)
import qualified Text.Atom.Feed as Atom
import Text.Feed.Types (Feed(..))
import qualified Text.RSS.Syntax as RSS
import Text.XML
( Document(Document, documentEpilogue, documentPrologue, documentRoot)
, Element(..)
, Node(NodeElement)
, Prologue(Prologue, prologueAfter, prologueBefore, prologueDoctype)
, def
, fromXMLElement
, renderLBS
)
import Data.Time (parseTimeM, rfc822DateFormat, defaultTimeLocale, iso8601DateFormat)
parseAtom :: Atom.Feed -> [Entry]
parseAtom Atom.Feed{Atom.feedEntries=es} = mapMaybe parseEntry es
where
parseEntry :: Atom.Entry -> Maybe Entry
parseEntry atomEntry = Entry
<$> view (unpacked . from _URL . re _Just) entryId
<*> title
<*> content
<*> pure 0
<*> pure mempty
<*> (parseTimeM True defaultTimeLocale (iso8601DateFormat (Just "%H:%M:%S%Q%EZ")) . view unpacked =<< entryPublished)
where
content =
case entryContent of
Just (Atom.TextContent txt) -> Just $ view (re utf8 . lazy) txt
Just (Atom.HTMLContent txt) -> Just $ view (re utf8 . lazy) txt
Just (Atom.XHTMLContent xml) -> Just $ renderElement (toList $ fromXMLElement xml)
_ -> Nothing
title =
case entryTitle of
Atom.TextString s -> Just s
_ -> Nothing
Atom.Entry{ Atom.entryId
, Atom.entryTitle
, Atom.entryContent
, Atom.entryPublished } = atomEntry
renderElement :: [Element] -> ByteString
renderElement els = renderLBS def doc
where
el = Element {elementName = "span", elementAttributes = mempty, elementNodes = map NodeElement els}
doc = Document {documentPrologue = prologue, documentRoot = el, documentEpilogue = []}
prologue = Prologue {prologueBefore = [], prologueDoctype = Nothing, prologueAfter = []}
parseRSS :: RSS.RSS -> [Entry]
parseRSS RSS.RSS{RSS.rssChannel=RSS.RSSChannel{RSS.rssItems = items}} =
mapMaybe parseItem items
where
parseItem :: RSS.RSSItem -> Maybe Entry
parseItem item = Entry
<$> (URL . view unpacked <$> rssItemLink)
<*> rssItemTitle
<*> (regularContent <> Just otherContent)
<*> pure 0
<*> pure (foldMap (S.singleton . Tag . RSS.rssCategoryValue) rssItemCategories)
<*> (parseTimeM True defaultTimeLocale rfc822DateFormat . view unpacked =<< rssItemPubDate)
where
regularContent = view (re utf8 . lazy) <$> rssItemContent
otherContent = renderElement (concatMap (toList . fromXMLElement) rssItemOther)
RSS.RSSItem { RSS.rssItemTitle
, RSS.rssItemLink
, RSS.rssItemCategories
, RSS.rssItemContent
, RSS.rssItemOther
, RSS.rssItemPubDate
} = item
parseEntries :: Feed -> [Entry]
parseEntries (AtomFeed atom) = parseAtom atom
parseEntries (RSSFeed rss) = parseRSS rss
parseEntries (RSS1Feed _rss1) = trace "rss1" []
parseEntries (XMLFeed _xml) = trace "xml" []

View File

@ -1,27 +1,32 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
module FeedMonad where module FeedMonad where
import Data.Text (Text)
import Middleware (Middleware)
import Numeric.Natural (Natural)
import Data.Category (Category)
import Data.Acid (openLocalState, AcidState (closeAcidState))
import Control.Exception (bracket) import Control.Exception (bracket)
import Database (emptyFeedMonadState, FeedId (FeedId)) import Control.Monad.App (App, runApp)
import Network.HTTP.Client.TLS (newTlsManager) import Control.Monad.HTTP (execute, fetch)
import Control.Monad.App (runApp) import Control.Monad.Reader (asks)
import Data.Environment
import Control.Monad.Trans (liftIO) import Control.Monad.Trans (liftIO)
import Data.Foldable (for_, traverse_) import Data.Acid (AcidState(closeAcidState), openLocalState, query, update)
import Control.Monad.HTTP (fetch, execute) import Data.Category (Category)
import Data.Environment
import Data.Foldable (for_, toList)
import Data.Text (Text)
import Database
( CountEntries(CountEntries)
, FeedId(FeedId)
, SaveEntries(SaveEntries)
, emptyFeedMonadState, UnseenEntries (UnseenEntries), EntryId (EntryId)
)
import Feed.Parser (parseEntries)
import Middleware (Middleware)
import Network.HTTP.Client.TLS (newTlsManager)
import Numeric.Natural (Natural)
import Text.Feed.Import (parseFeedSource) import Text.Feed.Import (parseFeedSource)
import Text.Feed.Types import Data.Entry (Entry(entryURL))
import qualified Text.Atom.Feed as Atom import qualified Data.Map.Strict as M
import Data.Entry
import Control.Lens
import Data.Text.Strict.Lens (utf8, unpacked)
import Data.URL (_URL)
newtype Minutes = Minutes Natural newtype Minutes = Minutes Natural
@ -47,37 +52,31 @@ defaultConfig = FeedMonad
, secretToken = "i am a secret" , secretToken = "i am a secret"
} }
parseAtom :: Atom.Feed -> [Entry]
parseAtom Atom.Feed{Atom.feedEntries=es} = map parseEntry es updateFeeds :: FeedMonad -> App ()
updateFeeds f = do
mgr <- asks environmentManager
st <- asks environmentAcidState
for_ (feeds f) $ \c -> for_ c $ \fid -> liftIO $ do
let FeedId u = fid
entries <- maybe [] parseEntries . parseFeedSource <$> liftIO (execute mgr (fetch u))
finalEntries <- foldMap (\e -> M.singleton (EntryId $ entryURL e) e) <$> traverse (execute mgr . filters f pure) entries
newEntries <- query st (UnseenEntries fid (M.keysSet finalEntries))
update st (SaveEntries fid (foldMap (\eid -> toList $ M.lookup eid finalEntries) newEntries))
queryCategory :: FeedMonad -> App [Category (FeedId, Int)]
queryCategory = traverse (traverse q) . feeds
where where
parseEntry :: Atom.Entry -> Entry q :: FeedId -> App (FeedId, Int)
parseEntry atomEntry = Entry q fid = do
{ entryURL = view (unpacked . from _URL) $ Atom.entryId atomEntry st <- asks environmentAcidState
, entryTitle = (fid, ) <$> liftIO (query st (CountEntries fid))
case Atom.entryTitle atomEntry of
Atom.TextString txt -> txt
_ -> "Title supported"
, entryContent =
case Atom.entryContent atomEntry of
Just (Atom.TextContent txt) -> view (re utf8 . lazy) txt
Just (Atom.HTMLContent html) -> view (re utf8 . lazy) html
Just _ -> "Content not supported"
Nothing -> ""
, entryScore = 0
, entryTags = mempty
}
defaultMain :: FeedMonad -> IO () defaultMain :: FeedMonad -> IO ()
defaultMain f = defaultMain f =
bracket (openLocalState emptyFeedMonadState) closeAcidState $ \st -> do bracket (openLocalState emptyFeedMonadState) closeAcidState $ \st -> do
mgr <- newTlsManager mgr <- newTlsManager
runApp (Environment mgr st) $ runApp (Environment mgr st) $ do
for_ (feeds f) $ \c -> for_ c $ \fid -> do updateFeeds f
let FeedId u = fid cat <- queryCategory f
feed <- parseFeedSource <$> liftIO (execute mgr (fetch u)) liftIO $ print cat
case feed of
Nothing -> pure ()
Just (AtomFeed atom) -> liftIO (traverse (execute mgr . filters f pure) (parseAtom atom) >>= traverse_ print)
Just (RSSFeed _rssFeed) -> liftIO $ putStrLn "rssFeed"
Just (RSS1Feed _rss1Feed) -> liftIO $ putStrLn "rss1Feed"
Just (XMLFeed _xmlFeed) -> liftIO $ putStrLn "xmlFeed"