Parse RSS and have published timestamps

This commit is contained in:
Mats Rauhala 2021-11-12 17:25:47 +02:00
parent e1b4155a25
commit 4adc75c33c
6 changed files with 193 additions and 48 deletions

View File

@ -13,8 +13,19 @@ import Control.Lens
myFeeds :: [ Category FeedId ]
myFeeds =
[ Category "News"
[ Category "Code"
[Leaf (FeedId (URL "https://github.com/feediron/feediron-recipes/commits/master.atom"))]
, Category "News"
[ Category "Yle"
[ Leaf (FeedId (URL "https://feeds.yle.fi/uutiset/v1/majorHeadlines/YLE_UUTISET.rss"))
, Leaf (FeedId (URL "https://feeds.yle.fi/uutiset/v1/recent.rss?publisherIds=YLE_UUTISET&concepts=18-147345"))
]
]
, Category "Programming"
[ Leaf (FeedId (URL "https://reddit.com/r/haskell.rss"))
, Leaf (FeedId (URL "https://discourse.dhall-lang.org/latest.rss"))
, Leaf (FeedId (URL "https://www.haskellforall.com/feeds/posts/default"))
]
]
myFilters :: Middleware

View File

@ -34,6 +34,7 @@ library
Control.Monad.App
Control.Monad.HTTP
Data.URL
Feed.Parser
-- Modules included in this library but not exported.
-- other-modules:
@ -55,6 +56,8 @@ library
, xdg-basedir
, free
, feed
, xml-conduit
, time
hs-source-dirs: src
default-language: Haskell2010
ghc-options: -Wall

View File

@ -9,6 +9,7 @@ import Control.Lens
import Data.SafeCopy
import GHC.Generics (Generic)
import Data.URL (URL )
import Data.Time (UTCTime)
newtype Tag = Tag Text
@ -22,6 +23,7 @@ data Entry = Entry
, entryContent :: ByteString
, entryScore :: Int
, entryTags :: Set Tag
, entryPublished :: UTCTime
}
deriving (Eq, Ord, Show)

View File

@ -13,9 +13,12 @@ module Database
, EntryId(..)
-- * Update functions
, SaveEntry(..)
, SaveEntries(..)
-- * Query functions
, GetEntries(..)
, GetEntry(..)
, CountEntries(..)
, UnseenEntries(..)
)
where
@ -28,6 +31,9 @@ import Control.Lens
import Control.Applicative ((<|>))
import Control.Monad.Reader (asks)
import Data.URL (URL)
import qualified Data.Map.Strict as M
import Data.Set (Set)
import qualified Data.Set as S
newtype FeedId = FeedId URL
deriving (Show, Eq, Ord, Generic)
@ -49,16 +55,52 @@ emptyFeedMonadState = FeedMonadState mempty
feeds :: Lens' FeedMonadState (Map FeedId (Map EntryId Entry))
feeds = lens _feeds (\ fms f -> fms{_feeds=f})
-- | Save the entry to the database
saveEntry :: FeedId -> EntryId -> Entry -> Update FeedMonadState ()
saveEntry fid eid en = modifying (feeds . at fid . non mempty . at eid) (<|> Just en)
-- | Save a batch of entries to the database
--
-- Saves the new entries, preferring the old entries over the new entries
saveEntries :: FeedId -> [Entry] -> Update FeedMonadState ()
saveEntries fid entries = modifying (feeds . at fid) (\old -> Just (toEntryMap entries) <> old)
where
toEntryMap = foldMap (\e -> M.singleton (EntryId (entryURL e)) e)
getEntries :: FeedId -> Query FeedMonadState [Entry]
getEntries fid = asks (toListOf (feeds . ix fid . traversed))
-- | Count the number of entries in a feed
countEntries :: FeedId -> Query FeedMonadState Int
countEntries fid = asks (lengthOf (feeds . ix fid . traversed))
getEntry :: FeedId -> EntryId -> Query FeedMonadState (Maybe Entry)
getEntry fid eid = asks (preview (feeds . ix fid . ix eid))
-- | Filter the unseen entries
--
unseenEntries :: FeedId -> Set EntryId -> Query FeedMonadState (Set EntryId)
-- This seems to be an unfortunate effect of acid-state :/. If I try
-- to insert all the entries and let the 'Map' handle updates, the
-- acid-state event gets humongous. With my tests with only a few
-- feeds, each run uses about a megabyte of disk space. Multiply this
-- by 48 times a day, 365 times a year, you would get 17G a year.
--
-- Even though the save isn't now done in a single atomic operation,
-- this should be fine, because at worst we would be inserting
-- duplicates and the 'saveEntries' function already handles that.
unseenEntries fid s = do
keys <- asks (foldMapOf (feeds . ix fid) M.keysSet)
pure $ s `S.difference` keys
deriveSafeCopy 0 'base ''FeedMonadState
makeAcidic ''FeedMonadState ['saveEntry, 'getEntries, 'getEntry]
makeAcidic ''FeedMonadState
[ 'saveEntry
, 'getEntries
, 'getEntry
, 'saveEntries
, 'countEntries
, 'unseenEntries
]

View File

@ -0,0 +1,88 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
module Feed.Parser where
import Control.Lens (from, lazy, re, view, _Just)
import Data.ByteString.Lazy (ByteString)
import Data.Entry (Entry(..), Tag(Tag))
import Data.Foldable (toList)
import Data.Maybe (mapMaybe)
import qualified Data.Set as S
import Data.Text.Strict.Lens (unpacked, utf8)
import Data.URL (URL(URL), _URL)
import Debug.Trace (trace)
import qualified Text.Atom.Feed as Atom
import Text.Feed.Types (Feed(..))
import qualified Text.RSS.Syntax as RSS
import Text.XML
( Document(Document, documentEpilogue, documentPrologue, documentRoot)
, Element(..)
, Node(NodeElement)
, Prologue(Prologue, prologueAfter, prologueBefore, prologueDoctype)
, def
, fromXMLElement
, renderLBS
)
import Data.Time (parseTimeM, rfc822DateFormat, defaultTimeLocale, iso8601DateFormat)
parseAtom :: Atom.Feed -> [Entry]
parseAtom Atom.Feed{Atom.feedEntries=es} = mapMaybe parseEntry es
where
parseEntry :: Atom.Entry -> Maybe Entry
parseEntry atomEntry = Entry
<$> view (unpacked . from _URL . re _Just) entryId
<*> title
<*> content
<*> pure 0
<*> pure mempty
<*> (parseTimeM True defaultTimeLocale (iso8601DateFormat (Just "%H:%M:%S%Q%EZ")) . view unpacked =<< entryPublished)
where
content =
case entryContent of
Just (Atom.TextContent txt) -> Just $ view (re utf8 . lazy) txt
Just (Atom.HTMLContent txt) -> Just $ view (re utf8 . lazy) txt
Just (Atom.XHTMLContent xml) -> Just $ renderElement (toList $ fromXMLElement xml)
_ -> Nothing
title =
case entryTitle of
Atom.TextString s -> Just s
_ -> Nothing
Atom.Entry{ Atom.entryId
, Atom.entryTitle
, Atom.entryContent
, Atom.entryPublished } = atomEntry
renderElement :: [Element] -> ByteString
renderElement els = renderLBS def doc
where
el = Element {elementName = "span", elementAttributes = mempty, elementNodes = map NodeElement els}
doc = Document {documentPrologue = prologue, documentRoot = el, documentEpilogue = []}
prologue = Prologue {prologueBefore = [], prologueDoctype = Nothing, prologueAfter = []}
parseRSS :: RSS.RSS -> [Entry]
parseRSS RSS.RSS{RSS.rssChannel=RSS.RSSChannel{RSS.rssItems = items}} =
mapMaybe parseItem items
where
parseItem :: RSS.RSSItem -> Maybe Entry
parseItem item = Entry
<$> (URL . view unpacked <$> rssItemLink)
<*> rssItemTitle
<*> (regularContent <> Just otherContent)
<*> pure 0
<*> pure (foldMap (S.singleton . Tag . RSS.rssCategoryValue) rssItemCategories)
<*> (parseTimeM True defaultTimeLocale rfc822DateFormat . view unpacked =<< rssItemPubDate)
where
regularContent = view (re utf8 . lazy) <$> rssItemContent
otherContent = renderElement (concatMap (toList . fromXMLElement) rssItemOther)
RSS.RSSItem { RSS.rssItemTitle
, RSS.rssItemLink
, RSS.rssItemCategories
, RSS.rssItemContent
, RSS.rssItemOther
, RSS.rssItemPubDate
} = item
parseEntries :: Feed -> [Entry]
parseEntries (AtomFeed atom) = parseAtom atom
parseEntries (RSSFeed rss) = parseRSS rss
parseEntries (RSS1Feed _rss1) = trace "rss1" []
parseEntries (XMLFeed _xml) = trace "xml" []

View File

@ -1,27 +1,32 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
module FeedMonad where
import Data.Text (Text)
import Middleware (Middleware)
import Numeric.Natural (Natural)
import Data.Category (Category)
import Data.Acid (openLocalState, AcidState (closeAcidState))
import Control.Exception (bracket)
import Database (emptyFeedMonadState, FeedId (FeedId))
import Network.HTTP.Client.TLS (newTlsManager)
import Control.Monad.App (runApp)
import Data.Environment
import Control.Monad.App (App, runApp)
import Control.Monad.HTTP (execute, fetch)
import Control.Monad.Reader (asks)
import Control.Monad.Trans (liftIO)
import Data.Foldable (for_, traverse_)
import Control.Monad.HTTP (fetch, execute)
import Data.Acid (AcidState(closeAcidState), openLocalState, query, update)
import Data.Category (Category)
import Data.Environment
import Data.Foldable (for_, toList)
import Data.Text (Text)
import Database
( CountEntries(CountEntries)
, FeedId(FeedId)
, SaveEntries(SaveEntries)
, emptyFeedMonadState, UnseenEntries (UnseenEntries), EntryId (EntryId)
)
import Feed.Parser (parseEntries)
import Middleware (Middleware)
import Network.HTTP.Client.TLS (newTlsManager)
import Numeric.Natural (Natural)
import Text.Feed.Import (parseFeedSource)
import Text.Feed.Types
import qualified Text.Atom.Feed as Atom
import Data.Entry
import Control.Lens
import Data.Text.Strict.Lens (utf8, unpacked)
import Data.URL (_URL)
import Data.Entry (Entry(entryURL))
import qualified Data.Map.Strict as M
newtype Minutes = Minutes Natural
@ -47,37 +52,31 @@ defaultConfig = FeedMonad
, secretToken = "i am a secret"
}
parseAtom :: Atom.Feed -> [Entry]
parseAtom Atom.Feed{Atom.feedEntries=es} = map parseEntry es
updateFeeds :: FeedMonad -> App ()
updateFeeds f = do
mgr <- asks environmentManager
st <- asks environmentAcidState
for_ (feeds f) $ \c -> for_ c $ \fid -> liftIO $ do
let FeedId u = fid
entries <- maybe [] parseEntries . parseFeedSource <$> liftIO (execute mgr (fetch u))
finalEntries <- foldMap (\e -> M.singleton (EntryId $ entryURL e) e) <$> traverse (execute mgr . filters f pure) entries
newEntries <- query st (UnseenEntries fid (M.keysSet finalEntries))
update st (SaveEntries fid (foldMap (\eid -> toList $ M.lookup eid finalEntries) newEntries))
queryCategory :: FeedMonad -> App [Category (FeedId, Int)]
queryCategory = traverse (traverse q) . feeds
where
parseEntry :: Atom.Entry -> Entry
parseEntry atomEntry = Entry
{ entryURL = view (unpacked . from _URL) $ Atom.entryId atomEntry
, entryTitle =
case Atom.entryTitle atomEntry of
Atom.TextString txt -> txt
_ -> "Title supported"
, entryContent =
case Atom.entryContent atomEntry of
Just (Atom.TextContent txt) -> view (re utf8 . lazy) txt
Just (Atom.HTMLContent html) -> view (re utf8 . lazy) html
Just _ -> "Content not supported"
Nothing -> ""
, entryScore = 0
, entryTags = mempty
}
q :: FeedId -> App (FeedId, Int)
q fid = do
st <- asks environmentAcidState
(fid, ) <$> liftIO (query st (CountEntries fid))
defaultMain :: FeedMonad -> IO ()
defaultMain f =
bracket (openLocalState emptyFeedMonadState) closeAcidState $ \st -> do
mgr <- newTlsManager
runApp (Environment mgr st) $
for_ (feeds f) $ \c -> for_ c $ \fid -> do
let FeedId u = fid
feed <- parseFeedSource <$> liftIO (execute mgr (fetch u))
case feed of
Nothing -> pure ()
Just (AtomFeed atom) -> liftIO (traverse (execute mgr . filters f pure) (parseAtom atom) >>= traverse_ print)
Just (RSSFeed _rssFeed) -> liftIO $ putStrLn "rssFeed"
Just (RSS1Feed _rss1Feed) -> liftIO $ putStrLn "rss1Feed"
Just (XMLFeed _xmlFeed) -> liftIO $ putStrLn "xmlFeed"
runApp (Environment mgr st) $ do
updateFeeds f
cat <- queryCategory f
liftIO $ print cat