Parse RSS and have published timestamps
This commit is contained in:
parent
e1b4155a25
commit
4adc75c33c
@ -13,8 +13,19 @@ import Control.Lens
|
||||
|
||||
myFeeds :: [ Category FeedId ]
|
||||
myFeeds =
|
||||
[ Category "News"
|
||||
[ Category "Code"
|
||||
[Leaf (FeedId (URL "https://github.com/feediron/feediron-recipes/commits/master.atom"))]
|
||||
, Category "News"
|
||||
[ Category "Yle"
|
||||
[ Leaf (FeedId (URL "https://feeds.yle.fi/uutiset/v1/majorHeadlines/YLE_UUTISET.rss"))
|
||||
, Leaf (FeedId (URL "https://feeds.yle.fi/uutiset/v1/recent.rss?publisherIds=YLE_UUTISET&concepts=18-147345"))
|
||||
]
|
||||
]
|
||||
, Category "Programming"
|
||||
[ Leaf (FeedId (URL "https://reddit.com/r/haskell.rss"))
|
||||
, Leaf (FeedId (URL "https://discourse.dhall-lang.org/latest.rss"))
|
||||
, Leaf (FeedId (URL "https://www.haskellforall.com/feeds/posts/default"))
|
||||
]
|
||||
]
|
||||
|
||||
myFilters :: Middleware
|
||||
|
@ -34,6 +34,7 @@ library
|
||||
Control.Monad.App
|
||||
Control.Monad.HTTP
|
||||
Data.URL
|
||||
Feed.Parser
|
||||
|
||||
-- Modules included in this library but not exported.
|
||||
-- other-modules:
|
||||
@ -55,6 +56,8 @@ library
|
||||
, xdg-basedir
|
||||
, free
|
||||
, feed
|
||||
, xml-conduit
|
||||
, time
|
||||
hs-source-dirs: src
|
||||
default-language: Haskell2010
|
||||
ghc-options: -Wall
|
||||
|
@ -9,6 +9,7 @@ import Control.Lens
|
||||
import Data.SafeCopy
|
||||
import GHC.Generics (Generic)
|
||||
import Data.URL (URL )
|
||||
import Data.Time (UTCTime)
|
||||
|
||||
|
||||
newtype Tag = Tag Text
|
||||
@ -22,6 +23,7 @@ data Entry = Entry
|
||||
, entryContent :: ByteString
|
||||
, entryScore :: Int
|
||||
, entryTags :: Set Tag
|
||||
, entryPublished :: UTCTime
|
||||
}
|
||||
deriving (Eq, Ord, Show)
|
||||
|
||||
|
@ -13,9 +13,12 @@ module Database
|
||||
, EntryId(..)
|
||||
-- * Update functions
|
||||
, SaveEntry(..)
|
||||
, SaveEntries(..)
|
||||
-- * Query functions
|
||||
, GetEntries(..)
|
||||
, GetEntry(..)
|
||||
, CountEntries(..)
|
||||
, UnseenEntries(..)
|
||||
)
|
||||
where
|
||||
|
||||
@ -28,6 +31,9 @@ import Control.Lens
|
||||
import Control.Applicative ((<|>))
|
||||
import Control.Monad.Reader (asks)
|
||||
import Data.URL (URL)
|
||||
import qualified Data.Map.Strict as M
|
||||
import Data.Set (Set)
|
||||
import qualified Data.Set as S
|
||||
|
||||
newtype FeedId = FeedId URL
|
||||
deriving (Show, Eq, Ord, Generic)
|
||||
@ -49,16 +55,52 @@ emptyFeedMonadState = FeedMonadState mempty
|
||||
feeds :: Lens' FeedMonadState (Map FeedId (Map EntryId Entry))
|
||||
feeds = lens _feeds (\ fms f -> fms{_feeds=f})
|
||||
|
||||
|
||||
-- | Save the entry to the database
|
||||
saveEntry :: FeedId -> EntryId -> Entry -> Update FeedMonadState ()
|
||||
saveEntry fid eid en = modifying (feeds . at fid . non mempty . at eid) (<|> Just en)
|
||||
|
||||
-- | Save a batch of entries to the database
|
||||
--
|
||||
-- Saves the new entries, preferring the old entries over the new entries
|
||||
saveEntries :: FeedId -> [Entry] -> Update FeedMonadState ()
|
||||
saveEntries fid entries = modifying (feeds . at fid) (\old -> Just (toEntryMap entries) <> old)
|
||||
where
|
||||
toEntryMap = foldMap (\e -> M.singleton (EntryId (entryURL e)) e)
|
||||
|
||||
getEntries :: FeedId -> Query FeedMonadState [Entry]
|
||||
getEntries fid = asks (toListOf (feeds . ix fid . traversed))
|
||||
|
||||
-- | Count the number of entries in a feed
|
||||
countEntries :: FeedId -> Query FeedMonadState Int
|
||||
countEntries fid = asks (lengthOf (feeds . ix fid . traversed))
|
||||
|
||||
getEntry :: FeedId -> EntryId -> Query FeedMonadState (Maybe Entry)
|
||||
getEntry fid eid = asks (preview (feeds . ix fid . ix eid))
|
||||
|
||||
-- | Filter the unseen entries
|
||||
--
|
||||
unseenEntries :: FeedId -> Set EntryId -> Query FeedMonadState (Set EntryId)
|
||||
-- This seems to be an unfortunate effect of acid-state :/. If I try
|
||||
-- to insert all the entries and let the 'Map' handle updates, the
|
||||
-- acid-state event gets humongous. With my tests with only a few
|
||||
-- feeds, each run uses about a megabyte of disk space. Multiply this
|
||||
-- by 48 times a day, 365 times a year, you would get 17G a year.
|
||||
--
|
||||
-- Even though the save isn't now done in a single atomic operation,
|
||||
-- this should be fine, because at worst we would be inserting
|
||||
-- duplicates and the 'saveEntries' function already handles that.
|
||||
unseenEntries fid s = do
|
||||
keys <- asks (foldMapOf (feeds . ix fid) M.keysSet)
|
||||
pure $ s `S.difference` keys
|
||||
|
||||
deriveSafeCopy 0 'base ''FeedMonadState
|
||||
|
||||
makeAcidic ''FeedMonadState ['saveEntry, 'getEntries, 'getEntry]
|
||||
makeAcidic ''FeedMonadState
|
||||
[ 'saveEntry
|
||||
, 'getEntries
|
||||
, 'getEntry
|
||||
, 'saveEntries
|
||||
, 'countEntries
|
||||
, 'unseenEntries
|
||||
]
|
||||
|
88
FeedMonad/src/Feed/Parser.hs
Normal file
88
FeedMonad/src/Feed/Parser.hs
Normal file
@ -0,0 +1,88 @@
|
||||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
module Feed.Parser where
|
||||
import Control.Lens (from, lazy, re, view, _Just)
|
||||
import Data.ByteString.Lazy (ByteString)
|
||||
import Data.Entry (Entry(..), Tag(Tag))
|
||||
import Data.Foldable (toList)
|
||||
import Data.Maybe (mapMaybe)
|
||||
import qualified Data.Set as S
|
||||
import Data.Text.Strict.Lens (unpacked, utf8)
|
||||
import Data.URL (URL(URL), _URL)
|
||||
import Debug.Trace (trace)
|
||||
import qualified Text.Atom.Feed as Atom
|
||||
import Text.Feed.Types (Feed(..))
|
||||
import qualified Text.RSS.Syntax as RSS
|
||||
import Text.XML
|
||||
( Document(Document, documentEpilogue, documentPrologue, documentRoot)
|
||||
, Element(..)
|
||||
, Node(NodeElement)
|
||||
, Prologue(Prologue, prologueAfter, prologueBefore, prologueDoctype)
|
||||
, def
|
||||
, fromXMLElement
|
||||
, renderLBS
|
||||
)
|
||||
import Data.Time (parseTimeM, rfc822DateFormat, defaultTimeLocale, iso8601DateFormat)
|
||||
|
||||
parseAtom :: Atom.Feed -> [Entry]
|
||||
parseAtom Atom.Feed{Atom.feedEntries=es} = mapMaybe parseEntry es
|
||||
where
|
||||
parseEntry :: Atom.Entry -> Maybe Entry
|
||||
parseEntry atomEntry = Entry
|
||||
<$> view (unpacked . from _URL . re _Just) entryId
|
||||
<*> title
|
||||
<*> content
|
||||
<*> pure 0
|
||||
<*> pure mempty
|
||||
<*> (parseTimeM True defaultTimeLocale (iso8601DateFormat (Just "%H:%M:%S%Q%EZ")) . view unpacked =<< entryPublished)
|
||||
where
|
||||
content =
|
||||
case entryContent of
|
||||
Just (Atom.TextContent txt) -> Just $ view (re utf8 . lazy) txt
|
||||
Just (Atom.HTMLContent txt) -> Just $ view (re utf8 . lazy) txt
|
||||
Just (Atom.XHTMLContent xml) -> Just $ renderElement (toList $ fromXMLElement xml)
|
||||
_ -> Nothing
|
||||
title =
|
||||
case entryTitle of
|
||||
Atom.TextString s -> Just s
|
||||
_ -> Nothing
|
||||
Atom.Entry{ Atom.entryId
|
||||
, Atom.entryTitle
|
||||
, Atom.entryContent
|
||||
, Atom.entryPublished } = atomEntry
|
||||
|
||||
renderElement :: [Element] -> ByteString
|
||||
renderElement els = renderLBS def doc
|
||||
where
|
||||
el = Element {elementName = "span", elementAttributes = mempty, elementNodes = map NodeElement els}
|
||||
doc = Document {documentPrologue = prologue, documentRoot = el, documentEpilogue = []}
|
||||
prologue = Prologue {prologueBefore = [], prologueDoctype = Nothing, prologueAfter = []}
|
||||
|
||||
parseRSS :: RSS.RSS -> [Entry]
|
||||
parseRSS RSS.RSS{RSS.rssChannel=RSS.RSSChannel{RSS.rssItems = items}} =
|
||||
mapMaybe parseItem items
|
||||
where
|
||||
parseItem :: RSS.RSSItem -> Maybe Entry
|
||||
parseItem item = Entry
|
||||
<$> (URL . view unpacked <$> rssItemLink)
|
||||
<*> rssItemTitle
|
||||
<*> (regularContent <> Just otherContent)
|
||||
<*> pure 0
|
||||
<*> pure (foldMap (S.singleton . Tag . RSS.rssCategoryValue) rssItemCategories)
|
||||
<*> (parseTimeM True defaultTimeLocale rfc822DateFormat . view unpacked =<< rssItemPubDate)
|
||||
where
|
||||
regularContent = view (re utf8 . lazy) <$> rssItemContent
|
||||
otherContent = renderElement (concatMap (toList . fromXMLElement) rssItemOther)
|
||||
RSS.RSSItem { RSS.rssItemTitle
|
||||
, RSS.rssItemLink
|
||||
, RSS.rssItemCategories
|
||||
, RSS.rssItemContent
|
||||
, RSS.rssItemOther
|
||||
, RSS.rssItemPubDate
|
||||
} = item
|
||||
|
||||
parseEntries :: Feed -> [Entry]
|
||||
parseEntries (AtomFeed atom) = parseAtom atom
|
||||
parseEntries (RSSFeed rss) = parseRSS rss
|
||||
parseEntries (RSS1Feed _rss1) = trace "rss1" []
|
||||
parseEntries (XMLFeed _xml) = trace "xml" []
|
@ -1,27 +1,32 @@
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE DeriveFunctor #-}
|
||||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE TupleSections #-}
|
||||
module FeedMonad where
|
||||
|
||||
import Data.Text (Text)
|
||||
import Middleware (Middleware)
|
||||
import Numeric.Natural (Natural)
|
||||
import Data.Category (Category)
|
||||
import Data.Acid (openLocalState, AcidState (closeAcidState))
|
||||
import Control.Exception (bracket)
|
||||
import Database (emptyFeedMonadState, FeedId (FeedId))
|
||||
import Network.HTTP.Client.TLS (newTlsManager)
|
||||
import Control.Monad.App (runApp)
|
||||
import Data.Environment
|
||||
import Control.Monad.App (App, runApp)
|
||||
import Control.Monad.HTTP (execute, fetch)
|
||||
import Control.Monad.Reader (asks)
|
||||
import Control.Monad.Trans (liftIO)
|
||||
import Data.Foldable (for_, traverse_)
|
||||
import Control.Monad.HTTP (fetch, execute)
|
||||
import Data.Acid (AcidState(closeAcidState), openLocalState, query, update)
|
||||
import Data.Category (Category)
|
||||
import Data.Environment
|
||||
import Data.Foldable (for_, toList)
|
||||
import Data.Text (Text)
|
||||
import Database
|
||||
( CountEntries(CountEntries)
|
||||
, FeedId(FeedId)
|
||||
, SaveEntries(SaveEntries)
|
||||
, emptyFeedMonadState, UnseenEntries (UnseenEntries), EntryId (EntryId)
|
||||
)
|
||||
import Feed.Parser (parseEntries)
|
||||
import Middleware (Middleware)
|
||||
import Network.HTTP.Client.TLS (newTlsManager)
|
||||
import Numeric.Natural (Natural)
|
||||
import Text.Feed.Import (parseFeedSource)
|
||||
import Text.Feed.Types
|
||||
import qualified Text.Atom.Feed as Atom
|
||||
import Data.Entry
|
||||
import Control.Lens
|
||||
import Data.Text.Strict.Lens (utf8, unpacked)
|
||||
import Data.URL (_URL)
|
||||
import Data.Entry (Entry(entryURL))
|
||||
import qualified Data.Map.Strict as M
|
||||
|
||||
|
||||
newtype Minutes = Minutes Natural
|
||||
@ -47,37 +52,31 @@ defaultConfig = FeedMonad
|
||||
, secretToken = "i am a secret"
|
||||
}
|
||||
|
||||
parseAtom :: Atom.Feed -> [Entry]
|
||||
parseAtom Atom.Feed{Atom.feedEntries=es} = map parseEntry es
|
||||
|
||||
updateFeeds :: FeedMonad -> App ()
|
||||
updateFeeds f = do
|
||||
mgr <- asks environmentManager
|
||||
st <- asks environmentAcidState
|
||||
for_ (feeds f) $ \c -> for_ c $ \fid -> liftIO $ do
|
||||
let FeedId u = fid
|
||||
entries <- maybe [] parseEntries . parseFeedSource <$> liftIO (execute mgr (fetch u))
|
||||
finalEntries <- foldMap (\e -> M.singleton (EntryId $ entryURL e) e) <$> traverse (execute mgr . filters f pure) entries
|
||||
newEntries <- query st (UnseenEntries fid (M.keysSet finalEntries))
|
||||
update st (SaveEntries fid (foldMap (\eid -> toList $ M.lookup eid finalEntries) newEntries))
|
||||
|
||||
queryCategory :: FeedMonad -> App [Category (FeedId, Int)]
|
||||
queryCategory = traverse (traverse q) . feeds
|
||||
where
|
||||
parseEntry :: Atom.Entry -> Entry
|
||||
parseEntry atomEntry = Entry
|
||||
{ entryURL = view (unpacked . from _URL) $ Atom.entryId atomEntry
|
||||
, entryTitle =
|
||||
case Atom.entryTitle atomEntry of
|
||||
Atom.TextString txt -> txt
|
||||
_ -> "Title supported"
|
||||
, entryContent =
|
||||
case Atom.entryContent atomEntry of
|
||||
Just (Atom.TextContent txt) -> view (re utf8 . lazy) txt
|
||||
Just (Atom.HTMLContent html) -> view (re utf8 . lazy) html
|
||||
Just _ -> "Content not supported"
|
||||
Nothing -> ""
|
||||
, entryScore = 0
|
||||
, entryTags = mempty
|
||||
}
|
||||
q :: FeedId -> App (FeedId, Int)
|
||||
q fid = do
|
||||
st <- asks environmentAcidState
|
||||
(fid, ) <$> liftIO (query st (CountEntries fid))
|
||||
|
||||
defaultMain :: FeedMonad -> IO ()
|
||||
defaultMain f =
|
||||
bracket (openLocalState emptyFeedMonadState) closeAcidState $ \st -> do
|
||||
mgr <- newTlsManager
|
||||
runApp (Environment mgr st) $
|
||||
for_ (feeds f) $ \c -> for_ c $ \fid -> do
|
||||
let FeedId u = fid
|
||||
feed <- parseFeedSource <$> liftIO (execute mgr (fetch u))
|
||||
case feed of
|
||||
Nothing -> pure ()
|
||||
Just (AtomFeed atom) -> liftIO (traverse (execute mgr . filters f pure) (parseAtom atom) >>= traverse_ print)
|
||||
Just (RSSFeed _rssFeed) -> liftIO $ putStrLn "rssFeed"
|
||||
Just (RSS1Feed _rss1Feed) -> liftIO $ putStrLn "rss1Feed"
|
||||
Just (XMLFeed _xmlFeed) -> liftIO $ putStrLn "xmlFeed"
|
||||
runApp (Environment mgr st) $ do
|
||||
updateFeeds f
|
||||
cat <- queryCategory f
|
||||
liftIO $ print cat
|
||||
|
Loading…
x
Reference in New Issue
Block a user