Parse RSS and have published timestamps
This commit is contained in:
parent
e1b4155a25
commit
4adc75c33c
@ -13,8 +13,19 @@ import Control.Lens
|
|||||||
|
|
||||||
myFeeds :: [ Category FeedId ]
|
myFeeds :: [ Category FeedId ]
|
||||||
myFeeds =
|
myFeeds =
|
||||||
[ Category "News"
|
[ Category "Code"
|
||||||
[Leaf (FeedId (URL "https://github.com/feediron/feediron-recipes/commits/master.atom"))]
|
[Leaf (FeedId (URL "https://github.com/feediron/feediron-recipes/commits/master.atom"))]
|
||||||
|
, Category "News"
|
||||||
|
[ Category "Yle"
|
||||||
|
[ Leaf (FeedId (URL "https://feeds.yle.fi/uutiset/v1/majorHeadlines/YLE_UUTISET.rss"))
|
||||||
|
, Leaf (FeedId (URL "https://feeds.yle.fi/uutiset/v1/recent.rss?publisherIds=YLE_UUTISET&concepts=18-147345"))
|
||||||
|
]
|
||||||
|
]
|
||||||
|
, Category "Programming"
|
||||||
|
[ Leaf (FeedId (URL "https://reddit.com/r/haskell.rss"))
|
||||||
|
, Leaf (FeedId (URL "https://discourse.dhall-lang.org/latest.rss"))
|
||||||
|
, Leaf (FeedId (URL "https://www.haskellforall.com/feeds/posts/default"))
|
||||||
|
]
|
||||||
]
|
]
|
||||||
|
|
||||||
myFilters :: Middleware
|
myFilters :: Middleware
|
||||||
|
@ -34,6 +34,7 @@ library
|
|||||||
Control.Monad.App
|
Control.Monad.App
|
||||||
Control.Monad.HTTP
|
Control.Monad.HTTP
|
||||||
Data.URL
|
Data.URL
|
||||||
|
Feed.Parser
|
||||||
|
|
||||||
-- Modules included in this library but not exported.
|
-- Modules included in this library but not exported.
|
||||||
-- other-modules:
|
-- other-modules:
|
||||||
@ -55,6 +56,8 @@ library
|
|||||||
, xdg-basedir
|
, xdg-basedir
|
||||||
, free
|
, free
|
||||||
, feed
|
, feed
|
||||||
|
, xml-conduit
|
||||||
|
, time
|
||||||
hs-source-dirs: src
|
hs-source-dirs: src
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall
|
||||||
|
@ -9,6 +9,7 @@ import Control.Lens
|
|||||||
import Data.SafeCopy
|
import Data.SafeCopy
|
||||||
import GHC.Generics (Generic)
|
import GHC.Generics (Generic)
|
||||||
import Data.URL (URL )
|
import Data.URL (URL )
|
||||||
|
import Data.Time (UTCTime)
|
||||||
|
|
||||||
|
|
||||||
newtype Tag = Tag Text
|
newtype Tag = Tag Text
|
||||||
@ -22,6 +23,7 @@ data Entry = Entry
|
|||||||
, entryContent :: ByteString
|
, entryContent :: ByteString
|
||||||
, entryScore :: Int
|
, entryScore :: Int
|
||||||
, entryTags :: Set Tag
|
, entryTags :: Set Tag
|
||||||
|
, entryPublished :: UTCTime
|
||||||
}
|
}
|
||||||
deriving (Eq, Ord, Show)
|
deriving (Eq, Ord, Show)
|
||||||
|
|
||||||
|
@ -13,9 +13,12 @@ module Database
|
|||||||
, EntryId(..)
|
, EntryId(..)
|
||||||
-- * Update functions
|
-- * Update functions
|
||||||
, SaveEntry(..)
|
, SaveEntry(..)
|
||||||
|
, SaveEntries(..)
|
||||||
-- * Query functions
|
-- * Query functions
|
||||||
, GetEntries(..)
|
, GetEntries(..)
|
||||||
, GetEntry(..)
|
, GetEntry(..)
|
||||||
|
, CountEntries(..)
|
||||||
|
, UnseenEntries(..)
|
||||||
)
|
)
|
||||||
where
|
where
|
||||||
|
|
||||||
@ -28,6 +31,9 @@ import Control.Lens
|
|||||||
import Control.Applicative ((<|>))
|
import Control.Applicative ((<|>))
|
||||||
import Control.Monad.Reader (asks)
|
import Control.Monad.Reader (asks)
|
||||||
import Data.URL (URL)
|
import Data.URL (URL)
|
||||||
|
import qualified Data.Map.Strict as M
|
||||||
|
import Data.Set (Set)
|
||||||
|
import qualified Data.Set as S
|
||||||
|
|
||||||
newtype FeedId = FeedId URL
|
newtype FeedId = FeedId URL
|
||||||
deriving (Show, Eq, Ord, Generic)
|
deriving (Show, Eq, Ord, Generic)
|
||||||
@ -49,16 +55,52 @@ emptyFeedMonadState = FeedMonadState mempty
|
|||||||
feeds :: Lens' FeedMonadState (Map FeedId (Map EntryId Entry))
|
feeds :: Lens' FeedMonadState (Map FeedId (Map EntryId Entry))
|
||||||
feeds = lens _feeds (\ fms f -> fms{_feeds=f})
|
feeds = lens _feeds (\ fms f -> fms{_feeds=f})
|
||||||
|
|
||||||
|
|
||||||
-- | Save the entry to the database
|
-- | Save the entry to the database
|
||||||
saveEntry :: FeedId -> EntryId -> Entry -> Update FeedMonadState ()
|
saveEntry :: FeedId -> EntryId -> Entry -> Update FeedMonadState ()
|
||||||
saveEntry fid eid en = modifying (feeds . at fid . non mempty . at eid) (<|> Just en)
|
saveEntry fid eid en = modifying (feeds . at fid . non mempty . at eid) (<|> Just en)
|
||||||
|
|
||||||
|
-- | Save a batch of entries to the database
|
||||||
|
--
|
||||||
|
-- Saves the new entries, preferring the old entries over the new entries
|
||||||
|
saveEntries :: FeedId -> [Entry] -> Update FeedMonadState ()
|
||||||
|
saveEntries fid entries = modifying (feeds . at fid) (\old -> Just (toEntryMap entries) <> old)
|
||||||
|
where
|
||||||
|
toEntryMap = foldMap (\e -> M.singleton (EntryId (entryURL e)) e)
|
||||||
|
|
||||||
getEntries :: FeedId -> Query FeedMonadState [Entry]
|
getEntries :: FeedId -> Query FeedMonadState [Entry]
|
||||||
getEntries fid = asks (toListOf (feeds . ix fid . traversed))
|
getEntries fid = asks (toListOf (feeds . ix fid . traversed))
|
||||||
|
|
||||||
|
-- | Count the number of entries in a feed
|
||||||
|
countEntries :: FeedId -> Query FeedMonadState Int
|
||||||
|
countEntries fid = asks (lengthOf (feeds . ix fid . traversed))
|
||||||
|
|
||||||
getEntry :: FeedId -> EntryId -> Query FeedMonadState (Maybe Entry)
|
getEntry :: FeedId -> EntryId -> Query FeedMonadState (Maybe Entry)
|
||||||
getEntry fid eid = asks (preview (feeds . ix fid . ix eid))
|
getEntry fid eid = asks (preview (feeds . ix fid . ix eid))
|
||||||
|
|
||||||
|
-- | Filter the unseen entries
|
||||||
|
--
|
||||||
|
unseenEntries :: FeedId -> Set EntryId -> Query FeedMonadState (Set EntryId)
|
||||||
|
-- This seems to be an unfortunate effect of acid-state :/. If I try
|
||||||
|
-- to insert all the entries and let the 'Map' handle updates, the
|
||||||
|
-- acid-state event gets humongous. With my tests with only a few
|
||||||
|
-- feeds, each run uses about a megabyte of disk space. Multiply this
|
||||||
|
-- by 48 times a day, 365 times a year, you would get 17G a year.
|
||||||
|
--
|
||||||
|
-- Even though the save isn't now done in a single atomic operation,
|
||||||
|
-- this should be fine, because at worst we would be inserting
|
||||||
|
-- duplicates and the 'saveEntries' function already handles that.
|
||||||
|
unseenEntries fid s = do
|
||||||
|
keys <- asks (foldMapOf (feeds . ix fid) M.keysSet)
|
||||||
|
pure $ s `S.difference` keys
|
||||||
|
|
||||||
deriveSafeCopy 0 'base ''FeedMonadState
|
deriveSafeCopy 0 'base ''FeedMonadState
|
||||||
|
|
||||||
makeAcidic ''FeedMonadState ['saveEntry, 'getEntries, 'getEntry]
|
makeAcidic ''FeedMonadState
|
||||||
|
[ 'saveEntry
|
||||||
|
, 'getEntries
|
||||||
|
, 'getEntry
|
||||||
|
, 'saveEntries
|
||||||
|
, 'countEntries
|
||||||
|
, 'unseenEntries
|
||||||
|
]
|
||||||
|
88
FeedMonad/src/Feed/Parser.hs
Normal file
88
FeedMonad/src/Feed/Parser.hs
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
module Feed.Parser where
|
||||||
|
import Control.Lens (from, lazy, re, view, _Just)
|
||||||
|
import Data.ByteString.Lazy (ByteString)
|
||||||
|
import Data.Entry (Entry(..), Tag(Tag))
|
||||||
|
import Data.Foldable (toList)
|
||||||
|
import Data.Maybe (mapMaybe)
|
||||||
|
import qualified Data.Set as S
|
||||||
|
import Data.Text.Strict.Lens (unpacked, utf8)
|
||||||
|
import Data.URL (URL(URL), _URL)
|
||||||
|
import Debug.Trace (trace)
|
||||||
|
import qualified Text.Atom.Feed as Atom
|
||||||
|
import Text.Feed.Types (Feed(..))
|
||||||
|
import qualified Text.RSS.Syntax as RSS
|
||||||
|
import Text.XML
|
||||||
|
( Document(Document, documentEpilogue, documentPrologue, documentRoot)
|
||||||
|
, Element(..)
|
||||||
|
, Node(NodeElement)
|
||||||
|
, Prologue(Prologue, prologueAfter, prologueBefore, prologueDoctype)
|
||||||
|
, def
|
||||||
|
, fromXMLElement
|
||||||
|
, renderLBS
|
||||||
|
)
|
||||||
|
import Data.Time (parseTimeM, rfc822DateFormat, defaultTimeLocale, iso8601DateFormat)
|
||||||
|
|
||||||
|
parseAtom :: Atom.Feed -> [Entry]
|
||||||
|
parseAtom Atom.Feed{Atom.feedEntries=es} = mapMaybe parseEntry es
|
||||||
|
where
|
||||||
|
parseEntry :: Atom.Entry -> Maybe Entry
|
||||||
|
parseEntry atomEntry = Entry
|
||||||
|
<$> view (unpacked . from _URL . re _Just) entryId
|
||||||
|
<*> title
|
||||||
|
<*> content
|
||||||
|
<*> pure 0
|
||||||
|
<*> pure mempty
|
||||||
|
<*> (parseTimeM True defaultTimeLocale (iso8601DateFormat (Just "%H:%M:%S%Q%EZ")) . view unpacked =<< entryPublished)
|
||||||
|
where
|
||||||
|
content =
|
||||||
|
case entryContent of
|
||||||
|
Just (Atom.TextContent txt) -> Just $ view (re utf8 . lazy) txt
|
||||||
|
Just (Atom.HTMLContent txt) -> Just $ view (re utf8 . lazy) txt
|
||||||
|
Just (Atom.XHTMLContent xml) -> Just $ renderElement (toList $ fromXMLElement xml)
|
||||||
|
_ -> Nothing
|
||||||
|
title =
|
||||||
|
case entryTitle of
|
||||||
|
Atom.TextString s -> Just s
|
||||||
|
_ -> Nothing
|
||||||
|
Atom.Entry{ Atom.entryId
|
||||||
|
, Atom.entryTitle
|
||||||
|
, Atom.entryContent
|
||||||
|
, Atom.entryPublished } = atomEntry
|
||||||
|
|
||||||
|
renderElement :: [Element] -> ByteString
|
||||||
|
renderElement els = renderLBS def doc
|
||||||
|
where
|
||||||
|
el = Element {elementName = "span", elementAttributes = mempty, elementNodes = map NodeElement els}
|
||||||
|
doc = Document {documentPrologue = prologue, documentRoot = el, documentEpilogue = []}
|
||||||
|
prologue = Prologue {prologueBefore = [], prologueDoctype = Nothing, prologueAfter = []}
|
||||||
|
|
||||||
|
parseRSS :: RSS.RSS -> [Entry]
|
||||||
|
parseRSS RSS.RSS{RSS.rssChannel=RSS.RSSChannel{RSS.rssItems = items}} =
|
||||||
|
mapMaybe parseItem items
|
||||||
|
where
|
||||||
|
parseItem :: RSS.RSSItem -> Maybe Entry
|
||||||
|
parseItem item = Entry
|
||||||
|
<$> (URL . view unpacked <$> rssItemLink)
|
||||||
|
<*> rssItemTitle
|
||||||
|
<*> (regularContent <> Just otherContent)
|
||||||
|
<*> pure 0
|
||||||
|
<*> pure (foldMap (S.singleton . Tag . RSS.rssCategoryValue) rssItemCategories)
|
||||||
|
<*> (parseTimeM True defaultTimeLocale rfc822DateFormat . view unpacked =<< rssItemPubDate)
|
||||||
|
where
|
||||||
|
regularContent = view (re utf8 . lazy) <$> rssItemContent
|
||||||
|
otherContent = renderElement (concatMap (toList . fromXMLElement) rssItemOther)
|
||||||
|
RSS.RSSItem { RSS.rssItemTitle
|
||||||
|
, RSS.rssItemLink
|
||||||
|
, RSS.rssItemCategories
|
||||||
|
, RSS.rssItemContent
|
||||||
|
, RSS.rssItemOther
|
||||||
|
, RSS.rssItemPubDate
|
||||||
|
} = item
|
||||||
|
|
||||||
|
parseEntries :: Feed -> [Entry]
|
||||||
|
parseEntries (AtomFeed atom) = parseAtom atom
|
||||||
|
parseEntries (RSSFeed rss) = parseRSS rss
|
||||||
|
parseEntries (RSS1Feed _rss1) = trace "rss1" []
|
||||||
|
parseEntries (XMLFeed _xml) = trace "xml" []
|
@ -1,27 +1,32 @@
|
|||||||
{-# LANGUAGE OverloadedStrings #-}
|
|
||||||
{-# LANGUAGE DeriveFunctor #-}
|
{-# LANGUAGE DeriveFunctor #-}
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE TupleSections #-}
|
||||||
module FeedMonad where
|
module FeedMonad where
|
||||||
|
|
||||||
import Data.Text (Text)
|
|
||||||
import Middleware (Middleware)
|
|
||||||
import Numeric.Natural (Natural)
|
|
||||||
import Data.Category (Category)
|
|
||||||
import Data.Acid (openLocalState, AcidState (closeAcidState))
|
|
||||||
import Control.Exception (bracket)
|
import Control.Exception (bracket)
|
||||||
import Database (emptyFeedMonadState, FeedId (FeedId))
|
import Control.Monad.App (App, runApp)
|
||||||
import Network.HTTP.Client.TLS (newTlsManager)
|
import Control.Monad.HTTP (execute, fetch)
|
||||||
import Control.Monad.App (runApp)
|
import Control.Monad.Reader (asks)
|
||||||
import Data.Environment
|
|
||||||
import Control.Monad.Trans (liftIO)
|
import Control.Monad.Trans (liftIO)
|
||||||
import Data.Foldable (for_, traverse_)
|
import Data.Acid (AcidState(closeAcidState), openLocalState, query, update)
|
||||||
import Control.Monad.HTTP (fetch, execute)
|
import Data.Category (Category)
|
||||||
|
import Data.Environment
|
||||||
|
import Data.Foldable (for_, toList)
|
||||||
|
import Data.Text (Text)
|
||||||
|
import Database
|
||||||
|
( CountEntries(CountEntries)
|
||||||
|
, FeedId(FeedId)
|
||||||
|
, SaveEntries(SaveEntries)
|
||||||
|
, emptyFeedMonadState, UnseenEntries (UnseenEntries), EntryId (EntryId)
|
||||||
|
)
|
||||||
|
import Feed.Parser (parseEntries)
|
||||||
|
import Middleware (Middleware)
|
||||||
|
import Network.HTTP.Client.TLS (newTlsManager)
|
||||||
|
import Numeric.Natural (Natural)
|
||||||
import Text.Feed.Import (parseFeedSource)
|
import Text.Feed.Import (parseFeedSource)
|
||||||
import Text.Feed.Types
|
import Data.Entry (Entry(entryURL))
|
||||||
import qualified Text.Atom.Feed as Atom
|
import qualified Data.Map.Strict as M
|
||||||
import Data.Entry
|
|
||||||
import Control.Lens
|
|
||||||
import Data.Text.Strict.Lens (utf8, unpacked)
|
|
||||||
import Data.URL (_URL)
|
|
||||||
|
|
||||||
|
|
||||||
newtype Minutes = Minutes Natural
|
newtype Minutes = Minutes Natural
|
||||||
@ -47,37 +52,31 @@ defaultConfig = FeedMonad
|
|||||||
, secretToken = "i am a secret"
|
, secretToken = "i am a secret"
|
||||||
}
|
}
|
||||||
|
|
||||||
parseAtom :: Atom.Feed -> [Entry]
|
|
||||||
parseAtom Atom.Feed{Atom.feedEntries=es} = map parseEntry es
|
updateFeeds :: FeedMonad -> App ()
|
||||||
|
updateFeeds f = do
|
||||||
|
mgr <- asks environmentManager
|
||||||
|
st <- asks environmentAcidState
|
||||||
|
for_ (feeds f) $ \c -> for_ c $ \fid -> liftIO $ do
|
||||||
|
let FeedId u = fid
|
||||||
|
entries <- maybe [] parseEntries . parseFeedSource <$> liftIO (execute mgr (fetch u))
|
||||||
|
finalEntries <- foldMap (\e -> M.singleton (EntryId $ entryURL e) e) <$> traverse (execute mgr . filters f pure) entries
|
||||||
|
newEntries <- query st (UnseenEntries fid (M.keysSet finalEntries))
|
||||||
|
update st (SaveEntries fid (foldMap (\eid -> toList $ M.lookup eid finalEntries) newEntries))
|
||||||
|
|
||||||
|
queryCategory :: FeedMonad -> App [Category (FeedId, Int)]
|
||||||
|
queryCategory = traverse (traverse q) . feeds
|
||||||
where
|
where
|
||||||
parseEntry :: Atom.Entry -> Entry
|
q :: FeedId -> App (FeedId, Int)
|
||||||
parseEntry atomEntry = Entry
|
q fid = do
|
||||||
{ entryURL = view (unpacked . from _URL) $ Atom.entryId atomEntry
|
st <- asks environmentAcidState
|
||||||
, entryTitle =
|
(fid, ) <$> liftIO (query st (CountEntries fid))
|
||||||
case Atom.entryTitle atomEntry of
|
|
||||||
Atom.TextString txt -> txt
|
|
||||||
_ -> "Title supported"
|
|
||||||
, entryContent =
|
|
||||||
case Atom.entryContent atomEntry of
|
|
||||||
Just (Atom.TextContent txt) -> view (re utf8 . lazy) txt
|
|
||||||
Just (Atom.HTMLContent html) -> view (re utf8 . lazy) html
|
|
||||||
Just _ -> "Content not supported"
|
|
||||||
Nothing -> ""
|
|
||||||
, entryScore = 0
|
|
||||||
, entryTags = mempty
|
|
||||||
}
|
|
||||||
|
|
||||||
defaultMain :: FeedMonad -> IO ()
|
defaultMain :: FeedMonad -> IO ()
|
||||||
defaultMain f =
|
defaultMain f =
|
||||||
bracket (openLocalState emptyFeedMonadState) closeAcidState $ \st -> do
|
bracket (openLocalState emptyFeedMonadState) closeAcidState $ \st -> do
|
||||||
mgr <- newTlsManager
|
mgr <- newTlsManager
|
||||||
runApp (Environment mgr st) $
|
runApp (Environment mgr st) $ do
|
||||||
for_ (feeds f) $ \c -> for_ c $ \fid -> do
|
updateFeeds f
|
||||||
let FeedId u = fid
|
cat <- queryCategory f
|
||||||
feed <- parseFeedSource <$> liftIO (execute mgr (fetch u))
|
liftIO $ print cat
|
||||||
case feed of
|
|
||||||
Nothing -> pure ()
|
|
||||||
Just (AtomFeed atom) -> liftIO (traverse (execute mgr . filters f pure) (parseAtom atom) >>= traverse_ print)
|
|
||||||
Just (RSSFeed _rssFeed) -> liftIO $ putStrLn "rssFeed"
|
|
||||||
Just (RSS1Feed _rss1Feed) -> liftIO $ putStrLn "rss1Feed"
|
|
||||||
Just (XMLFeed _xmlFeed) -> liftIO $ putStrLn "xmlFeed"
|
|
||||||
|
Loading…
Reference in New Issue
Block a user