FeedMonad/FeedMonad/src/Feed/Parser.hs

95 lines
3.9 KiB
Haskell

{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
module Feed.Parser where
import Control.Lens (from, lazy, re, view, _Just)
import Data.ByteString.Lazy (ByteString)
import Data.Entry (Entry(..), Tag(Tag))
import Data.Foldable (toList)
import qualified Data.Set as S
import Data.Text.Strict.Lens (unpacked, utf8)
import Data.URL (URL(URL), _URL)
import Debug.Trace (trace)
import qualified Text.Atom.Feed as Atom
import Text.Feed.Types (Feed(..))
import qualified Text.RSS.Syntax as RSS
import Text.XML
( Document(Document, documentEpilogue, documentPrologue, documentRoot)
, Element(..)
, Node(NodeElement)
, Prologue(Prologue, prologueAfter, prologueBefore, prologueDoctype)
, def
, fromXMLElement
, renderLBS
)
import Data.Time (parseTimeM, rfc822DateFormat, defaultTimeLocale, iso8601DateFormat)
import Data.Text (Text)
import qualified Data.Text as T
parseAtom :: Atom.Feed -> [Either Text Entry]
parseAtom Atom.Feed{Atom.feedEntries=es} = map parseEntry es
where
parseEntry :: Atom.Entry -> Either Text Entry
parseEntry atomEntry = Entry
<$> note "Missing entry id" (view (unpacked . from _URL . re _Just) entryId)
<*> note "Missing title" title
<*> note "Missing content" content
<*> pure 0
<*> pure mempty
<*> note ("Missing time: " <> T.pack (show entryUpdated)) (parseTimeM True defaultTimeLocale (iso8601DateFormat (Just "%H:%M:%S%Q%EZ")) . view unpacked $ entryUpdated)
where
content =
case entryContent of
Just (Atom.TextContent txt) -> Just $ view (re utf8 . lazy) txt
Just (Atom.HTMLContent txt) -> Just $ view (re utf8 . lazy) txt
Just (Atom.XHTMLContent xml) -> Just $ renderElement (toList $ fromXMLElement xml)
_ -> Nothing
title =
case entryTitle of
Atom.TextString s -> Just s
_ -> Nothing
Atom.Entry{ Atom.entryId
, Atom.entryTitle
, Atom.entryContent
, Atom.entryUpdated } = atomEntry
renderElement :: [Element] -> ByteString
renderElement els = renderLBS def doc
where
el = Element {elementName = "span", elementAttributes = mempty, elementNodes = map NodeElement els}
doc = Document {documentPrologue = prologue, documentRoot = el, documentEpilogue = []}
prologue = Prologue {prologueBefore = [], prologueDoctype = Nothing, prologueAfter = []}
-- | Add context to a Maybe by converting it into an Either
note :: e -> Maybe a -> Either e a
note e Nothing = Left e
note _ (Just a) = Right a
parseRSS :: RSS.RSS -> [Either Text Entry]
parseRSS RSS.RSS{RSS.rssChannel=RSS.RSSChannel{RSS.rssItems = items}} =
map parseItem items
where
parseItem :: RSS.RSSItem -> Either Text Entry
parseItem item = Entry
<$> note "Missing entry url" (URL . view unpacked <$> rssItemLink)
<*> note "Missing title" rssItemTitle
<*> note "Missing content" (regularContent <> Just otherContent)
<*> pure 0
<*> pure (foldMap (S.singleton . Tag . RSS.rssCategoryValue) rssItemCategories)
<*> note ("Missing time: " <> T.pack (show rssItemPubDate)) (parseTimeM True defaultTimeLocale rfc822DateFormat . view unpacked =<< rssItemPubDate)
where
regularContent = view (re utf8 . lazy) <$> rssItemContent
otherContent = renderElement (concatMap (toList . fromXMLElement) rssItemOther)
RSS.RSSItem { RSS.rssItemTitle
, RSS.rssItemLink
, RSS.rssItemCategories
, RSS.rssItemContent
, RSS.rssItemOther
, RSS.rssItemPubDate
} = item
parseEntries :: Feed -> [Either Text Entry]
parseEntries (AtomFeed atom) = parseAtom atom
parseEntries (RSSFeed rss) = parseRSS rss
parseEntries (RSS1Feed _rss1) = trace "rss1" []
parseEntries (XMLFeed _xml) = trace "xml" []