95 lines
3.9 KiB
Haskell
95 lines
3.9 KiB
Haskell
{-# LANGUAGE NamedFieldPuns #-}
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
module Feed.Parser where
|
|
import Control.Lens (from, lazy, re, view, _Just)
|
|
import Data.ByteString.Lazy (ByteString)
|
|
import Data.Entry (Entry(..), Tag(Tag))
|
|
import Data.Foldable (toList)
|
|
import qualified Data.Set as S
|
|
import Data.Text.Strict.Lens (unpacked, utf8)
|
|
import Data.URL (URL(URL), _URL)
|
|
import Debug.Trace (trace)
|
|
import qualified Text.Atom.Feed as Atom
|
|
import Text.Feed.Types (Feed(..))
|
|
import qualified Text.RSS.Syntax as RSS
|
|
import Text.XML
|
|
( Document(Document, documentEpilogue, documentPrologue, documentRoot)
|
|
, Element(..)
|
|
, Node(NodeElement)
|
|
, Prologue(Prologue, prologueAfter, prologueBefore, prologueDoctype)
|
|
, def
|
|
, fromXMLElement
|
|
, renderLBS
|
|
)
|
|
import Data.Time (parseTimeM, rfc822DateFormat, defaultTimeLocale, iso8601DateFormat)
|
|
import Data.Text (Text)
|
|
import qualified Data.Text as T
|
|
|
|
parseAtom :: Atom.Feed -> [Either Text Entry]
|
|
parseAtom Atom.Feed{Atom.feedEntries=es} = map parseEntry es
|
|
where
|
|
parseEntry :: Atom.Entry -> Either Text Entry
|
|
parseEntry atomEntry = Entry
|
|
<$> note "Missing entry id" (view (unpacked . from _URL . re _Just) entryId)
|
|
<*> note "Missing title" title
|
|
<*> note "Missing content" content
|
|
<*> pure 0
|
|
<*> pure mempty
|
|
<*> note ("Missing time: " <> T.pack (show entryUpdated)) (parseTimeM True defaultTimeLocale (iso8601DateFormat (Just "%H:%M:%S%Q%EZ")) . view unpacked $ entryUpdated)
|
|
where
|
|
content =
|
|
case entryContent of
|
|
Just (Atom.TextContent txt) -> Just $ view (re utf8 . lazy) txt
|
|
Just (Atom.HTMLContent txt) -> Just $ view (re utf8 . lazy) txt
|
|
Just (Atom.XHTMLContent xml) -> Just $ renderElement (toList $ fromXMLElement xml)
|
|
_ -> Nothing
|
|
title =
|
|
case entryTitle of
|
|
Atom.TextString s -> Just s
|
|
_ -> Nothing
|
|
Atom.Entry{ Atom.entryId
|
|
, Atom.entryTitle
|
|
, Atom.entryContent
|
|
, Atom.entryUpdated } = atomEntry
|
|
|
|
renderElement :: [Element] -> ByteString
|
|
renderElement els = renderLBS def doc
|
|
where
|
|
el = Element {elementName = "span", elementAttributes = mempty, elementNodes = map NodeElement els}
|
|
doc = Document {documentPrologue = prologue, documentRoot = el, documentEpilogue = []}
|
|
prologue = Prologue {prologueBefore = [], prologueDoctype = Nothing, prologueAfter = []}
|
|
|
|
-- | Add context to a Maybe by converting it into an Either
|
|
note :: e -> Maybe a -> Either e a
|
|
note e Nothing = Left e
|
|
note _ (Just a) = Right a
|
|
|
|
parseRSS :: RSS.RSS -> [Either Text Entry]
|
|
parseRSS RSS.RSS{RSS.rssChannel=RSS.RSSChannel{RSS.rssItems = items}} =
|
|
map parseItem items
|
|
where
|
|
parseItem :: RSS.RSSItem -> Either Text Entry
|
|
parseItem item = Entry
|
|
<$> note "Missing entry url" (URL . view unpacked <$> rssItemLink)
|
|
<*> note "Missing title" rssItemTitle
|
|
<*> note "Missing content" (regularContent <> Just otherContent)
|
|
<*> pure 0
|
|
<*> pure (foldMap (S.singleton . Tag . RSS.rssCategoryValue) rssItemCategories)
|
|
<*> note ("Missing time: " <> T.pack (show rssItemPubDate)) (parseTimeM True defaultTimeLocale rfc822DateFormat . view unpacked =<< rssItemPubDate)
|
|
where
|
|
regularContent = view (re utf8 . lazy) <$> rssItemContent
|
|
otherContent = renderElement (concatMap (toList . fromXMLElement) rssItemOther)
|
|
RSS.RSSItem { RSS.rssItemTitle
|
|
, RSS.rssItemLink
|
|
, RSS.rssItemCategories
|
|
, RSS.rssItemContent
|
|
, RSS.rssItemOther
|
|
, RSS.rssItemPubDate
|
|
} = item
|
|
|
|
parseEntries :: Feed -> [Either Text Entry]
|
|
parseEntries (AtomFeed atom) = parseAtom atom
|
|
parseEntries (RSSFeed rss) = parseRSS rss
|
|
parseEntries (RSS1Feed _rss1) = trace "rss1" []
|
|
parseEntries (XMLFeed _xml) = trace "xml" []
|