module Text.XmlHtml.HTML.Parse where
import Control.Applicative
import Control.Monad
import Data.Char
import Data.List
import Data.Maybe
import Text.XmlHtml.Common
import Text.XmlHtml.HTML.Meta
import Text.XmlHtml.TextParser
import qualified Text.XmlHtml.XML.Parse as XML
import qualified Text.Parsec as P
import qualified Data.HashSet as S
import qualified Data.HashMap.Strict as M
import Data.Text (Text)
import qualified Data.Text as T
docFragment :: Encoding -> Parser Document
docFragment e = do
(dt, nodes1) <- prolog
(nodes2, Matched) <- content Nothing
return $ HtmlDocument e dt (nodes1 ++ nodes2)
prolog :: Parser (Maybe DocType, [Node])
prolog = do
_ <- optional XML.xmlDecl
nodes1 <- many XML.misc
rest <- optional $ do
dt <- docTypeDecl
nodes2 <- many XML.misc
return (dt, nodes2)
case rest of
Nothing -> return (Nothing, catMaybes nodes1)
Just (dt, nodes2) -> return (Just dt, catMaybes (nodes1 ++ nodes2))
docTypeDecl :: Parser DocType
docTypeDecl = do
P.try $ do
_ <- text "<!"
decl <- XML.name
when (T.toLower decl /= "doctype") $ fail "Expected DOCTYPE"
XML.whiteSpace
tag <- XML.name
_ <- optional XML.whiteSpace
extid <- externalID
_ <- optional XML.whiteSpace
intsub <- XML.internalDoctype
_ <- P.char '>'
return (DocType tag extid intsub)
externalID :: Parser ExternalID
externalID = do
tok <- optional $ T.toLower <$> XML.name
case tok of
Just "system" -> systemID
Just "public" -> publicID
Just _ -> fail "Expected SYSTEM or PUBLIC"
Nothing -> return NoExternalID
where
systemID = do
XML.whiteSpace
System <$> XML.systemLiteral
publicID = do
XML.whiteSpace
pid <- XML.pubIdLiteral
XML.whiteSpace
sid <- XML.systemLiteral
return (Public pid sid)
data ElemResult = Matched
| ImplicitLast Text
| ImplicitNext Text Text [(Text, Text)] Bool
finishElement :: Text -> Text -> [(Text, Text)] -> Bool
-> Parser (Node, ElemResult)
finishElement t tbase a b = do
if b then return (Element t a [], Matched)
else nonEmptyElem
where
nonEmptyElem
| isRawText tbase a = do
c <- XML.cdata "<" $ P.try (endTag t)
return (Element t a [c], Matched)
| tbase `S.member` endOmittableLast = tagContents optional
| otherwise = tagContents (fmap Just)
tagContents modifier = do
(c,r1) <- content (Just tbase)
case r1 of
Matched -> do
r2 <- modifier (endTag t)
case r2 of
Nothing -> return (Element t a c, Matched)
Just rr -> return (Element t a c, rr)
ImplicitLast tag | T.toCaseFold tag == T.toCaseFold t -> do
return (Element t a c, Matched)
end -> do
return (Element t a c, end)
emptyOrStartTag :: Parser (Text, Text, [(Text, Text)], Bool)
emptyOrStartTag = do
t <- P.try $ P.char '<' *> XML.name
let tbase = T.toLower $ snd $ T.breakOnEnd ":" t
a <- many $ P.try $ do
XML.whiteSpace
attribute
when (hasDups a) $ fail "Duplicate attribute names in element"
_ <- optional XML.whiteSpace
e <- fmap isJust $ optional (P.char '/')
let e' = e || (tbase `S.member` voidTags)
_ <- P.char '>'
return (t, tbase, a, e')
where
hasDups a = length (nub (map fst a)) < length a
attrName :: Parser Text
attrName = takeWhile1 isAttrName
where isAttrName c | c `elem` "\0 \"\'>/=" = False
| isControlChar c = False
| otherwise = True
isControlChar :: Char -> Bool
isControlChar c | c >= '\x007F' && c <= '\x009F' = True
| c >= '\xFDD0' && c <= '\xFDEF' = True
| otherwise = False
quotedAttrValue :: Parser Text
quotedAttrValue = singleQuoted <|> doubleQuoted
where
singleQuoted = P.char '\'' *> refTill "&\'" <* P.char '\''
doubleQuoted = P.char '\"' *> refTill "&\"" <* P.char '\"'
refTill end = T.concat <$> many
(takeWhile1 (not . (`elem` end)) <|> reference)
unquotedAttrValue :: Parser Text
unquotedAttrValue = refTill " \"\'=<>&`"
where
refTill end = T.concat <$> some
(takeWhile1 (not . (`elem` end)) <|> reference)
attrValue :: Parser Text
attrValue = quotedAttrValue <|> unquotedAttrValue
attribute :: Parser (Text, Text)
attribute = do
n <- attrName
v <- optional $ do
_ <- P.try $ do
_ <- optional XML.whiteSpace
P.char '='
_ <- optional XML.whiteSpace
attrValue
return $ maybe (n,"") (n,) v
endTag :: Text -> Parser ElemResult
endTag s = do
_ <- text "</"
t <- XML.name
let sbase = T.toLower $ snd $ T.breakOnEnd ":" s
r <- if (T.toCaseFold s == T.toCaseFold t)
then return Matched
else if sbase `S.member` endOmittableLast
then return (ImplicitLast t)
else fail $ "mismatched tags: </" ++ T.unpack t ++
"> found inside <" ++ T.unpack s ++ "> tag"
_ <- optional XML.whiteSpace
_ <- text ">"
return r
content :: Maybe Text -> Parser ([Node], ElemResult)
content parent = do
(ns, end) <- readText
return (coalesceText (catMaybes ns), end)
where
readText = do
s <- optional XML.charData
t <- optional whileMatched
case t of
Nothing -> return ([s], Matched)
Just (tt, m) -> return (s:tt, m)
whileMatched = do
(n,end) <- (,Matched) <$> (:[]) <$> Just <$> TextNode <$> reference
<|> (,Matched) <$> (:[]) <$> XML.cdSect
<|> (,Matched) <$> (:[]) <$> XML.processingInstruction
<|> (,Matched) <$> (:[]) <$> XML.comment
<|> doElement
case end of
Matched -> do
(ns, end') <- readText
return (n ++ ns, end')
_ -> do
return (n, end)
doElement = do
(t,tb, a,b) <- emptyOrStartTag
handle t tb a b
handle t tb a b = do
if breaksTag tb parent
then return ([Nothing], ImplicitNext t tb a b)
else do
(n,end) <- finishElement t tb a b
case end of
ImplicitNext t' tb' a' b' -> do
(ns,end') <- handle t' tb' a' b'
return (Just n : ns, end')
_ -> return ([Just n], end)
breaksTag _ Nothing = False
breaksTag child (Just tag) = case M.lookup tag endOmittableNext of
Nothing -> False
Just s -> S.member child s
coalesceText (TextNode s : TextNode t : ns)
= coalesceText (TextNode (T.append s t) : ns)
coalesceText (n:ns)
= n : coalesceText ns
coalesceText []
= []
reference :: Parser Text
reference = do
_ <- P.char '&'
r <- (Left <$> P.try finishCharRef)
<|> (Right <$> P.try finishEntityRef)
<|> (Left <$> return '&')
case r of
Left c -> do
when (not (isValidChar c)) $ fail $
"Reference is not a valid character"
return (T.singleton c)
Right nm -> case M.lookup nm predefinedRefs of
Nothing -> fail $ "Unknown entity reference: " ++ T.unpack nm
Just t -> return t
finishCharRef :: Parser Char
finishCharRef = P.char '#' *> (hexCharRef <|> decCharRef)
where
decCharRef = do
ds <- some digit
_ <- P.char ';'
let c = chr $ foldl' (\a b -> 10 * a + b) 0 ds
return c
where
digit = do
d <- P.satisfy (\c -> c >= '0' && c <= '9')
return (ord d ord '0')
hexCharRef = do
_ <- P.char 'x' <|> P.char 'X'
ds <- some digit
_ <- P.char ';'
let c = chr $ foldl' (\a b -> 16 * a + b) 0 ds
return c
where
digit = num <|> upper <|> lower
num = do
d <- P.satisfy (\c -> c >= '0' && c <= '9')
return (ord d ord '0')
upper = do
d <- P.satisfy (\c -> c >= 'A' && c <= 'F')
return (10 + ord d ord 'A')
lower = do
d <- P.satisfy (\c -> c >= 'a' && c <= 'f')
return (10 + ord d ord 'a')
finishEntityRef :: Parser Text
finishEntityRef = XML.name <* P.char ';'