Created
January 10, 2012 08:23
-
-
Save vshabanov/1587842 to your computer and use it in GitHub Desktop.
Более шустрый парсер для tagsoup (где-то раз в 200 быстрее)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings, BangPatterns, FlexibleInstances #-} | |
-- | Тупейший (и, надеюсь, быстрый) парсер xml -> теги. | |
-- | |
-- Выдает где-то от 40МБ/сек (а где-то и 80 и 200) | |
-- | |
-- Все теги и атрибуты возвращаются в lowercase (строка меняется INPLACE, | |
-- для уменьшения числа аллокаций и фрагментации) | |
-- | |
-- После профилирования выяснилось, | |
-- что TagSoup отжирает 75% памяти и >50% времени. Более того, | |
-- оказалось у него внутри весь парсинг идет в string и только потом | |
-- трансформация в Text. Да еще и позиции обрабатываются. В общем куча | |
-- лишней работы. | |
-- | |
-- Посмотрел другие библиотеки: | |
-- * hexpat -- все проверяет, google.ru не разгребает | |
-- * libxml -- то же самое | |
-- | |
-- Помимо всего прочего, TagSoup работает с unicode и пришлось | |
-- использовать Text, хотя это нафиг не нужно. Мы можем тупо работать | |
-- со strict bytestring (все равно целиком скачиваем) и всю работу | |
-- проводить в utf-8 | |
-- | |
-- | |
module Lib.XMLParser | |
( parse, render, parseT, renderT ) | |
where | |
import Data.Bits | |
import Control.Monad | |
import Text.HTML.TagSoup (Tag(..),parseTags,renderTags) | |
import Text.HTML.TagSoup.Entity | |
--import Text.HTML.TagSoup (Tag(..)) | |
import qualified Data.ByteString as B8 | |
import qualified Data.ByteString.Char8 as B | |
import qualified Data.ByteString.Internal as B | |
import Foreign | |
import Data.String | |
-- import Downloader | |
import qualified Data.ByteString.Lazy.Char8 as BL | |
import Data.Char | |
import qualified Data.Text as T | |
import qualified Data.Text.Encoding as T | |
instance IsString Word8 where | |
fromString [c] = B.c2w c | |
type P = Ptr Word8 | |
r :: P -> Word8 | |
r ptr = B.inlinePerformIO (peek ptr) | |
{-# INLINE r #-} | |
instance Enum (Ptr Word8) where | |
succ p = p `plusPtr` 1 | |
toEnum = error "toEnum: Ptr Word8" | |
fromEnum = error "toEnum: Ptr Word8" | |
pp :: Enum a => a -> a | |
pp !x = succ x | |
mm !y = pred y | |
{-# INLINE mm #-} | |
{-# INLINE pp #-} | |
-- делаем INPLACE toLower, хотя вроде это не особо влияет, судя по | |
-- Parser.perfTest (но он и фрагментацию не показывает). | |
-- А вот проверка перед escape/unescape уменьшает отжираемую память | |
-- toLowerBS = B.map toLower | |
toLowerBS bs@(B.PS fp o l) = B.inlinePerformIO $ withForeignPtr fp $ \ p -> do | |
let go !o 0 = return bs | |
go !o l = do | |
w <- peekByteOff p o | |
pokeByteOff p o (B.c2w $ toLower $ B.w2c w) | |
go (o+1) (l-1) | |
go o l | |
parse :: B.ByteString -> [Tag B.ByteString] | |
parse s = B.inlinePerformIO $ withForeignPtr fp $ \ p -> | |
return $ | |
map fixTag $ filter (/= TagText "") $ | |
dat (p `plusPtr` offset) len 0 | |
-- [TagText s] | |
where (fp, offset, len) = B.toForeignPtr s | |
fixTag (TagOpen t a) = | |
TagOpen t (map (\(a,v) -> (toLowerBS a, unescapeHtml v)) $ | |
reverse a) | |
-- fixTag (TagText t) = TagText (unescapeHtml t) | |
fixTag t = t | |
-- По-правильному надо делать так | |
-- http://www.w3.org/TR/html5/tokenization.html | |
-- и tagsoup так пытается. Я упростил часть state-ов | |
-- (особенно comment-ы) и убрал разгребание символов | |
mkS !left !n = B.fromForeignPtr fp (offset + len - left - n) n | |
mkText unescape !left !n | |
| unescape = TagText $ unescapeHtml $ mkS left n | |
| otherwise = TagText $ mkS left n | |
-- script/style/CDATA оставляем как есть | |
script :: P -> Int -> Int -> [Tag B.ByteString] | |
script p 0 0 = [] | |
script p 0 n = [mkText False 0 n] | |
script !p !l !n | |
| nextIC p l (toLowerUpperPairs "</script>") = | |
mkText False l n : TagClose "script" | |
: dat (p `plusPtr` 9) (l - 9) 0 | |
| otherwise = script (pp p) (mm l) (pp n) | |
style :: P -> Int -> Int -> [Tag B.ByteString] | |
style p 0 0 = [] | |
style p 0 n = [mkText False 0 n] | |
style !p !l !n | |
| nextIC p l (toLowerUpperPairs "</style>") = | |
mkText False l n : TagClose "style" | |
: dat (p `plusPtr` 8) (l - 8) 0 | |
| otherwise = style (pp p) (mm l) (pp n) | |
dat :: P -> Int -> Int -> [Tag B.ByteString] | |
dat p 0 0 = [] | |
dat p 0 n = [mkText True 0 n] | |
dat !p !left !n | |
| r p == "<" = mkText True left n : tagOpen (pp p) (mm left) | |
| otherwise = dat (pp p) (mm left) (pp n) | |
tdat !t !p !l = t : case t of | |
TagOpen "script" _ -> script p l 0 | |
TagOpen "style" _ -> style p l 0 | |
_ -> dat p l 0 | |
tagOpen p 0 = dat p 0 1 | |
tagOpen !p !left | |
| r p == "!" = markupDeclOpen (pp p) (mm left) | |
| alpha (r p) || r p == "?" = tagName True (pp p) (mm left) 1 | |
| r p == "/" = closeTagOpen (pp p) (mm left) | |
| otherwise = dat p left 1 | |
closeTagOpen p 0 = dat p 0 2 | |
closeTagOpen !p !left | |
| alphaBQ (r p) = tagName False (pp p) (mm left) 1 | |
| otherwise = dat p 0 2 | |
tagName o p 0 n = [] -- ничего не выводим, если тег | |
-- закрылся раньше времени | |
tagName !o !p !left !n | |
| space (r p) = beforeAttName tag (pp p) (mm left) | |
| r p == ">" = tdat tag (pp p) (mm left) | |
| r p == "?" || r p == "/" = | |
selfClosingStartTag tag (pp p) (mm left) | |
| r p == "'" || r p == "\"" = attValue (r p) tag "" (pp p) (mm left) 0 | |
| otherwise = tagName o (pp p) (mm left) (pp n) | |
where tag | o = TagOpen (toLowerBS $ mkS left n) [] | |
| otherwise = TagClose (toLowerBS $ mkS left n) | |
markupDeclOpen p 0 = dat p 0 2 | |
markupDeclOpen !p !l | |
| alpha (r p) = tagName True (pp p) (mm l) 1 | |
| next p l "--" = commentStart (p `plusPtr` 2) (l - 2) 0 | |
| next p l "[CDATA[" = cdataSection (p `plusPtr` 7) (l - 7) 0 | |
| otherwise = dat p l 2 | |
beforeAttName t p 0 = [t] | |
beforeAttName !t !p !l | |
| space (r p) = beforeAttName t (pp p) (mm l) | |
| r p == ">" = tdat t (pp p) (mm l) | |
| r p == "?" || r p == "/" = selfClosingStartTag t (pp p) (mm l) | |
| r p == "'" || r p == "\"" = attValue (r p) t "" (pp p) (mm l) 0 | |
| otherwise = attName t (pp p) (mm l) 1 | |
attName t p 0 n = [t] | |
attName !t !p !l !n | |
| space (r p) = afterAttName t (mkS l n) (pp p) (mm l) | |
| r p == ">" = tdat (addAttr t (mkS l n) "") (pp p) (mm l) | |
| r p == "?" || r p == "/" = | |
selfClosingStartTag (addAttr t (mkS l n) "") (pp p) (mm l) | |
| r p == "=" = beforeAttValue t (mkS l n) (pp p) (mm l) | |
| otherwise = attName t (pp p) (mm l) (pp n) | |
afterAttName t a p 0 = [t] | |
afterAttName !t !a !p !l | |
| space (r p) = afterAttName t a (pp p) (mm l) | |
| r p == "=" = beforeAttValue t a (pp p) (mm l) | |
| r p == ">" = tdat (addAttr t a "") (pp p) (mm l) | |
| r p == "?" || r p == "/" = | |
selfClosingStartTag (addAttr t a "") (pp p) (mm l) | |
| r p == "'" || r p == "\"" = attValue (r p) t a (pp p) (mm l) 0 | |
| otherwise = attName (addAttr t a "") (pp p) (mm l) 1 | |
beforeAttValue t a p 0 = [t] | |
beforeAttValue !t !a !p !l | |
| space (r p) = beforeAttValue t a (pp p) (mm l) | |
| r p == ">" = tdat (addAttr t a "") (pp p) (mm l) | |
-- | (r p == "?" || r p == "/") && | |
-- (l == 0 || (r (pp p) == ">")) = | |
-- selfClosingStartTag (addAttr t a "") (pp p) (mm l) | |
| r p == "'" || r p == "\"" = attValue (r p) t a (pp p) (mm l) 0 | |
| otherwise = attValueUnquoted t a (pp p) (mm l) 1 | |
attValue end t a p 0 n = [t] | |
attValue !end !t !a !p !l !n | |
| r p == end = beforeAttName (addAttr t a (mkS l n)) (pp p) (mm l) | |
| otherwise = attValue end t a (pp p) (mm l) (pp n) | |
attValueUnquoted t a p 0 n = [t] | |
attValueUnquoted !t !a !p !l !n | |
| space (r p) = beforeAttName (addAttr t a (mkS l n)) (pp p) (mm l) | |
| r p == ">" -- || r p == "/" | |
= beforeAttName (addAttr t a (mkS l n)) p l | |
| otherwise = attValueUnquoted t a (pp p) (mm l) (pp n) | |
commentStart p 0 n = [] -- комменты никуда не выводим | |
commentStart !p !l !n | |
| next p l "-->" = dat (p `plusPtr` 3) (l - 3) 0 | |
| otherwise = commentStart (pp p) (mm l) (pp n) | |
cdataSection p 0 n = dat p 0 n | |
cdataSection !p !l !n | |
| next p l "]]>" = mkText False l n : dat (p `plusPtr` 3) (l-3) 0 | |
| otherwise = cdataSection (pp p) (mm l) (pp n) | |
selfClosingStartTag t p 0 = closeTag t [] | |
selfClosingStartTag !t !p !l | |
| r p == ">" = closeTag t $ dat (pp p) (mm l) 0 | |
| otherwise = beforeAttName t p l | |
addAttr (TagOpen !tn !ta) !a !v = TagOpen tn ((a, v):ta) | |
addAttr t _ _ = t | |
closeTag !t@(TagOpen !tn _) !r = t : TagClose tn : r | |
closeTag !t !r = t : r | |
space 0x20 = True | |
space !n = n >= 9 && n <= 13 -- \t\n\v\f\r | |
alpha !c = (c >= "a" && c <= "z") || (c >= "A" && c <= "Z") | |
alphaBQ !c = alpha c || c == "?" || c == "!" | |
next p l [] = True | |
next p 0 _ = False | |
next !p !l (x:xs) = (r p == B.c2w x) && next (pp p) (mm l) xs | |
toLowerUpperPairs s = [(B.c2w $ toLower x, B.c2w $ toUpper x) | x <- s] | |
nextIC p l [] = True | |
nextIC p 0 _ = False | |
nextIC !p !l ((a,b):xs) = (r p == a || r p == b) && nextIC (pp p) (mm l) xs | |
{-# INLINE parse #-} | |
-- ord maxBound = 1114111 = 0x10FFFF =   | |
-- максимум 8 символов между & и ; | |
-- буквенные обозначения также не больше 8 символов | |
-- т.е. искать ';' после '&' можно в пределах ближайших 9 символов | |
unescapeHtml :: B.ByteString -> B.ByteString | |
unescapeHtml s | |
| not (B8.elem "&" s) = s | |
| B.length s' == 0 = s -- дабы лишняя аллокация сразу прибилась | |
| otherwise = s' | |
where s' = B.inlinePerformIO $ withForeignPtr fp $ \ src -> | |
B.createAndTrim (len*2) $ \ dst -> do | |
(ch, dst') <- go (src `plusPtr` offset) dst len False | |
return $ if ch then dst' `minusPtr` dst else 0 | |
(fp, offset, len) = B.toForeignPtr s | |
go :: P -> P -> Int -> Bool -> IO (Bool, P) | |
go !s !d !0 !c = return (c, d) | |
go !s !d !l !c | |
-- | r s .&. (0x8 + 0x4) == 0x8 | |
| r s /= "&" = | |
poke d (r s) >> go (pp s) (pp d) (mm l) c | |
| otherwise = entity (pp s) d (mm l) 9 [] c | |
-- add !c !s !d !n !l = | |
-- poke d c >> go (s `plusPtr` n) (pp d) (l - n) True | |
entity !s !d !l !n !acc !c | |
| n == 0 || l == 0 = | |
poke d "&" >> go (s `plusPtr` (n-9)) (pp d) (l+9-n) c | |
| r s /= ";" = | |
entity (pp s) d (mm l) (mm n) (B.w2c (r s) : acc) c | |
| otherwise = case lookupEntity $ reverse acc of | |
Nothing -> | |
poke d "&" >> go (s `plusPtr` (n-9)) (pp d) (l+9-n) c | |
Just i -> do | |
let put !d [] = go (pp s) d (mm l) True | |
put !d (x:xs) = | |
poke d x >> put (pp d) xs | |
put d $ encodeChar i | |
encodeChar :: Char -> [Word8] | |
encodeChar = map fromIntegral . go . ord | |
where | |
go oc | |
| oc <= 0x7f = [oc] | |
| oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6) | |
, 0x80 + oc .&. 0x3f | |
] | |
| oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12) | |
, 0x80 + ((oc `shiftR` 6) .&. 0x3f) | |
, 0x80 + oc .&. 0x3f | |
] | |
| otherwise = [ 0xf0 + (oc `shiftR` 18) | |
, 0x80 + ((oc `shiftR` 12) .&. 0x3f) | |
, 0x80 + ((oc `shiftR` 6) .&. 0x3f) | |
, 0x80 + oc .&. 0x3f | |
] | |
bst = T.decodeUtf8With (\ _ -> fmap B.w2c) | |
parseT = map textTag . parse | |
textTag (TagOpen t a) = TagOpen (bst t) [(bst n, bst v) | (n,v) <- a] | |
textTag (TagClose t) = TagClose (bst t) | |
textTag (TagText t) = TagText (bst t) | |
textTag (TagComment t) = TagComment (bst t) | |
textTag (TagWarning t) = TagWarning (bst t) | |
textTag (TagPosition r c) = TagPosition r c | |
escapeHtmlT :: T.Text -> T.Text | |
escapeHtmlT s | |
| not $ T.any (\ c -> c=='&'||c=='<'||c=='>'||c=='\''||c=='"') s = s | |
| otherwise = bst $ escapeHtml $ T.encodeUtf8 s | |
escapeHtml :: B.ByteString -> B.ByteString | |
escapeHtml s | |
| Nothing <- B8.find (\ c -> c=="&"||c=="<"||c==">"||c=="'"||c=="\"") s = s | |
| B.length s' == 0 = s | |
| otherwise = s' | |
where s' = B.inlinePerformIO $ withForeignPtr fp $ \ src -> | |
B.createAndTrim (len*6) $ \ dst -> do | |
(ch, dst') <- go (src `plusPtr` offset) dst len False | |
return $ if ch then dst' `minusPtr` dst else 0 | |
(fp, offset, len) = B.toForeignPtr s | |
go :: P -> P -> Int -> Bool -> IO (Bool, P) | |
go !s !d !0 !c = return (c, d) | |
go !s !d !l !c | |
| r s == "&" = add s d l $ map B.c2w "&" | |
| r s == "<" = add s d l $ map B.c2w "<" | |
| r s == ">" = add s d l $ map B.c2w ">" | |
| r s == "'" = add s d l $ map B.c2w "'" | |
| r s == "\"" = add s d l $ map B.c2w """ | |
| otherwise = | |
poke d (r s) >> go (pp s) (pp d) (mm l) c | |
add !s !d !l [] = go (pp s) d (mm l) True | |
add !s !d !l (x:xs) = do | |
poke d x | |
add s (pp d) l xs | |
{-# INLINE unescapeHtml #-} | |
-- nTest f n u = do | |
-- c <- urlGetContents u | |
-- forM_ [1..n] $ \ i -> print $ length $ f $ B.concat [B.pack (show i), c] | |
-- test u = urlGetContents u >>= print . length . parse | |
-- test2 u = urlGetContents u >>= print . length . parseTags | |
render = render' escapeHtml B.concat | |
renderT = render' escapeHtmlT T.concat | |
render' escape concat = go [] | |
where go acc [] = concat $ reverse acc | |
go acc (TagOpen t as : TagClose tc : ts) | t == tc = | |
go ("/>" : renderAtts (reverse as) (t : "<" : acc)) ts | |
go acc (TagOpen t as : ts) = | |
go (">" : renderAtts (reverse as) (t : "<" : acc)) ts | |
go acc (TagClose t : ts) = | |
go (">" : t : "</" : acc) ts | |
go acc (TagText t : ts) = | |
go (escape t : acc) ts | |
renderAtts [] r = r | |
renderAtts ((a,v):as) r = | |
"\"" : escape v : "=\"" : a : " " : renderAtts as r | |
{-# INLINE render #-} | |
-- -- | Использует тот факт, что внутри у нас одна строка и теги подряд | |
-- -- можно не рендерить, а тупо брать смещения. | |
-- renderSeqTags [] = "" | |
-- renderSeqTags (t:ts) = | |
-- where tagS (TagOpen |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment