Skip to content

Instantly share code, notes, and snippets.

@lemon24
Last active January 24, 2022 16:01
Show Gist options
  • Save lemon24/10ae478fafb8fc1cb091f04e0ceec03f to your computer and use it in GitHub Desktop.
Save lemon24/10ae478fafb8fc1cb091f04e0ceec03f to your computer and use it in GitHub Desktop.
"""
Working of off feedparser 6.0.8.
To do:
* [x] don't read entire file in memory
* [x] allow fallback to reading entire file in memory
* allow using alternative (defusedxml, lxml) sax handlers
* [x] fix content order bug
* [~] fix loose parsing
* still need to document that fallback only works for seekable streams
* full-featured parse()
"""
import io
import codecs
import xml.sax
import feedparser as fp
from feedparser.encodings import convert_to_utf8
from feedparser.sanitizer import replace_doctype
from feedparser.urls import make_safe_absolute_uri
from feedparser.util import FeedParserDict
from prefixfilewrapper import PrefixFileWrapper, StreamFactory, MissingEncoding
def convert_file_prefix_to_utf8(http_headers, file, result):
# based on https://gist.github.com/lemon24/dbe0f5f0cad3be3e1646c61cb026061d
prefix_len = 2**12
prefix = file.read(prefix_len)
# we call convert_to_utf8() up to 4 times,
# to make sure we eventually land on a code point boundary
for _ in range(4):
fake_result = {}
converted_prefix = convert_to_utf8(http_headers, prefix, fake_result)
if not fake_result.get('bozo'):
break
# check if the prefix we have is actually the whole thing
if len(prefix) < prefix_len:
break
byte = file.read(1)
if not byte:
break
prefix += byte
prefix_len += 1
result.update(fake_result)
return converted_prefix
OPTIMISTIC_ENCODING_DETECTION = True
def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detection):
if optimistic_encoding_detection:
prefix = convert_file_prefix_to_utf8(http_headers, file, result)
result['version'], prefix, entities = replace_doctype(prefix)
file = PrefixFileWrapper(prefix, file)
else:
# this shouldn't increase memory usage if file is BytesIO,
# since BytesIO does copy-on-write; https://bugs.python.org/issue22003
data = convert_to_utf8(http_headers, file.read(), result)
result['version'], data, entities = replace_doctype(data)
# still need to be able to reset() to the "beginning"
file = PrefixFileWrapper(b'', io.BytesIO(data))
return StreamFactory(file, result.get('encoding')), entities
def parse(
file,
response_headers=None,
resolve_relative_uris=None,
sanitize_html=None,
optimistic_encoding_detection=None,
):
# similar to the original code
if sanitize_html is None:
sanitize_html = fp.SANITIZE_HTML
if resolve_relative_uris is None:
resolve_relative_uris = fp.RESOLVE_RELATIVE_URIS
if optimistic_encoding_detection is None:
optimistic_encoding_detection = OPTIMISTIC_ENCODING_DETECTION
result = FeedParserDict(
bozo=False,
entries=[],
feed=FeedParserDict(),
headers={},
)
result['headers'].update(response_headers or {})
original_file = file
stream_factory, entities = convert_file_to_utf8(result['headers'], file, result, optimistic_encoding_detection)
# at this point, if the original file was seekable,
# file.reset() will seek the original file to its initial tell();
# also, file.close() will be ignored
# (because the sax parser closes the file when done, and we don't want that)
# TODO: when implementing parse() for real, if the file object is not from the user, close it
# similar to the original code
use_strict_parser = result['encoding'] and True or False
contentloc = result['headers'].get('content-location', '')
href = result.get('href', '')
baseuri = make_safe_absolute_uri(href, contentloc) or make_safe_absolute_uri(contentloc) or href
baselang = result['headers'].get('content-language', None)
if isinstance(baselang, bytes) and baselang is not None:
baselang = baselang.decode('utf-8', 'ignore')
if not fp.api._XML_AVAILABLE:
use_strict_parser = 0
if use_strict_parser:
feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
feedparser.resolve_relative_uris = resolve_relative_uris
feedparser.sanitize_html = sanitize_html
saxparser = xml.sax.make_parser(fp.api.PREFERRED_XML_PARSERS)
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
try:
saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
except xml.sax.SAXNotSupportedException:
pass
saxparser.setContentHandler(feedparser)
saxparser.setErrorHandler(feedparser)
source = xml.sax.xmlreader.InputSource()
# if an encoding was detected, decode the file on the fly;
# otherwise, pass it as-is and let the SAX parser deal with it
try:
source.setCharacterStream(stream_factory.get_text_file())
except MissingEncoding:
source.setByteStream(stream_factory.get_binary_file())
try:
saxparser.parse(source)
except xml.sax.SAXException as e:
result['bozo'] = 1
result['bozo_exception'] = feedparser.exc or e
use_strict_parser = 0
if not use_strict_parser:
# falling back to the loose parser only works with seekable files:
# io.UnsupportedOperation: underlying stream is not seekable
# decode the file on the fly;
# if an encoding was detected, use it;
# otherwise assume utf-8 and do your best
data = stream_factory.get_text_file('utf-8', 'replace').read()
# similar to the original code
feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
feedparser.resolve_relative_uris = resolve_relative_uris
feedparser.sanitize_html = sanitize_html
feedparser.feed(data)
result['feed'] = feedparser.feeddata
result['entries'] = feedparser.entries
result['version'] = result['version'] or feedparser.version
result['namespaces'] = feedparser.namespaces_in_use
return result
class _SummaryMixin:
def _start_summary(self, attrs_d):
if not self.version.startswith('atom'):
return super()._start_summary(attrs_d)
self._summaryKey = 'summary'
self.push_content(self._summaryKey, attrs_d, 'text/plain', 1)
class StrictFeedParser(_SummaryMixin, fp.api.StrictFeedParser): pass
class LooseFeedParser(_SummaryMixin, fp.api.LooseFeedParser): pass
"""
# after passing file as-is to source (no encoding handling, no loose parsing)
error: _feeds/https-www-reddit-com-r-oilshell-rss.rss: loose parser not implemented
better 11.6 28
feedparser 13.9 56
noop 0.0 18
# after we convert_to_utf8() only the prefix and avoid reading the whole file
error: _feeds/https-www-reddit-com-r-oilshell-rss.rss: loose parser not implemented
better 10.1 32
feedparser 10.6 60
noop 0.0 19
better results are the same as for feedparser
# after PrefixFileWrapper and optimistic_encoding_detection
error: _feeds/https-www-reddit-com-r-oilshell-rss.rss: 'PrefixFileWrapper' object has no attribute 'seek'
better 10.1 31
feedparser 10.6 60
noop 0.0 19
# after StreamFactory
better 10.9 33.2
better_bytes 10.4 33.9
feedparser 10.8 60.6
noop 0.0 20.0
"""
if __name__ == "__main__":
from textwrap import dedent
from pprint import pprint, pformat
import sys, feedparser, difflib
lines = dedent("""\
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="UTF-8"?>
<?xml version="1.0" encoding="UTF-8" ?>
<?xml version="1.0" encoding="utf-8" standalone="yes" ?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
<?xml version="1.0"?>
<rss version='2.0'>
""").splitlines()
"""
for line in lines:
line = line.encode('gbk')
result = {}
headers = {'content-type': 'application/xml; charset=ms932'}
data = convert_to_utf8(headers, line, result)
print(line, data, result, sep='\n', end='\n\n')
"""
"""
for path in sys.stdin:
path = path.rstrip()
with open(path, 'rb') as f:
original = feedparser.parse(f)
f.seek(0)
try:
better = parse(f)
except NotImplementedError:
continue
if original != better:
content_equal = []
for eo, eb in zip(original.entries, better.entries):
eoa = ([eo.summary] if eo.summary else []) + [c.value for c in eo.content]
eba = ([eb.summary] if eb.summary else []) + [c.value for c in eb.content]
content_equal.append(set(eoa) == set(eba))
if all(content_equal):
continue
print('===', path)
print(*difflib.ndiff(pformat(original).splitlines(), pformat(better).splitlines()), sep='\n')
# the only one that's different is _feeds/https-sobolevn-me-feed-xml.atom
# but i checked it by hand and it looks OK (it has both content and summary)
"""
with open('index.xml', 'rb') as f:
pprint(parse(f))
import sys, time, resource, io
import feedparser, atoma, betterfeedparser
def feedparser_parse(path, file):
return feedparser.parse(
file,
resolve_relative_uris=False,
sanitize_html=False,
)
def better_parse(path, file):
return betterfeedparser.parse(
file,
resolve_relative_uris=False,
sanitize_html=False,
)
def better_bytes_parse(path, file):
return betterfeedparser.parse(
io.BytesIO(file.read()),
resolve_relative_uris=False,
sanitize_html=False,
)
def atoma_parse(path, file):
return getattr(atoma, f'parse_{path.rpartition(".")[2]}_file')(file)
def noop_parse(*_): pass
impl = sys.argv[1]
parse = locals()[f'{impl}_parse']
timings = 0
for line in sys.stdin:
path = line.rstrip()
with open(path, 'rb') as file:
try:
start = time.perf_counter()
parse(path, file)
end = time.perf_counter()
timings += end - start
except Exception as e:
print(f'error: {path}: {e}', file=sys.stderr)
maxrss = (
resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
/ 2 ** (20 if sys.platform == 'darwin' else 10)
)
print(impl, round(timings, 1), round(maxrss, 1))
import codecs
import io
class PrefixFileWrapper:
"""
>>> import io
>>> file = io.StringIO('abcdef')
>>> file.read(2)
'ab'
>>> wrapped = PrefixFileWrapper(file.read(2).upper(), file)
>>> wrapped.read()
'CDef'
>>> wrapped.reset()
>>> wrapped.read()
'CDef'
>>>
"""
def __init__(self, prefix, file):
self.prefix = prefix
self.file = file
try:
self.file_initial_offset = file.tell()
except OSError:
self.file_initial_offset = None
self.offset = 0
def reset(self):
# raises io.UnsupportedOperation if the underlying stream is not seekable
self.file.seek(self.file_initial_offset)
self.offset = 0
def read(self, size=-1):
buffer = self.file.read(0)
if self.offset < len(self.prefix):
if size < 0:
chunk = self.prefix
else:
chunk = self.prefix[self.offset : self.offset+size]
size -= len(chunk)
buffer += chunk
self.offset += len(chunk)
while True:
chunk = self.file.read(size)
if not chunk:
break
buffer += chunk
self.offset += len(chunk)
if size <= 0:
break
size -= len(chunk)
return buffer
def close(self):
pass
class MissingEncoding(io.UnsupportedOperation): pass
class StreamFactory:
def __init__(self, file, encoding=None):
self.file = file
self.encoding = encoding
self.should_reset = False
def get_text_file(self, fallback_encoding=None, errors='strict'):
encoding = self.encoding or fallback_encoding
if encoding is None:
raise MissingEncoding("cannot create text stream without encoding")
reader_factory = codecs.getreader(encoding)
reader = reader_factory(self.file, errors)
self.reset()
return reader
def get_binary_file():
self.reset()
return self.file
def reset(self):
if self.should_reset:
self.file.reset()
self.should_reset = True
import io
# commented out so it doesn't mess with memory measurements
#import pytest
def make_file_in_the_middle(data):
prefix = b'zzzzz'
rv = io.BytesIO(prefix + data)
rv.seek(len(prefix))
return rv
class make_file_one_by_one(io.BytesIO):
def read(self, size=-1):
if size <= 0:
return super().read(size)
return super().read(1)
"""
@pytest.mark.parametrize('make_file', [
io.BytesIO,
make_file_in_the_middle,
make_file_one_by_one,
])
"""
def test_pfw(make_file):
f = PrefixFileWrapper(b'abc', make_file(b'def'))
assert f.read() == b'abcdef'
assert f.read() == b''
f.reset()
assert f.read(2) == b'ab'
assert f.read(2) == b'cd'
assert f.read(2) == b'ef'
assert f.read(2) == b''
assert f.read() == b''
f.reset()
assert f.read(3) == b'abc'
assert f.read(3) == b'def'
assert f.read(3) == b''
assert f.read() == b''
f.reset()
assert f.read(0) == b''
assert f.read() == b'abcdef'
f.reset()
f.reset()
assert f.read() == b'abcdef'
class make_file_not_seekable(io.BytesIO):
def tell(self):
raise io.UnsupportedOperation
def seek(self, *args):
raise io.UnsupportedOperation
def test_pfw_not_seekable():
f = PrefixFileWrapper(b'abc', make_file_not_seekable(b'def'))
assert f.read() == b'abcdef'
assert f.read() == b''
with pytest.raises(io.UnsupportedOperation):
f.reset()
assert f.read() == b''
f = PrefixFileWrapper(b'abc', make_file_not_seekable(b'def'))
assert f.read(3) == b'abc'
with pytest.raises(io.UnsupportedOperation):
f.reset()
assert f.read() == b'def'
def test_pfw_no_prefix():
f = PrefixFileWrapper(b'', io.BytesIO(b'abc'))
assert f.read(1) == b'a'
assert f.read() == b'bc'
f.reset()
assert f.read() == b'abc'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment