Last active
January 24, 2022 16:01
-
-
Save lemon24/10ae478fafb8fc1cb091f04e0ceec03f to your computer and use it in GitHub Desktop.
betterfeedparser.py for https://github.com/lemon24/reader/issues/265 https://github.com/kurtmckee/feedparser/issues/296
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Working of off feedparser 6.0.8. | |
To do: | |
* [x] don't read entire file in memory | |
* [x] allow fallback to reading entire file in memory | |
* allow using alternative (defusedxml, lxml) sax handlers | |
* [x] fix content order bug | |
* [~] fix loose parsing | |
* still need to document that fallback only works for seekable streams | |
* full-featured parse() | |
""" | |
import io | |
import codecs | |
import xml.sax | |
import feedparser as fp | |
from feedparser.encodings import convert_to_utf8 | |
from feedparser.sanitizer import replace_doctype | |
from feedparser.urls import make_safe_absolute_uri | |
from feedparser.util import FeedParserDict | |
from prefixfilewrapper import PrefixFileWrapper, StreamFactory, MissingEncoding | |
def convert_file_prefix_to_utf8(http_headers, file, result): | |
# based on https://gist.github.com/lemon24/dbe0f5f0cad3be3e1646c61cb026061d | |
prefix_len = 2**12 | |
prefix = file.read(prefix_len) | |
# we call convert_to_utf8() up to 4 times, | |
# to make sure we eventually land on a code point boundary | |
for _ in range(4): | |
fake_result = {} | |
converted_prefix = convert_to_utf8(http_headers, prefix, fake_result) | |
if not fake_result.get('bozo'): | |
break | |
# check if the prefix we have is actually the whole thing | |
if len(prefix) < prefix_len: | |
break | |
byte = file.read(1) | |
if not byte: | |
break | |
prefix += byte | |
prefix_len += 1 | |
result.update(fake_result) | |
return converted_prefix | |
OPTIMISTIC_ENCODING_DETECTION = True | |
def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detection): | |
if optimistic_encoding_detection: | |
prefix = convert_file_prefix_to_utf8(http_headers, file, result) | |
result['version'], prefix, entities = replace_doctype(prefix) | |
file = PrefixFileWrapper(prefix, file) | |
else: | |
# this shouldn't increase memory usage if file is BytesIO, | |
# since BytesIO does copy-on-write; https://bugs.python.org/issue22003 | |
data = convert_to_utf8(http_headers, file.read(), result) | |
result['version'], data, entities = replace_doctype(data) | |
# still need to be able to reset() to the "beginning" | |
file = PrefixFileWrapper(b'', io.BytesIO(data)) | |
return StreamFactory(file, result.get('encoding')), entities | |
def parse( | |
file, | |
response_headers=None, | |
resolve_relative_uris=None, | |
sanitize_html=None, | |
optimistic_encoding_detection=None, | |
): | |
# similar to the original code | |
if sanitize_html is None: | |
sanitize_html = fp.SANITIZE_HTML | |
if resolve_relative_uris is None: | |
resolve_relative_uris = fp.RESOLVE_RELATIVE_URIS | |
if optimistic_encoding_detection is None: | |
optimistic_encoding_detection = OPTIMISTIC_ENCODING_DETECTION | |
result = FeedParserDict( | |
bozo=False, | |
entries=[], | |
feed=FeedParserDict(), | |
headers={}, | |
) | |
result['headers'].update(response_headers or {}) | |
original_file = file | |
stream_factory, entities = convert_file_to_utf8(result['headers'], file, result, optimistic_encoding_detection) | |
# at this point, if the original file was seekable, | |
# file.reset() will seek the original file to its initial tell(); | |
# also, file.close() will be ignored | |
# (because the sax parser closes the file when done, and we don't want that) | |
# TODO: when implementing parse() for real, if the file object is not from the user, close it | |
# similar to the original code | |
use_strict_parser = result['encoding'] and True or False | |
contentloc = result['headers'].get('content-location', '') | |
href = result.get('href', '') | |
baseuri = make_safe_absolute_uri(href, contentloc) or make_safe_absolute_uri(contentloc) or href | |
baselang = result['headers'].get('content-language', None) | |
if isinstance(baselang, bytes) and baselang is not None: | |
baselang = baselang.decode('utf-8', 'ignore') | |
if not fp.api._XML_AVAILABLE: | |
use_strict_parser = 0 | |
if use_strict_parser: | |
feedparser = StrictFeedParser(baseuri, baselang, 'utf-8') | |
feedparser.resolve_relative_uris = resolve_relative_uris | |
feedparser.sanitize_html = sanitize_html | |
saxparser = xml.sax.make_parser(fp.api.PREFERRED_XML_PARSERS) | |
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) | |
try: | |
saxparser.setFeature(xml.sax.handler.feature_external_ges, 0) | |
except xml.sax.SAXNotSupportedException: | |
pass | |
saxparser.setContentHandler(feedparser) | |
saxparser.setErrorHandler(feedparser) | |
source = xml.sax.xmlreader.InputSource() | |
# if an encoding was detected, decode the file on the fly; | |
# otherwise, pass it as-is and let the SAX parser deal with it | |
try: | |
source.setCharacterStream(stream_factory.get_text_file()) | |
except MissingEncoding: | |
source.setByteStream(stream_factory.get_binary_file()) | |
try: | |
saxparser.parse(source) | |
except xml.sax.SAXException as e: | |
result['bozo'] = 1 | |
result['bozo_exception'] = feedparser.exc or e | |
use_strict_parser = 0 | |
if not use_strict_parser: | |
# falling back to the loose parser only works with seekable files: | |
# io.UnsupportedOperation: underlying stream is not seekable | |
# decode the file on the fly; | |
# if an encoding was detected, use it; | |
# otherwise assume utf-8 and do your best | |
data = stream_factory.get_text_file('utf-8', 'replace').read() | |
# similar to the original code | |
feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities) | |
feedparser.resolve_relative_uris = resolve_relative_uris | |
feedparser.sanitize_html = sanitize_html | |
feedparser.feed(data) | |
result['feed'] = feedparser.feeddata | |
result['entries'] = feedparser.entries | |
result['version'] = result['version'] or feedparser.version | |
result['namespaces'] = feedparser.namespaces_in_use | |
return result | |
class _SummaryMixin: | |
def _start_summary(self, attrs_d): | |
if not self.version.startswith('atom'): | |
return super()._start_summary(attrs_d) | |
self._summaryKey = 'summary' | |
self.push_content(self._summaryKey, attrs_d, 'text/plain', 1) | |
class StrictFeedParser(_SummaryMixin, fp.api.StrictFeedParser): pass | |
class LooseFeedParser(_SummaryMixin, fp.api.LooseFeedParser): pass | |
""" | |
# after passing file as-is to source (no encoding handling, no loose parsing) | |
error: _feeds/https-www-reddit-com-r-oilshell-rss.rss: loose parser not implemented | |
better 11.6 28 | |
feedparser 13.9 56 | |
noop 0.0 18 | |
# after we convert_to_utf8() only the prefix and avoid reading the whole file | |
error: _feeds/https-www-reddit-com-r-oilshell-rss.rss: loose parser not implemented | |
better 10.1 32 | |
feedparser 10.6 60 | |
noop 0.0 19 | |
better results are the same as for feedparser | |
# after PrefixFileWrapper and optimistic_encoding_detection | |
error: _feeds/https-www-reddit-com-r-oilshell-rss.rss: 'PrefixFileWrapper' object has no attribute 'seek' | |
better 10.1 31 | |
feedparser 10.6 60 | |
noop 0.0 19 | |
# after StreamFactory | |
better 10.9 33.2 | |
better_bytes 10.4 33.9 | |
feedparser 10.8 60.6 | |
noop 0.0 20.0 | |
""" | |
if __name__ == "__main__": | |
from textwrap import dedent | |
from pprint import pprint, pformat | |
import sys, feedparser, difflib | |
lines = dedent("""\ | |
<?xml version="1.0" encoding="utf-8"?> | |
<?xml version="1.0" encoding="UTF-8"?> | |
<?xml version="1.0" encoding="UTF-8" ?> | |
<?xml version="1.0" encoding="utf-8" standalone="yes" ?> | |
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"> | |
<?xml version="1.0"?> | |
<rss version='2.0'> | |
""").splitlines() | |
""" | |
for line in lines: | |
line = line.encode('gbk') | |
result = {} | |
headers = {'content-type': 'application/xml; charset=ms932'} | |
data = convert_to_utf8(headers, line, result) | |
print(line, data, result, sep='\n', end='\n\n') | |
""" | |
""" | |
for path in sys.stdin: | |
path = path.rstrip() | |
with open(path, 'rb') as f: | |
original = feedparser.parse(f) | |
f.seek(0) | |
try: | |
better = parse(f) | |
except NotImplementedError: | |
continue | |
if original != better: | |
content_equal = [] | |
for eo, eb in zip(original.entries, better.entries): | |
eoa = ([eo.summary] if eo.summary else []) + [c.value for c in eo.content] | |
eba = ([eb.summary] if eb.summary else []) + [c.value for c in eb.content] | |
content_equal.append(set(eoa) == set(eba)) | |
if all(content_equal): | |
continue | |
print('===', path) | |
print(*difflib.ndiff(pformat(original).splitlines(), pformat(better).splitlines()), sep='\n') | |
# the only one that's different is _feeds/https-sobolevn-me-feed-xml.atom | |
# but i checked it by hand and it looks OK (it has both content and summary) | |
""" | |
with open('index.xml', 'rb') as f: | |
pprint(parse(f)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, time, resource, io | |
import feedparser, atoma, betterfeedparser | |
def feedparser_parse(path, file): | |
return feedparser.parse( | |
file, | |
resolve_relative_uris=False, | |
sanitize_html=False, | |
) | |
def better_parse(path, file): | |
return betterfeedparser.parse( | |
file, | |
resolve_relative_uris=False, | |
sanitize_html=False, | |
) | |
def better_bytes_parse(path, file): | |
return betterfeedparser.parse( | |
io.BytesIO(file.read()), | |
resolve_relative_uris=False, | |
sanitize_html=False, | |
) | |
def atoma_parse(path, file): | |
return getattr(atoma, f'parse_{path.rpartition(".")[2]}_file')(file) | |
def noop_parse(*_): pass | |
impl = sys.argv[1] | |
parse = locals()[f'{impl}_parse'] | |
timings = 0 | |
for line in sys.stdin: | |
path = line.rstrip() | |
with open(path, 'rb') as file: | |
try: | |
start = time.perf_counter() | |
parse(path, file) | |
end = time.perf_counter() | |
timings += end - start | |
except Exception as e: | |
print(f'error: {path}: {e}', file=sys.stderr) | |
maxrss = ( | |
resource.getrusage(resource.RUSAGE_SELF).ru_maxrss | |
/ 2 ** (20 if sys.platform == 'darwin' else 10) | |
) | |
print(impl, round(timings, 1), round(maxrss, 1)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
import io | |
class PrefixFileWrapper: | |
""" | |
>>> import io | |
>>> file = io.StringIO('abcdef') | |
>>> file.read(2) | |
'ab' | |
>>> wrapped = PrefixFileWrapper(file.read(2).upper(), file) | |
>>> wrapped.read() | |
'CDef' | |
>>> wrapped.reset() | |
>>> wrapped.read() | |
'CDef' | |
>>> | |
""" | |
def __init__(self, prefix, file): | |
self.prefix = prefix | |
self.file = file | |
try: | |
self.file_initial_offset = file.tell() | |
except OSError: | |
self.file_initial_offset = None | |
self.offset = 0 | |
def reset(self): | |
# raises io.UnsupportedOperation if the underlying stream is not seekable | |
self.file.seek(self.file_initial_offset) | |
self.offset = 0 | |
def read(self, size=-1): | |
buffer = self.file.read(0) | |
if self.offset < len(self.prefix): | |
if size < 0: | |
chunk = self.prefix | |
else: | |
chunk = self.prefix[self.offset : self.offset+size] | |
size -= len(chunk) | |
buffer += chunk | |
self.offset += len(chunk) | |
while True: | |
chunk = self.file.read(size) | |
if not chunk: | |
break | |
buffer += chunk | |
self.offset += len(chunk) | |
if size <= 0: | |
break | |
size -= len(chunk) | |
return buffer | |
def close(self): | |
pass | |
class MissingEncoding(io.UnsupportedOperation): pass | |
class StreamFactory: | |
def __init__(self, file, encoding=None): | |
self.file = file | |
self.encoding = encoding | |
self.should_reset = False | |
def get_text_file(self, fallback_encoding=None, errors='strict'): | |
encoding = self.encoding or fallback_encoding | |
if encoding is None: | |
raise MissingEncoding("cannot create text stream without encoding") | |
reader_factory = codecs.getreader(encoding) | |
reader = reader_factory(self.file, errors) | |
self.reset() | |
return reader | |
def get_binary_file(): | |
self.reset() | |
return self.file | |
def reset(self): | |
if self.should_reset: | |
self.file.reset() | |
self.should_reset = True | |
import io | |
# commented out so it doesn't mess with memory measurements | |
#import pytest | |
def make_file_in_the_middle(data): | |
prefix = b'zzzzz' | |
rv = io.BytesIO(prefix + data) | |
rv.seek(len(prefix)) | |
return rv | |
class make_file_one_by_one(io.BytesIO): | |
def read(self, size=-1): | |
if size <= 0: | |
return super().read(size) | |
return super().read(1) | |
""" | |
@pytest.mark.parametrize('make_file', [ | |
io.BytesIO, | |
make_file_in_the_middle, | |
make_file_one_by_one, | |
]) | |
""" | |
def test_pfw(make_file): | |
f = PrefixFileWrapper(b'abc', make_file(b'def')) | |
assert f.read() == b'abcdef' | |
assert f.read() == b'' | |
f.reset() | |
assert f.read(2) == b'ab' | |
assert f.read(2) == b'cd' | |
assert f.read(2) == b'ef' | |
assert f.read(2) == b'' | |
assert f.read() == b'' | |
f.reset() | |
assert f.read(3) == b'abc' | |
assert f.read(3) == b'def' | |
assert f.read(3) == b'' | |
assert f.read() == b'' | |
f.reset() | |
assert f.read(0) == b'' | |
assert f.read() == b'abcdef' | |
f.reset() | |
f.reset() | |
assert f.read() == b'abcdef' | |
class make_file_not_seekable(io.BytesIO): | |
def tell(self): | |
raise io.UnsupportedOperation | |
def seek(self, *args): | |
raise io.UnsupportedOperation | |
def test_pfw_not_seekable(): | |
f = PrefixFileWrapper(b'abc', make_file_not_seekable(b'def')) | |
assert f.read() == b'abcdef' | |
assert f.read() == b'' | |
with pytest.raises(io.UnsupportedOperation): | |
f.reset() | |
assert f.read() == b'' | |
f = PrefixFileWrapper(b'abc', make_file_not_seekable(b'def')) | |
assert f.read(3) == b'abc' | |
with pytest.raises(io.UnsupportedOperation): | |
f.reset() | |
assert f.read() == b'def' | |
def test_pfw_no_prefix(): | |
f = PrefixFileWrapper(b'', io.BytesIO(b'abc')) | |
assert f.read(1) == b'a' | |
assert f.read() == b'bc' | |
f.reset() | |
assert f.read() == b'abc' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment