Created
September 18, 2019 00:00
-
-
Save yak1ex/b67fb1080712cdb09460dc3c24bf0ede to your computer and use it in GitHub Desktop.
eml by futakuro splitter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
from email.parser import BytesParser, Parser | |
from email.policy import default | |
from pathlib import Path | |
from urllib.parse import urlparse | |
from html.parser import HTMLParser | |
class URLConverter(HTMLParser): | |
def __init__(self, mapping): | |
super().__init__() | |
self._mapping = mapping | |
self._out = '' | |
def handle_decl(self, decl): | |
self._out = self._out + '<!' + decl + '>' | |
def handle_starttag(self, tag, attrs): | |
def mapper(attrs_): | |
for (k, v) in attrs_: | |
check = (k == 'src' or k == 'href') and v in self._mapping | |
yield k, self._mapping[v] if check else v | |
self._out = self._out + '<' + tag + ' ' + ' '.join(k + '="' + v + '"' for (k,v) in mapper(attrs)) + '>' | |
def handle_endtag(self, tag): | |
self._out = self._out + '</' + tag + '>' | |
def handle_data(self, data): | |
self._out = self._out + data | |
def get_output(self): | |
return self._out | |
def make_path(in_path: str) -> Path: | |
if not in_path: | |
return None | |
url = urlparse(in_path) | |
path = Path(url.path) | |
if 'res' in path.parts: | |
return Path('.').joinpath(path.parts[-1]) | |
else: | |
return Path('.').joinpath(*path.parts[-2:]) | |
def process(input: str): | |
with open(input, 'rb') as fp: | |
mapping = {} | |
msg = BytesParser(policy=default).parse(fp) | |
# 1st walk for making mapping | |
for part in msg.walk(): | |
if not re.match(r'text', part.get_content_type()): | |
url = part.get("Content-Location") | |
loc = urlparse(url).path if url else None | |
path = make_path(loc) | |
mapping[url] = str(path) | |
# 2nd walk for actual output | |
for part in msg.walk(): | |
loc = part.get("Content-Location") | |
loc = urlparse(loc).path if loc else None | |
path = make_path(loc) | |
print(part.get_content_type(), path) | |
if path: | |
if re.match(r'text', part.get_content_type()): | |
part.set_charset('cp932') | |
content = part.get_content() | |
if not path.parent.exists(): | |
path.parent.mkdir() | |
with open(path, 'wb') as out: | |
if type(content) == bytes: | |
out.write(part.get_content()) | |
else: | |
parser = URLConverter(mapping) | |
parser.feed(part.get_content()) | |
out.write(parser.get_output().encode('utf-8')) | |
if __name__ == "__main__": | |
process(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment