Skip to content

Instantly share code, notes, and snippets.

@kinow
Last active March 15, 2018 08:08
Show Gist options
  • Save kinow/81d4709554c62fd2d15b3a8029926e08 to your computer and use it in GitHub Desktop.
Save kinow/81d4709554c62fd2d15b3a8029926e08 to your computer and use it in GitHub Desktop.
text2html.py
#!/usr/bin/env python3
import os, sys, binaryornot
from binaryornot.check import is_binary
DIR = '/home/kinow/Development/python/workspace/crawlers-noticias'
IGNORE_DIRS = set([
'.git',
'data',
'bower_components',
'news-crawler-app'
])
IGNORE_FILES = set([
'datepicker3.css'
])
OUTPUT = '/tmp/output.html'
import html
from bs4 import BeautifulSoup
from pathlib import Path
FIRST = True
def text2html(filepath):
global DIR
header_file_name = os.path.relpath(filepath, Path(DIR).parent)
global FIRST
if FIRST:
FIRST = False
output = "<div class=''/>"
else:
output = "<div class='new-page'/>"
with open(filepath, 'r') as f:
output += "<strong>{}</strong>".format(header_file_name)
html_text = "<pre>{}</pre>".format(html.escape(str(f.read())))
output += html_text
output += '</div>'
output = BeautifulSoup(output, "lxml").encode(formatter='html').decode('utf8')
output = output.replace('\n', '<br />')
output = output.replace("\t", '&nbsp;&nbsp;&nbsp;&nbsp;')
return output
def main():
with open(OUTPUT, 'w') as f:
f.write("""
<html><head><style type='text/css'>
pre {
width: 100%;
padding: 0;
margin: 0;
overflow: auto;
overflow-y: hidden;
font-size: 12px;
line-height: 20px;
background: #efefef;
border: 1px solid #777;
background: url(lines.png) repeat 0 0;
word-wrap: break-word;
}
pre code {
padding: 10px;
color: #333;
word-wrap: break-word;
}
@media print {
.new-page {
page-break-before: always;
}
pre {
width: 100%;
padding: 0;
margin: 0;
overflow: auto;
overflow-y: hidden;
font-size: 12px;
line-height: 20px;
background: #efefef;
border: 1px solid #777;
background: url(lines.png) repeat 0 0;
}
pre code {
padding: 10px;
color: #333;
}
}
</style>
</head>
<body>
""")
for root, dirs, files in os.walk(DIR):
dirs[:] = [d for d in dirs if d not in IGNORE_DIRS]
if "^.*" in dirs:
continue
for filename in files:
if "^.*" in filename:
continue
if filename in IGNORE_FILES:
continue
filepath = os.path.join(root, filename)
if not is_binary(filepath) and not filepath.endswith('.pdf'):
html = text2html(filepath)
f.write(html)
f.write("</body></html>")
if __name__ == '__main__':
main()
sys.exit(0)
# wkhtmltopdf /tmp/output.html ~/Desktop/output.pdf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment