kinow · March 15, 2018 08:08
diff --git a/text2html.py b/text2html.py
 #!/usr/bin/env python3

 import os, sys, binaryornot
 from binaryornot.check import is_binary

 DIR = '/home/kinow/Development/python/workspace/crawlers-noticias'

 IGNORE_DIRS = set([
    '.git',
    'data',
    'bower_components',
    'news-crawler-app'
 ])
 IGNORE_FILES = set([
    'datepicker3.css'
 ])
 OUTPUT = '/tmp/output.html'

 import html
 from bs4 import BeautifulSoup
 from pathlib import Path

 FIRST = True

 def text2html(filepath):
    global DIR
    header_file_name = os.path.relpath(filepath, Path(DIR).parent)
    global FIRST
    if FIRST:
        FIRST = False
        output = "<div class=''/>"
    else:
        output = "<div class='new-page'/>"
    with open(filepath, 'r') as f:
        output += "<strong>{}</strong>".format(header_file_name)
        html_text = "<pre>{}</pre>".format(html.escape(str(f.read())))
        output += html_text
    output += '</div>'
    output = BeautifulSoup(output, "lxml").encode(formatter='html').decode('utf8')
    output = output.replace('\n', '<br />')
    output = output.replace("\t", '&nbsp;&nbsp;&nbsp;&nbsp;')
    return output

 def main():
    with open(OUTPUT, 'w') as f:
        f.write("""
 <html><head><style type='text/css'>
 pre {
 	width: 100%;
 	padding: 0;
 	margin: 0;
 	overflow: auto;
 	overflow-y: hidden;
 	font-size: 12px;
 	line-height: 20px;
 	background: #efefef;
 	border: 1px solid #777;
 	background: url(lines.png) repeat 0 0;
    word-wrap: break-word;
 }
 pre code {
 	padding: 10px;
 	color: #333;
    word-wrap: break-word;
 }
 @media print {
 .new-page {
    page-break-before: always;
 }
 pre {
 	width: 100%;
 	padding: 0;
 	margin: 0;
 	overflow: auto;
 	overflow-y: hidden;
 	font-size: 12px;
 	line-height: 20px;
 	background: #efefef;
 	border: 1px solid #777;
 	background: url(lines.png) repeat 0 0;
 }
 pre code {
 	padding: 10px;
 	color: #333;
 }
 }
 </style>
 </head>
 <body>
 """)
        for root, dirs, files in os.walk(DIR):
            dirs[:] = [d for d in dirs if d not in IGNORE_DIRS]

            if "^.*" in dirs:
                continue
            for filename in files:
                if "^.*" in filename:
                    continue
                if filename in IGNORE_FILES:
                    continue
                filepath = os.path.join(root, filename) 
                if not is_binary(filepath) and not filepath.endswith('.pdf'):
                    html = text2html(filepath)
                    f.write(html)

        f.write("</body></html>")

 if __name__ == '__main__':
    main()
    sys.exit(0)

 # wkhtmltopdf /tmp/output.html ~/Desktop/output.pdf
	#!/usr/bin/env python3

	import os, sys, binaryornot
	from binaryornot.check import is_binary

	DIR = '/home/kinow/Development/python/workspace/crawlers-noticias'

	IGNORE_DIRS = set([
	'.git',
	'data',
	'bower_components',
	'news-crawler-app'
	])
	IGNORE_FILES = set([
	'datepicker3.css'
	])
	OUTPUT = '/tmp/output.html'

	import html
	from bs4 import BeautifulSoup
	from pathlib import Path

	FIRST = True

	def text2html(filepath):
	global DIR
	header_file_name = os.path.relpath(filepath, Path(DIR).parent)
	global FIRST
	if FIRST:
	FIRST = False
	output = "<div class=''/>"
	else:
	output = "<div class='new-page'/>"
	with open(filepath, 'r') as f:
	output += "<strong>{}</strong>".format(header_file_name)
	html_text = "<pre>{}</pre>".format(html.escape(str(f.read())))
	output += html_text
	output += '</div>'
	output = BeautifulSoup(output, "lxml").encode(formatter='html').decode('utf8')
	output = output.replace('\n', '<br />')
	output = output.replace("\t", '    ')
	return output

	def main():
	with open(OUTPUT, 'w') as f:
	f.write("""
	<html><head><style type='text/css'>
	pre {
	width: 100%;
	padding: 0;
	margin: 0;
	overflow: auto;
	overflow-y: hidden;
	font-size: 12px;
	line-height: 20px;
	background: #efefef;
	border: 1px solid #777;
	background: url(lines.png) repeat 0 0;
	word-wrap: break-word;
	}
	pre code {
	padding: 10px;
	color: #333;
	word-wrap: break-word;
	}
	@media print {
	.new-page {
	page-break-before: always;
	}
	pre {
	width: 100%;
	padding: 0;
	margin: 0;
	overflow: auto;
	overflow-y: hidden;
	font-size: 12px;
	line-height: 20px;
	background: #efefef;
	border: 1px solid #777;
	background: url(lines.png) repeat 0 0;
	}
	pre code {
	padding: 10px;
	color: #333;
	}
	}
	</style>
	</head>
	<body>
	""")
	for root, dirs, files in os.walk(DIR):
	dirs[:] = [d for d in dirs if d not in IGNORE_DIRS]

	if "^.*" in dirs:
	continue
	for filename in files:
	if "^.*" in filename:
	continue
	if filename in IGNORE_FILES:
	continue
	filepath = os.path.join(root, filename)
	if not is_binary(filepath) and not filepath.endswith('.pdf'):
	html = text2html(filepath)
	f.write(html)

	f.write("</body></html>")

	if __name__ == '__main__':
	main()
	sys.exit(0)

	# wkhtmltopdf /tmp/output.html ~/Desktop/output.pdf