Last active
August 29, 2015 13:56
-
-
Save pebbie/9262557 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
file: html2wikitext.py | |
author: Peb Ruswono Aryan (28.02.2014) | |
desc: | |
script to convert html extracted by [http://github.com/petrabarus/perundangan](perundangan) | |
into wikitext used in http://hukum.pebbie.org | |
""" | |
import re | |
import sys | |
replacer = [ | |
(re.compile(r"<html[^>.]*>(.*?)<\/html>", re.I), r"\1"), | |
(re.compile(r"<head[^>.]*>(.*?)<\/head>", re.I), r""), | |
(re.compile(r"<body[^>.]*>(.*?)<\/body>", re.I), r"\1"), | |
(re.compile(r"<p[^>.]*>(.*?)<\/p>", re.I), r"\1"), | |
(re.compile(r"<small[^>.]*>(.*?)<\/small>", re.I), r"\1"), | |
(re.compile(r"<tbody[^>.]*>(.*?)<\/tbody>", re.I), r"\1"), | |
(re.compile(r"<thead[^>.]*>(.*?)<\/thead>", re.I), r"\1"), | |
(re.compile(r"<table[^>.]*>(.*?)<\/table>", re.I), r"\1"), | |
(re.compile(r"<td[^>.]*>(.*?)<\/td>", re.I), r"\1"), | |
(re.compile(r"<tr[^>.]*>(.*?)<\/tr>", re.I), r"\1"), | |
(re.compile(r"<center[^>.]*>(.*?)<\/center>", re.I), r"\n\1\n"), | |
(re.compile(r"<div[^>.]*>(.*?)<\/div>", re.I), r"\n\1"), | |
(re.compile(r"<font[^>]*>(.*?)<\/font>", re.I), r"\1"), | |
(re.compile(r"<hr[^>]*(\/?)>", re.I), r"\n"), | |
(re.compile(r"<img[^>]*(\/?)>", re.I), r""), | |
(re.compile(r"<br(\/?)>", re.I), r"\n"), | |
(re.compile(r"<(\/?).+(\/?)>", re.I), r""), | |
] | |
def get_text(src): | |
for p,r in replacer: | |
src = p.sub(r, src) | |
return src | |
def linkify(segment, prefix=""): | |
ayat = re.compile(r" ayat \((\d+)\) ", re.I) | |
segment = re.sub(r"[ ]+Peraturan Pemerintah Nomor (\d+) Tahun (\d+)([,. ])+", r" [[PP/\1/\2|Peraturan Pemerintah Nomor \1 Tahun \2]] ", segment) | |
segment = re.sub(r"[ ]+Pasal (\d+) ayat \((\d+)\) huruf (\w+)([,. ])+", r" [[#Pasal\1Ayat\2_\3|Pasal \1 ayat (\2) huruf \3]] ", segment) | |
segment = re.sub(r"[ ]+Pasal (\d+) ayat \((\d+)\)([,. ])+", r" [[#Pasal\1Ayat\2|Pasal \1 ayat (\2)]] ", segment) | |
segment = re.sub(r"[ ]+ayat \((\d+)\) huruf (\w+)([,. ])+", r" [[#${pp}Ayat\1_\2|ayat (\1) huruf \2]] ", segment) | |
segment = re.sub(r"[ ]+ayat \((\d+)\)([,. ])+", r" [[#${pp}Ayat\1|ayat (\1)]]\2", segment) | |
segment = re.sub(r"[ ]+Pasal (\d+)([,. ])+", r" [[#Pasal\1|Pasal \1]]\2", segment) | |
segment = re.sub(r"[ ]+huruf (\w+)([,. ])+", r" [[#${pp}\1|huruf (\1)]]\2", segment) | |
if "${pp}" in segment: segment = segment.replace("${pp}", prefix) | |
return segment | |
def wikify(src): | |
lines = src.split("\n") | |
output = [] | |
buffer = [] | |
state = "START" | |
nempty = 0 | |
stack = [] | |
segmentid = "" | |
list_stack = [] | |
title_hints = ["UNDANG", "PERATURAN"] | |
for line in lines: | |
ul = line.strip().upper() | |
ll = line.strip().lower() | |
print state, line | |
if state != "JUDUL" and any([hint in line and ul.index(hint)==0 for hint in title_hints]): | |
while len(stack)>0: output.append(stack.pop()) | |
state = "JUDUL" | |
buffer = [line] | |
numindent = 1 | |
stack=["="*numindent+" <br/>".join(buffer)+"="*numindent] | |
elif state=="JUDUL" and line.strip()==ul and len(ul)>0: | |
buffer.append(line) | |
numindent = 1 | |
stack=["="*numindent+" <br/>".join(buffer)+"="*numindent+"\n"] | |
elif state != "BAB" and "BAB" in line and ul.index("BAB")==0: | |
while len(stack)>0: output.append(stack.pop()) | |
state = "BAB" | |
buffer = [line] | |
numindent = 2 | |
stack=["="*numindent+" <br/>".join(buffer)+"="*numindent] | |
elif state=="BAB" and line.strip()==ul and len(ul)>0: | |
buffer.append(line) | |
numindent = 2 | |
stack=["="*numindent+" <br/>".join(buffer)+"="*numindent] | |
elif state != "BAGIAN" and "bagian" in ll and ll.index("bagian")==0: | |
while len(stack)>0: output.append(stack.pop()) | |
state = "BAGIAN" | |
buffer = [line] | |
numindent = 3 | |
stack=["="*numindent+" <br/>".join(buffer)+"="*numindent] | |
elif state=="BAGIAN" and len(ul)>0: | |
buffer.append(line) | |
numindent = 3 | |
stack=["="*numindent+" <br/>".join(buffer)+"="*numindent] | |
elif "pasal" in ll and ll.index("pasal")==0: | |
state = "PASAL" | |
while len(stack)>0: output.append(stack.pop()) | |
numindent = 3 | |
output.append("="*numindent+line+"="*numindent) | |
segmentid = "Pasal"+line.split(" ")[1] | |
elif len(line.strip())==0: | |
if state != "EMPTY": | |
if state in ["PASAL","LIST"]:continue | |
nempty = 1 | |
while len(stack)>0: output.append(stack.pop()) | |
else: | |
nempty += 1 | |
state="EMPTY" | |
if nempty>2: | |
output.append("\n") | |
nempty = 0 | |
elif state=="PASAL": | |
if "(" in line and ll.index("(")==0: | |
state="AYAT" | |
ayatid = line[1:line.index(")")] | |
output.append("<ol style=\"list-style-type:none;margin-left:0;\" id=\"Pasal5\">") | |
if ":"==line[-1]: | |
state="LIST" | |
fragment="%sAyat%s_" % (segmentid, ayatid) | |
output.append("<li id=\"%sAyat%s\">%s" % (segmentid, ayatid, linkify(line, segmentid))) | |
stack.append("</li>") | |
output.append("<ol style=\"list-style-type:none;\">") | |
stack.append("</ol>") | |
else: | |
output.append("<li id=\"%sAyat%s\">%s</li>" % (segmentid, ayatid, linkify(line, segmentid))) | |
stack.append("</ol>\n") | |
else: | |
state="ISI" | |
output.append("<span id=\"%s\">" % (segmentid)) | |
output.append(linkify(line.strip(), segmentid)) | |
stack.append("</span>\n") | |
if ":"==line[-1]: | |
state="LIST" | |
fragment="%s_" % segmentid | |
output.append("<ol style=\"list-style-type:none;\">") | |
stack.append("</ol>") | |
elif state=="AYAT" and "(" in line and ll.index("(")==0: | |
ayatid = line[1:line.index(")")] | |
if ":"==line[-1]: | |
state="LIST" | |
fragment="%sAyat%s_" % (segmentid, ayatid) | |
output.append("<li id=\"%sAyat%s\">%s" % (segmentid, ayatid, linkify(line, segmentid))) | |
stack.append("</li>") | |
output.append("<ol style=\"list-style-type:none;\">") | |
stack.append("</ol>") | |
else: | |
output.append("<li id=\"%sAyat%s\">%s</li>" % (segmentid, ayatid, linkify(line, segmentid))) | |
elif state=="LIST": | |
if "(" in line and ll.index("(")==0: | |
while len(stack)>0: output.append(stack.pop()) | |
ayatid = line[1:line.index(")")] | |
state="AYAT" | |
if ":"==line[-1]: | |
state="LIST" | |
fragment="%sAyat%s_" % (segmentid, ayatid) | |
output.append("<li id=\"%sAyat%s\">%s" % (segmentid, ayatid, linkify(line, segmentid))) | |
stack.append("</li>") | |
output.append("<ol style=\"list-style-type:none;\">") | |
stack.append("</ol>") | |
else: | |
output.append("<li id=\"%sAyat%s\">%s</li>" % (segmentid, ayatid, linkify(line, segmentid))) | |
else: | |
itemid = line.split(".")[0].strip() | |
output.append("<li id=\"%s%s\">%s</li>" % (fragment, itemid, linkify(line, fragment) )) | |
elif state=="EMPTY" and ("PENJELASAN"==ul or ("TAMBAHAN" in ul and ul.index("TAMBAHAN")==0)): | |
break | |
else: | |
output.append(line.strip()) | |
output.append("[[Category:Peraturan]]") | |
return "\n".join(output) | |
if __name__=="__main__": | |
if len(sys.argv)>1: | |
input_file = sys.argv[1] | |
with open(input_file) as f: txt = f.read() | |
txt = get_text(txt) | |
txt = wikify(txt) | |
if len(sys.argv)>2: | |
output_file = sys.argv[2] | |
with open(output_file,"w") as f: | |
f.write(txt) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment