Created
January 10, 2014 22:00
-
-
Save ap-Codkelden/8363529 to your computer and use it in GitHub Desktop.
Cleaning wrong HTML markup.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from os import listdir | |
from os.path import isfile, join | |
#import argparse | |
import re | |
def isinput(filename): | |
# для итерации 1 | |
# r=re.compile('Chapter\d{1,3}\.html') | |
# для итерации 2 | |
r=re.compile('Chapter\d{1,3}\.xhtml') | |
return re.match(r, filename) | |
def Expressed(bigstring, filename): | |
try: | |
css=re.compile('styles.css') | |
chapterlines=css.sub('ruby.css',bigstring) | |
header=re.compile('\s+<div class="title(\d)">\s+<p>(.+)</p>\s+</div>') | |
s=header.sub(r'<h\g<1>>\g<2></h\g<1>>',chapterlines) | |
# emphasis | |
emtag = re.compile('<span class=\"emphasis\">(.+?)</span>') | |
s=emtag.sub(r' <em>\g<1></em>',s) | |
# Нумерованный список | |
ol=re.compile('\s+<p>\d\..\s(.+?)</p>\s+?') | |
s=ol.sub(r'OL<li>\g<1></li>',s) | |
# указание, что это OL | |
olpoint=re.compile('OL(<li>.*</li>)') | |
s=olpoint.sub(r'<ol>\g<1></ol>',s) | |
rmol=re.compile('</li>OL<li>') | |
s=rmol.sub('</li><li>',s) | |
# Маркированный список | |
ul=re.compile('<p>\*\*[\s<](.+?)</p>') | |
s=ul.sub(r'UL<li>\g<1></li>',s) | |
# указание, что это UL | |
ulpoint=re.compile('UL(<li>.+?</li>)') | |
s=ulpoint.sub(r'<ul>\g<1></ul>',s) | |
rmul=re.compile('</li>UL<li>') | |
s=rmol.sub('</li><li>',s) | |
# remove 'empty-line' | |
s=s.replace('<p class="empty-line"/>','') | |
# строки кода в <pre> | |
codetopre=re.compile('\s+<p>\s+?\s+<code>(.+?)</code>\s+</p>') | |
s=codetopre.sub(r'<pre>\g<1></pre>',s) | |
# про листинг | |
listing=re.compile('subtitle') | |
s=listing.sub('listing', s) | |
# множественный pre -> в pre с разрывами строк | |
rmpre=re.compile('</pre><pre>') | |
s=rmpre.subn('\n', s) | |
#print(s[0].encode('utf8')) | |
return s[0].encode('utf8') | |
except: | |
print("I've got error @ ", filename) | |
pass | |
def RepairChapter(bigstring, filename): | |
try: | |
space=re.compile('>\s+?<') | |
s=space.sub(r'><',bigstring) | |
h1=re.compile('<h[234]>(Глава*.+\.\s.+)</h[234]>') | |
s=h1.sub(r'<h1>\g<1></h1>',s) | |
h4=re.compile('<h[123]>(\d{1,2}.\d{1,2}.\d{1,2}.\d{1,2}.\ .+)</h[123]>') | |
s=h4.sub(r'<h4>\g<1></h4>',s) | |
h3=re.compile('<h[124]>(\d{1,2}.\d{1,2}.\d{1,2}.\ .+)</h[124]>') | |
s=h3.sub(r'<h3>\g<1></h3>',s) | |
h2=re.compile('<h[134]>(\d{1,2}.\d.{1,2}\ .+)</h[134]>') | |
s=h2.subn(r'<h2>\g<1></h2>',s) | |
return s[0].encode('utf8') | |
except: | |
print("I've got error @ ", filename) | |
pass | |
# TODO | |
# ссылки по словам "глава", "листинг", "раздел" | |
# проверить каждую главу с эпиграфом на дублирование | |
if __name__ == '__main__': | |
# имена файлов в массив | |
html = [f for f in listdir('.') if isfile(join('.',f))] | |
#print(html) | |
for x in html: | |
if isinput(x): | |
p = x.split('.') | |
# итерация 1 | |
# filesave = p[0]+'.x'+p[1] | |
# итерация 2 | |
filesave = '_'+p[0]+'.'+p[1] | |
print('Processing file: ', x , ' => ', filesave) | |
with open(x, 'r', encoding='utf8') as chapter: | |
# iter 1 | |
# chapterlines=chapter.read().replace("\n","").replace(u'\xa0',' ').replace(u'\u2022','**').replace(u'\x98','Z') | |
chapterlines=chapter.read() | |
with open(filesave,'wb') as newfile: | |
# имя файла вторым параметром | |
# для сообщения про ошибку | |
# 1 итерация | |
# newfile.write(Expressed(chapterlines, x)) | |
# 2 итерация | |
newfile.write(RepairChapter(chapterlines, x)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment