ap-Codkelden · January 10, 2014 22:00
diff --git a/replace.py b/replace.py
 from os import listdir
 from os.path import isfile, join
 #import argparse
 import re

 def isinput(filename):
 	# для итерации 1
 	# r=re.compile('Chapter\d{1,3}\.html')
 	# для итерации 2
 	r=re.compile('Chapter\d{1,3}\.xhtml')
 	return re.match(r, filename)

 def Expressed(bigstring, filename):
 	try:
 		css=re.compile('styles.css')
 		chapterlines=css.sub('ruby.css',bigstring)

 		header=re.compile('\s+<div class="title(\d)">\s+<p>(.+)</p>\s+</div>')
 		s=header.sub(r'<h\g<1>>\g<2></h\g<1>>',chapterlines)

 		# emphasis 
 		emtag = re.compile('<span class=\"emphasis\">(.+?)</span>')
 		s=emtag.sub(r' <em>\g<1></em>',s)

 		# Нумерованный список
 		ol=re.compile('\s+<p>\d\..\s(.+?)</p>\s+?')
 		s=ol.sub(r'OL<li>\g<1></li>',s)
 		# указание, что это OL
 		olpoint=re.compile('OL(<li>.*</li>)')
 		s=olpoint.sub(r'<ol>\g<1></ol>',s)
 		rmol=re.compile('</li>OL<li>')
 		s=rmol.sub('</li><li>',s)

 		# Маркированный список
 		ul=re.compile('<p>\*\*[\s<](.+?)</p>')
 		s=ul.sub(r'UL<li>\g<1></li>',s)

 		# указание, что это UL
 		ulpoint=re.compile('UL(<li>.+?</li>)')
 		s=ulpoint.sub(r'<ul>\g<1></ul>',s)
 		rmul=re.compile('</li>UL<li>')
 		s=rmol.sub('</li><li>',s)

 		# remove 'empty-line'
 		s=s.replace('<p class="empty-line"/>','')

 		# строки кода в <pre>
 		codetopre=re.compile('\s+<p>\s+?\s+<code>(.+?)</code>\s+</p>')
 		s=codetopre.sub(r'<pre>\g<1></pre>',s)

 		# про листинг
 		listing=re.compile('subtitle')
 		s=listing.sub('listing', s)

 		# множественный pre -> в pre с разрывами строк
 		rmpre=re.compile('</pre><pre>')
 		s=rmpre.subn('\n', s)
 		#print(s[0].encode('utf8'))
 		return s[0].encode('utf8')
 	except:
 		print("I've got error @ ", filename)
 		pass

 def RepairChapter(bigstring, filename):
 	try:
 		space=re.compile('>\s+?<')
 		s=space.sub(r'><',bigstring)

 		h1=re.compile('<h[234]>(Глава*.+\.\s.+)</h[234]>')
 		s=h1.sub(r'<h1>\g<1></h1>',s)

 		h4=re.compile('<h[123]>(\d{1,2}.\d{1,2}.\d{1,2}.\d{1,2}.\ .+)</h[123]>')
 		s=h4.sub(r'<h4>\g<1></h4>',s)

 		h3=re.compile('<h[124]>(\d{1,2}.\d{1,2}.\d{1,2}.\ .+)</h[124]>')
 		s=h3.sub(r'<h3>\g<1></h3>',s)

 		h2=re.compile('<h[134]>(\d{1,2}.\d.{1,2}\ .+)</h[134]>')
 		s=h2.subn(r'<h2>\g<1></h2>',s)

 		return s[0].encode('utf8')
 	except:
 		print("I've got error @ ", filename)
 		pass


 # TODO
 # ссылки по словам "глава", "листинг", "раздел"
 # проверить каждую главу с эпиграфом на дублирование 

 if __name__ == '__main__':
 	# имена файлов в массив
 	html = [f for f in listdir('.') if isfile(join('.',f))]
 	#print(html)

 	for x in html:
 		if isinput(x):
 			p = x.split('.')
 			# итерация 1
 			# filesave = p[0]+'.x'+p[1]
 			# итерация 2
 			filesave = '_'+p[0]+'.'+p[1]
 			print('Processing file: ', x , ' => ', filesave)

 			with open(x, 'r', encoding='utf8') as chapter:
 				# iter 1
 				# chapterlines=chapter.read().replace("\n","").replace(u'\xa0',' ').replace(u'\u2022','**').replace(u'\x98','Z')
 				chapterlines=chapter.read()

 			with open(filesave,'wb') as newfile:
 				# имя файла вторым параметром
 				# для сообщения про ошибку

 				# 1 итерация
 				# newfile.write(Expressed(chapterlines, x))

 				# 2 итерация
 				newfile.write(RepairChapter(chapterlines, x))