gartenfeld · August 29, 2015 14:10
diff --git a/citation_block_convert.py b/citation_block_convert.py
 from bs4 import BeautifulSoup
 import re # Regular Expressions
 import collections # Data Types
 import sys # File operations
 import codecs # UniCode support
 import os
 import locale

 def inspect_file(file_name):

 	raw_file = source_path + file_name
 	raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") # Read the whole file into a soup
 	entry = raw_soup.find('div', class_="entry")

 	# Load all citation blocks
 	citation_blocks = entry.find_all('div', class_="citation-block")

 	# Iterate through each citation block
 	for citation_block in citation_blocks:

 		# Make new gloss-block wrapper
 		new_block = BeautifulSoup().new_tag("div", **{'class':'citation-block'})

 		# Find all Finnish sentences
 		fi_citations = citation_block.find_all('span', lang="fi")

 		# For each citation pair, anchored by the Finnish citation
 		for fi_citation in fi_citations:

 			# Make a citation wrapper
 			citation_wrapper = BeautifulSoup().new_tag("div", **{'class':'citation-pair'})

 			# Find the immediate next English citation
 			en_translation = fi_citation.find_next_sibling()

 			# Clean up Finnish text
 			fi_text = "".join(fi_citation.find_all(text=True)).strip()

 			# Clean up English text
 			en_text = "".join(en_translation.find_all(text=True)).strip()
 			# Strip right-most comma
 			if en_text[-1:] == ",": en_text = en_text[:-1]
 			en_text = en_text.strip()

 			# Make and add FI tag
 			fi_tag = BeautifulSoup().new_tag("div", **{'class':'FI-citation'})
 			fi_tag.string = fi_text
 			citation_wrapper.append(fi_tag)

 			# Make and add EN tag
 			en_tag = BeautifulSoup().new_tag("div", **{'class':'EN-translation'})
 			en_tag.string = en_text
 			citation_wrapper.append(en_tag)

 			# Add citation pair to new block
 			new_block.append(citation_wrapper)

 		# After iterating through FI SPANs, replace old block
 		citation_block.replace_with(new_block)

 	# Save changes
 	f_output = open(raw_file,'w')
 	f_output.write(file_header+str(entry)+file_footer)
 	f_output.close()
 	
 	return

 def check_all(files_list):
 	for i, file_name in enumerate(files_list):
 		if i%6000 == 0: print ("\tProgress: " + str(int(i*100/len(files_list)))+"%")
 		inspect_file(file_name)
 	return

 def load_directory(source_path):
 	files_list = []
 	for file_name in os.listdir(source_path):
 		try:
 			if file_name.endswith(".html"):
 				files_list.append(file_name)
 		except IndexError:
 			sys.stderr.write("Something went wrong with " + file_name + ".")
 			continue
 	locale.setlocale(locale.LC_ALL, 'en_AU')
 	nr_loaded = locale.format("%d", len(files_list), grouping=True)
 	print(nr_loaded + " files loaded.")
 	return files_list

 if __name__ == '__main__':

 	source_path = "Data-FI-EN/"
 	file_header = """<html>
 <head>
 <meta charset="utf-8">
 </head>
 <body>
 """
 	file_footer ="""
 </body>
 </html>"""
 	print("Loading files...")
 	files_list = load_directory(source_path) # Load list of raw files
 	
 	print("Checking all files...")
 	check_all(files_list)

 	print("Valmis!")
	from bs4 import BeautifulSoup
	import re # Regular Expressions
	import collections # Data Types
	import sys # File operations
	import codecs # UniCode support
	import os
	import locale

	def inspect_file(file_name):

	raw_file = source_path + file_name
	raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") # Read the whole file into a soup
	entry = raw_soup.find('div', class_="entry")

	# Load all citation blocks
	citation_blocks = entry.find_all('div', class_="citation-block")

	# Iterate through each citation block
	for citation_block in citation_blocks:

	# Make new gloss-block wrapper
	new_block = BeautifulSoup().new_tag("div", **{'class':'citation-block'})

	# Find all Finnish sentences
	fi_citations = citation_block.find_all('span', lang="fi")

	# For each citation pair, anchored by the Finnish citation
	for fi_citation in fi_citations:

	# Make a citation wrapper
	citation_wrapper = BeautifulSoup().new_tag("div", **{'class':'citation-pair'})

	# Find the immediate next English citation
	en_translation = fi_citation.find_next_sibling()

	# Clean up Finnish text
	fi_text = "".join(fi_citation.find_all(text=True)).strip()

	# Clean up English text
	en_text = "".join(en_translation.find_all(text=True)).strip()
	# Strip right-most comma
	if en_text[-1:] == ",": en_text = en_text[:-1]
	en_text = en_text.strip()

	# Make and add FI tag
	fi_tag = BeautifulSoup().new_tag("div", **{'class':'FI-citation'})
	fi_tag.string = fi_text
	citation_wrapper.append(fi_tag)

	# Make and add EN tag
	en_tag = BeautifulSoup().new_tag("div", **{'class':'EN-translation'})
	en_tag.string = en_text
	citation_wrapper.append(en_tag)

	# Add citation pair to new block
	new_block.append(citation_wrapper)

	# After iterating through FI SPANs, replace old block
	citation_block.replace_with(new_block)

	# Save changes
	f_output = open(raw_file,'w')
	f_output.write(file_header+str(entry)+file_footer)
	f_output.close()

	return

	def check_all(files_list):
	for i, file_name in enumerate(files_list):
	if i%6000 == 0: print ("\tProgress: " + str(int(i*100/len(files_list)))+"%")
	inspect_file(file_name)
	return

	def load_directory(source_path):
	files_list = []
	for file_name in os.listdir(source_path):
	try:
	if file_name.endswith(".html"):
	files_list.append(file_name)
	except IndexError:
	sys.stderr.write("Something went wrong with " + file_name + ".")
	continue
	locale.setlocale(locale.LC_ALL, 'en_AU')
	nr_loaded = locale.format("%d", len(files_list), grouping=True)
	print(nr_loaded + " files loaded.")
	return files_list

	if __name__ == '__main__':

	source_path = "Data-FI-EN/"
	file_header = """<html>
	<head>
	<meta charset="utf-8">
	</head>
	<body>
	"""
	file_footer ="""
	</body>
	</html>"""
	print("Loading files...")
	files_list = load_directory(source_path) # Load list of raw files

	print("Checking all files...")
	check_all(files_list)

	print("Valmis!")