gartenfeld · August 29, 2015 14:10
diff --git a/gloss_block_convert.py b/gloss_block_convert.py
 from bs4 import BeautifulSoup
 from bs4.element import Tag
 import re # Regular Expressions
 import collections # Data Types
 import sys # File operations
 import codecs # UniCode support
 import os
 import locale

 def is_tag(tag):
 	return isinstance(tag, Tag)

 def inspect_file(file_name):

 	raw_file = source_path + file_name
 	raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") # Read the whole file into a soup
 	entry = raw_soup.find('div', class_="entry")

 	gloss_blocks = entry.find_all('div', class_="gloss-block")

 	for gloss_block in gloss_blocks:

 		# Make new gloss-block wrapper
 		new_block = BeautifulSoup().new_tag("div", **{'class':'gloss-block'})

 		# Check for Sense Number
 		first_element = gloss_block.contents[0]
 		remove_sn = False
 		# If it's in a <b> tag and it's the first element, it's a Sense Number
 		if is_tag(first_element):
 			if first_element.name == 'b':
 				if first_element.string.isdigit:
 					# Make tag
 					sense_number = BeautifulSoup().new_tag("div", **{'class':'sense-number'})
 					sense_number.string = first_element.string
 					# Add tag to new block
 					new_block.append(sense_number)
 					# Remove element from soup
 					remove_sn = True

 		# Check for Lexical Class
 		first_i = gloss_block.find('i')
 		if first_i != None:
 			sib = first_i.find_previous_sibling()
 			if sib != None:
 				if sib.name == 'b' or sib.name == 'font':
 					# Make tag
 					lex_class = BeautifulSoup().new_tag("div", **{'class':'lexical-class'})
 					lex_class.string = first_i.string
 					# Add tag to new block
 					new_block.append(lex_class)
 					# Remove element from soup
 					first_i.decompose()

 		# Only now can sense-number be removed, after checking the lexical class tag
 		if remove_sn: 
 			first_element.decompose()

 		# Enclose all domains with a symbol
 		domain_tags = gloss_block.find_all('font')
 		for domain_tag in domain_tags:
 			domain_tag.string = "⋅" + domain_tag.string.strip() + "⋅"

 		# Extract Summary Gloss
 		EN_spans = gloss_block.find_all('span', lang="en")
 		# Make summary-gloss wrapper tag
 		summary_wrapper = BeautifulSoup().new_tag("div", **{'class':'summary-gloss'})
 		for EN_span in EN_spans:
 			EN_string = "".join(EN_span.find_all(text=True)).strip()
 			if EN_string[-1:] == ",": EN_string = EN_string[:-1]
 			# Exclude redirecting links
 			if EN_string != 'ks' and EN_string != 'ks.':
 				# Make en-gloss tag
 				en_gloss = BeautifulSoup().new_tag("div", **{'class':'en-gloss'})
 				en_gloss.string = EN_string
 				summary_wrapper.append(en_gloss)
 		# Add tag to new block
 		new_block.append(summary_wrapper)

 		# Concatenate the remaining text into full-gloss
 		full_gloss = BeautifulSoup().new_tag("div", **{'class':'full-gloss'})
 		full_string = "".join(gloss_block.find_all(text=True)).strip()
 		# Remove redundant spaces
 		full_gloss.string = re.sub(r' +',' ',full_string)

 		new_block.append(full_gloss)

 		# Replace the block
 		gloss_block.replace_with(new_block)
 	
 	# Write entry into file
 	f_output = open(raw_file,'w')
 	f_output.write(file_header+str(entry)+file_footer)
 	f_output.close()
 	
 	return

 def check_all(files_list):
 	for i, file_name in enumerate(files_list):
 		if i%5000 == 0: print ("Progress: " + str(int(i*100/len(files_list)))+"%")
 		inspect_file(file_name)

 	return

 def load_directory(source_path):
 	files_list = []
 	for file_name in os.listdir(source_path):
 		try:
 			if file_name.endswith(".html"):
 				files_list.append(file_name)
 		except IndexError:
 			sys.stderr.write("Something went wrong with " + file_name + ".")
 			continue
 	locale.setlocale(locale.LC_ALL, 'en_AU')
 	nr_loaded = locale.format("%d", len(files_list), grouping=True)
 	print(nr_loaded + " files loaded.")
 	return files_list

 if __name__ == '__main__':

 	source_path = "Data-FI-EN/"
 	file_header = """<html>
 <head>
 <meta charset="utf-8">
 </head>
 <body>
 """
 	file_footer ="""
 </body>
 </html>"""
 	print("Loading files...")
 	files_list = load_directory(source_path) # Load list of raw files
 	
 	print("Checking all files...")
 	check_all(files_list)
 	
 	print("Valmis!")
	from bs4 import BeautifulSoup
	from bs4.element import Tag
	import re # Regular Expressions
	import collections # Data Types
	import sys # File operations
	import codecs # UniCode support
	import os
	import locale

	def is_tag(tag):
	return isinstance(tag, Tag)

	def inspect_file(file_name):

	raw_file = source_path + file_name
	raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") # Read the whole file into a soup
	entry = raw_soup.find('div', class_="entry")

	gloss_blocks = entry.find_all('div', class_="gloss-block")

	for gloss_block in gloss_blocks:

	# Make new gloss-block wrapper
	new_block = BeautifulSoup().new_tag("div", **{'class':'gloss-block'})

	# Check for Sense Number
	first_element = gloss_block.contents[0]
	remove_sn = False
	# If it's in a <b> tag and it's the first element, it's a Sense Number
	if is_tag(first_element):
	if first_element.name == 'b':
	if first_element.string.isdigit:
	# Make tag
	sense_number = BeautifulSoup().new_tag("div", **{'class':'sense-number'})
	sense_number.string = first_element.string
	# Add tag to new block
	new_block.append(sense_number)
	# Remove element from soup
	remove_sn = True

	# Check for Lexical Class
	first_i = gloss_block.find('i')
	if first_i != None:
	sib = first_i.find_previous_sibling()
	if sib != None:
	if sib.name == 'b' or sib.name == 'font':
	# Make tag
	lex_class = BeautifulSoup().new_tag("div", **{'class':'lexical-class'})
	lex_class.string = first_i.string
	# Add tag to new block
	new_block.append(lex_class)
	# Remove element from soup
	first_i.decompose()

	# Only now can sense-number be removed, after checking the lexical class tag
	if remove_sn:
	first_element.decompose()

	# Enclose all domains with a symbol
	domain_tags = gloss_block.find_all('font')
	for domain_tag in domain_tags:
	domain_tag.string = "⋅" + domain_tag.string.strip() + "⋅"

	# Extract Summary Gloss
	EN_spans = gloss_block.find_all('span', lang="en")
	# Make summary-gloss wrapper tag
	summary_wrapper = BeautifulSoup().new_tag("div", **{'class':'summary-gloss'})
	for EN_span in EN_spans:
	EN_string = "".join(EN_span.find_all(text=True)).strip()
	if EN_string[-1:] == ",": EN_string = EN_string[:-1]
	# Exclude redirecting links
	if EN_string != 'ks' and EN_string != 'ks.':
	# Make en-gloss tag
	en_gloss = BeautifulSoup().new_tag("div", **{'class':'en-gloss'})
	en_gloss.string = EN_string
	summary_wrapper.append(en_gloss)
	# Add tag to new block
	new_block.append(summary_wrapper)

	# Concatenate the remaining text into full-gloss
	full_gloss = BeautifulSoup().new_tag("div", **{'class':'full-gloss'})
	full_string = "".join(gloss_block.find_all(text=True)).strip()
	# Remove redundant spaces
	full_gloss.string = re.sub(r' +',' ',full_string)

	new_block.append(full_gloss)

	# Replace the block
	gloss_block.replace_with(new_block)

	# Write entry into file
	f_output = open(raw_file,'w')
	f_output.write(file_header+str(entry)+file_footer)
	f_output.close()

	return

	def check_all(files_list):
	for i, file_name in enumerate(files_list):
	if i%5000 == 0: print ("Progress: " + str(int(i*100/len(files_list)))+"%")
	inspect_file(file_name)

	return

	def load_directory(source_path):
	files_list = []
	for file_name in os.listdir(source_path):
	try:
	if file_name.endswith(".html"):
	files_list.append(file_name)
	except IndexError:
	sys.stderr.write("Something went wrong with " + file_name + ".")
	continue
	locale.setlocale(locale.LC_ALL, 'en_AU')
	nr_loaded = locale.format("%d", len(files_list), grouping=True)
	print(nr_loaded + " files loaded.")
	return files_list

	if __name__ == '__main__':

	source_path = "Data-FI-EN/"
	file_header = """<html>
	<head>
	<meta charset="utf-8">
	</head>
	<body>
	"""
	file_footer ="""
	</body>
	</html>"""
	print("Loading files...")
	files_list = load_directory(source_path) # Load list of raw files

	print("Checking all files...")
	check_all(files_list)

	print("Valmis!")