Last active
August 29, 2015 14:10
-
-
Save gartenfeld/74f9cfdbc8faf1d98537 to your computer and use it in GitHub Desktop.
Making XML more semantic using BeautifulSoup.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from bs4.element import Tag | |
import re # Regular Expressions | |
import collections # Data Types | |
import sys # File operations | |
import codecs # UniCode support | |
import os | |
import locale | |
def is_tag(tag): | |
return isinstance(tag, Tag) | |
def inspect_file(file_name): | |
raw_file = source_path + file_name | |
raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") # Read the whole file into a soup | |
entry = raw_soup.find('div', class_="entry") | |
gloss_blocks = entry.find_all('div', class_="gloss-block") | |
for gloss_block in gloss_blocks: | |
# Make new gloss-block wrapper | |
new_block = BeautifulSoup().new_tag("div", **{'class':'gloss-block'}) | |
# Check for Sense Number | |
first_element = gloss_block.contents[0] | |
remove_sn = False | |
# If it's in a <b> tag and it's the first element, it's a Sense Number | |
if is_tag(first_element): | |
if first_element.name == 'b': | |
if first_element.string.isdigit: | |
# Make tag | |
sense_number = BeautifulSoup().new_tag("div", **{'class':'sense-number'}) | |
sense_number.string = first_element.string | |
# Add tag to new block | |
new_block.append(sense_number) | |
# Remove element from soup | |
remove_sn = True | |
# Check for Lexical Class | |
first_i = gloss_block.find('i') | |
if first_i != None: | |
sib = first_i.find_previous_sibling() | |
if sib != None: | |
if sib.name == 'b' or sib.name == 'font': | |
# Make tag | |
lex_class = BeautifulSoup().new_tag("div", **{'class':'lexical-class'}) | |
lex_class.string = first_i.string | |
# Add tag to new block | |
new_block.append(lex_class) | |
# Remove element from soup | |
first_i.decompose() | |
# Only now can sense-number be removed, after checking the lexical class tag | |
if remove_sn: | |
first_element.decompose() | |
# Enclose all domains with a symbol | |
domain_tags = gloss_block.find_all('font') | |
for domain_tag in domain_tags: | |
domain_tag.string = "⋅" + domain_tag.string.strip() + "⋅" | |
# Extract Summary Gloss | |
EN_spans = gloss_block.find_all('span', lang="en") | |
# Make summary-gloss wrapper tag | |
summary_wrapper = BeautifulSoup().new_tag("div", **{'class':'summary-gloss'}) | |
for EN_span in EN_spans: | |
EN_string = "".join(EN_span.find_all(text=True)).strip() | |
if EN_string[-1:] == ",": EN_string = EN_string[:-1] | |
# Exclude redirecting links | |
if EN_string != 'ks' and EN_string != 'ks.': | |
# Make en-gloss tag | |
en_gloss = BeautifulSoup().new_tag("div", **{'class':'en-gloss'}) | |
en_gloss.string = EN_string | |
summary_wrapper.append(en_gloss) | |
# Add tag to new block | |
new_block.append(summary_wrapper) | |
# Concatenate the remaining text into full-gloss | |
full_gloss = BeautifulSoup().new_tag("div", **{'class':'full-gloss'}) | |
full_string = "".join(gloss_block.find_all(text=True)).strip() | |
# Remove redundant spaces | |
full_gloss.string = re.sub(r' +',' ',full_string) | |
new_block.append(full_gloss) | |
# Replace the block | |
gloss_block.replace_with(new_block) | |
# Write entry into file | |
f_output = open(raw_file,'w') | |
f_output.write(file_header+str(entry)+file_footer) | |
f_output.close() | |
return | |
def check_all(files_list): | |
for i, file_name in enumerate(files_list): | |
if i%5000 == 0: print ("Progress: " + str(int(i*100/len(files_list)))+"%") | |
inspect_file(file_name) | |
return | |
def load_directory(source_path): | |
files_list = [] | |
for file_name in os.listdir(source_path): | |
try: | |
if file_name.endswith(".html"): | |
files_list.append(file_name) | |
except IndexError: | |
sys.stderr.write("Something went wrong with " + file_name + ".") | |
continue | |
locale.setlocale(locale.LC_ALL, 'en_AU') | |
nr_loaded = locale.format("%d", len(files_list), grouping=True) | |
print(nr_loaded + " files loaded.") | |
return files_list | |
if __name__ == '__main__': | |
source_path = "Data-FI-EN/" | |
file_header = """<html> | |
<head> | |
<meta charset="utf-8"> | |
</head> | |
<body> | |
""" | |
file_footer =""" | |
</body> | |
</html>""" | |
print("Loading files...") | |
files_list = load_directory(source_path) # Load list of raw files | |
print("Checking all files...") | |
check_all(files_list) | |
print("Valmis!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment