Last active
August 29, 2015 14:10
-
-
Save gartenfeld/687fc4f58a375b2ab340 to your computer and use it in GitHub Desktop.
Extracting and re-formatting XML data using BeautifulSoup.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re # Regular Expressions | |
import collections # Data Types | |
import sys # File operations | |
import codecs # UniCode support | |
import os | |
import locale | |
def inspect_file(file_name): | |
raw_file = source_path + file_name | |
raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") # Read the whole file into a soup | |
entry = raw_soup.find('div', class_="entry") | |
# Load all citation blocks | |
citation_blocks = entry.find_all('div', class_="citation-block") | |
# Iterate through each citation block | |
for citation_block in citation_blocks: | |
# Make new gloss-block wrapper | |
new_block = BeautifulSoup().new_tag("div", **{'class':'citation-block'}) | |
# Find all Finnish sentences | |
fi_citations = citation_block.find_all('span', lang="fi") | |
# For each citation pair, anchored by the Finnish citation | |
for fi_citation in fi_citations: | |
# Make a citation wrapper | |
citation_wrapper = BeautifulSoup().new_tag("div", **{'class':'citation-pair'}) | |
# Find the immediate next English citation | |
en_translation = fi_citation.find_next_sibling() | |
# Clean up Finnish text | |
fi_text = "".join(fi_citation.find_all(text=True)).strip() | |
# Clean up English text | |
en_text = "".join(en_translation.find_all(text=True)).strip() | |
# Strip right-most comma | |
if en_text[-1:] == ",": en_text = en_text[:-1] | |
en_text = en_text.strip() | |
# Make and add FI tag | |
fi_tag = BeautifulSoup().new_tag("div", **{'class':'FI-citation'}) | |
fi_tag.string = fi_text | |
citation_wrapper.append(fi_tag) | |
# Make and add EN tag | |
en_tag = BeautifulSoup().new_tag("div", **{'class':'EN-translation'}) | |
en_tag.string = en_text | |
citation_wrapper.append(en_tag) | |
# Add citation pair to new block | |
new_block.append(citation_wrapper) | |
# After iterating through FI SPANs, replace old block | |
citation_block.replace_with(new_block) | |
# Save changes | |
f_output = open(raw_file,'w') | |
f_output.write(file_header+str(entry)+file_footer) | |
f_output.close() | |
return | |
def check_all(files_list): | |
for i, file_name in enumerate(files_list): | |
if i%6000 == 0: print ("\tProgress: " + str(int(i*100/len(files_list)))+"%") | |
inspect_file(file_name) | |
return | |
def load_directory(source_path): | |
files_list = [] | |
for file_name in os.listdir(source_path): | |
try: | |
if file_name.endswith(".html"): | |
files_list.append(file_name) | |
except IndexError: | |
sys.stderr.write("Something went wrong with " + file_name + ".") | |
continue | |
locale.setlocale(locale.LC_ALL, 'en_AU') | |
nr_loaded = locale.format("%d", len(files_list), grouping=True) | |
print(nr_loaded + " files loaded.") | |
return files_list | |
if __name__ == '__main__': | |
source_path = "Data-FI-EN/" | |
file_header = """<html> | |
<head> | |
<meta charset="utf-8"> | |
</head> | |
<body> | |
""" | |
file_footer =""" | |
</body> | |
</html>""" | |
print("Loading files...") | |
files_list = load_directory(source_path) # Load list of raw files | |
print("Checking all files...") | |
check_all(files_list) | |
print("Valmis!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment