Skip to content

Instantly share code, notes, and snippets.

@gartenfeld
Last active August 29, 2015 14:10
Show Gist options
  • Save gartenfeld/687fc4f58a375b2ab340 to your computer and use it in GitHub Desktop.
Save gartenfeld/687fc4f58a375b2ab340 to your computer and use it in GitHub Desktop.
Extracting and re-formatting XML data using BeautifulSoup.
from bs4 import BeautifulSoup
import re # Regular Expressions
import collections # Data Types
import sys # File operations
import codecs # UniCode support
import os
import locale
def inspect_file(file_name):
raw_file = source_path + file_name
raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") # Read the whole file into a soup
entry = raw_soup.find('div', class_="entry")
# Load all citation blocks
citation_blocks = entry.find_all('div', class_="citation-block")
# Iterate through each citation block
for citation_block in citation_blocks:
# Make new gloss-block wrapper
new_block = BeautifulSoup().new_tag("div", **{'class':'citation-block'})
# Find all Finnish sentences
fi_citations = citation_block.find_all('span', lang="fi")
# For each citation pair, anchored by the Finnish citation
for fi_citation in fi_citations:
# Make a citation wrapper
citation_wrapper = BeautifulSoup().new_tag("div", **{'class':'citation-pair'})
# Find the immediate next English citation
en_translation = fi_citation.find_next_sibling()
# Clean up Finnish text
fi_text = "".join(fi_citation.find_all(text=True)).strip()
# Clean up English text
en_text = "".join(en_translation.find_all(text=True)).strip()
# Strip right-most comma
if en_text[-1:] == ",": en_text = en_text[:-1]
en_text = en_text.strip()
# Make and add FI tag
fi_tag = BeautifulSoup().new_tag("div", **{'class':'FI-citation'})
fi_tag.string = fi_text
citation_wrapper.append(fi_tag)
# Make and add EN tag
en_tag = BeautifulSoup().new_tag("div", **{'class':'EN-translation'})
en_tag.string = en_text
citation_wrapper.append(en_tag)
# Add citation pair to new block
new_block.append(citation_wrapper)
# After iterating through FI SPANs, replace old block
citation_block.replace_with(new_block)
# Save changes
f_output = open(raw_file,'w')
f_output.write(file_header+str(entry)+file_footer)
f_output.close()
return
def check_all(files_list):
for i, file_name in enumerate(files_list):
if i%6000 == 0: print ("\tProgress: " + str(int(i*100/len(files_list)))+"%")
inspect_file(file_name)
return
def load_directory(source_path):
files_list = []
for file_name in os.listdir(source_path):
try:
if file_name.endswith(".html"):
files_list.append(file_name)
except IndexError:
sys.stderr.write("Something went wrong with " + file_name + ".")
continue
locale.setlocale(locale.LC_ALL, 'en_AU')
nr_loaded = locale.format("%d", len(files_list), grouping=True)
print(nr_loaded + " files loaded.")
return files_list
if __name__ == '__main__':
source_path = "Data-FI-EN/"
file_header = """<html>
<head>
<meta charset="utf-8">
</head>
<body>
"""
file_footer ="""
</body>
</html>"""
print("Loading files...")
files_list = load_directory(source_path) # Load list of raw files
print("Checking all files...")
check_all(files_list)
print("Valmis!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment