Created
November 1, 2014 11:11
-
-
Save gartenfeld/bdcc93f166dd1fc60dfb to your computer and use it in GitHub Desktop.
Cleaning up and separating lexical data files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re # Regular Expressions | |
import collections # Data Types | |
import sys # File operations | |
import codecs # UniCode support | |
import os | |
def clear_output_file(out_file): | |
file_header ="""<html> | |
<head> | |
<meta charset="utf-8"> | |
</head> | |
<body> | |
""" | |
f_output = open(out_file,'w') | |
f_output.write(file_header) | |
f_output.close() | |
return | |
def add_footer(out_file): | |
file_footer =""" | |
</body> | |
</html>""" | |
f_output = open(out_file,'a') | |
f_output.write(file_footer) | |
f_output.close() | |
return | |
def has_lang(tag): | |
return tag.has_attr('lang') | |
def triage_source_files(files_list): | |
for file_name in files_list: # For each raw file | |
raw_file = source_path + file_name | |
raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") # Read the whole file into a soup | |
entries = raw_soup.find_all('div', class_="entry") | |
for entry in entries: | |
mark = entry.find('font',class_="pieniharmaa") | |
if mark != None: entry.decompose() # Remove non-relevant content | |
re_soup = BeautifulSoup(str(entries)) | |
lexical_entries = re_soup.find_all('div', class_="entry") | |
if len(lexical_entries) > 0: # Exclude files with no valid content | |
for entry in lexical_entries: | |
lang_tag = entry.find(has_lang) # Locate the very first 'lang' attribute in the entry | |
lang = lang_tag['lang'] # Grab the attribute value | |
if lang == "fi": | |
out_path = out_fi_path | |
else: | |
out_path = out_en_path | |
out_file = out_path + file_name | |
clear_output_file(out_file) | |
f_output = open(out_file,'a') | |
f_output.write(str(entry)) | |
f_output.close() | |
add_footer(out_file) | |
return | |
def load_directory(source_path): | |
files_list = [] | |
for file_name in os.listdir(source_path): | |
try: | |
if file_name.endswith(".html"): | |
files_list.append(file_name) | |
except IndexError: | |
sys.stderr.write("Something went wrong with " + file_name + ".") | |
continue | |
return files_list | |
if __name__ == '__main__': | |
source_path = "Files-Raw/" | |
out_fi_path = "Data-FI-EN/" | |
out_en_path = "Data-EN-FI/" | |
print("Loading files...") | |
files_list = load_directory(source_path) # Load list of raw files | |
print("Triaging files...") | |
triage_source_files(files_list) # Separate FI-EN and EN-FI entries. | |
print("Valmis!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment