Created
November 23, 2023 16:07
-
-
Save alix-tz/64677e95bb5944e9ce71a1d69ed98941 to your computer and use it in GitHub Desktop.
Python script to explore the lexical variety of the French Du Valais Recensement
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import unicodedata | |
from collections import Counter | |
from spacy.lang.fr import French | |
from tqdm import tqdm | |
import pandas as pd | |
import lxml.etree as ET | |
def check_paths(list_of_paths): | |
""" | |
Check if the paths in the given list exist | |
""" | |
for path in list_of_paths: | |
if not os.path.exists(path): | |
print("ERROR: ", path, " does not exist") | |
def load_alto_xml(path: str) -> ET.ElementTree: | |
"""Load an ALTO XML file and return the XML tree.""" | |
try: | |
return ET.parse(path, ET.XMLParser(encoding="utf-8")) | |
except Exception: | |
return False | |
def get_strings_in_alto(xml: ET.ElementTree) -> list: | |
"""Return a list of all the <String> elements in an ALTO XML file.""" | |
strings = [] | |
ns = dict(namespaces={"a": "http://www.loc.gov/standards/alto/ns-v4#"}) | |
for line in xml.xpath("//a:TextLine", **ns): | |
for string in line.xpath("./a:String", **ns): | |
strings.append(string) | |
return strings | |
def spacy_count_tokens(data): | |
"""Count the tokens in a string using Spacy""" | |
nlp = French() | |
tokens = nlp.tokenizer(data) | |
return Counter([token.text for token in tokens]) | |
def snippet_from_counted_tokens(counted_tokens, dataset): | |
"""Print some info about the tokens in a dataset""" | |
#print(f"Found {len(counted_tokens)} tokens in {dataset}") | |
print(f"Found {sum(counted_tokens.values())} tokens in {dataset}") | |
print(f"Found {len(set(counted_tokens.keys()))} different tokens in {dataset}") | |
unica = sum(1 for c in counted_tokens.values() if c == 1) | |
print(f"Found {unica} unica (token appearing only once) in {dataset}") | |
print(f"The most frequent token in {dataset} is {counted_tokens.most_common(1)[0][0]} with {counted_tokens.most_common(1)[0][1]} occurences") | |
def main(dataset): | |
print(f"Processing {dataset}") | |
list_xml_files = [] | |
for root, dirs, files in os.walk(dataset): | |
for file in files: | |
if file.endswith(".xml"): | |
list_xml_files.append(os.path.join(root, file)) | |
check_paths(list_xml_files) | |
print(f"[DEBUG] Found {len(list_xml_files)} XML files.") | |
data = "" | |
for xml in tqdm(list_xml_files, desc="Loading XML files"): | |
lines = get_strings_in_alto(load_alto_xml(xml)) | |
for line in lines: | |
data += " " + unicodedata.normalize("NFC", line.get("CONTENT", "")).strip() | |
counted_tokens = spacy_count_tokens(data) | |
snippet_from_counted_tokens(counted_tokens, dataset) | |
# no_unicas is a Counter object with the tokens that appear more than once | |
no_unicas = Counter({k: v for k, v in counted_tokens.items() if v > 1}) | |
# turn Counter into dataframe | |
#df = pd.DataFrame.from_records(counted_tokens.most_common(), columns=['token','count']) | |
df = pd.DataFrame.from_records(no_unicas.most_common(), columns=['token','count']) | |
print(df.to_string()) | |
#df.to_csv(f"token_count_{dataset}.csv", index=False) | |
main("valais-recensement/data/fr") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment