Created
July 27, 2022 02:26
-
-
Save adigitoleo/856ca22fe9132ede3470b70a73ac9d74 to your computer and use it in GitHub Desktop.
Script to parse IMA mineral list into CSV file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import csv | |
import pdfplumber | |
# Create a list of minerals as classified by the IMA. | |
# Get PDF from http://cnmnc.main.jp/ | |
pdf = pdfplumber.open("IMA_MineralList_202207.pdf") | |
# Set up regexp to fix chemical formulae. | |
ion_sup = re.compile(r"(\d+[+-])") | |
count_sub = re.compile(r"([^{])(\d+)") | |
# First two pages are header stuff. | |
pages = pdf.pages[2:] | |
# Need larger tolerance to pick up sub-/superscripts. | |
table_settings={"text_y_tolerance": 6} | |
# Get table content on first page, which has the column names. | |
first_table = pages[0].extract_table(table_settings=table_settings) | |
# Check PDF, currently there are 5828 mineral species. | |
index = iter(range(5828)) | |
minerals = { | |
key.replace("\n", "").strip(): [""]*5828 for key in first_table[0] | |
} | |
for page in pages: | |
if page.page_number == pages[0].page_number: | |
table = first_table[1:] | |
else: | |
table = page.extract_table(table_settings=table_settings) | |
for row in table: | |
idx = next(index) | |
for cell, (key, val) in enumerate(minerals.items()): | |
cell_val = row[cell].replace("\n", "").strip() | |
if "formula" in key: | |
minerals[key][idx] = count_sub.sub( | |
r"\1_{\2}", ion_sup.sub(r"^{\1}", cell_val) | |
).replace("·", "") | |
else: | |
minerals[key][idx] = cell_val | |
# Write to CSV file. | |
with open("IMA_MineralList_202207.csv", mode="w", encoding="utf8") as file: | |
keys = minerals.keys() | |
writer = csv.DictWriter(file, fieldnames=keys) | |
writer.writeheader() | |
for row in zip(*minerals.values()): | |
writer.writerow(dict(zip(keys, row))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment