Created
August 23, 2017 07:55
-
-
Save iimog/a6a36a7b03906f18ac490b0a4708224c to your computer and use it in GitHub Desktop.
A python script (using beautiful soup) to extract bee traits from the html pages at http://scales.ckff.si/scaletool/?menu=6&submenu=3 and save as FENNEC compatible tsv files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from bs4 import BeautifulSoup | |
import glob | |
trait_types = dict() | |
trait_values_numeric = dict() | |
trait_values_categorical = dict() | |
general_citation = "Budrys, E., Budriene., A. and Orlovskyte. S. 2014. Cavity-nesting wasps and bees database." | |
for file in glob.glob('*.html'): | |
#print(file) | |
sid = file[0:file.find('.')] | |
origin_url = 'http://scales.ckff.si/scaletool/index.php?menu=6&submenu=3&sid='+sid | |
with open(file) as content: | |
soup = BeautifulSoup(content, 'html.parser') | |
classification = soup.find(id="content_div").h2.contents | |
scientific_name = classification[1].split(" / ")[2] +" "+ classification[0].string | |
reference = "None" | |
if soup.find(id="content_div").b is not None: | |
reference = soup.find(id="content_div").b.string | |
reference = "Source:"+general_citation+";Reference:"+reference | |
#print(scientific_name, reference) | |
for row in soup.find(id="content_div").table.find_all('tr'): | |
tds = row.find_all('td') | |
if(len(tds)>1): | |
trait_type = tds[0].contents[0] | |
trait_definition = "" | |
if tds[0].contents[1].has_attr("onmouseover"): | |
trait_definition = tds[0].contents[1]['onmouseover'].split("'")[1] | |
numeric_value = "" | |
if len(tds[1].contents) > 0: | |
numeric_value = tds[1].contents[0] | |
categorical_value = tds[2].contents[0] | |
categorical_definition = "" | |
if tds[2].contents[1].has_attr("onmouseover"): | |
categorical_definition = tds[2].contents[1]['onmouseover'].split("'")[1] | |
trait_types[trait_type] = trait_definition | |
if not trait_type in trait_values_categorical: | |
trait_values_categorical[trait_type] = list() | |
trait_values_categorical[trait_type].append([scientific_name, categorical_value, categorical_definition, reference, origin_url]) | |
if numeric_value != "": | |
if not trait_type in trait_values_numeric: | |
trait_values_numeric[trait_type] = list() | |
trait_values_numeric[trait_type].append([scientific_name, numeric_value, '', reference, origin_url]) | |
with open("trait_types.tsv", "w") as f: | |
for tt in trait_types.keys(): | |
f.write("\t".join([tt, trait_types[tt]])+"\n") | |
for tt in trait_values_numeric.keys(): | |
with open(tt+"_numeric.tsv", "w") as f: | |
for value in trait_values_numeric[tt]: | |
f.write("\t".join(value)+"\n") | |
for tt in trait_values_categorical.keys(): | |
with open(tt+"_categorical.tsv", "w") as f: | |
for value in trait_values_categorical[tt]: | |
f.write("\t".join(value)+"\n") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment