Created
June 23, 2011 23:25
-
-
Save hpiwowar/1043882 to your computer and use it in GitHub Desktop.
For stratified sampling of bibtex records
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Initially written by Heather Piwowar, June 2011 | |
# Public domain: have at it! | |
# For stratified sampling of bibtex records | |
import random | |
import math | |
import re | |
from collections import defaultdict | |
from pybtex.database import BibliographyData | |
from pybtex.database.input import bibtex as bibtex_in | |
from pybtex.database.output import bibtex as bibtex_out | |
from operator import itemgetter, attrgetter | |
import pprint | |
import sys | |
def read_bib(filename): | |
parser = bibtex_in.Parser() | |
bib_data = parser.parse_file(filename) | |
#print(bib_data.entries['1'].fields['email']) | |
return(bib_data) | |
def meets_inclusion_requirements(bib_data): | |
try: | |
if (("Article;" in bib_data.fields["keywords"]) or ("Proceedings Paper;" in bib_data.fields["keywords"])): | |
if (("English") in bib_data.fields["language"]): | |
return(True) | |
except KeyError: | |
pass | |
return(False) | |
def shuffle_entries(bib_keys): | |
random.seed(42) | |
random.shuffle(bib_keys) | |
return(bib_keys) | |
def get_group(num_citations, group_boundary_pairs): | |
for pair in group_boundary_pairs: | |
(low, high) = pair | |
try: | |
if low <= num_citations < high: | |
return(pair) | |
except KeyError: | |
pass | |
return(None) | |
def filter_and_group(bib_data_orig): | |
grouped = defaultdict(list) | |
shuffled_keys = shuffle_entries(bib_data_orig.entries.keys()) | |
for entry_key in shuffled_keys: | |
entry = bib_data_orig.entries[entry_key] | |
if meets_inclusion_requirements(entry): | |
group = get_group(int(entry.fields["number_total_citations_to_dataset"]), group_boundary_pairs) | |
grouped[group].append(entry) | |
return(grouped) | |
def sample_bib(bib_data_orig): | |
grouped = filter_and_group(bib_data_orig) | |
longest_group = max([len(grouped[group]) for group in grouped]) | |
bib_data_sampled = BibliographyData() | |
key_list = [] | |
for i in range(longest_group): | |
for group in grouped.keys(): | |
if group: | |
i = i+1 | |
if i < len(grouped[group]): | |
entry = grouped[group][i] | |
group_string = str(group[0]) + "-" + str(group[1]) | |
id_string = str(10000 + i) + "_" + group_string | |
key_list.append(id_string) | |
entry.fields["title"] = "^^" + id_string + "^^" + entry.fields["title"] | |
if "mendeley-tags" not in entry.fields.keys(): | |
entry.fields["mendeley-tags"] = "" | |
entry.fields["mendeley-tags"] += "; citation group " + group_string | |
bib_data_sampled.add_entry(id_string, entry) | |
return(bib_data_sampled, key_list) | |
def run_sample(mydir, mymax, out_filename): | |
bib_data = read_bib(mydir + "_annotated.bib") | |
(bib_data_sampled, key_list) = sample_bib(bib_data) | |
subset_keys = key_list[0:mymax] | |
stream = open(out_filename, "w") | |
w = bibtex_out.Writer() | |
for mykey in subset_keys: | |
this_bib = BibliographyData() | |
this_bib.add_entry(mykey, bib_data_sampled.entries[mykey]) | |
w.write_stream(this_bib, stream) | |
stream.close() | |
return(subset_keys) | |
base = [int(math.pow(10,i)) for i in range(0,5)] | |
base_list = sorted([0] + base + [3*i for i in base]) | |
group_boundary_pairs = zip(base_list[:-1], base_list[1:]) | |
MAX = 500 | |
#mydir = "Pangaea" | |
#mydir = "GEOROC" | |
#mydir = "GEO" | |
mydir = "TreeBase" | |
#subset_keys = run_sample(mydir, MAX, mydir + "_sampled.bib") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment