Created
January 15, 2012 16:11
-
-
Save mark-cooper/1616289 to your computer and use it in GitHub Desktop.
Open Library data dumps: getting CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package net.libcode.www.openlibrary; | |
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.nio.charset.Charset; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.HashSet; | |
import java.util.Iterator; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Set; | |
import org.json.simple.JSONArray; | |
import org.json.simple.JSONObject; | |
import org.json.simple.parser.JSONParser; | |
import org.json.simple.parser.ParseException; | |
import com.google.common.io.Files; | |
/* | |
* Script style, built incrementally, ready for reorganization | |
* JSONObjects use a lot of Strings: | |
* -Xmx1024m -XX:MaxPermSize=256m | |
*/ | |
public class OpenLibraryData { | |
public static final JSONParser parser = new JSONParser(); | |
public static final int MAX_JSON = 50000; // Chunk these | |
public static final File authors = new File("/home/mcooper/OL-Data/ol_dump_authors_2011-12-31.txt"); | |
public static final File authors_search = new File("/home/mcooper/OL-Data/authors_search.txt"); | |
public static final File authors_output = new File("/home/mcooper/OL-Data/authors_output.txt"); | |
public static final File authors_found = new File("/home/mcooper/OL-Data/authors_found.csv"); | |
public static final File works = new File("/home/mcooper/OL-Data/ol_dump_works_2011-12-31.txt"); | |
public static final File works_search = new File("/home/mcooper/OL-Data/works_search.txt"); | |
public static final File works_output = new File("/home/mcooper/OL-Data/works_output.txt"); | |
public static final File works_found = new File("/home/mcooper/OL-Data/works_found.csv"); | |
public static final File editions = new File("/home/mcooper/OL-Data/ol_dump_editions_2011-12-31.txt"); | |
public static final File editions_search = new File("/home/mcooper/OL-Data/editions_search.txt"); | |
public static final File editions_output = new File("/home/mcooper/OL-Data/editions_output.txt"); | |
public static final File editions_found = new File("/home/mcooper/OL-Data/editions_found.csv"); | |
public static final String author_profile_url = "http://openlibrary.org/authors/"; | |
public static final String author_cover_url = "http://covers.openlibrary.org/a/olid/"; | |
public static final String author_csv_hdr = "ID\tNAME\tPROFILE\tIMAGE\tBIRTH\tDEATH\tLINKS"; | |
public static final String works_profile_url = "http://openlibrary.org/works/"; | |
public static final String works_cover_url = "http://covers.openlibrary.org/w/id/"; | |
public static final String works_csv_hdr = "ID\tTITLE\tPROFILE\tIMAGE\tAUTHORS\tPUBLISHED\tDESCRIPTION"; | |
public static final String editions_profile_url = "http://openlibrary.org/books/"; | |
public static final String editions_cover_url = "http://covers.openlibrary.org/b/id/"; | |
public static final String editions_csv_hdr = "ID\tTITLE\tWORKS\tAUTHORS\tOCLC\tLCCN\tISBN10\tISBN13\tPLACES\tPUBLISHERS\tDATE\tFORMAT\tPAGES\tPAGINATION\tDIMENSIONS\tSUBJECTS\tSUBJECT PLACES\tSERIES\tCOVERS\tLIBRARYTHING\tGOODREADS"; | |
public static final String OLID_DELIM = "\t"; | |
public static final String OLID_IDENT = "O"; | |
public static final String JSON_S = "{"; | |
public static final String JSON_E = "}"; | |
public static final String SMALL_JPG = "-S.jpg"; | |
public static final String MED_JPG = "-M.jpg"; | |
public static final String CSV_FIELD_DELIM = "\t"; | |
public static final String CSV_CELL_DELIM = "|"; | |
public static final String NL = String.format("%n"); | |
public static void main(String[] args) throws IOException, ParseException { | |
clearFile(authors_found); // Start over searching the authors dump file ... | |
Set<String> searchTerms = readSearchTermsFile(authors_search, 10); | |
long startTime = System.currentTimeMillis(); | |
searchData(authors, authors_output, searchTerms, false, true); | |
searchTerms = null; | |
Map<String, JSONObject> authorsData = parseData(authors_output, 0, true); | |
List<String> authors_csv = new ArrayList<String>(authorsData.size() + 1); | |
authors_csv.add(author_csv_hdr); | |
System.out.println(); | |
System.out.println("AUTHORS FOUND #" + authorsData.size()); | |
System.out.println(); | |
authors_csv.addAll(getAuthorsCsv(authorsData, "\t", "|", true)); | |
long endTime = System.currentTimeMillis(); | |
System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s"); | |
System.out.println(); | |
appendToFile(authors_found, authors_csv, Charset.defaultCharset()); | |
authors_csv = null; | |
clearFile(works_found); // Search the works dump file by author OLID | |
searchTerms = readSearchTermsFile(works_search, 10); | |
startTime = System.currentTimeMillis(); | |
searchData(works, works_output, searchTerms, false, false); | |
searchTerms = null; | |
Map<String, JSONObject> worksData = parseData(works_output, 0, false); | |
List<String> works_csv = new ArrayList<String>(worksData.size() + 1); | |
works_csv.add(works_csv_hdr); | |
System.out.println(); | |
System.out.println("WORKS FOUND #" + worksData.size()); | |
System.out.println(); | |
works_csv.addAll(getWorksCsv(worksData, "\t", "|", false)); | |
endTime = System.currentTimeMillis(); | |
System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s"); | |
System.out.println(); | |
appendToFile(works_found, works_csv, Charset.defaultCharset()); | |
works_csv = null; | |
clearFile(editions_found); // Search the editions dump file ... this is huge. | |
// searchTerms = readSearchTermsFile(editions_search, 100000); | |
// searchDataByOLID(editions, editions_output, searchTerms, true); | |
// searchTerms = null; | |
startTime = System.currentTimeMillis(); | |
Map<String, JSONObject> editionsData = parseData(editions_output, 0, true); | |
List<String> editions_csv = new ArrayList<String>(editionsData.size() + 1); | |
editions_csv.add(editions_csv_hdr); | |
System.out.println(); | |
System.out.println("EDITIONS FOUND #" + editionsData.size()); | |
System.out.println(); | |
editions_csv.addAll(getEditionsCsv(editionsData, CSV_FIELD_DELIM, CSV_CELL_DELIM, false)); | |
editionsData = null; | |
endTime = System.currentTimeMillis(); | |
System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s"); | |
System.out.println(); | |
appendToFile(editions_found, editions_csv, Charset.defaultCharset()); | |
editions_csv = null; | |
////////// | |
startTime = System.currentTimeMillis(); | |
editionsData = parseData(editions_output, 50001, true); | |
editions_csv = new ArrayList<String>(editionsData.size() + 1); | |
editions_csv.add(editions_csv_hdr); | |
System.out.println(); | |
System.out.println("EDITIONS FOUND #" + editionsData.size()); | |
System.out.println(); | |
editions_csv.addAll(getEditionsCsv(editionsData, CSV_FIELD_DELIM, CSV_CELL_DELIM, false)); | |
editionsData = null; | |
endTime = System.currentTimeMillis(); | |
System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s"); | |
System.out.println(); | |
appendToFile(editions_found, editions_csv, Charset.defaultCharset()); | |
editions_csv = null; | |
} | |
public static void clearFile(File file) throws IOException { | |
Files.write(new byte[]{}, file); | |
} | |
public static void appendToFile(File file, String data, Charset charset) throws IOException { | |
Files.append(data + NL, file, charset); | |
} | |
public static void appendToFile(File file, List<String> data, Charset charset) throws IOException { | |
for(String x : data) { | |
Files.append(x + NL, file, charset); | |
} | |
} | |
public static Set<String> readSearchTermsFile(File file, int capacity) throws IOException { | |
Set<String> lines = new HashSet<String>(capacity); | |
BufferedReader reader = Files.newReader(file, Charset.defaultCharset()); | |
String line; | |
while((line = reader.readLine()) != null) { | |
lines.add(line); | |
} | |
reader.close(); | |
return lines; | |
} | |
public static int searchData(File file, File output_file, Set<String> search, boolean unique, boolean print) throws IOException, ParseException { | |
clearFile(output_file); | |
BufferedReader reader = Files.newReader(file, Charset.defaultCharset()); | |
String line; | |
String term; | |
int count = 0; | |
int found = 0; | |
while((line = reader.readLine()) != null) { | |
count += 1; | |
for (Iterator<String> i = search.iterator(); i.hasNext();) { | |
term = i.next(); | |
if(line.contains(term)) { | |
found += 1; | |
appendToFile(output_file, line, Charset.defaultCharset()); | |
if(unique) i.remove(); | |
if(print) System.out.println(count + ": " + line); | |
break; | |
} | |
} | |
} | |
reader.close(); | |
return found; | |
} | |
/* | |
* Search by OLID. Fast (6 minutes for 100,000 OLID edition searches on 25gb editions dump) | |
*/ | |
public static int searchDataByOLID(File file, File output_file, Set<String> search, boolean print) throws IOException { | |
clearFile(output_file); | |
BufferedReader reader = Files.newReader(file, Charset.defaultCharset()); | |
String line; | |
String[] parts; | |
String id; | |
int count = 0; | |
int found = 0; | |
while((line = reader.readLine()) != null) { | |
count += 1; | |
parts = line.split(OLID_DELIM); | |
int idStart = parts[1].indexOf(OLID_IDENT); | |
id = parts[1].substring(idStart, parts[1].length()); | |
if(search.contains(id)) { | |
found += 1; | |
appendToFile(output_file, line, Charset.defaultCharset()); | |
if(print) System.out.println(count + ": " + line); | |
} | |
} | |
reader.close(); | |
return found; | |
} | |
/* | |
* Return Map of OLID => Json data up to MAX_JSON elements | |
*/ | |
public static Map<String, JSONObject> parseData(File file, int begin, boolean print) throws IOException, ParseException { | |
Map<String, JSONObject> data = new HashMap<String, JSONObject>(); | |
BufferedReader reader = Files.newReader(file, Charset.defaultCharset()); | |
String line; | |
String[] parts; | |
String id; | |
String json; | |
int lineNumber = 0; | |
int count = 0; | |
while((line = reader.readLine()) != null && count < MAX_JSON) { | |
lineNumber += 1; | |
if(lineNumber >= begin) { | |
count += 1; | |
// Get the OLID | |
parts = line.split(OLID_DELIM); | |
int idStart = parts[1].indexOf(OLID_IDENT); | |
id = parts[1].substring(idStart, parts[1].length()); | |
// Get the JSON | |
int jsonStart = line.indexOf(JSON_S); | |
int jsonStop = line.lastIndexOf(JSON_E); | |
json = line.substring(jsonStart, jsonStop + 1); | |
data.put(id, (JSONObject) parser.parse(json)); | |
if(print) System.out.println(count + ": " + line); | |
} | |
} | |
reader.close(); | |
return data; | |
} | |
public static List<String> getAuthorsCsv(Map<String, JSONObject> authors, String fieldDelimiter, String cellDelimiter, boolean print) { | |
List<String> data = new ArrayList<String>(authors.size()); | |
for(Map.Entry<String, JSONObject> entry : authors.entrySet()) { | |
List<String> field = new ArrayList<String>(); | |
field.add(entry.getKey()); | |
JSONObject json = entry.getValue(); | |
field.add(getJsonString(json, "name")); | |
field.add(author_profile_url + field.get(0)); | |
field.add(author_cover_url + field.get(0) + MED_JPG); | |
field.add(getJsonString(json, "birth_date")); | |
field.add(getJsonString(json, "death_date")); | |
field.add(getJsonHashString(json, "links", "url", cellDelimiter, false)); | |
data.add(join(field, fieldDelimiter)); | |
if(print) System.out.println(join(field, fieldDelimiter)); | |
} | |
return data; | |
} | |
public static List<String> getWorksCsv(Map<String, JSONObject> works, String fieldDelimiter, String cellDelimiter, boolean print) { | |
List<String> data = new ArrayList<String>(works.size()); | |
String title; | |
String subtitle; | |
String desc; | |
for(Map.Entry<String, JSONObject> entry : works.entrySet()) { | |
List<String> field = new ArrayList<String>(); | |
field.add(entry.getKey()); | |
JSONObject json = entry.getValue(); | |
title = getJsonString(json, "title"); | |
subtitle = getJsonString(json, "subtitle"); | |
if(! subtitle.isEmpty()) title = title + " - " + subtitle; | |
field.add(title); | |
field.add(works_profile_url + field.get(0)); | |
field.add(works_cover_url + field.get(0) + MED_JPG); | |
field.add(getJsonHashString(json, "authors", "author", "key", cellDelimiter, true)); | |
field.add(getJsonString(json, "first_publish_date")); | |
try { | |
JSONObject description = (JSONObject) json.get("description"); | |
desc = getJsonString(description, "value"); | |
} catch (ClassCastException e) { | |
desc = getJsonString(json, "description"); | |
} | |
field.add(removeNewLines(desc)); | |
data.add(join(field, fieldDelimiter)); | |
if(print) System.out.println(join(field, fieldDelimiter)); | |
} | |
return data; | |
} | |
public static List<String> getEditionsCsv(Map<String, JSONObject> editions, String fieldDelimiter, String cellDelimiter, boolean print) { | |
List<String> data = new ArrayList<String>(editions.size()); | |
String title; | |
String subtitle; | |
String by; | |
for(Map.Entry<String, JSONObject> entry : editions.entrySet()) { | |
List<String> field = new ArrayList<String>(); | |
field.add(entry.getKey()); | |
JSONObject json = entry.getValue(); | |
title = getJsonString(json, "title"); | |
subtitle = getJsonString(json, "subtitle"); | |
by = getJsonString(json, "by_statement"); | |
if(! subtitle.isEmpty()) title = title + " - " + subtitle; | |
if(! by.isEmpty()) title = title + " - " + by; | |
field.add(title); | |
field.add(getJsonHashString(json, "works", "key", cellDelimiter, true)); | |
field.add(getJsonHashString(json, "authors", "key", cellDelimiter, true)); | |
field.add(getJsonArrayString(json, "oclc_numbers", cellDelimiter)); | |
field.add(getJsonArrayString(json, "lccn", cellDelimiter)); | |
field.add(getJsonArrayString(json, "isbn_10", cellDelimiter)); | |
field.add(getJsonArrayString(json, "isbn_13", cellDelimiter)); | |
field.add(getJsonArrayString(json, "publish_places", cellDelimiter)); | |
field.add(getJsonArrayString(json, "publishers", cellDelimiter)); | |
field.add(getJsonString(json, "publish_date")); | |
field.add(getJsonString(json, "physical format")); | |
field.add(getJsonString(json, "number_of_pages")); | |
field.add(getJsonString(json, "pagination")); | |
field.add(getJsonString(json, "physical_dimensions")); | |
field.add(getJsonArrayString(json, "subjects", cellDelimiter)); | |
field.add(getJsonArrayString(json, "subject_places", cellDelimiter)); | |
field.add(getJsonArrayString(json, "series", cellDelimiter)); | |
field.add(getJsonArrayString(json, "covers", cellDelimiter)); | |
field.add(getJsonHashArrayString(json, "identifiers", "librarything", cellDelimiter)); | |
field.add(getJsonHashArrayString(json, "identifiers", "goodreads", cellDelimiter)); | |
data.add(join(field, fieldDelimiter)); | |
if(print) System.out.println(join(field, fieldDelimiter)); | |
} | |
return data; | |
} | |
public static String getJsonString(JSONObject json, String key) { | |
String value = ""; | |
if(json != null) { | |
String result = null; | |
try { | |
result = (String) json.get(key); | |
} catch (ClassCastException e) { | |
result = String.valueOf(json.get(key)); | |
} | |
if(result != null) value = result; | |
} | |
return value; | |
} | |
public static String getJsonHashArrayString(JSONObject json, String json_obj, String key, String delimiter) { | |
String value = ""; | |
if(json != null) { | |
JSONObject a = (JSONObject) json.get(json_obj); | |
if(a != null) value = getJsonArrayString(a, key, delimiter); | |
} | |
return value; | |
} | |
public static String getJsonArrayString(JSONObject json, String key, String delimiter) { | |
String value = ""; | |
if(json != null) { | |
JSONArray result = (JSONArray) json.get(key); | |
if(result != null) value = join(result, delimiter); | |
} | |
return value; | |
} | |
public static String getJsonHashString(JSONObject json, String key, String sub_key, String delimiter, boolean olid) { | |
String value = ""; | |
JSONArray result = (JSONArray) json.get(key); | |
if(result != null) { | |
String element; | |
for(Object x : result) { | |
JSONObject a = (JSONObject) x; | |
element = getJsonString(a, sub_key); | |
if(olid) { | |
int idStart = element.indexOf(OLID_IDENT); | |
element = element.substring(idStart, element.length()); | |
} | |
value = value + element + delimiter; | |
} | |
} | |
if(! value.isEmpty()) value = value.substring(0, value.length() - 1); | |
return value; | |
} | |
public static String getJsonHashString(JSONObject json, String key, String sub_key, String sub_sub_key, String delimiter, boolean olid) { | |
String value = ""; | |
JSONArray result = (JSONArray) json.get(key); | |
if(result != null) { | |
String element; | |
for(Object x : result) { | |
JSONObject a = (JSONObject) x; | |
JSONObject b = (JSONObject) a.get(sub_key); | |
element = getJsonString(b, sub_sub_key); | |
if(olid) { | |
int idStart = element.indexOf(OLID_IDENT); | |
element = element.substring(idStart, element.length()); | |
} | |
value = value + element + delimiter; | |
} | |
} | |
if(! value.isEmpty()) value = value.substring(0, value.length() - 1); | |
return value; | |
} | |
public static String join(List<?> list, String delimiter) { | |
StringBuilder l = new StringBuilder(); | |
for(Object x : list) { | |
if(l.length() != 0) l.append(delimiter); | |
l.append(x.toString()); | |
} | |
return l.toString(); | |
} | |
public static String removeNewLines(String text) { | |
return text.replaceAll("\\r\\n|\\r|\\n", " "); | |
} | |
public static long millisToSeconds(long milli) { | |
return (long) (milli * 0.001); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment