This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Humans with the most non-deprecated OpenLibrary IDs (merge candidates) | |
SELECT ?item (COUNT(?olid) AS ?olidC) | |
{ | |
VALUES (?ranks) { ( wikibase:PreferredRank ) ( wikibase:NormalRank ) } | |
?item p:P648 [ps:P648 ?olid; | |
wikibase:rank ?ranks; | |
] ; | |
wdt:P31 wd:Q5. | |
# SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.Collections; | |
import java.util.EnumSet; | |
import java.util.IntSummaryStatistics; | |
import java.util.Set; | |
import java.util.function.BiConsumer; | |
import java.util.function.BinaryOperator; | |
import java.util.function.Function; | |
import java.util.function.Supplier; | |
import java.util.function.ToIntFunction; | |
import java.util.stream.Collector; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
common-crawl-cdx.py | |
A simple example program to analyze the Common Crawl index. | |
This is implemented as a single stream job which accesses S3 via HTTP, | |
so that it can be easily be run from any laptop, but it could easily be | |
converted to an EMR job which processed the 300 index files in parallel. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import shutil | |
import urllib2 | |
import platform | |
import tempfile | |
import urllib | |
import os | |
import subprocess | |
import webbrowser | |
import stat |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Scrape BBC Desert Island Discs data including songs, books, and luxury item, if available, for the celebrity "castaways" | |
# based on original work by Francis Irving with the following changes by Tom Morris July 2012: | |
# - updated to current BBC page format | |
# - switched from BeautifulSoup to lxml | |
# - updated deprecated database calls | |
# - restructured to run as a single integrated process and not rescrape data it already extracted | |
import scraperwiki | |
import scraperwiki.apiwrapper | |
import lxml.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version='1.0' encoding='utf-8'?> | |
<xsl:stylesheet version='1.0' xmlns:xsl='http://www.w3.org/1999/XSL/Transform'> | |
<!-- | |
Author: Rod Page | |
Source: http://iphylo.blogspot.com/2011/07/correcting-ocr-using-hocr-firefox.html#comment-400434491 | |
--> | |
<xsl:output method='html' version='1.0' encoding='utf-8' indent='yes'/> | |
<xsl:variable name="scale" select="800 div //page/@width" /> |