Skip to content

Instantly share code, notes, and snippets.

@dirkgr
dirkgr / map_per_process.py
Last active October 19, 2018 23:43
A map function that runs every iteration in a separate process, in parallel
from typing import *
import multiprocessing as mp
import multiprocessing.connection
def map_per_process(fn, input_sequence: Iterable) -> Iterable:
pipeno_to_pipe: Dict[int, multiprocessing.connection.Connection] = {}
pipeno_to_process: Dict[int, mp.Process] = {}
def process_one_item(send_pipe: multiprocessing.connection.Connection, item):
try:
@dirkgr
dirkgr / shuffle_with_pool.py
Last active September 9, 2020 23:46
Shuffle the output of an iterator
from typing import *
import random
def shuffle_iterable(i: Iterable, pool_size: int = 1024) -> Iterable:
import random
i = iter(i)
pool = []
# fill up the pool
for item in i:
@dirkgr
dirkgr / memoize.py
Last active September 12, 2018 20:46
A Python function decorator that memoizes a function persistently, to disk
import dill
import lmdb
import functools
from typing import *
import io
import mmh3
def memoize(exceptions: Optional[List] = None, version = None):
if exceptions is None:
exceptions = []
@dirkgr
dirkgr / getReferences.java
Created May 30, 2018 20:10
Parsing just references with science-parse
ExtractReferences referenceExtractor;
try(
final InputStream gazetteerIs = new FileInputStream(Parser.getDefaultGazetteer().toFile());
final DataInputStream bibModelIs = new DataInputStream(new FileInputStream(Parser.getDefaultBibModel().toFile()))
) {
referenceExtractor = new ExtractReferences(gazetteerIs, bibModelIs);
}
List<String> referenceLines = Arrays.asList("Reference 1", "Reference 2", "etc.");
@dirkgr
dirkgr / RefCount.scala
Created November 22, 2016 22:36
Reference counting in Scala
case class CountedReference[T <: AutoCloseable](counted: ReferenceCounted[T]) extends AutoCloseable {
counted.inc()
override def close() = counted.dec()
def get = counted.inner
}
case class ReferenceCounted[T <: AutoCloseable](inner: T) {
private val count = new AtomicInteger(0)
object SuffixTree extends App {
case class TreeNode(children: Map[Char, TreeNode] = Map.empty, end: Boolean = false) {
def addString(string: CharSequence): TreeNode = {
if(string.length() == 0) {
this.copy(end = true)
} else {
val c = string.charAt(0)
val existingNode = children.getOrElse(c, TreeNode())
@dirkgr
dirkgr / allFilesInDirectory.scala
Created August 11, 2016 19:04
Lists all files under a directory, recursively, and fast, in Scala
logger.info(s"Finding out how many files to process in $inputDirectory")
// work around Java's dumb file IO methods
def forEachFileToProcess(f: Path => Unit): Unit = {
val filenamePattern = "^([a-f0-9]{40})\\.pdf$"r
val visitor = new FileVisitor[Path] {
override def visitFileFailed(file: Path, exc: IOException): FileVisitResult =
FileVisitResult.SKIP_SUBTREE
override def visitFile(file: Path, attrs: BasicFileAttributes): FileVisitResult = {
@dirkgr
dirkgr / withFileFromS3.scala
Created December 9, 2015 02:07
Gets a file from S3, wrapped in a function
import org.allenai.common.Resource
import com.amazonaws.services.s3.AmazonS3Client
import java.net.URI
import java.nio.file.{Path, Files}
object IO {
private val s3 = new AmazonS3Client()
private val acceptableUrlSchemes = Set("s3", "s3n", "s3a")
@dirkgr
dirkgr / firstSuccessOrFirstFailure.scala
Created December 1, 2015 02:20
Function that returns the first failure or the first success out of an iterator of Try, without exhausting the iterator
/** Returns the first failure or the first success out of an iterator of Try, taking care not to
* exhaust the iterator when possible.
*
* @param tries The iterator of Try. Must not be empty.
* @param defaultFailure The return value to use if all tries left in the iterators are
* failures. If this is None, use the first failure from the iterator.
* @return the first success out of the iterator, or the first failure if there is no success
*/
@tailrec
def firstSuccessOrFirstFailure[A](
@dirkgr
dirkgr / fixJson.sh
Created November 23, 2015 19:36
In a JSON file with run-together brackets ("}{"), this inserts newlines between them, so you have one JSON expression on one line.
perl -pe 'BEGIN{ $/="}{" } s/}{/}\n{/g'