Skip to content

Instantly share code, notes, and snippets.

View jbaiter's full-sized avatar

Johannes Baiter jbaiter

View GitHub Profile
Thread 15 (Thread 0x7f16de535700 (LWP 23745)):
#0 pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
#1 0x00007f16e1b6da0b in blas_thread_server ()
from /home/jbaiter/.virtualenvs/dtaalign/local/lib/python2.7/site-packages/scipy/special/../.libs/libopenblasp-r0-39a31c03.2.18.so
#2 0x00007f16fd6826fa in start_thread (arg=0x7f16de535700) at pthread_create.c:333
#3 0x00007f16fd3b8b5d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
Thread 14 (Thread 0x7f16ded36700 (LWP 23744)):
#0 pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
@jbaiter
jbaiter / manifest.json
Last active November 9, 2016 08:55
Manifest with absolute links in Metadata (see .metadata[1])
{
"attribution": "Bayerische Staatsbibliothek",
"license": "https://creativecommons.org/licenses/by/3.0",
"logo": "https://www.bsb-muenchen.de/fileadmin/templates/images/bsb_logo.png",
"related": "http://daten.digitale-sammlungen.de/~db/0008/bsb00083115/images",
"seeAlso": [
{
"@id": "http://daten.digitale-sammlungen.de/~db/0008/bsb00083115/images"
},
{
@jbaiter
jbaiter / example.py
Created December 25, 2016 22:19
threaded version
def fetch_from_remote(url):
return requests.get(url).content
def content_generator(urls):
with concurrent.futures.ThreadPoolExectur(max_workers=4) as pool:
futs = [pool.submit(fetch_from_remote, url) for url in urls]
for fut in concurrent.futures.as_completed(futs):
yield fut.result()
import asyncio
import time
from threading import Thread
import aiohttp
import janus
# q: do i really have to pass the loop from the main thread around?
async def fetch(url, loop):
async with aiohttp.ClientSession(loop=loop) as session:
package org.apache.solr.highlight;
import com.google.common.primitives.Longs;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
def align(truth_lines, ocr_lines):
nonaligned = []
aligned = []
align_idx = 0
for truth_line in truth_lines:
best_error = 1.0
best_align = 0
for idx, ocr_line in enumerate(ocr_lines[align_idx+1:],
align_idx+1):
total_error = levenshtein(truth_line, ocr_line)
This file has been truncated, but you can view the full file.
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<title>OCR Output</title>
<meta http-equiv='content-type' content='text/html; charset=utf-8'>
<meta http-equiv='content-style-type' content='text/css'>
<meta name='ocr-capabilities' content='ocr_page ocr_par ocr_cinfo ocr_line'>
<meta name='ocr-system' content=' '>
<meta name='ocr-number-of-pages' content='518'>
<meta name='DC.title' content='Popular Tales of the West Highlands'>
package main
import (
"archive/tar"
"bytes"
"compress/gzip"
"encoding/json"
"errors"
"flag"
"fmt"
@jbaiter
jbaiter / dta_aligner.py
Last active September 28, 2017 13:51
Dependencies: `pip install click requests editdistance kraken lxml pillow-simd sickle`
from __future__ import division
import json
import logging
import os
import re
from collections import OrderedDict
from io import BytesIO
import click