Skip to content

Instantly share code, notes, and snippets.

View urigoren's full-sized avatar

Uri Goren urigoren

View GitHub Profile
import collections
import wikipedia
from bs4 import BeautifulSoup
def infobox(wiki_page):
"""Returns the infobox of a given wikipedia page"""
if isinstance(wiki_page, str):
wiki_page = wikipedia.page(wiki_page)
try:
soup = BeautifulSoup(wiki_page.html()).find_all("table", {"class": "infobox"})[0]
import numpy as np
import pdfplumber
import itertools, collections, sys, os, re, json
from pprint import pprint as pr
from copy import deepcopy
from operator import itemgetter as at
class CartesianText:
__slots__ = ["text", "x0", "x1", "y0", "y1", "page_height"]
from collections import namedtuple
from datetime import datetime
date_pattern = "%Y-%m-%dT%H:%M:%S.%fZ"
Point = namedtuple("Point", ("x", "y"))
def serialize_datetime(nt):
assert hasattr(nt, '_asdict')
import sys, os
import streamlit as st
def file2page_name(fname):
return fname.replace('.py', '').split("_", 1)[1].title()
sys.path.append("..")
page_files = dict()
@urigoren
urigoren / .htaccess
Last active February 7, 2021 13:00
Call python via command line from php
Options +SymLinksIfOwnerMatch
RewriteEngine on
RewriteCond %{REQUEST_FILENAME} !-f
RewriteCond %{REQUEST_FILENAME} !-d
RewriteRule ^((?!index\.php).+)$ /index.php?py=$1 [NC,L,QSA]
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@urigoren
urigoren / config_reader.py
Created May 1, 2021 19:41
A simple cascading config reader
import os, sys, json
from pathlib import Path
class ConfigReader:
def __init__(self, default=None, **kwargs):
self.default=default
self.py_file = Path(os.path.join(os.getcwd(), sys.argv[0])).absolute()
p = self.py_file.parent
found_config_json = []
while p!=Path('/'):
@urigoren
urigoren / ConditionedTextClassifier.py
Last active July 10, 2021 14:05
Bag-of-words baseline for conditional text classification
from copy import deepcopy as clone
from sklearn.base import ClassifierMixin
from sklearn.pipeline import Pipeline
class ConditionedTextClassifier(ClassifierMixin):
def __init__(self, conditions, model, condition_sep=' <s> '):
self.condition_sep=condition_sep
self.conditions = {}
for c in conditions:
self.conditions[c] = clone(model)
@urigoren
urigoren / bgprocess.py
Last active February 3, 2022 09:51
Run a python process in the background
from pathlib import Path
import subprocess, sys
def bgprocess(p:Path, *args):
python = sys.executable
if not isinstance(p, Path):
p = Path(p)
p = p.absolute()
return subprocess.Popen([python, p.name]+list(args), cwd = str(p.parent), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
from collections import defaultdict
from itertools import product
from scipy import sparse
from sklearn.base import TransformerMixin
class InteractionBySplit(TransformerMixin):
"""
Takes a sparse matrix as input, and an index to split by, and returns all possible interactions before and after that index.
"""
def __init__(self, split_index,*args,**kwargs):