Skip to content

Instantly share code, notes, and snippets.

View jaklinger's full-sized avatar

Joel Klinger jaklinger

View GitHub Profile
@jaklinger
jaklinger / terminate_running.py
Created May 21, 2018 10:58
Terminate all RUNNING AND RUNNABLE processes in boto3
import time
import boto3
client = boto3.client('batch')
i = 0
for job in client.list_jobs(jobQueue="HighPriority", jobStatus="RUNNABLE")["jobS
ummaryList"]:
i+=1
print(job["jobId"])
client.terminate_job(jobId=job["jobId"], reason="Too slow")
time.sleep(0.2)
@jaklinger
jaklinger / strapper.py
Created June 15, 2018 11:10
Randomly chunk up an iterable, useful for sampling efficiently
import random
def chunks(whole, n_chunks):
'''Randomly chunk up an iterable'''
# Make sure that it makes sense to chunk up the object
if n_chunks > len(whole) or n_chunks <= 0:
yield whole
return
# Copy the iterable (we'll delete it later anyway) and shuffle it
@jaklinger
jaklinger / luigi_ec2_setup
Created June 29, 2018 13:46
Instructions for setting up Luigi server on AWS EC2
Note: THIS IS NOT A SHELL SCRIPT
cd /dev/shm
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh
<LOG OUT AND IN>
conda create --name py36 python=3.6
@jaklinger
jaklinger / papers_by_fos.py
Created July 5, 2018 09:02
Get papers from Open Academic Graph by FOS
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.sql import text as sql_text
from collections import defaultdict
with open('/Users/jklinger/Nesta-AWS/AWS-RDS-config/open-academic-graph.config') as f:
host, port, database, user, password = f.read().split(':')
database_uri = 'postgresql://{}:{}@{}/{}'.format(user, password, host, "microsoft_academic_graph")
con = create_engine(database_uri)
@jaklinger
jaklinger / api_hitter.html
Created July 19, 2018 13:12
Reminder to self about how to hit my own flask API (to clio)
<!DOCTYPE html>
<html>
<body>
<!-- The search bar -->
<input type='text' id="query_data"/>
<button onclick="hitAPI()"> Submit </button>
<br>
<!-- Where the response will end up -->
from itertools import chain, combinations
def all_subsets(n):
return chain(*map(lambda x: combinations(range(0,n), x), range(2, n+1)))
def subset_matrix(n):
rows = []
for subset in all_subsets(n):
new_row = [0]*n
for i in subset:
new_row[i] = 1
@jaklinger
jaklinger / scan_elasticsearch.py
Created August 28, 2018 15:55
Getting all documents from an Elasticsearch database. Note this method doesn't rank results in any way. For that, you should use the search API, and accept that you will get the top N results.
'''Getting all documents from an Elasticsearch database.
Note this method doesn't rank results in any way. For that, you should
use the search API, and accept that you will get the top N results.'''
from elasticsearch.helpers import scan
from elasticsearch import Elasticsearch
ENDPOINT = "" # <=== Enter an endpoint URI here
es = Elasticsearch(ENDPOINT, index="rwjf", doc_type="world_reporter")
@jaklinger
jaklinger / s3_to_pandas.py
Last active April 7, 2019 03:01
Read CSV (or JSON etc) from AWS S3 to a Pandas dataframe
import boto3
import pandas as pd
from io import BytesIO
bucket, filename = "bucket_name", "filename.csv"
s3 = boto3.resource('s3')
obj = s3.Object(bucket, filename)
with BytesIO(obj.get()['Body'].read()) as bio:
df = pd.read_csv(bio)
@jaklinger
jaklinger / json_to_s3.py
Created November 7, 2018 13:04
Write json to s3
import boto3
import json
from io import StringIO
s3 = boto3.resource("s3")
data = [{"a":1, "c":3},{"b":2}]
s3_obj = s3.Object(<BUCKET NAME>, <FILE MAME>)
s3_obj.put(Body=json.dumps(data))
@jaklinger
jaklinger / s3_bucket_to_pandas.py
Created November 7, 2018 14:01
Read entire content of s3 bucket to pandas dataframe
import boto3
import pandas as pd
from io import BytesIO
bucket = "innovation-mapping-general"
directory = "nih_all_processed_data/"
s3 = boto3.resource('s3')
dfs = []
for key in s3.Bucket(bucket).objects.all():
if not key.key.startswith(directory):