Skip to content

Instantly share code, notes, and snippets.

View fozziethebeat's full-sized avatar

Keith Stevens fozziethebeat

View GitHub Profile
@fozziethebeat
fozziethebeat / prefect_caching_tasks.py
Created December 16, 2021 03:05
Same as other simple Prefect sample pipeline but caches output of tasks to local files. Data is stored as CloudPickles.
import numpy as np
import pandas as pd
import prefect
from os import listdir
from os.path import isfile, join
from prefect import Flow, apply_map, case, task
from prefect.engine.results import LocalResult
from prefect.tasks.control_flow import merge
@fozziethebeat
fozziethebeat / save_computation_prefect.py
Created December 15, 2021 07:35
Tries reading some Dataframes and only re-computes new output Dataframes if needed, otherwise copies old result dataframes
import numpy as np
import pandas as pd
import prefect
from os import listdir
from os.path import isfile, join
from prefect import Flow, apply_map, case, task
from prefect.tasks.control_flow import merge
INPUT_BASE_PATH = './data/input'
@fozziethebeat
fozziethebeat / minimal_can_beam_pipeline.py
Created December 3, 2021 05:18
Minimal Beam CAN Pipeline
# Requires
# pip install apache-beam
# pip install apache-beam[dataframe]
#
# Associated documentation
# Beam Dataframe API: https://beam.apache.org/releases/pydoc/2.34.0/apache_beam.dataframe.html
# Beam Dataframe Overview: https://beam.apache.org/documentation/dsls/dataframes/overview/
# Beam Dataframe Differences: https://beam.apache.org/documentation/dsls/dataframes/differences-from-pandas/
@fozziethebeat
fozziethebeat / simple_beam_can_pipeline.py
Created November 30, 2021 01:09
Simple Beam CAN Pipeline
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from libs.datasets import combined_datasets
from pyseir.run import OneRegionPipeline
class ProcessOneRegion(beam.DoFn):
def process(self, one_region):
@fozziethebeat
fozziethebeat / dynamic_can_airflow.py
Created November 30, 2021 01:05
Full Dynamic CAN Airflow
from collections import defaultdict
from airflow.decorators import dag, task
from airflow.utils.dates import days_ago
from pyseir.run import OneRegionPipeline
from libs.datasets import combined_datasets
default_args = {
'owner': 'airflow',
@fozziethebeat
fozziethebeat / simple_dymaic_can_airflow.py
Created November 30, 2021 01:03
Simple Dynamic CAN Airflow
from collections import defaultdict
from airflow import DAG
from airflow.decorators import dag, task
from airflow.utils.dates import days_ago
from airflow.operators.python import PythonOperator
from libs.datasets import combined_datasets
default_args = {
'owner': 'airflow',
@fozziethebeat
fozziethebeat / NamedEntityRecognitionExample
Created August 1, 2012 07:35
A simple example using Stanford's Named Entity Recognizer as a library
import edu.stanford.nlp.ie.crf.CRFClassifier
import edu.stanford.nlp.ling.CoreLabel
import edu.stanford.nlp.ling.Word
import edu.stanford.nlp.util.StringUtils
import edu.stanford.nlp.sequences.PlainTextDocumentReaderAndWriter
import edu.stanford.nlp.sequences.PlainTextDocumentReaderAndWriter.OutputStyle
import scala.collection.JavaConversions.collectionAsScalaIterable
import scala.collection.JavaConversions.seqAsJavaList
import scala.io.Source
@fozziethebeat
fozziethebeat / AdjustedMutualInformation.scaja
Created March 9, 2012 17:04
A sample implementation of Adjusted Mutual Information written in Scala. This is used to test SemEval Word Sense Induction algorithms.
/*
* Copyright (c) 2011, Lawrence Livermore National Security, LLC. Produced at
* the Lawrence Livermore National Laboratory. Written by Keith Stevens,
* [email protected] OCEC-10-073 All rights reserved.
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
@fozziethebeat
fozziethebeat / NeighborChainAgglomerativeClustering.scala
Created March 2, 2012 05:14
A scala based O(n^2) agglomerative clustering implementation
import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet
import scala.collection.mutable.Stack
class NeighborChainAgglomerativeClustering {
/**
* Clusters the elements represented as symmetric adjacency matrix. Values in
* {@code adj} represent the similarity between any two points using a
* symmetric similarity metric. This returns sets of points assigned to the
@fozziethebeat
fozziethebeat / TestLttoolbox.scala
Created February 8, 2012 19:43
A sample scala driver for doing morphological analysis with lttoolbox
/**
* Sample code for using the <a
* href="http://wiki.apertium.org/wiki/Lttoolbox-java">lttoolbox-java</a> code
* from within Scala. This morhpologically analyzes a simple sentence using
* this <a
* href="http://sourceforge.net/projects/apertium/files/apertium-mk-en/apertium-mk-en-0.1.0.tar.gz/download">english
* dictionary</a>. To run this code, first do two steps:
*
* <ul>
* <li> download and compile lttoolbox-java as per <a href="http://wiki.apertium.org/wiki/Lttoolbox-java">these instructions</a>.</li>