Skip to content

Instantly share code, notes, and snippets.

@a-paxton
a-paxton / text-cleaning+word2vec-gensim.py
Created September 11, 2015 23:31
Cleaning Text Data and Creating 'word2vec' Model with Gensim
# preliminaries
from pymongo import MongoClient
from nltk.corpus import stopwords
from string import ascii_lowercase
import pandas as pd
import gensim, os, re, pymongo, itertools, nltk, snowballstemmer
# set the location where we'll save our model
savefolder = '/data'
import org.apache.spark.ml.feature.{CountVectorizer, RegexTokenizer, StopWordsRemover}
import org.apache.spark.mllib.clustering.{LDA, OnlineLDAOptimizer}
import org.apache.spark.mllib.linalg.Vector
import sqlContext.implicits._
val numTopics: Int = 100
val maxIterations: Int = 100
val vocabSize: Int = 10000
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@rbren
rbren / README.md
Created January 25, 2016 23:36
Sync GitHub Issues to Trello

DataFire Daemon: Sync GitHub Issues to Trello

Create a Trello list for every Milestone, and a card for every Issue

View on DataFire

[metadata]: ./ '{"links":[{"connection":"564cb90e2a034b0e0f1255fe","operation":{"method":"get","path":"/members/{idMember}/boards"}},{"connection":"563b9b84ea9ad5f345e97505","operation":{"method":"get","path":"/repos/{ownerId}/{repoId}/issues"}},{"connection":"564cb90e2a034b0e0f1255fe","operation":{"method":"get","path":"/boards/{idBoard}/cards"}},{"connection":"564cb90e2a034b0e0f1255fe","operation":{"method":"get","path":"/boards/{idBoard}/lists"}},{"connection":"563b9b84ea9ad5f345e97505","operation":{"method":"get","path":"/repos/{ownerId}/{repoId}/milestones"}},{"connection":"564cb90e2a034b0e0f1255fe","operation":{"method":"post","path":"/lists"}},{"connection":"564cb90e2a034b0e0f1255fe","operation":{"method":"post","path":"/cards"}},{"connection":"564cb90e2a034b0e0f1255fe","operation":{"method":"put","path":"/cards/{i

@mrahul17
mrahul17 / fimp_xgboost.py
Last active May 31, 2017 22:20
Feature importance in XGBoost
# credits @mmueller https://www.kaggle.com/mmueller/liberty-mutual-group-property-inspection-prediction/xgb-feature-importance-python/code
import pandas as pd
import xgboost as xgb
import operator
from matplotlib import pylab as plt
def ceate_feature_map(features):
outfile = open('xgb.fmap', 'w')
i = 0
for feat in features:
# -*- coding: utf-8 -*-
## EXPORTING TO PDF FROM revealjs OR jupyter notebook slides
## using nbconvert and decktape (https://github.com/astefanutti/decktape)
## to export pdf and/or html(revealjs)
## from jupyter notebook / revealjs html
## phantomjs must be included in path, and decktape directory must be place beside this export_reveal.py file
## for more detail, please check:
## nbconvert - https://github.com/jupyter/nbconvert
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@duckworth
duckworth / alb.sql
Created December 20, 2016 19:56
AWS ALB Logs Athena RegexSerDe
CREATE EXTERNAL TABLE IF NOT EXISTS alb_logs (
type string,
timestamp string,
elb string,
client_ip string,
client_port int,
target_ip string,
target_port int,
request_processing_time double,
target_processing_time double,
You can use select with varargs including *:
import spark.implicits._
df.select($"*" +: Seq("A", "B", "C").map(c =>
sum(c).over(Window.partitionBy("ID").orderBy("time")).alias(s"cum$c")
): _*)
This:
Maps columns names to window expressions with Seq("A", ...).map(...)
# See official docs at https://dash.plotly.com
# pip install dash pandas
from dash import Dash, dcc, html, Input, Output
import plotly.express as px
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminderDataFiveYear.csv')