Skip to content

Instantly share code, notes, and snippets.

## FUC (Frequently Used Commands)
df.isnull.sum() # num of missing values per column
## USEFUL FUNCTIONS
# Binning
pd.cut
pd.qcut # qunatile-based pd.cut
@mieitza
mieitza / Spark Dataframe Cheat Sheet.py
Created January 3, 2018 15:08 — forked from ved93/Spark Dataframe Cheat Sheet.py
Cheat sheet for Spark Dataframes (using Python)
# A simple cheat sheet of Spark Dataframe syntax
# Current for Spark 1.6.1
# import statements
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
#creating dataframes
df = sqlContext.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"]) # from manual data
#REad in the log
matches = pd.read_csv('../data/all_but_champ/match_log.csv')
#Add a column for match length
matches["length"] = matches["player1-score"] + matches["player2-score"]
# Get all the records where Trav won
travis_winner = matches[matches["winner"] == "Travis Roberts"]
# Get all the records where Trav lost
travis_loser = matches[matches["loser"] == "Travis Roberts"]
@mieitza
mieitza / deduplication_mongod.py
Created January 3, 2018 15:09 — forked from kireal/deduplication_mongod.py
Run to deduplicate mongodb collection
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Deduplication collection mongodb database utility.
#
# Works only with pymongo >= 3.0
# Kireal
#
# 01.10.2016
# Last change: 04.10.2016
@mieitza
mieitza / mongo_to_csv.py
Created January 3, 2018 15:09 — forked from wixb50/mongo_to_csv.py
python mongo to csv use pandas.
# @Author: xiewenqian <int>
# @Date: 2016-11-28T20:35:09+08:00
# @Email: [email protected]
# @Last modified by: int
# @Last modified time: 2016-12-01T19:32:48+08:00
import pandas as pd
from pymongo import MongoClient
def reaching_max_messages(current_hour_bitrange):
"""
from https://stackoverflow.com/a/29281409
"""
# https://stackoverflow.com/a/29281409
# https://docs.mongodb.com/manual/reference/operator/query/bitsAllSet/
pipeline = [
{
@mieitza
mieitza / mongo.py
Created January 3, 2018 15:19
parallel insert
import json
import pandas as pd
from pymongo import MongoClient
def mongo_to_pandas(client, database, collection):
""" Reads mongo collection to pandas data frame."""
cur = client[database][collection].find()
df = pd.DataFrame(list(cur))
df["_id"] = df["_id"].apply(lambda x: str(x)) #convert ids to strings
import pandas as pd
import numpy as np
import re, os, time, csv, random, pymongo, math, html, string, html2text
from pymongo import MongoClient
import xml.etree.ElementTree as ET
import pymongo
import sklearn
@mieitza
mieitza / clustering.py
Created January 3, 2018 15:20 — forked from shahradj/clustering.py
Perform clustering on mouse tracking data
import pandas as pd
from scipy.cluster.hierarchy import fclusterdata
from pymongo import MongoClient
import numpy as np
from datetime import *
import json
def saveToRelational(jsonPacket):
"""
save the received json packet to a relational database
import pandas as pd
from pandas.io.json import json_normalize
from pymongo import MongoClient
import matplotlib.pyplot as plt
import re
import time
pd.set_option('display.expand_frame_repr', False)
def _connect_mongo(host, port, db):