Skip to content

Instantly share code, notes, and snippets.

@mieitza
mieitza / import_csv_to_mongo
Created January 3, 2018 15:07 — forked from thangarajan8/import_csv_to_mongo
Store CSV data into mongodb using python pandas
#!/usr/bin/env python
import sys
import pandas as pd
import pymongo
import json
def import_content(filepath):
mng_client = pymongo.MongoClient('localhost', 27017)
@mieitza
mieitza / pandas-cheatsheet.py
Created January 3, 2018 15:08 — forked from spepechen/pandas-cheatsheet.py
handy 🐼 snippets
#### BASIC ########################################################################################################################
# cleaning str in the header
df.columns = [x.lower().strip() for x in df.columns] # lower case, trim leading and trailing spaces
df.columns = [x.strip().replace(' ', '_') for x in df.columns] # replace whitespaces b/w words with _
# checking NaN in all df
df.isnull().values.any()
# get column-slices
## FUC (Frequently Used Commands)
df.isnull.sum() # num of missing values per column
## USEFUL FUNCTIONS
# Binning
pd.cut
pd.qcut # qunatile-based pd.cut
@mieitza
mieitza / Spark Dataframe Cheat Sheet.py
Created January 3, 2018 15:08 — forked from ved93/Spark Dataframe Cheat Sheet.py
Cheat sheet for Spark Dataframes (using Python)
# A simple cheat sheet of Spark Dataframe syntax
# Current for Spark 1.6.1
# import statements
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
#creating dataframes
df = sqlContext.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"]) # from manual data
#REad in the log
matches = pd.read_csv('../data/all_but_champ/match_log.csv')
#Add a column for match length
matches["length"] = matches["player1-score"] + matches["player2-score"]
# Get all the records where Trav won
travis_winner = matches[matches["winner"] == "Travis Roberts"]
# Get all the records where Trav lost
travis_loser = matches[matches["loser"] == "Travis Roberts"]
@mieitza
mieitza / deduplication_mongod.py
Created January 3, 2018 15:09 — forked from kireal/deduplication_mongod.py
Run to deduplicate mongodb collection
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Deduplication collection mongodb database utility.
#
# Works only with pymongo >= 3.0
# Kireal
#
# 01.10.2016
# Last change: 04.10.2016
@mieitza
mieitza / mongo_to_csv.py
Created January 3, 2018 15:09 — forked from wixb50/mongo_to_csv.py
python mongo to csv use pandas.
# @Author: xiewenqian <int>
# @Date: 2016-11-28T20:35:09+08:00
# @Email: [email protected]
# @Last modified by: int
# @Last modified time: 2016-12-01T19:32:48+08:00
import pandas as pd
from pymongo import MongoClient
def reaching_max_messages(current_hour_bitrange):
"""
from https://stackoverflow.com/a/29281409
"""
# https://stackoverflow.com/a/29281409
# https://docs.mongodb.com/manual/reference/operator/query/bitsAllSet/
pipeline = [
{
@mieitza
mieitza / mongo.py
Created January 3, 2018 15:19
parallel insert
import json
import pandas as pd
from pymongo import MongoClient
def mongo_to_pandas(client, database, collection):
""" Reads mongo collection to pandas data frame."""
cur = client[database][collection].find()
df = pd.DataFrame(list(cur))
df["_id"] = df["_id"].apply(lambda x: str(x)) #convert ids to strings
import pandas as pd
import numpy as np
import re, os, time, csv, random, pymongo, math, html, string, html2text
from pymongo import MongoClient
import xml.etree.ElementTree as ET
import pymongo
import sklearn