mieitza’s gists

mieitza / import_csv_to_mongo

Created January 3, 2018 15:07 — forked from thangarajan8/import_csv_to_mongo

Store CSV data into mongodb using python pandas

	#!/usr/bin/env python
	import sys
	import pandas as pd
	import pymongo
	import json



	def import_content(filepath):
	mng_client = pymongo.MongoClient('localhost', 27017)

mieitza / pandas-cheatsheet.py

Created January 3, 2018 15:08 — forked from spepechen/pandas-cheatsheet.py

handy 🐼 snippets

	#### BASIC ########################################################################################################################

	# cleaning str in the header
	df.columns = [x.lower().strip() for x in df.columns] # lower case, trim leading and trailing spaces
	df.columns = [x.strip().replace(' ', '_') for x in df.columns] # replace whitespaces b/w words with _

	# checking NaN in all df
	df.isnull().values.any()

	# get column-slices

mieitza / 00a_quick_ref.py

Created January 3, 2018 15:08 — forked from Yogendra0Sharma/00a_quick_ref.py

pandas

	## FUC (Frequently Used Commands)
	df.isnull.sum() # num of missing values per column


	## USEFUL FUNCTIONS
	# Binning
	pd.cut
	pd.qcut # qunatile-based pd.cut

mieitza / Spark Dataframe Cheat Sheet.py

Created January 3, 2018 15:08 — forked from ved93/Spark Dataframe Cheat Sheet.py

Cheat sheet for Spark Dataframes (using Python)

	# A simple cheat sheet of Spark Dataframe syntax
	# Current for Spark 1.6.1

	# import statements
	from pyspark.sql import SQLContext
	from pyspark.sql.types import *
	from pyspark.sql.functions import *

	#creating dataframes
	df = sqlContext.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"]) # from manual data

mieitza / pandas example.py

Created January 3, 2018 15:08 — forked from jacobSingh/pandas example.py

	#REad in the log
	matches = pd.read_csv('../data/all_but_champ/match_log.csv')

	#Add a column for match length
	matches["length"] = matches["player1-score"] + matches["player2-score"]

	# Get all the records where Trav won
	travis_winner = matches[matches["winner"] == "Travis Roberts"]
	# Get all the records where Trav lost
	travis_loser = matches[matches["loser"] == "Travis Roberts"]

mieitza / deduplication_mongod.py

Created January 3, 2018 15:09 — forked from kireal/deduplication_mongod.py

Run to deduplicate mongodb collection

	#!/usr/bin/env python
	# -- coding: utf-8 --
	#
	# Deduplication collection mongodb database utility.
	#
	# Works only with pymongo >= 3.0
	# Kireal
	#
	# 01.10.2016
	# Last change: 04.10.2016

mieitza / mongo_to_csv.py

Created January 3, 2018 15:09 — forked from wixb50/mongo_to_csv.py

python mongo to csv use pandas.

	# @Author: xiewenqian <int>
	# @Date: 2016-11-28T20:35:09+08:00
	# @Email: [email protected]
	# @Last modified by: int
	# @Last modified time: 2016-12-01T19:32:48+08:00


	import pandas as pd
	from pymongo import MongoClient

mieitza / comparing_two_fields.py

Created January 3, 2018 15:10 — forked from VITIMan/comparing_two_fields.py

PyMongo snippets


	def reaching_max_messages(current_hour_bitrange):
	"""
	from https://stackoverflow.com/a/29281409
	"""
	# https://stackoverflow.com/a/29281409
	# https://docs.mongodb.com/manual/reference/operator/query/bitsAllSet/

	pipeline = [
	{

mieitza / mongo.py

Created January 3, 2018 15:19

parallel insert

	import json
	import pandas as pd
	from pymongo import MongoClient


	def mongo_to_pandas(client, database, collection):
	""" Reads mongo collection to pandas data frame."""
	cur = client[database][collection].find()
	df = pd.DataFrame(list(cur))
	df["_id"] = df["_id"].apply(lambda x: str(x)) #convert ids to strings

mieitza / logistic_regression_model (1).py

Created January 3, 2018 15:19 — forked from rohitr360/logistic_regression_model (1).py

	import pandas as pd
	import numpy as np
	import re, os, time, csv, random, pymongo, math, html, string, html2text
	from pymongo import MongoClient
	import xml.etree.ElementTree as ET
	import pymongo
	import sklearn

Mihai mieitza