in2csv file1.xls > file1.csv
in2csv -f fixed -s schema.csv data.fixed > data.csv
csvgrep -c phone_number -r "\d{3}-123-\d{4}" data.csv > matching.csv
import pandas as pd | |
from github import Github | |
g = Github("username", "password") | |
final = ({'url':r.html_url , 'name': r.name} for r in g.get_user().get_starred()) | |
pd.DataFrame(final).to_excel('Github Stars 20160101.xlsx') |
starts_with <- function(vars, match, ignore.case = TRUE) { | |
if (ignore.case) match <- tolower(match) | |
n <- nchar(match) | |
if (ignore.case) vars <- tolower(vars) | |
substr(vars, 1, n) == match | |
} | |
ends_with <- function(vars, match, ignore.case = TRUE) { | |
if (ignore.case) match <- tolower(match) |
mylist <- list(structure(list(Hit = "True", Project = "Blue", Year = "2011", | |
Rating = "4", Launch = "26 Jan 2012", ID = "19", Dept = "1, 2, 4"), .Names = c("Hit", | |
"Project", "Year", "Rating", "Launch", "ID", "Dept")), structure(list( | |
Hit = "False", Error = "Record not found"), .Names = c("Hit", | |
"Error")), structure(list(Hit = "True", Project = "Green", Year = "2004", | |
Rating = "8", Launch = "29 Feb 2004", ID = "183", Dept = "6, 8"), .Names = c("Hit", | |
"Project", "Year", "Rating", "Launch", "ID", "Dept"))) | |
dfs <- lapply(mylist, data.frame, stringsAsFactors = FALSE) | |
library(dplyr) |
minmax_scaler <- function(x, a, b) { | |
" | |
x: data. numeric vector of values to be scaled | |
a: desired minimum after scaling takes place | |
b: desired maximum after scaling takes place | |
e.g. f(c(1,2,3,4), 1, 17) | |
[1] 1.000000 6.333333 11.666667 17.000000 | |
" | |
(((b - a)*(x - min(x))) / (max(x) - min(x))) + a |
def select(dataframe, columns, keep_others=True): | |
''' Re-order or select columns. If keep_others, then it is simply re-ordered else it will select columns''' | |
cols = set(dataframe.columns) | |
if keep_others: | |
others = list(cols.difference(columns)) | |
reordered = columns + others | |
return dataframe[reordered] | |
else: | |
return dataframe[columns] |
def search_item(dataframe, name, query, na=False, case=False, regex=True): | |
idx = pd.Series([False]*len(dataframe)) | |
# For each item in the query look for the item and collect the documents ids it pertains to | |
for q in query: | |
matches = dataframe[text_column].str.contains(q, na=False, case=False, regex=True) |
--- | |
title: 'Going deeper with dplyr: New features in 0.3 and 0.4' | |
output: html_document | |
--- | |
## Introduction | |
In August 2014, I created a [40-minute video tutorial](https://www.youtube.com/watch?v=jWjqLW-u3hc) introducing the key functionality of the dplyr package in R, using dplyr version 0.2. Since then, there have been two significant updates to dplyr (0.3 and 0.4), introducing a ton of new features. | |
This document (created in March 2015) covers the most useful new features in 0.3 and 0.4, as well as other functionality that I didn't cover last time (though it is not necessarily new). My [new video tutorial](https://www.youtube.com/watch?v=2mh1PqfsXVI) walks through the code below in detail. |
--- | |
title: "Introduction to dplyr for Faster Data Manipulation in R" | |
output: html_document | |
--- | |
Note: There is a 40-minute [video tutorial](https://www.youtube.com/watch?v=jWjqLW-u3hc) on YouTube that walks through this document in detail. | |
## Why do I use dplyr? | |
* Great for data exploration and transformation |