Last active
March 10, 2016 03:20
-
-
Save pkrnjevic/6e98bc711f0f213c59eb to your computer and use it in GitHub Desktop.
One way to concatenate a lot of files quickly (352k in 2m48sec) ... inspired by http://randyzwitch.com/gnu-parallel-medium-data/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// catfile.go | |
// one possible way to concatenate files quickly | |
// inspired by http://randyzwitch.com/gnu-parallel-medium-data/ | |
// runtime on mb pro15 (late 2013): 352k files concatentaed in 2m48sec | |
// | |
package main | |
import ( | |
"fmt" | |
"io/ioutil" | |
"os" | |
"path/filepath" | |
) | |
func walk(out *os.File) filepath.WalkFunc { | |
count := 0 | |
return func(path string, info os.FileInfo, err error) error { | |
count++ | |
fmt.Printf("%d\t%s\r", count, path) | |
if info.IsDir() { | |
return nil | |
} | |
dat, err := ioutil.ReadFile(path) | |
if err != nil { | |
panic(err) | |
} | |
_, err = out.Write(dat) | |
if err != nil { | |
panic(err) | |
} | |
return nil | |
} | |
} | |
func main() { | |
out, err := os.Create("output.txt") | |
defer out.Close() | |
if err != nil { | |
panic(err) | |
} | |
walk := walk(out) | |
err = filepath.Walk("transactions/", walk) | |
if err != nil { | |
panic(err) | |
} | |
fmt.Printf("\ndone\n") | |
} | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// code from http://randyzwitch.com/gnu-parallel-medium-data/ | |
// used to generate test data files | |
#Python Code | |
import random, csv | |
from faker import Faker | |
fake = Faker() | |
from pandas import DataFrame | |
import pandas as pd | |
# Create customer file of 1,234,567 customers with fake data | |
# Use dataframe index as a way to generate unique customer id | |
customers = [fake.simple_profile() for x in range(0,1234567)] | |
customer_df = pd.DataFrame(customers) | |
customer_df["cust_id"] = customer_df.index | |
#Read in transactions file from arules package | |
#with open("grocerydata.txt") as f: | |
with open("groceries.txt") as f: | |
transactions = f.readlines() | |
#Remove new line character | |
transactions = [x[0:-1] for x in transactions] | |
#Generate transactions by cust_id | |
#file format: | |
#cust_id::int | |
#store_id::int | |
#transaction_datetime::string/datetime | |
#items::string | |
#for each customer... | |
for i in range(0,1234567): | |
#...create a file... | |
# with open('./transactions/custfile_%s' % i, 'w') as csvfile: | |
with open('transactions/custfile_%s' % i, 'w') as csvfile: | |
trans = csv.writer(csvfile, delimiter=' ', quotechar='"', quoting=csv.QUOTE_MINIMAL) | |
#...that contains all of the transactions they've ever made | |
for j in range(1, random.randint(1,365)): | |
trans.writerow([i, fake.zipcode(), fake.date_time_this_decade(before_now=True, after_now=False), transactions[random.randint(0,len(transactions) - 1)]]) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// code from http://randyzwitch.com/gnu-parallel-medium-data/ | |
// used to generate test data files | |
#R Code | |
library(arules) | |
data("Groceries") | |
write(Groceries, "groceries.txt", sep = ",") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
One possible way to concatenate files quickly.
Inspired by Randy Zwitch's article A Million Text Files And A Single Laptop.
Runtime on (late 2013) mbpro15: 352k files concatenated in 2m48sec.