Skip to content

Instantly share code, notes, and snippets.

@zacharysyoung
Last active August 9, 2023 17:55
Show Gist options
  • Save zacharysyoung/da51b614acc7f8e97b249d1f47302900 to your computer and use it in GitHub Desktop.
Save zacharysyoung/da51b614acc7f8e97b249d1f47302900 to your computer and use it in GitHub Desktop.
Generate CSVs
#!/usr/bin/env python3
import csv
import random
# Used to characterize answer for https://stackoverflow.com/questions/75578992
with open("input.csv", "w", newline="") as f:
w = csv.writer(f)
w.writerow(["RowNum", "ID"])
for i in range(20_000_000):
if i % 100_000 == 0:
print(f"to row {i+1}")
w.writerow([i+1, random.randint(0, 20_000_000)])
package main
import (
"encoding/csv"
"fmt"
"os"
"strconv"
"strings"
// random field data
"math/rand"
"time"
)
var rowsArg, colsArg float64
func init() {
if len(os.Args) < 2 || os.Args[1] == "-h" {
fmt.Println("genCSV [ROWS] [COLS]")
return
}
var err error
// Parsing as float so I can use the following syntaxes: 1_000_000, 10e6
rowsArg, err = strconv.ParseFloat(os.Args[1], 0)
if err != nil {
panic(err)
}
colsArg, err = strconv.ParseFloat(os.Args[2], 0)
if err != nil {
panic(err)
}
}
func main() {
rand.Seed(time.Now().UnixNano())
rows := int(rowsArg)
cols := int(colsArg) + 1
lens := [4]int{5, 6, 7, 8} // possible lengths of a cell
row := make([]string, cols) // row with cols-number of cells
fOut, _ := os.Create(fmt.Sprintf("gen_%dx%d.csv", rows, cols))
defer fOut.Close()
w := csv.NewWriter(fOut)
// Create and write header
row[0] = "ID"
for i := 1; i < cols; i++ {
row[i] = fmt.Sprintf("Col%d", i)
}
w.Write(row)
var len int
var char string
for i := 0; i < rows; i++ {
row[0] = fmt.Sprintf("%d", i)
for j := 1; j < cols; j++ {
char = string(rune('a' + rand.Intn(26))) // pick a lower-case letter at random
len = lens[rand.Intn(4)] // pick length from possible `lens`
row[j] = strings.Repeat(char, len)
}
w.Write(row)
}
w.Flush()
}
package main
import (
"encoding/csv"
"fmt"
"math/rand"
"os"
"strconv"
"strings"
)
const (
sUsage = "usage: gen [-h] COL_TYPES NROWS"
sHelp = `generate a CSV with NROWS number of rows following the layout of COL_TYPES, a
comma-separated list of types (e.g., 'float64,int,bool,string'). Each field of
a row will be filled with random values of that column's type.`
)
type Type int
const (
tF64 Type = iota
tInt
tBool
tString
)
func usage() {
fmt.Fprintln(os.Stderr, sUsage)
os.Exit(1)
}
func help() {
fmt.Fprintln(os.Stderr, sUsage)
fmt.Fprintln(os.Stderr)
fmt.Fprintln(os.Stderr, sHelp)
fmt.Fprintln(os.Stderr)
os.Exit(1)
}
func main() {
args := os.Args
if len(args) < 2 {
usage()
}
if args[1] == "-h" {
help()
}
if len(args) < 3 {
usage()
}
types, err := readHeader(args[1])
if err != nil {
fmt.Fprintf(os.Stderr, "error: could not read header %q: %v\n", args[1], err)
usage()
}
nrows, err := strconv.Atoi(args[2])
if err != nil {
fmt.Fprintf(os.Stderr, "error: could not read number of rows %q: %v\n", args[2], err)
usage()
}
w := csv.NewWriter(os.Stdout)
// header
row := make([]string, len(types))
for i := 0; i < len(types); i++ {
row[i] = fmt.Sprintf("Col%d", i)
}
w.Write(row)
for i := 0; i < nrows; i++ {
for j, t := range types {
row[j] = genValue(t)
}
w.Write(row)
}
w.Flush()
}
// readHeader reads the header string and parses the Type of each column.
func readHeader(s string) ([]Type, error) {
types := make([]Type, 0)
r := csv.NewReader(strings.NewReader(s))
colTypes, err := r.Read()
if err != nil {
return types, err
}
for _, x := range colTypes {
switch x {
case "float64":
types = append(types, tF64)
case "int":
types = append(types, tInt)
case "bool":
types = append(types, tBool)
case "string":
types = append(types, tString)
default:
return types, fmt.Errorf("%v does not match valid types: float64, int, bool, string", x)
}
}
return types, nil
}
func genValue(t Type) string {
var s string
switch t {
case tF64:
s = strconv.FormatFloat(rand.Float64()*100, 'g', 6, 64)[:5]
case tInt:
s = strconv.Itoa(rand.Int())[:5]
case tBool:
s = "false"
if rand.Intn(2) == 1 {
s = "true"
}
case tString:
for i := 0; i < 5; i++ {
s += string(byte(rand.Intn(26) + 65))
}
}
return s
}
package main
import (
"encoding/csv"
"flag"
"fmt"
"math"
"os"
"strconv"
)
const desc = `Print a ROWSxCOLS CSV of uniformly named and padded fields, e.g., ./prog 10 5:
r01c1,r01c2,...,...,r01c5
r02c1,r02c2,...,...,r02c5
.....,.....,...,...,.....
r10c1,r10c2,...,...,r10c5
`
var (
// Args
rows int
cols int
// Whole-file padding values
padRows int
padCols int
)
func init() {
flag.Usage = func() {
usage := fmt.Sprintf("Usage of %s: ROWS COLS [Flags]\n", os.Args[0])
fmt.Fprintf(flag.CommandLine.Output(), "%s\n%s\nFlags:\n\n", usage, desc)
flag.PrintDefaults()
os.Exit(1)
}
flag.Parse()
if len(flag.Args()) != 2 {
flag.Usage()
}
parseArg := func(arg, name string) int {
num, err := strconv.ParseFloat(arg, 32)
if err != nil || num != math.Trunc(num) {
fmt.Fprintf(
flag.CommandLine.Output(),
"expected %s to be a whole number; got %q\n\n", name, arg)
flag.Usage()
}
return int(num)
}
rows = parseArg(flag.Args()[0], "ROWS")
cols = parseArg(flag.Args()[1], "COLS")
// Get "width" of decimal number of rows and columns, for padding later
padRows = int(math.Ceil(math.Log10(float64(rows + 1))))
padCols = int(math.Ceil(math.Log10(float64(cols + 1))))
}
func main() {
row := make([]string, cols)
w := csv.NewWriter(os.Stdout)
for j := 0; j < cols; j++ {
row[j] = fmt.Sprintf("Col%0*d", padCols, j+1)
}
w.Write(row)
for i := 0; i < rows; i++ {
for j := 0; j < cols; j++ {
row[j] = fmt.Sprintf("r%0*dc%0*d", padRows, i+1, padCols, j+1)
}
w.Write(row)
}
w.Flush()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment