Created
April 13, 2023 00:26
-
-
Save c4pt0r/74c708ec7831ac349822e528a4a6efa9 to your computer and use it in GitHub Desktop.
Small tool to split large CSV file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"bytes" | |
"compress/gzip" | |
"encoding/base64" | |
"encoding/csv" | |
"flag" | |
"fmt" | |
"io" | |
"os" | |
"path" | |
"github.com/c4pt0r/log" | |
) | |
var ( | |
csvFile = flag.String("i", "", "CSV file to split") | |
sizeLimit = flag.Int64("s", 10000000, "Size limit of each partial CSV file in bytes, default: 10000000 (10MB)") | |
outDir = flag.String("o", "./output", "Output directory") | |
hasHeader = flag.Bool("has-header", true, "CSV file has header row") | |
) | |
func mustCreateDir(dir string) { | |
err := os.MkdirAll(dir, 0755) | |
if err != nil && !os.IsExist(err) { | |
log.Fatalf("Failed to create directory: %s, error: %v", dir, err) | |
} | |
} | |
// splitCSVFile splits the specified CSV file into several smaller CSV files with a size limit specified in bytes | |
func splitCSVFile(originalCSVfile string, sizeLimit int64, outDir string, withHeader bool) ([]string, error) { | |
// Open the original CSV file | |
f, err := os.Open(originalCSVfile) | |
if err != nil { | |
return nil, err | |
} | |
defer f.Close() | |
// Create a CSV reader | |
r := csv.NewReader(bufio.NewReader(f)) | |
// Read the header row of the CSV file | |
var header []string | |
if withHeader { | |
header, err = r.Read() | |
if err != nil { | |
return nil, err | |
} | |
} | |
// Create a CSV writer | |
var ( | |
createdFiles []string | |
currentCSVfile *os.File | |
currentCSVwriter *csv.Writer | |
currentCSVfileIndex int | |
currentCSVfileSize int64 | |
rowCount int64 | |
) | |
calcRecordSize := func(row []string) int64 { | |
size := 0 | |
for _, field := range row { | |
size += len(field) | |
} | |
return int64(size) | |
} | |
openNewCSVfile := func() (string, error) { | |
// Close the current CSV file | |
if currentCSVwriter != nil { | |
currentCSVwriter.Flush() | |
currentCSVfile.Close() | |
} | |
// Create a new CSV file | |
baseName := path.Base(originalCSVfile) | |
filename := path.Join(outDir, fmt.Sprintf("%s_%d.csv", baseName, currentCSVfileIndex)) | |
currentCSVfile, err = os.Create(filename) | |
if err != nil { | |
return "", err | |
} | |
// Create a new CSV writer | |
currentCSVwriter = csv.NewWriter(currentCSVfile) | |
if withHeader { | |
// Write the header row of the CSV file | |
err = currentCSVwriter.Write(header) | |
if err != nil { | |
return "", err | |
} | |
} | |
// Update the CSV file index | |
currentCSVfileIndex++ | |
createdFiles = append(createdFiles, filename) | |
return filename, nil | |
} | |
// for the first one | |
_, err = openNewCSVfile() | |
if err != nil { | |
return nil, err | |
} | |
// Split the CSV file | |
for { | |
// Read a record from the CSV file | |
record, err := r.Read() | |
if err == io.EOF { | |
break | |
} else if err != nil { | |
return nil, err | |
} | |
// Open a new CSV file if the current CSV file size exceeds the limit | |
recordSize := calcRecordSize(record) // Calculate the size of the CSV record in bytes | |
if currentCSVfileSize+recordSize > sizeLimit { | |
fn, err := openNewCSVfile() | |
if err != nil { | |
return nil, err | |
} | |
currentCSVfileSize = 0 | |
log.Infof("Created new partial CSV file: %s, started row:%d", fn, rowCount) | |
} | |
// Write the CSV record to the current CSV file | |
err = currentCSVwriter.Write(record) | |
if err != nil { | |
return nil, err | |
} | |
// Update the size of the current CSV file and row count | |
currentCSVfileSize += recordSize | |
rowCount++ | |
} | |
// Close the last CSV file | |
if currentCSVwriter != nil { | |
currentCSVwriter.Flush() | |
currentCSVfile.Close() | |
} | |
return createdFiles, nil | |
} | |
func main() { | |
flag.Parse() | |
// Create the output directory | |
mustCreateDir(*outDir) | |
// Split the CSV file | |
if len(*csvFile) == 0 { | |
log.Fatal("Please specify a CSV file") | |
} | |
files, err := splitCSVFile(*csvFile, *sizeLimit, *outDir, *hasHeader) | |
if err != nil { | |
log.Fatalf("Failed to split CSV file: %s, error: %v", *csvFile, err) | |
} | |
for _, f := range files { | |
fmt.Println(f) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment