mirror of
https://github.com/cubixle/csv-vs-parquet.git
synced 2026-04-24 14:44:41 +01:00
init
This commit is contained in:
18
README.md
Normal file
18
README.md
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
# CSV vs Parquet
|
||||||
|
|
||||||
|
This is a simple Go project to test the speed differences between writing to CSV and writing to Parquet.
|
||||||
|
|
||||||
|
The dataset will be N randomly generated entires with 3 columns.
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
Using the Hyperfine benchmarking tool, I ran the built binary for the CSV file and the Parquet file and as you can see from the results below the parquet is much faster writing 100,000 rows.
|
||||||
|
|
||||||
|
$ ./tests.sh
|
||||||
|
Benchmark 1: ./app -type csv -amount 100000
|
||||||
|
Time (mean ± σ): 244.2 ms ± 8.7 ms [User: 22.7 ms, System: 197.6 ms]
|
||||||
|
Range (min … max): 234.4 ms … 261.9 ms 11 runs
|
||||||
|
|
||||||
|
Benchmark 1: ./app -type parquet -amount 100000
|
||||||
|
Time (mean ± σ): 38.3 ms ± 3.8 ms [User: 62.0 ms, System: 9.9 ms]
|
||||||
|
Range (min … max): 26.4 ms … 44.4 ms 69 runs
|
||||||
17
go.mod
Normal file
17
go.mod
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
module github.com/cubixle/csv-vs-parquet
|
||||||
|
|
||||||
|
go 1.22.2
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/xitongsys/parquet-go v1.6.2
|
||||||
|
github.com/xitongsys/parquet-go-source v0.0.0-20240122235623-d6294584ab18
|
||||||
|
)
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/apache/arrow/go/arrow v0.0.0-20200730104253-651201b0f516 // indirect
|
||||||
|
github.com/apache/thrift v0.14.2 // indirect
|
||||||
|
github.com/golang/snappy v0.0.3 // indirect
|
||||||
|
github.com/klauspost/compress v1.15.9 // indirect
|
||||||
|
github.com/pierrec/lz4/v4 v4.1.8 // indirect
|
||||||
|
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
|
||||||
|
)
|
||||||
96
main.go
Normal file
96
main.go
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"github.com/xitongsys/parquet-go-source/local"
|
||||||
|
"github.com/xitongsys/parquet-go/parquet"
|
||||||
|
"github.com/xitongsys/parquet-go/writer"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
t := flag.String("type", "csv", "")
|
||||||
|
rows := flag.Int("amount", 1000, "")
|
||||||
|
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
slog.Info("Running", "type", *t, "rows", *rows)
|
||||||
|
|
||||||
|
switch *t {
|
||||||
|
case "csv":
|
||||||
|
err := writeCSV(*rows)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
case "parquet":
|
||||||
|
err := writeParquet(*rows)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeCSV(rows int) error {
|
||||||
|
const filename = "output.csv"
|
||||||
|
|
||||||
|
f, err := os.Create(filename)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < rows; i++ {
|
||||||
|
_, err := fmt.Fprintf(f, "%d,ice hockey,pizza", i)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type Row struct {
|
||||||
|
ID int `parquet:"name=id, type=INT32, encoding=PLAIN"`
|
||||||
|
Sport string `parquet:"name=sport, type=BYTE_ARRAY, convertedtype=UTF8, encoding=PLAIN_DICTIONARY"`
|
||||||
|
Food string `parquet:"name=food, type=BYTE_ARRAY, convertedtype=UTF8, encoding=PLAIN_DICTIONARY"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeParquet(rows int) error {
|
||||||
|
const filename = "output.parquet"
|
||||||
|
|
||||||
|
fw, err := local.NewLocalFileWriter(filename)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to NewLocalFileWriter %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// write
|
||||||
|
pw, err := writer.NewParquetWriter(fw, new(Row), 4)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to NewParquertWriter %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
pw.RowGroupSize = 128 * 1024 * 1024 // 128M
|
||||||
|
pw.CompressionType = parquet.CompressionCodec_SNAPPY
|
||||||
|
|
||||||
|
for i := 0; i < rows; i++ {
|
||||||
|
r := Row{
|
||||||
|
ID: i,
|
||||||
|
Food: "pizza",
|
||||||
|
Sport: "ice hockey",
|
||||||
|
}
|
||||||
|
if err = pw.Write(r); err != nil {
|
||||||
|
return fmt.Errorf("failed to Write %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = pw.WriteStop(); err != nil {
|
||||||
|
return fmt.Errorf("failed to WriteStop %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fw.Close()
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
1
output.csv
Normal file
1
output.csv
Normal file
File diff suppressed because one or more lines are too long
BIN
output.parquet
Normal file
BIN
output.parquet
Normal file
Binary file not shown.
Reference in New Issue
Block a user