Documentation
¶
Overview ¶
Package csv reads CSV files and presents the extracted data as records, also writes data as record into CSV files
Example ¶
package main
import (
"bytes"
"fmt"
"log"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/csv"
)
func main() {
f := bytes.NewBufferString(`## a simple set of data: int64;float64;string
0;0;str-0
1;1;str-1
2;2;str-2
3;3;str-3
4;4;str-4
5;5;str-5
6;6;str-6
7;7;str-7
8;8;str-8
9;9;str-9
`)
schema := arrow.NewSchema(
[]arrow.Field{
{Name: "i64", Type: arrow.PrimitiveTypes.Int64},
{Name: "f64", Type: arrow.PrimitiveTypes.Float64},
{Name: "str", Type: arrow.BinaryTypes.String},
},
nil,
)
r := csv.NewReader(f, schema, csv.WithComment('#'), csv.WithComma(';'))
defer r.Release()
n := 0
for r.Next() {
rec := r.RecordBatch()
for i, col := range rec.Columns() {
fmt.Printf("rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
}
n++
}
// check for reader errors indicating issues converting csv values
// to the arrow schema types
err := r.Err()
if err != nil {
log.Fatal(err)
}
}
Output: rec[0]["i64"]: [0] rec[0]["f64"]: [0] rec[0]["str"]: ["str-0"] rec[1]["i64"]: [1] rec[1]["f64"]: [1] rec[1]["str"]: ["str-1"] rec[2]["i64"]: [2] rec[2]["f64"]: [2] rec[2]["str"]: ["str-2"] rec[3]["i64"]: [3] rec[3]["f64"]: [3] rec[3]["str"]: ["str-3"] rec[4]["i64"]: [4] rec[4]["f64"]: [4] rec[4]["str"]: ["str-4"] rec[5]["i64"]: [5] rec[5]["f64"]: [5] rec[5]["str"]: ["str-5"] rec[6]["i64"]: [6] rec[6]["f64"]: [6] rec[6]["str"]: ["str-6"] rec[7]["i64"]: [7] rec[7]["f64"]: [7] rec[7]["str"]: ["str-7"] rec[8]["i64"]: [8] rec[8]["f64"]: [8] rec[8]["str"]: ["str-8"] rec[9]["i64"]: [9] rec[9]["f64"]: [9] rec[9]["str"]: ["str-9"]
Example (Reader) ¶
package main
import (
"fmt"
"os"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/array"
arrowcsv "github.com/apache/arrow-go/v18/arrow/csv"
)
func main() {
filePath := "../../arrow-testing/data/csv/aggregate_test_100.csv" // Test csv file
f, err := os.Open(filePath)
if err != nil {
fmt.Printf("Failed to open file: %v\n", err)
return
}
defer f.Close()
// Schema defined in the csv file
schema := arrow.NewSchema([]arrow.Field{
{Name: "c1", Type: arrow.BinaryTypes.String, Nullable: true},
{Name: "c2", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
{Name: "c3", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
{Name: "c4", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
{Name: "c5", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
{Name: "c6", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
{Name: "c7", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
{Name: "c8", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
{Name: "c9", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
{Name: "c10", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
{Name: "c11", Type: arrow.PrimitiveTypes.Float64, Nullable: true},
{Name: "c12", Type: arrow.PrimitiveTypes.Float64, Nullable: true},
{Name: "c13", Type: arrow.BinaryTypes.String, Nullable: true},
}, nil)
reader := arrowcsv.NewReader(f, schema, arrowcsv.WithHeader(true), arrowcsv.WithChunk(-1))
defer reader.Release()
// Read the first record
if !reader.Next() {
if err := reader.Err(); err != nil {
fmt.Printf("Error reading CSV: %v\n", err)
return
}
fmt.Println("No records found")
return
}
// Get the record but don't release it - the reader will handle that
record := reader.RecordBatch()
fmt.Printf("Number of rows: %d\n", record.NumRows())
fmt.Printf("Number of columns: %d\n", record.NumCols())
fmt.Println()
fmt.Println("Basic statistics for numeric columns:")
for i := 1; i < 10; i++ { // cols c2 through c10 are Int64
col := record.Column(i).(*array.Int64)
var sum int64
for j := 0; j < col.Len(); j++ {
sum += col.Value(j)
}
avg := float64(sum) / float64(col.Len())
fmt.Printf("Column c%d: Average = %.2f\n", i+1, avg)
}
for i := 10; i < 12; i++ { // cols c11 and c12 are Float64
col := record.Column(i).(*array.Float64)
var sum float64
for j := 0; j < col.Len(); j++ {
sum += col.Value(j)
}
avg := sum / float64(col.Len())
fmt.Printf("Column c%d: Average = %.4f\n", i+1, avg)
}
}
Output: Number of rows: 100 Number of columns: 13 Basic statistics for numeric columns: Column c2: Average = 2.85 Column c3: Average = 7.81 Column c4: Average = 2319.97 Column c5: Average = 158626279.61 Column c6: Average = 59276376114661656.00 Column c7: Average = 130.60 Column c8: Average = 30176.41 Column c9: Average = 2220897700.60 Column c10: Average = -86834033398685392.00 Column c11: Average = 0.4793 Column c12: Average = 0.5090
Example (WithChunk) ¶
package main
import (
"bytes"
"fmt"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/csv"
)
func main() {
f := bytes.NewBufferString(`## a simple set of data: int64;float64;string
0;0;str-0
1;1;str-1
2;2;str-2
3;3;str-3
4;4;str-4
5;5;str-5
6;6;str-6
7;7;str-7
8;8;str-8
9;9;str-9
`)
schema := arrow.NewSchema(
[]arrow.Field{
{Name: "i64", Type: arrow.PrimitiveTypes.Int64},
{Name: "f64", Type: arrow.PrimitiveTypes.Float64},
{Name: "str", Type: arrow.BinaryTypes.String},
},
nil,
)
r := csv.NewReader(
f, schema,
csv.WithComment('#'), csv.WithComma(';'),
csv.WithChunk(3),
)
defer r.Release()
n := 0
for r.Next() {
rec := r.RecordBatch()
for i, col := range rec.Columns() {
fmt.Printf("rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
}
n++
}
}
Output: rec[0]["i64"]: [0 1 2] rec[0]["f64"]: [0 1 2] rec[0]["str"]: ["str-0" "str-1" "str-2"] rec[1]["i64"]: [3 4 5] rec[1]["f64"]: [3 4 5] rec[1]["str"]: ["str-3" "str-4" "str-5"] rec[2]["i64"]: [6 7 8] rec[2]["f64"]: [6 7 8] rec[2]["str"]: ["str-6" "str-7" "str-8"] rec[3]["i64"]: [9] rec[3]["f64"]: [9] rec[3]["str"]: ["str-9"]
Example (Writer) ¶
package main
import (
"bytes"
"fmt"
"log"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/array"
"github.com/apache/arrow-go/v18/arrow/csv"
"github.com/apache/arrow-go/v18/arrow/memory"
)
func main() {
f := new(bytes.Buffer)
pool := memory.NewGoAllocator()
schema := arrow.NewSchema(
[]arrow.Field{
{Name: "i64", Type: arrow.PrimitiveTypes.Int64},
{Name: "f64", Type: arrow.PrimitiveTypes.Float64},
{Name: "str", Type: arrow.BinaryTypes.String},
},
nil,
)
b := array.NewRecordBuilder(pool, schema)
defer b.Release()
b.Field(0).(*array.Int64Builder).AppendValues([]int64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nil)
b.Field(1).(*array.Float64Builder).AppendValues([]float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nil)
b.Field(2).(*array.StringBuilder).AppendValues([]string{"str-0", "str-1", "str-2", "str-3", "str-4", "str-5", "str-6", "str-7", "str-8", "str-9"}, nil)
rec := b.NewRecordBatch()
defer rec.Release()
w := csv.NewWriter(f, schema, csv.WithComma(';'))
err := w.Write(rec)
if err != nil {
log.Fatal(err)
}
err = w.Flush()
if err != nil {
log.Fatal(err)
}
err = w.Error()
if err != nil {
log.Fatal(err)
}
r := csv.NewReader(f, schema, csv.WithComment('#'), csv.WithComma(';'))
defer r.Release()
n := 0
for r.Next() {
rec := r.RecordBatch()
for i, col := range rec.Columns() {
fmt.Printf("rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
}
n++
}
// check for reader errors indicating issues converting csv values
// to the arrow schema types
err = r.Err()
if err != nil {
log.Fatal(err)
}
}
Output: rec[0]["i64"]: [0] rec[0]["f64"]: [0] rec[0]["str"]: ["str-0"] rec[1]["i64"]: [1] rec[1]["f64"]: [1] rec[1]["str"]: ["str-1"] rec[2]["i64"]: [2] rec[2]["f64"]: [2] rec[2]["str"]: ["str-2"] rec[3]["i64"]: [3] rec[3]["f64"]: [3] rec[3]["str"]: ["str-3"] rec[4]["i64"]: [4] rec[4]["f64"]: [4] rec[4]["str"]: ["str-4"] rec[5]["i64"]: [5] rec[5]["f64"]: [5] rec[5]["str"]: ["str-5"] rec[6]["i64"]: [6] rec[6]["f64"]: [6] rec[6]["str"]: ["str-6"] rec[7]["i64"]: [7] rec[7]["f64"]: [7] rec[7]["str"]: ["str-7"] rec[8]["i64"]: [8] rec[8]["f64"]: [8] rec[8]["str"]: ["str-8"] rec[9]["i64"]: [9] rec[9]["f64"]: [9] rec[9]["str"]: ["str-9"]
Index ¶
- Variables
- type Option
- func WithAllocator(mem memory.Allocator) Option
- func WithBoolWriter(fmtr func(bool) string) Option
- func WithCRLF(useCRLF bool) Option
- func WithChunk(n int) Option
- func WithColumnTypes(types map[string]arrow.DataType) Option
- func WithComma(c rune) Option
- func WithComment(c rune) Option
- func WithCustomTypeConverter(...) Option
- func WithHeader(useHeader bool) Option
- func WithIncludeColumns(cols []string) Option
- func WithLazyQuotes(useLazyQuotes bool) Option
- func WithNullReader(stringsCanBeNull bool, nullValues ...string) Option
- func WithNullWriter(null string) Option
- func WithStringsReplacer(replacer *strings.Replacer) Option
- type Reader
- type Writer
Examples ¶
Constants ¶
This section is empty.
Variables ¶
var DefaultNullValues = []string{"", "NULL", "null"}
DefaultNullValues is the set of values considered as NULL values by default when Reader is configured to handle NULL values.
var (
ErrMismatchFields = errors.New("arrow/csv: number of records mismatch")
)
Functions ¶
This section is empty.
Types ¶
type Option ¶
type Option func(config)
Option configures a CSV reader/writer.
func WithAllocator ¶
WithAllocator specifies the Arrow memory allocator used while building records.
func WithBoolWriter ¶
WithBoolWriter override the default bool formatter with a function that returns a string representation of bool states. i.e. True, False, 1, 0
func WithCRLF ¶
WithCRLF specifies the line terminator used while writing CSV files. If useCRLF is true, \r\n is used as the line terminator, otherwise \n is used. The default value is false.
func WithChunk ¶
WithChunk specifies the chunk size used while parsing CSV files.
If n is zero or 1, no chunking will take place and the reader will create one record per row. If n is greater than 1, chunks of n rows will be read. If n is negative, the reader will load the whole CSV file into memory and create one big record with all the rows.
func WithColumnTypes ¶
WithColumnTypes allows specifying optional per-column types (disabling type inference on those columns).
Will panic if used in conjunction with an explicit schema.
func WithComment ¶
WithComment specifies the comment character used while parsing CSV files.
func WithCustomTypeConverter ¶ added in v18.4.0
func WithCustomTypeConverter(converter func(typ arrow.DataType, col arrow.Array) (result []string, handled bool)) Option
WithCustomTypeConverter allows specifying a custom type converter for the CSV writer.
returns a slice of strings that must match the number of columns in the output csv. the second return value is a boolean that indicates if the conversion was handled. if it is set to false, the library will attempt to use default conversion.
There are multiple ways to convert arrow types to strings, and depending on the goal, you may want to use a different one. One clear example is encoding binary types. The default behaviour is to encode them as base64 strings. If you want to customize this behaviour, you can use this option and use any other encoding, such as hex.
csv.WithCustomTypeConverter(func(typ arrow.DataType, col arrow.Array) (result []string, handled bool) {
// use hex encoding for binary types
if typ.ID() == arrow.BINARY {
result = make([]string, col.Len())
arr := col.(*array.Binary)
for i := 0; i < arr.Len(); i++ {
if !arr.IsValid(i) {
result[i] = "NULL"
continue
}
result[i] = fmt.Sprintf("\\x%x", arr.Value(i))
}
return result, true
}
// keep the default behavior for other types
return nil, false
})
func WithHeader ¶
WithHeader enables or disables CSV-header handling.
func WithIncludeColumns ¶
WithIncludeColumns indicates the names of the columns from the CSV file that should actually be read and converted (in the slice's order). If set and non-empty, columns not in this slice will be ignored.
Will panic if used in conjunction with an explicit schema.
func WithLazyQuotes ¶
WithLazyQuotes sets csv parsing option to LazyQuotes
func WithNullReader ¶
WithNullReader sets options for a CSV Reader pertaining to NULL value handling. If stringsCanBeNull is true, then a string that matches one of the nullValues set will be interpreted as NULL. Numeric columns will be checked for nulls in all cases. If no nullValues arguments are passed in, the defaults set in NewReader() will be kept.
When no NULL values is given, the default set is taken from DefaultNullValues.
func WithNullWriter ¶
WithNullWriter sets the null string written for NULL values. The default is set in NewWriter().
func WithStringsReplacer ¶
WithStringsReplacer receives a replacer to be applied in the string fields of the CSV. This is useful to remove unwanted characters from the string.
type Reader ¶
type Reader struct {
// contains filtered or unexported fields
}
Reader wraps encoding/csv.Reader and creates array.RecordBatches from a schema.
func NewInferringReader ¶
NewInferringReader creates a CSV reader that attempts to infer the types and column names from the data in the first row of the CSV file.
This can be further customized using the WithColumnTypes and WithIncludeColumns options. For BinaryType the reader will use base64 decoding with padding as per base64.StdDecoding.
func NewReader ¶
NewReader returns a reader that reads from the CSV file and creates arrow.RecordBatches from the given schema.
NewReader panics if the given schema contains fields that have types that are not primitive types.
func (*Reader) Err ¶
Err returns the last error encountered during the iteration over the underlying CSV file.
func (*Reader) Next ¶
Next returns whether a Record could be extracted from the underlying CSV file.
Next panics if the number of records extracted from a CSV row does not match the number of fields of the associated schema. If a parse failure occurs, Next will return true and the Record will contain nulls where failures occurred. Subsequent calls to Next will return false - The user should check Err() after each call to Next to check if an error took place.
func (*Reader) RecordBatch ¶ added in v18.4.1
func (r *Reader) RecordBatch() arrow.RecordBatch
RecordBatch returns the current record batch that has been extracted from the underlying CSV file. It is valid until the next call to Next.
func (*Reader) Release ¶
func (r *Reader) Release()
Release decreases the reference count by 1. When the reference count goes to zero, the memory is freed. Release may be called simultaneously from multiple goroutines.
type Writer ¶
type Writer struct {
// contains filtered or unexported fields
}
Writer wraps encoding/csv.Writer and writes arrow.RecordBatch based on a schema.
func NewWriter ¶
NewWriter returns a writer that writes arrow.RecordBatches to the CSV file with the given schema.
NewWriter panics if the given schema contains fields that have types that are not primitive types. For BinaryType the writer will use base64 encoding with padding as per base64.StdEncoding.