R: chunker

chunker {chunkR}

R Documentation

chunker

Description

The objects of class "chunker" are the central elements of the chunkR package. These objects can store a data chunk and other information required for the process of reading a file in pieces. A "chunker" object is created with the chunker() function, that requires the path to a file, and other arguments, as the size of the chunk and the data type ("data.frame" or "matrix"). Two basic methods are defined to manipulate the object:

- next_chunk function to read the next chunk

- get_table function to retrieve the data

The functions get_completed and get_colnames allow to get the number of rows already read, and the column names of the table.

Usage

chunker(path, sep = " ", quoted = FALSE, has_colnames = TRUE,
  has_rownames = TRUE, chunksize = 1000L, data_format = c("data.frame",
  "matrix"), columns_classes = character(0), autodetect = TRUE,
  scan_rows = 10)

Arguments

`path`	Input file path
`sep`	Character separating cells in the input table (default = " ")
`quoted`	Quoted character data? Default FALSE. If TRUE, the program removes quotes.
`has_colnames`	Column names present in the input table? (Logical, default TRUE)
`has_rownames`	Row names present in the input table? (Logical, default TRUE)
`chunksize`	Chunk size (default 1000)
`data_format`	Format of input data: "data.frame" (default) or "matrix".
`columns_classes`	Vector with the class of each column: "character", "numeric" (aka "double"), "integer" or "logical".
`autodetect`	Use auto-detection of columns classes? Default TRUE.
`scan_rows`	How many rows to scan for auto-detection of columns classes. Default is 10. Note that this value shoud be increased when columns only have NA values in the scanned rows. Columns classes are detected via a call to read.table with the scan_rows value passed to the nrows parameter.

Examples



data(iris)

# write iris as tab delimited file. Note that quote is set to FALSE

tmp_path <- file.path(tempdir(),"iris.txt")
write.table(iris, tmp_path, quote = FALSE)

#-----------------------------------------------------------------#
#--- Reading a data frame with automatic column-type detection ---#
#-----------------------------------------------------------------#

# create a 'chunker' object passing the path of the input file.
my_chunker_object <- chunker(tmp_path, chunksize = 30)

# read a chunk
next_chunk(my_chunker_object)

# get the chunk
 get_table(my_chunker_object)

# read another chunk
next_chunk(my_chunker_object)

# get the number of lines already read
get_completed(my_chunker_object)


#--- read a csv file ---#

tmp_path_csv <- file.path(tempdir(),"iris.csv")

write.table(iris, tmp_path_csv, quote = FALSE, sep = ",")

# read the csv indicating the value of the 'sep' parameter
my_chunker_object2 <- chunker(tmp_path_csv, chunksize = 30, sep = ",")
# the file can  then be processed as with tab delimiters

# remove temporal file
file.remove(tmp_path_csv)

#--------------------------------------------------------#
#--- Reading a data frame using column types argument ---#
#--------------------------------------------------------#

## Four types can be passed : "character", "numeric" (aka "double"), "integer", "logical"

# create a 'chunker' object passing the path of the input file.
my_chunker_object3 <- chunker(tmp_path, chunksize = 120,
 columns_classes = c("numeric", "numeric", "numeric","numeric", "character"))

# read a chunk
next_chunk(my_chunker_object3)

# get the chunk
get_table(my_chunker_object3)

# read another chunk
next_chunk(my_chunker_object3)

# get the number of lines already read
get_completed(my_chunker_object3)


#-------------------------#
#--- Reading a matrix  ---#
#-------------------------#

my_chunker_object4 <- chunker(tmp_path, chunksize = 30, data_format= "matrix")

# store the chunk as a character matrix in R
this_data <- get_table(my_chunker_object4)


# The package provides a fast generic C++ function for conversion from
# matrix (any R type) to data frame
this_data_as_df2 <- matrix2df(this_data)

# remove temporal file
file.remove(tmp_path)

## Not run:  

#----------------------------------#
#--- Example with a big table -----#
#----------------------------------#

### Example with a data frame

# create a large data frame, and write it in a temporal directory

tmp_path <- file.path(tempdir(),"big_table.txt")

out <- data.frame(numeric_data = runif(1000000),
                  character_data = sample(c("a", "t", "c", "g"), 1000000, 
                  replace = TRUE),
                  integer_data = sample(1000000),
                  bool_data = sample(c(TRUE, FALSE), 1000000, replace = TRUE))


write.table(out, tmp_path, quote = FALSE)

# create a chunker object, reading in chunks of 10000 lines
my_chunker_object5 <- chunker(tmp_path, chunksize = 10000)

next_chunk(my_chunker_object5)
data <- get_table(my_chunker_object5) 

# check classes
lapply(data,typeof)
file.remove(tmp_path)


### Example with a matrix

# create a large matrix, and write it in a temporal directory

my_table <- tempfile()
write.table(matrix(sample(c("a", "t", "c", "g"), 1000000, replace = TRUE), 
100000, 1000), my_table, quote = FALSE)

# create a chunker object, reading in chunks of 10000 lines
my_chunker_object6 <- chunker(my_table, chunksize = 10000, data_format= "matrix")

# create a loop to read all the file and make something with it

lines <- 0
while(next_chunk(my_chunker_object6))
{
  data <- get_table(my_chunker_object6) 
  
  # do something with data, e.g., convert to data frame first
  data <- matrix2df(data)
  
  lines <- lines + nrow(data)
  cat("Processed ", lines, "lines\n")
}

# remove the temporal file
file.remove(my_table)

## End(Not run)

[Package chunkR version 1.1.1 Index]