hashmap {bit64}R Documentation

Hashing for 64bit integers


This is an explicit implementation of hash functionality that underlies matching and other functions in R. Explicit means that you can create, store and use hash functionality directly. One advantage is that you can re-use hashmaps, which avoid re-building hashmaps again and again.


hashfun(x, ...)
## S3 method for class 'integer64'
hashfun(x, minfac=1.41, hashbits=NULL, ...)
hashmap(x, ...)
## S3 method for class 'integer64'
hashmap(x, nunique=NULL, minfac=1.41, hashbits=NULL, cache=NULL, ...)
hashpos(cache, ...)
## S3 method for class 'cache_integer64'
hashpos(cache, x, nomatch = NA_integer_, ...)
hashrev(cache, ...)
## S3 method for class 'cache_integer64'
hashrev(cache, x, nomatch = NA_integer_, ...)
hashfin(cache, ...)
## S3 method for class 'cache_integer64'
hashfin(cache, x, ...)
hashrin(cache, ...)
## S3 method for class 'cache_integer64'
hashrin(cache, x, ...)
hashdup(cache, ...)
## S3 method for class 'cache_integer64'
hashdup(cache, ...)
hashuni(cache, ...)
## S3 method for class 'cache_integer64'
hashuni(cache, keep.order=FALSE, ...)
hashmapuni(x, ...)
## S3 method for class 'integer64'
hashmapuni(x, nunique=NULL, minfac=1.5, hashbits=NULL, ...)
hashupo(cache, ...)
## S3 method for class 'cache_integer64'
hashupo(cache, keep.order=FALSE, ...)
hashmapupo(x, ...)
## S3 method for class 'integer64'
hashmapupo(x, nunique=NULL, minfac=1.5, hashbits=NULL, ...)
hashtab(cache, ...)
## S3 method for class 'cache_integer64'
hashtab(cache, ...)
hashmaptab(x, ...)
## S3 method for class 'integer64'
hashmaptab(x, nunique=NULL, minfac=1.5, hashbits=NULL, ...)



an integer64 vector


an object of class 'hashmap' i.e. here 'cache_integer64'


minimum factor by which the hasmap has more elements compared to the data x, ignored if hashbits is given directly


length of hashmap is 2^hashbits


an optional cache object into which to put the hashmap (by default a new cache is created)


giving correct number of unique elements can help reducing the size of the hashmap


the value to be returned if an element is not found in the hashmap


determines order of results and speed: FALSE (the default) is faster and returns in the (pseudo)random order of the hash function, TRUE returns in the order of first appearance in the original data, but this requires extra work


further arguments, passed from generics, ignored in methods


function see also description
hashfun digest export of the hash function used in hashmap
hashmap match return hashmap
hashpos match return positions of x in hashmap
hashrev match return positions of hashmap in x
hashfin %in%.integer64 return logical whether x is in hashmap
hashrin %in%.integer64 return logical whether hashmap is in x
hashdup duplicated return logical whether hashdat is duplicated using hashmap
hashuni unique return unique values of hashmap
hashmapuni unique return unique values of x
hashupo unique return positions of unique values in hashdat
hashmapupo unique return positions of unique values in x
hashtab table tabulate values of hashdat using hashmap in keep.order=FALSE
hashmaptab table tabulate values of x building hasmap on the fly in keep.order=FALSE


see details


Jens Oehlschlägel <Jens.Oehlschlaegel@truecluster.com>

See Also

match, runif64


x <- as.integer64(sample(c(NA, 0:9)))
y <- as.integer64(sample(c(NA, 1:9), 10, TRUE))
hx <- hashmap(x)
hy <- hashmap(y)
hashpos(hy, x)
hashrev(hx, y)
hashfin(hy, x)
hashrin(hx, y)
hashuni(hy, keep.order=TRUE)
hashupo(hy, keep.order=TRUE)

stopifnot(identical(match(as.integer(x),as.integer(y)),hashpos(hy, x)))
stopifnot(identical(match(as.integer(x),as.integer(y)),hashrev(hx, y)))
stopifnot(identical(as.integer(x) %in% as.integer(y), hashfin(hy, x)))
stopifnot(identical(as.integer(x) %in% as.integer(y), hashrin(hx, y)))
stopifnot(identical(duplicated(as.integer(y)), hashdup(hy)))
stopifnot(identical(as.integer64(unique(as.integer(y))), hashuni(hy, keep.order=TRUE)))
stopifnot(identical(sort(hashuni(hy, keep.order=FALSE)), sort(hashuni(hy, keep.order=TRUE))))
stopifnot(identical(y[hashupo(hy, keep.order=FALSE)], hashuni(hy, keep.order=FALSE)))
stopifnot(identical(y[hashupo(hy, keep.order=TRUE)], hashuni(hy, keep.order=TRUE)))
stopifnot(identical(hashpos(hy, hashuni(hy, keep.order=TRUE)), hashupo(hy, keep.order=TRUE)))
stopifnot(identical(hashpos(hy, hashuni(hy, keep.order=FALSE)), hashupo(hy, keep.order=FALSE)))
stopifnot(identical(hashuni(hy, keep.order=FALSE), hashtab(hy)$values))
stopifnot(identical(as.vector(table(as.integer(y), useNA="ifany"))
, hashtab(hy)$counts[order.integer64(hashtab(hy)$values)]))
stopifnot(identical(hashuni(hy, keep.order=TRUE), hashmapuni(y)))
stopifnot(identical(hashupo(hy, keep.order=TRUE), hashmapupo(y)))
stopifnot(identical(hashtab(hy), hashmaptab(y)))

	## Not run: 
	message("explore speed given size of the hasmap in 2^hashbits and size of the data")
	message("more hashbits means more random access and less collisions")
	message("i.e. more data means less random access and more collisions")
	bits <- 24
	b <- seq(-1, 0, 0.1)
	tim <- matrix(NA, length(b), 2, dimnames=list(b, c("bits","bits+1")))
    for (i in 1:length(b)){
	  n <- as.integer(2^(bits+b[i]))
	  x <- as.integer64(sample(n))
	  tim[i,1] <- repeat.time(hashmap(x, hashbits=bits))[3]
	  tim[i,2] <- repeat.time(hashmap(x, hashbits=bits+1))[3]
      matplot(b, tim)
	message("we conclude that n*sqrt(2) is enough to avoid collisions")
## End(Not run)

[Package bit64 version 4.0.5 Index]