R: Move data points into clusters

cluster_groups {rearrr}

R Documentation

Move data points into clusters

Description

Transform values such that the elements in each group move closer to their centroid.

Usage

cluster_groups(
  data,
  cols,
  group_cols = NULL,
  scale_min_fn = function(x) {
     quantile(x, 0.025)
 },
  scale_max_fn = function(x) {
     quantile(x, 0.975)
 },
  keep_centroids = FALSE,
  multiplier = 0.05,
  suffix = "_clustered",
  keep_original = TRUE,
  overwrite = FALSE
)

Arguments

`data`	`data.frame`. If `group_cols` is `NULL`, it must be grouped with `dplyr::group_by()`.
`cols`	Names of columns in `data` to mutate. Each column is considered a dimension to contract distances in.
`group_cols`	Names of grouping columns in `data`. Must be distinct from the names in `cols`. If `NULL` and `data` is grouped, those groups are used instead.
`scale_min_fn`, `scale_max_fn`	Function to find the minimum/maximum value in the original data when rescaling the contracted data. Input: A `numeric vector`. Output: A `numeric scalar`.
`keep_centroids`	Whether to ensure the clusters have their original centroids. (Logical)
`multiplier`	Numeric constant to multiply the distance to the group centroid by. A smaller value makes the clusters more compact and vice versa.
`suffix`	Suffix to add to the names of the generated columns. Use an empty string (i.e. `""`) to overwrite the original columns.
`keep_original`	Whether to keep the original columns. (Logical) Some columns may have been overwritten, in which case only the newest versions are returned.
`overwrite`	Whether to allow overwriting of existing columns. (Logical)

Details

Contracts the distance from each data point to the centroid of its group.
Performs MinMax scaling such that the scale of the data points is similar to the original data.
If enabled (not default), the centroids are moved to the original centroids.

Value

data.frame (tibble) with the clustered columns.

Author(s)

Ludvig Renbo Olsen, r-pkgs@ludvigolsen.dk

Examples

# Attach packages
library(rearrr)
library(dplyr)
has_ggplot <- require(ggplot2)  # Attach if installed

# Set seed
set.seed(2)

# Create a data frame
df <- data.frame(
  "x" = runif(50),
  "y" = runif(50),
  "z" = runif(50),
  "g" = rep(c(1, 2, 3, 4, 5), each = 10)
)

# Move the data points into clusters
cluster_groups(df,
  cols = c("x", "y"),
  group_col = "g"
)
cluster_groups(df,
  cols = c("x", "y"),
  group_col = "g",
  multiplier = 0.1
)
cluster_groups(df,
  cols = c("x"),
  group_col = "g",
  multiplier = 0.1
)

#
# Plotting clusters
#

# Cluster x and y for each group in g
df_clustered <- cluster_groups(
  data = df,
  cols = c("x", "y"),
  group_col = "g"
)

# Plot the clusters over the original data points
# As we work with random data, the cluster might overlap
if (has_ggplot){
  ggplot(
    df_clustered,
    aes(x = x_clustered, y = y_clustered, color = factor(g))
  ) +
    # Original data
    geom_point(aes(x = x, y = y), alpha = 0.3, size = 0.8) +
    # Clustered data
    geom_point() +
    theme_minimal() +
    labs(x = "x", y = "y", color = "g")
}

#
# Maintain original group centroids
#

df_clustered <- cluster_groups(
  data = df,
  cols = c("x", "y"),
  group_col = "g",
  keep_centroids = TRUE
)

# Plot the clusters over the original data points
# As we work with random data, the cluster might overlap
if (has_ggplot){
  ggplot(
    df_clustered,
    aes(x = x_clustered, y = y_clustered, color = factor(g))
  ) +
    # Original data
    geom_point(aes(x = x, y = y), alpha = 0.3, size = 0.8) +
    # Clustered data
    geom_point() +
    theme_minimal() +
    labs(x = "x", y = "y", color = "g")
}

#
# Three dimensions
#

# Cluster in 3d
df_clustered <- cluster_groups(
  data = df,
  cols = c("x", "y", "z"),
  group_col = "g"
)

## Not run: 
# Plot 3d with plotly
plotly::plot_ly(
  x = df_clustered$x_clustered,
  y = df_clustered$y_clustered,
  z = df_clustered$z_clustered,
  type = "scatter3d",
  mode = "markers",
  color = df_clustered$g
)

## End(Not run)

[Package rearrr version 0.3.4 Index]