link_wf {diyar}  R Documentation 
Record linkage
Description
Deterministic and probabilistic record linkage Assign unique identifiers to records based on partial, nested or calculated probabilities.
Usage
links_af_probabilistic(
attribute,
blocking_attribute = NULL,
cmp_func = diyar::exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
u_probability = NULL,
score_threshold = 1,
repeats_allowed = FALSE,
permutations_allowed = FALSE,
data_source = NULL,
ignore_same_source = TRUE,
display = "none"
)
links_wf_probabilistic(
attribute,
blocking_attribute = NULL,
cmp_func = diyar::exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
u_probability = NULL,
score_threshold = 1,
id_1 = NULL,
id_2 = NULL,
return_weights = FALSE,
...
)
prob_score_range(attribute, m_probability = 0.95, u_probability = NULL)
Arguments
attribute 

blocking_attribute 

cmp_func 

attr_threshold 

probabilistic 

m_probability 

u_probability 

score_threshold 

repeats_allowed 

permutations_allowed 

data_source 

ignore_same_source 

display 

id_1 

id_2 

return_weights 
If 
... 
Arguments passed to 
Details
links_wf_probabilistic()
 A wrapper function of links
with a
with a specific sub_criteria
and to achieve to achieve probabilistic record linkage
It excludes functionalities for the nested and multistage linkage.
links_wf_probabilistic()
requires a score_threshold
in advance.
To help with this, prob_score_range()
can be used to return the range of scores attainable for a given set of attribute
, m
and u
probabilities.
Additionally, id_1
and id_2
can be used to link specific records pairs, aiding the review of potential scores.
links_af_probabilistic()
 A simpler version of links
.
It excludes functionalities for the batched, nested and multistage linkage.
links_af_probabilistic()
requires a score_threshold
in advance,
however, since it exports the match weights, the score_threshold
can be changed after the analysis.
Value
pid
; list
References
Fellegi, I. P., & Sunter, A. B. (1969). A Theory for Record Linkage. Journal of the Statistical Association, 64(328), 1183  1210. https://doi.org/10.1080/01621459.1969.10501049
Asher, J., Resnick, D., Brite, J., Brackbill, R., & Cone, J. (2020). An Introduction to Probabilistic Record Linkage with a Focus on Linkage Processing for WTC Registries. International journal of environmental research and public health, 17(18), 6937. https://doi.org/10.3390/ijerph17186937.
See vignette("links")
for more information.
See Also
Examples
data(patient_records)
# Weighted (probabilistic) comparison of forename, middlename and surname
criteria_1 < as.list(patient_records[c("forename", "middlename", "surname")])
# Possible scores when mprobability is 0.95
prob_scores < prob_score_range(attribute = criteria_1,
m_probability = 0.95,
u_probability = NULL)
## Not run:
# Probabilistic record linkage with 'links_af_probabilistic()'
pids_1a < links_af_probabilistic(attribute = criteria_1,
cmp_func = exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
score_threshold = prob_scores$mid_scorce,
display = "stats")
# Equivalent with 'links_wf_probabilistic()'
pids_1b < links_wf_probabilistic(attribute = criteria_1,
cmp_func = exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
score_threshold = prob_scores$mid_scorce,
display = "progress",
recursive = TRUE,
check_duplicates = TRUE)
# Less thorough but faster equivalent with `links_wf_probabilistic()`
pids_1c < links_wf_probabilistic(attribute = criteria_1,
cmp_func = exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
score_threshold = prob_scores$mid_scorce,
display = "progress",
recursive = FALSE,
check_duplicates = FALSE)
# Each implementation can lead to different results
summary(pids_1a$pid)
summary(pids_1b$pid)
summary(pids_1c$pid)
## End(Not run)
# Weighted (nonprobabilistic) comparison of forename, middlename and age difference
criteria_2 < as.list(patient_records[c("forename", "middlename", "dateofbirth")])
age_diff < function(x, y){
diff < abs(as.numeric(x)  as.numeric(y))
wgt < diff %in% 0:(365 * 10) & !is.na(diff)
wgt
}
pids_2a < links_af_probabilistic(attribute = criteria_2,
blocking_attribute = patient_records$surname,
cmp_func = c(exact_match, exact_match, age_diff),
score_threshold = number_line(3, 5),
probabilistic = FALSE,
display = "stats")
# Larger weights can be assigned to particular attributes through `cmp_func`
# For example, a smaller age difference can contribute a higher score (e.g 0 to 3)
age_diff_2 < function(x, y){
diff < as.numeric(abs(x  y))
wgt < diff %in% 0:(365 * 10) & !is.na(diff)
wgt[wgt] < match(as.numeric(cut(diff[wgt], 3)), 3:1)
wgt
}
pids_2b < links_af_probabilistic(attribute = criteria_2,
blocking_attribute = patient_records$surname,
cmp_func = c(exact_match, exact_match, age_diff_2),
score_threshold = number_line(3, 5),
probabilistic = FALSE,
display = "stats")
head(pids_2a$pid_weights, 10)
head(pids_2b$pid_weights, 10)