-
Notifications
You must be signed in to change notification settings - Fork 66
Expand file tree
/
Copy pathstringdist_join.Rd
More file actions
86 lines (67 loc) · 2.41 KB
/
stringdist_join.Rd
File metadata and controls
86 lines (67 loc) · 2.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/stringdist_join.R
\name{stringdist_join}
\alias{stringdist_join}
\alias{stringdist_inner_join}
\alias{stringdist_left_join}
\alias{stringdist_right_join}
\alias{stringdist_full_join}
\alias{stringdist_semi_join}
\alias{stringdist_anti_join}
\title{Join two tables based on fuzzy string matching of their columns}
\usage{
stringdist_join(
x,
y,
by = NULL,
max_dist = 2,
method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw",
"soundex"),
mode = "inner",
ignore_case = FALSE,
distance_col = NULL,
...
)
stringdist_inner_join(x, y, by = NULL, distance_col = NULL, ...)
stringdist_left_join(x, y, by = NULL, distance_col = NULL, ...)
stringdist_right_join(x, y, by = NULL, distance_col = NULL, ...)
stringdist_full_join(x, y, by = NULL, distance_col = NULL, ...)
stringdist_semi_join(x, y, by = NULL, distance_col = NULL, ...)
stringdist_anti_join(x, y, by = NULL, distance_col = NULL, ...)
}
\arguments{
\item{x}{A tbl.}
\item{y}{A tbl.}
\item{by}{Columns by which to join the two tables.}
\item{max_dist}{Maximum distance to use for joining.}
\item{method}{Method for computing string distance, see.
\code{stringdist-metrics} in the stringdist package.}
\item{mode}{One of "inner", "left", "right", "full" "semi", or "anti"}
\item{ignore_case}{Whether to be case insensitive (default yes).}
\item{distance_col}{If given, will add a column with this name containing the
difference between the two.}
\item{...}{Arguments passed on to \code{\link[stringdist]{stringdist}}.}
}
\description{
Join two tables based on fuzzy string matching of their columns. This is
useful, for example, in matching free-form inputs in a survey or online form,
where it can catch misspellings and small personal changes.
}
\details{
If \code{method = "soundex"}, the \code{max_dist} is automatically
set to 0.5, since soundex returns either a 0 (match) or a 1 (no match).
}
\examples{
library(dplyr)
library(ggplot2)
data(diamonds)
d <- tibble::tibble(approximate_name = c("Idea", "Premiums", "Premioom",
"VeryGood", "VeryGood", "Faiir"),
type = 1:6)
# no matches when they are inner-joined:
diamonds \%>\%
inner_join(d, by = c(cut = "approximate_name"))
# but we can match when they're fuzzy joined
diamonds \%>\%
stringdist_inner_join(d, by = c(cut = "approximate_name"))
}