before I start with my answer I have a couple of questions:
filter(id == id)
You are suggesting that both dataframes have a common field id
,
however the first_data and second_data do not share such an id! Why
is that?
You are using the Jaro-Winkler similarity with a max distance of 0.5. That value is way too low to match names successfully. Let me explain why:
Let's take an example from your dataset:
> 1-stringdist('Ivan Sabe','Ivan Sabel',method='jw')
[1] 0.9666667
> 1-stringdist('Ivan Sabel','Jame Labes',method='jw')
[1] 0.6666667
As you can see, Ivan Sabel would be matched with James Labes, even though their names are wildly different, just because they have common characters and common name lengths! So using 0.5
is way too low! I would suggest, using 0.9
or even higher
Is there a solution using duckdb and arrow to do this simultaneous fuzzy and exact join?
Yes! You can use this code, which matches both data_frames first based on the id f.id = s.another_id
or the Jaro-Winkler Similiratiy between both fullnames
above 0.95:
library(duckdb)
library(arrow)
library(dplyr)
first_data<- structure(list(user_id = c(441391106, 441514065, 442060539, 442158489,
438197192, 438206034, 438689594, 438881971, 440386286, 440479235
), fullname = c("Siva Kumar", "Ivan Sabe", "James Bigler", "Arthur Stephens",
"guy guy", "Rick Schlieper", "Tony Klemencic", "baiyu xu", "Michael Fritts",
"Daniel Wolf Roemele"), f_prob = c(0, 1, 0.005, 0.006, 0.005,
0.002, 0.011, 0.389, 0.005, 0.004), m_prob = c(1, 0, 0.995, 0.994,
0.995, 0.998, 0.989, 0.611, 0.995, 0.996), white_prob = c(0.021,
0.001, 0.994, 0.792, 0.547, 0.949, 0.948, 0.001, 0.995, 0.795
), black_prob = c(0.013, 0.003, 0.001, 0.198, 0.398, 0.004, 0.003,
0.001, 0.001, 0.097), api_prob = c(0.904, 0.991, 0, 0, 0.001,
0.002, 0.003, 0.994, 0.001, 0.061), hispanic_prob = c(0.005,
0.001, 0.001, 0.002, 0, 0.001, 0.039, 0.001, 0, 0.012), native_prob = c(0.006,
0.002, 0, 0, 0, 0.005, 0, 0, 0, 0.003), multiple_prob = c(0.051,
0.002, 0.004, 0.008, 0.054, 0.039, 0.007, 0.003, 0.003, 0.032
), degree = c("", "", "Bachelor", "", "", "Master", "Associate",
"", "", ""), other_id = c(1212616, 1212616, 1212616, 1212616, 1212991,
1212991, 1212991, 1212991, 1212991, 1212991), id = c(62399,
62399, 62399, 62399, 63907, 63907, 63907, 63907, 63907, 63907
)), row.names = c(NA, 10L), class = "data.frame")
second_data<- structure(list(gvkey = c(12825, 12945, 12945, 12945, 16456, 16456,
16456, 12136, 12136, 17254), another_id = c(7879, 8587, 18070, 40634,
13142, 17440, 41322, 899, 27199, 26604), fname = c("Gerald",
"John", "Dean", "Todd", "Thomas", "Ivan", "Vinit", "Scott", "Jonathan",
"William"), mname = c("B.", "L.", "A.", "P.", "F.", "R.",
"K.", "G.", "I.", "Jensen"), lname = c("Shreiber", "Nussbaum",
"Foate", "Kelsey", "Kirk", "Sabel, CPO", "Asar", "McNealy", "Schwartz",
"Gedwed"), companyname = c(NA, "Plexus Corp.", "Plexus Corp.",
NA, NA, NA, NA, "Oracle America, Inc.", "Oracle America, Inc.",
NA), fullname = c("Gerald Shreiber", "John Nussbaum",
"Dean Foate", "Todd Kelsey", "Thomas Kirk", "Ivan Sabel", "Vinit Asar",
"Scott McNealy", "Jonathan Schwartz", "William Gedwed")), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
# use JaroWinkler in DuckDb
con <- dbConnect(duckdb())
dbWriteTable(con, "first_data", first_data)
dbWriteTable(con, "second_data", second_data)
query <- "
SELECT
f.*, s.*,
jaro_similarity(lower(f.fullname), lower(s.fullname)) as name_distance
FROM first_data f
JOIN second_data s ON
f.id = s.another_id
OR (
jaro_similarity(lower(f.fullname), lower(s.fullname)) >= 0.95
)"
result <- dbGetQuery(con, query)
dbDisconnect(con, shutdown = TRUE)