From c70e4f764c1a88f5cced93d00430632fd3575281 Mon Sep 17 00:00:00 2001 From: venom1204 Date: Thu, 18 Jun 2026 20:17:13 +0000 Subject: [PATCH 1/3] added info --- vignettes/datatable-joins.Rmd | 43 ++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/vignettes/datatable-joins.Rmd b/vignettes/datatable-joins.Rmd index d8581eb7a9..32a823d79f 100644 --- a/vignettes/datatable-joins.Rmd +++ b/vignettes/datatable-joins.Rmd @@ -226,7 +226,23 @@ Products[ total_value = price * count) ] ``` +#### 3.1.4. Identifying matches in key-only tables +When joining a table `y` to a "lookup" table `x` that contains only keys, the resulting join column defaults to the value in `y`. To explicitly check if a match was found in `x`, we can use the `x.` prefix. If `x.col` is `NA`, no match was found. + +```{r} +# Lookup table of authorized IDs +authorized_ids = data.table(user_id = c(1L, 2L, 5L), key = "user_id") +# New login attempts +logins = data.table(user_id = c(1L, 3L, 5L)) + +# By selecting x.user_id, we can identify which logins exist in the authorized table +authorized_ids[logins, on = .(user_id), .(user_id, is_authorized = !is.na(x.user_id))] +# user_id is_authorized +# 1: 1 TRUE +# 2: 3 FALSE +# 3: 5 TRUE +``` ##### Summarizing with `on` in `data.table` @@ -253,7 +269,7 @@ dt2 = ProductReceived[ identical(dt1, dt2) ``` -#### 3.1.4. Joining based on several columns +#### 3.1.5. Joining based on several columns So far we have just joined `data.table`s based on 1 column, but it's important to know that the package can join tables matching several columns. @@ -629,6 +645,31 @@ ProductPriceHistory[ProductSales, j = .(product_id, date, count, price)] ``` +### 5.1. Calculating Staleness (Join Distance) + +In rolling joins, `data.table` matches to the nearest available record. By default, the join column in the result displays the value from the i table (the time you "queried"). To see the actual time of the record that was found in `x`, use the `x`. prefix. The difference between these two is often called "staleness." + +```{r} +# Prices updated at specific times +prices = data.table( + time = as.ITime(c("10:00:00", "10:05:00", "10:10:00")), + price = c(100, 105, 110), + key = "time" +) + +# A trade happens at 10:07:00 +trade = data.table(time = as.ITime("10:07:00")) + +# Using x.time to see the actual record time found +prices[trade, on = .(time), roll = TRUE, + .(queried_time = time, + actual_time = x.time, + price, + staleness = time - x.time)] +# queried_time actual_time price staleness +# 1: 10:07:00 10:05:00 105 00:02:00 +``` + ## 6. Taking advantage of joining speed ### 6.1. Subsets as joins From 91390220b40d2dc93dafade82af59de9b422eb52 Mon Sep 17 00:00:00 2001 From: venom1204 Date: Fri, 19 Jun 2026 19:29:48 +0000 Subject: [PATCH 2/3] .. --- vignettes/datatable-joins.Rmd | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/vignettes/datatable-joins.Rmd b/vignettes/datatable-joins.Rmd index 32a823d79f..60b0baf200 100644 --- a/vignettes/datatable-joins.Rmd +++ b/vignettes/datatable-joins.Rmd @@ -236,12 +236,10 @@ authorized_ids = data.table(user_id = c(1L, 2L, 5L), key = "user_id") # New login attempts logins = data.table(user_id = c(1L, 3L, 5L)) -# By selecting x.user_id, we can identify which logins exist in the authorized table -authorized_ids[logins, on = .(user_id), .(user_id, is_authorized = !is.na(x.user_id))] -# user_id is_authorized -# 1: 1 TRUE -# 2: 3 FALSE -# 3: 5 TRUE +# Use "user_id" as a string in the 'on' argument +authorized_ids[logins, on = "user_id", + .(user_id = i.user_id, + is_authorized = !is.na(x.user_id))] ``` ##### Summarizing with `on` in `data.table` @@ -661,13 +659,12 @@ prices = data.table( trade = data.table(time = as.ITime("10:07:00")) # Using x.time to see the actual record time found -prices[trade, on = .(time), roll = TRUE, - .(queried_time = time, +# Use "time" as a string to avoid conflict with base::time +prices[trade, on = "time", roll = TRUE, + .(queried_time = i.time, actual_time = x.time, price, - staleness = time - x.time)] -# queried_time actual_time price staleness -# 1: 10:07:00 10:05:00 105 00:02:00 + staleness = i.time - x.time)] ``` ## 6. Taking advantage of joining speed From a7dd97800f31a87bdcc9b3e139a55b169bcf6a67 Mon Sep 17 00:00:00 2001 From: venom1204 Date: Fri, 19 Jun 2026 19:34:45 +0000 Subject: [PATCH 3/3] .. --- vignettes/datatable-joins.Rmd | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/vignettes/datatable-joins.Rmd b/vignettes/datatable-joins.Rmd index 60b0baf200..14f080b9c8 100644 --- a/vignettes/datatable-joins.Rmd +++ b/vignettes/datatable-joins.Rmd @@ -236,10 +236,8 @@ authorized_ids = data.table(user_id = c(1L, 2L, 5L), key = "user_id") # New login attempts logins = data.table(user_id = c(1L, 3L, 5L)) -# Use "user_id" as a string in the 'on' argument -authorized_ids[logins, on = "user_id", - .(user_id = i.user_id, - is_authorized = !is.na(x.user_id))] +# By selecting x.user_id, we can identify which logins exist in the authorized table +authorized_ids[logins, on = "user_id", .(user_id, is_authorized = !is.na(x.user_id))] ``` ##### Summarizing with `on` in `data.table` @@ -649,6 +647,7 @@ In rolling joins, `data.table` matches to the nearest available record. By defau ```{r} # Prices updated at specific times +# Prices updated at specific times prices = data.table( time = as.ITime(c("10:00:00", "10:05:00", "10:10:00")), price = c(100, 105, 110), @@ -659,12 +658,11 @@ prices = data.table( trade = data.table(time = as.ITime("10:07:00")) # Using x.time to see the actual record time found -# Use "time" as a string to avoid conflict with base::time -prices[trade, on = "time", roll = TRUE, - .(queried_time = i.time, +prices[trade, on = .(time), roll = TRUE, + .(queried_time = time, actual_time = x.time, price, - staleness = i.time - x.time)] + staleness = time - x.time)] ``` ## 6. Taking advantage of joining speed