It seems like the key here is proper memory management. @Jon Spring's solution is perfectly fine. I'd just like to propose doing it with a separate file. For even better memory management, you could write the results to a file incrementally instead of keeping them in memory. Since it's millions of rows for each file, the RAM will be filled up easily. So I guess it would be best to read each file and inner joing it with other_dataset
. Then writing it all in one output_file <- "merged_results.txt"
which resides outside R on your local drive.
Your data quality is quite bad, there are many instances where rows don't even have ids
to join with. This will be a big problem, since there will be many to many relationships.
How are the text files structured? Will they have headers? Are they comma separated? Do they all have proper ids?
setwd(dirname(rstudioapi::getSourceEditorContext()$path)) # set the current script's location as working directory
library(dplyr)
library(data.table)
user_1<- structure(list(ID2 = c(3481890, 3500406, 3507786, 3507978, 3512641, 3528872, 3546395, 3546395, 3572638, 3578447, 3581236, 3581236, 3581236, 3581236, 3599403, 3602306, 3603380, 3604665, 3612597, 3623200, 3623200), country = c("India", "India", "India", "Israel", "India", "India", "India", "India", "Belgium", "Israel",
"India", "India", "India", "India", "India", "India", "United States",
"India", "Bulgaria", "India", "India"), id = c(197273, 197273,
197273, 197273, 197273, 197273, 197273, 197273, 197273, 197273,
197273, 197273, 197273, 197273, 197273, 197273, 197273, 197273,
197273, 197273, 197273)), row.names = 2000000:2000020, class = "data.frame")
user_250<- structure(list(ID2 = c(1000003, 1000004, 1000004, 1000011,
1000012, 1000012, 1000013, 1000013, 1000014, 1000017, 1000019,
1000025, 1000042, 1000042, 1000043, 1000043, 1000044, 1000046,
1000048, 1000049), country = c("India", "United States", "United States",
"China", "Argentina", "Argentina", "United States", "United States",
"United States", "Netherlands", "Chile", "India", "Russia", "",
"Chile", "Chile", "United States", "United States", "Italy",
"United States"), id = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_)), row.names = c(NA, 20L), class = "data.frame")
other_dataset<- structure(list(id = c(197273, 197273,
197273, 197273, 197273, 197273, 197273, 708822, 708822, 708822, 708822, 708822,
708822, NA_real_, NA_real_, NA_real_, NA_real_, 708822, 708822,
708822), year = c(1951, 1951L, 1951, 1951, 1951,
1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951,
1951, 1951, 1951, 1951, 1951, 1951)), row.names = c(NA,
20L), class = "data.frame")
# save two examples in texts folder
write.csv(user_1, "texts/user_1.txt")
write.csv(user_250, "texts/user_250.txt")
# Path to your folder
folder_path <- "texts"
output_file <- "merged_results.txt"
# Get list of files
file_list <- list.files(folder_path, pattern = "user_.*\\.txt$", full.names = TRUE)
# Process first file separately to create the output file
first_data <- read.delim(file_list[1], header = TRUE, sep = ",") # assuming the text files have comma separation and headers!
merged_data <- inner_join(first_data, other_dataset, by = "id")
fwrite(merged_data, output_file)
rm(first_data, merged_data)
gc()
# Process remaining files
for (file in file_list[-1]) {
# Read and merge current file
current_data <- read.delim(file_list[1], header = TRUE, sep = ",")
merged_data <- inner_join(current_data, other_dataset, by = "id")
# Append to output file
fwrite(merged_data, output_file, append = TRUE)
# Print progress
cat("Processed:", basename(file), "\n")
# Clean up
rm(current_data, merged_data)
gc()
}