79305435

Date: 2024-12-24 11:22:49
Score: 1
Natty:
Report link

It seems like the key here is proper memory management. @Jon Spring's solution is perfectly fine. I'd just like to propose doing it with a separate file. For even better memory management, you could write the results to a file incrementally instead of keeping them in memory. Since it's millions of rows for each file, the RAM will be filled up easily. So I guess it would be best to read each file and inner joing it with other_dataset. Then writing it all in one output_file <- "merged_results.txt" which resides outside R on your local drive.

Data quality

Your data quality is quite bad, there are many instances where rows don't even have ids to join with. This will be a big problem, since there will be many to many relationships.

text file structure

How are the text files structured? Will they have headers? Are they comma separated? Do they all have proper ids?

Code

setwd(dirname(rstudioapi::getSourceEditorContext()$path)) # set the current script's location as working directory
library(dplyr)
library(data.table)

user_1<- structure(list(ID2 = c(3481890, 3500406, 3507786, 3507978, 3512641, 3528872, 3546395, 3546395, 3572638, 3578447, 3581236, 3581236, 3581236, 3581236, 3599403, 3602306, 3603380, 3604665, 3612597, 3623200, 3623200), country = c("India", "India", "India", "Israel", "India", "India", "India", "India", "Belgium", "Israel", 
                                                                                                                                                                                                                                          "India", "India", "India", "India", "India", "India", "United States", 
                                                                                                                                                                                                                                          "India", "Bulgaria", "India", "India"), id = c(197273, 197273, 
                                                                                                                                                                                                                                                                                         197273, 197273, 197273, 197273, 197273, 197273, 197273, 197273, 
                                                                                                                                                                                                                                                                                         197273, 197273, 197273, 197273, 197273, 197273, 197273, 197273, 
                                                                                                                                                                                                                                                                                         197273, 197273, 197273)), row.names = 2000000:2000020, class = "data.frame")

user_250<- structure(list(ID2 = c(1000003, 1000004, 1000004, 1000011, 
                                  1000012, 1000012, 1000013, 1000013, 1000014, 1000017, 1000019, 
                                  1000025, 1000042, 1000042, 1000043, 1000043, 1000044, 1000046, 
                                  1000048, 1000049), country = c("India", "United States", "United States", 
                                                                 "China", "Argentina", "Argentina", "United States", "United States", 
                                                                 "United States", "Netherlands", "Chile", "India", "Russia", "", 
                                                                 "Chile", "Chile", "United States", "United States", "Italy", 
                                                                 "United States"), id = c(NA_real_, NA_real_, NA_real_, NA_real_, 
                                                                                          NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
                                                                                          NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
                                                                                          NA_real_, NA_real_)), row.names = c(NA, 20L), class = "data.frame")

other_dataset<- structure(list(id = c(197273, 197273, 
                                      197273, 197273, 197273, 197273, 197273, 708822, 708822, 708822, 708822, 708822, 
                                      708822, NA_real_, NA_real_, NA_real_, NA_real_, 708822, 708822, 
                                      708822), year = c(1951, 1951L, 1951, 1951, 1951, 
                                                        1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 
                                                        1951, 1951, 1951, 1951, 1951, 1951)), row.names = c(NA, 
                                                                                                            20L), class = "data.frame")

# save two examples in texts folder
write.csv(user_1, "texts/user_1.txt")
write.csv(user_250, "texts/user_250.txt")

# Path to your folder
folder_path <- "texts"
output_file <- "merged_results.txt"

# Get list of files
file_list <- list.files(folder_path, pattern = "user_.*\\.txt$", full.names = TRUE)

# Process first file separately to create the output file
first_data <- read.delim(file_list[1], header = TRUE, sep = ",") # assuming the text files have comma separation and headers!
merged_data <- inner_join(first_data, other_dataset, by = "id")
fwrite(merged_data, output_file)
rm(first_data, merged_data)
gc()

# Process remaining files
for (file in file_list[-1]) {
  # Read and merge current file
  current_data <- read.delim(file_list[1], header = TRUE, sep = ",")
  merged_data <- inner_join(current_data, other_dataset, by = "id")
  
  # Append to output file
  fwrite(merged_data, output_file, append = TRUE)
  
  # Print progress
  cat("Processed:", basename(file), "\n")
  
  # Clean up
  rm(current_data, merged_data)
  gc()
}
Reasons:
  • Whitelisted phrase (-1): solution is
  • Long answer (-1):
  • Has code block (-0.5):
  • Ends in question mark (2):
  • User mentioned (1): @Jon
  • Low reputation (0.5):
Posted by: G-Man