import os
import argparse
import re
import logging
from collections import defaultdict
from pathlib import Path
import datetime
# Logger will be configured in main() after parsing arguments
logger = logging.getLogger(__name__)
def find_filename_duplicates(directory):
"""Find files with duplicate names in the given directory and its subdirectories"""
files_by_name = defaultdict(list)
# Walk through the directory and collect files by name
for root, _, filenames in os.walk(directory):
for filename in filenames:
filepath = os.path.join(root, filename)
files_by_name[filename].append(filepath)
# Filter to only include names that have duplicates
duplicates = {name: paths for name, paths in files_by_name.items() if len(paths) > 1}
return duplicates
def extract_number_pattern(filename):
"""Extract the number from a filename with pattern V1_A_B_C_D_E_F__.*
where:
- A is date in YYYYMMDD format
- B is 2 digits
- C is 4-6 digits
- D is 4 digits
- E is 3 digits
- F is 2 digits (the part we'll increment)
"""
# Pattern: V1_YYYYMMDD_BB_CCCCCC_DDDD_EEE_FF__.*
pattern = re.compile(r'(V1_\d{8}_\d{2}_\d{4,6}_\d{4}_\d{3}_)(\d{2})(__.*)')
match = pattern.match(filename)
if match:
prefix = match.group(1) # Everything before the F part
number = int(match.group(2)) # The F part (2 digits)
suffix = match.group(3) # Everything after the F part
return prefix, number, suffix
return None
def increment_number_in_filename(filepath):
"""Increment the F part (2-digit number) in the filename pattern"""
path = Path(filepath)
filename = path.name
parts = extract_number_pattern(filename)
if parts:
prefix, number, suffix = parts
new_number = number + 1
# Ensure the number stays as 2 digits with leading zero if needed
new_filename = f"{prefix}{new_number:02d}{suffix}"
new_filepath = path.with_name(new_filename)
return new_filepath
# If pattern doesn't match, return the original path with a warning
logger.warning(f"File {filepath} doesn't match the expected pattern. Skipping.")
return filepath
def format_size(size):
"""Convert file size to human-readable format"""
for unit in ['B', 'KB', 'MB', 'GB']:
if size < 1024.0:
return f"{size:.2f} {unit}"
size /= 1024.0
return f"{size:.2f} TB"
def rename_duplicates(directory, dry_run=False):
"""Find and rename duplicate files, return number of renames performed"""
duplicates = find_filename_duplicates(directory)
if not duplicates:
logger.info("No files with duplicate names found.")
return 0
total_duplicates = sum(len(paths) - 1 for paths in duplicates.values())
logger.info(f"Found {len(duplicates)} duplicate filenames ({total_duplicates} duplicates in total)")
rename_count = 0
for filename, paths in duplicates.items():
logger.info(f"\nDuplicate filename: '{filename}'")
# Check if the filename matches our expected pattern
if not extract_number_pattern(filename):
logger.warning(f" Filename '{filename}' doesn't match the expected pattern V1_YYYYMMDD_BB_CCCCCC_DDDD_EEE_FF__*")
logger.warning(f" Skipping this group of files")
continue
# Keep the first file as is, rename the rest
for i, filepath in enumerate(paths):
rel_path = os.path.relpath(filepath, directory)
if i == 0:
logger.info(f" Keeping: {rel_path}")
else:
new_path = increment_number_in_filename(filepath)
# Skip if the pattern doesn't match
if new_path == filepath:
continue
# Make sure the new name doesn't already exist
while os.path.exists(new_path) and not dry_run:
new_path = increment_number_in_filename(new_path)
if dry_run:
logger.info(f" [DRY RUN] Would rename: {rel_path} → {os.path.basename(new_path)}")
else:
logger.info(f" Renaming: {rel_path} → {os.path.basename(new_path)}")
os.rename(filepath, new_path)
rename_count += 1
return rename_count
def main():
parser = argparse.ArgumentParser(
description='Find and rename files with duplicate names using pattern increment',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Example filename pattern: V1_YYYYMMDD_BB_CCCCCC_DDDD_EEE_FF__*
where:
- YYYYMMDD is a date
- BB is 2 digits
- CCCCCC is 4-6 digits
- DDDD is 4 digits
- EEE is 3 digits
- FF is 2 digits (this is the part that will be incremented)
"""
)
parser.add_argument('directory', help='Directory to scan for filename duplicates')
parser.add_argument('--max-iterations', type=int, default=10,
help='Maximum number of renaming iterations to perform')
parser.add_argument('--log-dir', default=os.getcwd(),
help='Directory where log file should be saved (default: current directory)')
parser.add_argument('--log-name',
help='Custom log filename (default: rename_log_YYYYMMDD_HHMMSS.txt)')
parser.add_argument('--dry-run', action='store_true',
help='Perform a dry run without actually renaming files')
args = parser.parse_args()
# Configure logging based on arguments
log_dir = os.path.abspath(args.log_dir)
if not os.path.exists(log_dir):
os.makedirs(log_dir)
if args.log_name:
log_filename = os.path.join(log_dir, args.log_name)
else:
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
log_filename = os.path.join(log_dir, f"rename_log_{timestamp}.txt")
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_filename),
logging.StreamHandler()
]
)
directory = os.path.abspath(args.directory)
if not os.path.isdir(directory):
logger.error(f"Error: Directory '{directory}' does not exist.")
return
logger.info(f"Starting duplicate filename resolution in: {directory}")
logger.info(f"Log file: {os.path.abspath(log_filename)}")
if args.dry_run:
logger.info("DRY RUN MODE: No files will actually be renamed")
iteration = 1
total_renames = 0
while iteration <= args.max_iterations:
logger.info(f"\n=== Iteration {iteration} ===")
renames = rename_duplicates(directory, args.dry_run)
total_renames += renames
if renames == 0:
logger.info(f"\nProcess completed successfully after {iteration} iterations.")
logger.info(f"Total files {'that would be' if args.dry_run else ''} renamed: {total_renames}")
break
logger.info(f"{'Would rename' if args.dry_run else 'Renamed'} {renames} files in this iteration. Checking for remaining duplicates...")
if args.dry_run:
# In dry run mode, we only do one iteration since we're not actually changing anything
logger.info("\nDry run complete. Exiting after one iteration.")
break
iteration += 1
else:
logger.warning(f"\nReached maximum iterations ({args.max_iterations}). Process may not be complete.")
logger.info(f"Total files renamed so far: {total_renames}")
logger.info(f"\nDetailed log saved to: {os.path.abspath(log_filename)}")
if __name__ == "__main__":
main()
md file
# Cascade Chat Conversation
Note: _This is purely the output of the chat conversation and does not contain any raw data, codebase snippets, etc. used to generate the output._
### User Input
file duplicates viewer and changer
### Planner Response
I'll help you create a Python script to find and manage duplicate files in a directory. This tool will identify files with identical content and provide options to delete or keep one copy.
Let me create a Python script with a simple command-line interface:
*Edited relevant file*
### Planner Response
I've created a Python script called [duplicate_finder.py](cci:7://file:///c:/Users/rsrivaishnavi/CascadeProjects/windsurf-project-2/duplicate_finder.py:0:0-0:0) that helps you find and manage duplicate files. Here's what it does:
### Features:
- Scans a directory and its subdirectories for duplicate files
- Identifies duplicates by both file size and content hash
- Shows detailed information about found duplicates
- Provides an option to automatically delete duplicates (keeping one copy)
- Displays the total space that can be recovered
### How to use:
1. **Basic usage** (just find duplicates):
```bash
python duplicate_finder.py "path/to/your/directory"
python duplicate_finder.py "path/to/your/directory" --delete
Scanning for duplicate files in: C:\path\to\your\directory
Found 3 groups of duplicate files (5 duplicates in total):
Group 1 - 2.50 MB each:
1. folder1\image1.jpg
2. folder2\image_copy.jpg
3. backup\image1(1).jpg
Group 2 - 150.00 KB each:
1. docs\report.pdf
2. archive\report.pdf
Group 3 - 5.00 MB each:
1. videos\clip.mp4
2. temp\clip.mp4
Total space wasted by duplicates: 12.80 MB
--delete, it keeps the first file in each group and removes the duplicates--delete)Would you like me to run it on a specific directory for you?
*Viewed duplicate_finder.py *
file name duplicates viewer and changer
I see you're looking for a tool to find and manage files with duplicate names rather than duplicate content. Let me create a script that focuses on filename duplicates instead.
Edited relevant file
I've created a new script called filename_duplicate_finder.py that specifically finds and manages files with duplicate names (rather than duplicate content).
python filename_duplicate_finder.py "path/to/your/directory"
python filename_duplicate_finder.py "path/to/your/directory" --rename
python filename_duplicate_finder.py "path/to/your/directory" --rename --interactive
Scanning for files with duplicate names in: C:\path\to\your\directory
Found 2 duplicate filenames (3 duplicates in total):
Duplicate #1: 'report.pdf'
1. docs\report.pdf (150.00 KB)
2. archive\report.pdf (150.00 KB)
3. old\report.pdf (120.00 KB)
Duplicate #2: 'config.json'
1. settings\config.json (2.50 KB)
2. backup\config.json (2.30 KB)
With --rename: Automatically renames all duplicates except the first one in each group
report.pdf → report_1.pdf, report_2.pdfWith --rename --interactive: Gives you options for each group of duplicates:
Would you like me to make any adjustments to this script or would you like to see it in action on a specific directory?
filename must be renamed but not at the end, the file will have a pattern like .dd__text. - the dd must be incremented by 1. also, the filename should be checked across a directory having many folders. after the renaming, the directory should be again checked for duplicates. this process should repeat until there is no filename duplicates found in the directory. Also provide a detailed log step by step -of the duplicate filenames found and renamed filenames
I understand you need a more specific filename renaming pattern. You want to:
.*dd__text.* where dd is a number that should be incrementedLet me create a new script that implements this logic:
Edited relevant file
I've created a new script called pattern_filename_renamer.py that implements your specific requirements for finding and renaming duplicate filenames with a pattern-based approach.
.*dd__text.* where dd is a two-digit numberdd) by 1 when renaming filespython pattern_filename_renamer.py "path/to/your/directory"
You can also specify a maximum number of iterations:
python pattern_filename_renamer.py "path/to/your/directory" --max-iterations 20
Pattern Recognition: The script looks for filenames with the pattern .*dd__text.* where:
.* is any prefixdd is a two-digit number (like 01, 02, etc.)__text is any text with double underscores.* is any suffix including file extensionRenaming Logic:
Iteration Process:
Logging:
rename_log_20251104_202500.txt)2025-11-04 20:25:00 - INFO - Starting duplicate filename resolution in: C:\path\to\directory
2025-11-04 20:25:00 - INFO - Log file: C:\path\to\rename_log_20251104_202500.txt
2025-11-04 20:25:00 - INFO - === Iteration 1 ===
2025-11-04 20:25:00 - INFO - Found 2 duplicate filenames (3 duplicates in total)
2025-11-04 20:25:00 - INFO - Duplicate filename: 'report01__final.pdf'
2025-11-04 20:25:00 - INFO - Keeping: docs\report01__final.pdf
2025-11-04 20:25:00 - INFO - Renaming: archive\report01__final.pdf → report02__final.pdf
2025-11-04 20:25:00 - INFO - Duplicate filename: 'data05__processed.csv'
2025-11-04 20:25:00 - INFO - Keeping: data\data05__processed.csv
2025-11-04 20:25:00 - INFO - Renaming: backup\data05__processed.csv → data06__processed.csv
2025-11-04 20:25:00 - INFO - Renamed 2 files in this iteration. Checking for remaining duplicates...
2025-11-04 20:25:01 - INFO - === Iteration 2 ===
2025-11-04 20:25:01 - INFO - No files with duplicate names found.
2025-11-04 20:25:01 - INFO - Process completed successfully after 2 iterations.
2025-11-04 20:25:01 - INFO - Total files renamed: 2
2025-11-04 20:25:01 - INFO - Detailed log saved to: C:\path\to\rename_log_20251104_202500.txt
Would you like me to make any adjustments to this script or would you like to see it in action on a specific directory?
keep the directory and log file location as variable inputs
I'll update the script to make both the directory and log file location configurable as command-line arguments. This will give you more flexibility when using the tool.
Edited relevant file
I've updated the script to make both the directory and log file location configurable as command-line arguments. Here are the changes:
--log-dir: Specifies the directory where the log file should be saved
--log-name: Allows you to specify a custom log filename
rename_log_YYYYMMDD_HHMMSS.txt with current timestamppython pattern_filename_renamer.py "path/to/your/directory"
python pattern_filename_renamer.py "path/to/your/directory" --log-dir "path/to/logs"
python pattern_filename_renamer.py "path/to/your/directory" --log-name "my_custom_log.txt"
python pattern_filename_renamer.py "path/to/your/directory" --log-dir "path/to/logs" --log-name "my_custom_log.txt"
python pattern_filename_renamer.py "path/to/your/directory" --max-iterations 15 --log-dir "path/to/logs" --log-name "my_custom_log.txt"
These changes make the script more flexible and allow you to specify exactly where you want the logs to be stored, which is especially useful for automation or when running the script on multiple directories.
The exact filename regex would be - V1_A_B_C_D_E_F__.* -where A would be date format in YYYYMMDD and B would be digits of length 2 and C would be digits of length between 4 to 6 , D would be digits of length 4, E would be digits of length 3 and this part F would be 2 digit number and the rest .* would be combination of alphabets, underscore and digits. I want you to change only the 2 digit number F, by incrementing the number by 1. Make the changes accordingly to match this requirement
I'll update the script to match your specific filename pattern and ensure it only increments the 2-digit number F in the pattern V1_A_B_C_D_E_F__.* where:
Edited relevant file
Let me fix that and try again:
Edited relevant file
I apologize for the errors. I'm having trouble with the replace_file_content tool when trying to update the script to match your specific filename pattern. The error indicates there's an issue with how I'm formatting the JSON for the replacement chunks.
The specific error is about invalid character 'd' in string escape code, which suggests there might be an issue with how backslashes are being handled in the regular expression patterns.
How would you like to proceed? I could:
What would you prefer?
1
I'll create a new file from scratch with the updated pattern requirements to match your specific filename format.
Edited relevant file
I've created a new script called filename_pattern_renamer.py that specifically handles your filename pattern requirements. Here's what it does:
Specific Pattern Matching: Identifies files with the pattern V1_A_B_C_D_E_F__.* where:
Targeted Renaming: Only increments the 2-digit F part of the filename
Duplicate Resolution: Iteratively scans and renames files until no duplicates remain
Detailed Logging: Creates a comprehensive log of all actions taken
Configurable Options:
python filename_pattern_renamer.py "path/to/your/directory"
python filename_pattern_renamer.py "path/to/your/directory" --dry-run
python filename_pattern_renamer.py "path/to/your/directory" --log-dir "path/to/logs" --log-name "my_rename_log.txt"
2025-11-04 22:42:00 - INFO - Starting duplicate filename resolution in: C:\path\to\directory
2025-11-04 22:42:00 - INFO - Log file: C:\path\to\rename_log_20251104_224200.txt
2025-11-04 22:42:00 - INFO - === Iteration 1 ===
2025-11-04 22:42:00 - INFO - Found 2 duplicate filenames (3 duplicates in total)
2025-11-04 22:42:00 - INFO - Duplicate filename: 'V1_20251104_01_12345_6789_123_01__data.txt'
2025-11-04 22:42:00 - INFO - Keeping: folder1\V1_20251104_01_12345_6789_123_01__data.txt
2025-11-04 22:42:00 - INFO - Renaming: folder2\V1_20251104_01_12345_6789_123_01__data.txt → V1_20251104_01_12345_6789_123_02__data.txt
2025-11-04 22:42:00 - INFO - Duplicate filename: 'V1_20251103_02_1234_5678_321_05__report.csv'
2025-11-04 22:42:00 - INFO - Keeping: reports\V1_20251103_02_1234_5678_321_05__report.csv
2025-11-04 22:42:00 - INFO - Renaming: archive\V1_20251103_02_1234_5678_321_05__report.csv → V1_20251103_02_1234_5678_321_06__report.csv
2025-11-04 22:42:00 - INFO - Renaming: backup\V1_20251103_02_1234_5678_321_05__report.csv → V1_20251103_02_1234_5678_321_06__report.csv
2025-11-04 22:42:00 - INFO - Renamed 3 files in this iteration. Checking for remaining duplicates...
2025-11-04 22:42:01 - INFO - === Iteration 2 ===
2025-11-04 22:42:01 - INFO - Found 1 duplicate filenames (1 duplicates in total)
2025-11-04 22:42:01 - INFO - Duplicate filename: 'V1_20251103_02_1234_5678_321_06__report.csv'
2025-11-04 22:42:01 - INFO - Keeping: archive\V1_20251103_02_1234_5678_321_06__report.csv
2025-11-04 22:42:01 - INFO - Renaming: backup\V1_20251103_02_1234_5678_321_06__report.csv → V1_20251103_02_1234_5678_321_07__report.csv
2025-11-04 22:42:01 - INFO - Renamed 1 files in this iteration. Checking for remaining duplicates...
2025-11-04 22:42:02 - INFO - === Iteration 3 ===
2025-11-04 22:42:02 - INFO - No files with duplicate names found.
2025-11-04 22:42:02 - INFO - Process completed successfully after 3 iterations.
2025-11-04 22:42:02 - INFO - Total files renamed: 4
2025-11-04 22:42:02 - INFO - Detailed log saved to: C:\path\to\rename_log_20251104_224200.txt
The script now precisely matches your requirements for finding and renaming duplicate files with the specific pattern format you described. Would you like me to make any adjustments to this script?