git_cleaner.py: Recursively run git gc on all your repositories
This script, git_cleaner.py, is designed to recursively search through a specified directory (or the current directory by default) for Git repositories and run git gc on each one. It captures the size and file count of the .git directory before and after running git gc, allowing you to see how much space was reclaimed and how many files were removed. The script also provides a final summary of all repositories processed.
git_cleaner.py
#!/usr/bin/env python3
import os
import subprocess
import argparse
import sys
def get_dir_stats(path):
"""Returns a tuple of (total_size_bytes, total_file_count)."""
total_size = 0
file_count = 0
for dirpath, _, filenames in os.walk(path):
for f in filenames:
fp = os.path.join(dirpath, f)
if not os.path.islink(fp):
total_size += os.path.getsize(fp)
file_count += 1
return total_size, file_count
def format_size(bytes_size):
"""Convert bytes to a human-readable string."""
for unit in ['B', 'KB', 'MB', 'GB']:
if abs(bytes_size) < 1024:
return f"{bytes_size:.2f} {unit}"
bytes_size /= 1024
return f"{bytes_size:.2f} TB"
def main():
parser = argparse.ArgumentParser(
description="Recursively run 'git gc', reporting space and files saved."
)
parser.add_argument(
"target_dir",
nargs="?",
default=".",
help="Directory to search (default: '.')"
)
args, unknown_args = parser.parse_known_args()
target_path = os.path.abspath(args.target_dir)
if not os.path.isdir(target_path):
print(f"Error: {target_path} is not a directory.")
sys.exit(1)
total_saved_bytes = 0
total_files_removed = 0
repos_processed = 0
print(f"--- Scanning: {target_path} ---")
for root, dirs, _ in os.walk(target_path):
if ".git" in dirs:
repos_processed += 1
git_dir = os.path.join(root, ".git")
# 1. Capture stats before
size_before, files_before = get_dir_stats(git_dir)
print(f"\n[{repos_processed}] Optimizing: {root}")
try:
# 2. Run git gc
# Note: We use -C to run git in the specific directory
cmd = ["git", "-C", root, "gc"] + unknown_args
subprocess.run(cmd, check=True, capture_output=True)
# 3. Capture stats after
size_after, files_after = get_dir_stats(git_dir)
saved_size = size_before - size_after
removed_files = files_before - files_after
total_saved_bytes += max(0, saved_size)
total_files_removed += max(0, removed_files)
print(f" Files: {files_before} -> {files_after} ({removed_files} removed)")
print(f" Size: {format_size(size_before)} -> {format_size(size_after)} ({format_size(saved_size)} saved)")
except subprocess.CalledProcessError as e:
print(f" [!] Error: {e.stderr.decode().strip()}")
# Efficiency: don't walk into the .git folder we just processed
dirs.remove(".git")
# Final Summary Table
print("\n" + "="*45)
print(f"{'FINAL SUMMARY':^45}")
print("-" * 45)
print(f" Repositories Processed : {repos_processed}")
print(f" Total Space Reclaimed : {format_size(total_saved_bytes)}")
print(f" Total Files Removed : {total_files_removed}")
print("="*45)
if __name__ == "__main__":
main()Example output
example.txt
[...]
[433] Optimizing: /home/uli/dev/FlareDNS
Files: 73 -> 33 (40 removed)
Size: 70.14 KB -> 47.03 KB (23.10 KB saved)
=============================================
FINAL SUMMARY
---------------------------------------------
Repositories Processed : 433
Total Space Reclaimed : 238.14 MB
Total Files Removed : 21612
=============================================If this post helped you, please consider buying me a coffee or donating via PayPal to support research & publishing of new posts on TechOverflow