Skip to content

Instantly share code, notes, and snippets.

@janttsu
Last active March 16, 2026 17:21
Show Gist options
  • Select an option

  • Save janttsu/12cad59e30f6cf43aa31d587c9167a18 to your computer and use it in GitHub Desktop.

Select an option

Save janttsu/12cad59e30f6cf43aa31d587c9167a18 to your computer and use it in GitHub Desktop.
Git-annex duplicate finder
#!/bin/bash
# =============================================================================
# find_annexed.sh - Find and reinject known git-annex files across multiple repos
#
# Run from the "upper" directory that contains all your annex repos:
# ./find_annexed.sh /path/to/files/to/scan
#
# The script automatically finds EVERY git-annex repo under the current directory.
# =============================================================================
set -euo pipefail
# -------------------------- Argument parsing (fixed) --------------------------
DIR_TO_SCAN="${1:-.}"
SPECIFIC_FILE=""
if [[ $# -ge 2 && "$2" == "--file" && -n "${3:-}" ]]; then
DIR_TO_SCAN="$1"
SPECIFIC_FILE="$3"
elif [[ $# -ge 1 && "$1" == "--file" && -n "${2:-}" ]]; then
SPECIFIC_FILE="$2"
DIR_TO_SCAN="."
fi
SEARCH_ROOT="."
# -------------------------- Discover all annex repos --------------------------
echo "Discovering git-annex repositories under $SEARCH_ROOT ..." >&2
annex_repos=()
while IFS= read -r -d '' repo; do
annex_repos+=("$repo")
done < <(find "$SEARCH_ROOT" -type d -exec test -d {}/.git/annex \; -print0 2>/dev/null)
echo "Found ${#annex_repos[@]} git-annex repository(ies)." >&2
[[ ${#annex_repos[@]} -eq 0 ]] && { echo "No annex repos found." >&2; exit 0; }
# -------------------------- Build maps for ALL repos --------------------------
declare -A maps # "repo_path::size" → "hash1 hash2 ..."
declare -A all_known_sizes
echo "Collecting known keys from all annex repos..." >&2
for repo in "${annex_repos[@]}"; do
echo " → $repo" >&2
while IFS= read -r key; do
if [[ -n "$key" && $key =~ ^SHA256E-s([0-9]+)--([0-9a-f]{64})(\.[^[:space:]]*)?$ ]]; then
size="${BASH_REMATCH[1]}"
hashh="${BASH_REMATCH[2]}"
all_known_sizes["$size"]=1
maps["${repo}::${size}"]+="${hashh} "
fi
done < <(git -C "$repo" ls-tree -r --name-only HEAD | git -C "$repo" annex lookupkey --batch 2>/dev/null)
done
# -------------------------- Specific file mode -------------------------------
if [[ -n "$SPECIFIC_FILE" ]]; then
[[ ! -f "$SPECIFIC_FILE" ]] && { echo "Error: $SPECIFIC_FILE not found" >&2; exit 1; }
size=$(stat -L -c %s "$SPECIFIC_FILE" 2>/dev/null || echo 0)
[[ "$size" -eq 0 ]] && { echo "Error: Could not read size of $SPECIFIC_FILE" >&2; exit 1; }
[[ -z "${all_known_sizes[$size]+x}" ]] && {
echo "$SPECIFIC_FILE (size $size) → not known in any annex repo"
exit 0
}
computed_hash=$(sha256sum "$SPECIFIC_FILE" | cut -d' ' -f1)
matching=()
mismatching=()
for repo in "${annex_repos[@]}"; do
mapkey="${repo}::${size}"
if [[ -n "${maps[$mapkey]+x}" ]]; then
known_hashes="${maps[$mapkey]}"
if [[ " ${known_hashes} " =~ " ${computed_hash} " ]]; then
matching+=("$repo")
else
mismatching+=("$repo")
fi
fi
done
if [[ ${#matching[@]} -gt 0 ]]; then
echo "$SPECIFIC_FILE (size $size) → known in repo(s): ${matching[*]}"
[[ ${#mismatching[@]} -gt 0 ]] && echo " (also size-matched but hash-mismatched in: ${mismatching[*]})"
elif [[ ${#mismatching[@]} -gt 0 ]]; then
echo "$SPECIFIC_FILE (size $size) → size match but hash mismatch in repo(s): ${mismatching[*]}"
else
echo "$SPECIFIC_FILE (size $size) → not known in any annex repo"
fi
exit 0
fi
# -------------------------- Normal scan mode ---------------------------------
echo "Scanning $DIR_TO_SCAN for known files (size > 1 MiB) ..." >&2
find -L "$DIR_TO_SCAN" -type f -size +1M -print0 2>/dev/null | while IFS= read -r -d '' file; do
size=$(stat -L -c %s "$file" 2>/dev/null || continue)
[[ -z "${all_known_sizes[$size]+x}" ]] && continue
computed_hash=$(sha256sum "$file" | cut -d' ' -f1)
for repo in "${annex_repos[@]}"; do
mapkey="${repo}::${size}"
if [[ -n "${maps[$mapkey]+x}" ]]; then
known_hashes="${maps[$mapkey]}"
if [[ " ${known_hashes} " =~ " ${computed_hash} " ]]; then
echo "✓ $file (size $size) → known in $repo"
git -C "$repo" annex reinject --known --force "$file"
fi
fi
done
done
echo "Scan finished." >&2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment