|
#!/usr/bin/env bash |
|
# LOC-based author exclusivity between Bitcoin Knots and Bitcoin Core |
|
# + per-author context for Knots-only contributors (top dirs/files/commits by LOC). |
|
# |
|
# Counting rules: |
|
# - Merges excluded |
|
# - LOC = adds + deletes (from `git log --numstat`) |
|
# - Binary-only rows ("-\t-\tfile") ignored |
|
# Identity: |
|
# - Author key uses mailmapped fields: %aN (name), %aE (email) |
|
# - Displayed as: "Name <email>" if email exists, else "Name" |
|
# |
|
# Exclusivity (strict-zero): |
|
# - Knots-only: >THRESHOLD LOC in Knots AND exactly 0 LOC in Core |
|
# - Core-only: >THRESHOLD LOC in Core AND exactly 0 LOC in Knots |
|
# |
|
# Scope: |
|
# - Default is current branch (HEAD) history on each repo. |
|
# - You can constrain via RANGE (e.g., "v28.0..HEAD") or SINCE (e.g., "2023-01-01"). |
|
# |
|
# Tested on a recent macOS and Ubuntu 22.04. |
|
|
|
set -euo pipefail |
|
|
|
# ---- Config (override via env) ---- |
|
WORKDIR="${WORKDIR:-bitcoin-loc-compare}" |
|
KNOTS_REPO="${KNOTS_REPO:-https://github.com/bitcoinknots/bitcoin.git}" |
|
CORE_REPO="${CORE_REPO:-https://github.com/bitcoin/bitcoin.git}" |
|
KNOTS_DIR="$WORKDIR/knots" |
|
CORE_DIR="$WORKDIR/core" |
|
|
|
# "more than a single line" => > 1 (adds+deletes) |
|
THRESHOLD="${THRESHOLD:-1}" |
|
|
|
# Optional scope limiters for BOTH repos (leave empty for full HEAD history): |
|
# RANGE="v28.0..HEAD" or SINCE="2023-01-01" |
|
RANGE="${RANGE:-}" |
|
SINCE="${SINCE:-}" |
|
|
|
# Context sizes |
|
TOP_DIRS="${TOP_DIRS:-5}" |
|
TOP_FILES="${TOP_FILES:-5}" |
|
TOP_COMMITS="${TOP_COMMITS:-3}" |
|
|
|
# Quiet & deterministic git output |
|
export GIT_TERMINAL_PROMPT=0 |
|
export GIT_PAGER=cat |
|
GIT_CFG=(-c core.quotepath=off -c i18n.logOutputEncoding=UTF-8) |
|
|
|
# ---- Dependency checks ---- |
|
need() { command -v "$1" >/dev/null 2>&1 || { echo "Error: '$1' not found on PATH." >&2; exit 127; }; } |
|
for tool in git awk sort join comm grep cut head wc mktemp; do need "$tool"; done |
|
|
|
# ---- workspace ---- |
|
mkdir -p "$WORKDIR" |
|
|
|
# ---- Prep (quiet; FULL clones to avoid lazy blob fetch spam) ---- |
|
if [ ! -d "$KNOTS_DIR/.git" ]; then |
|
echo "Cloning Knots (quiet)…" |
|
git "${GIT_CFG[@]}" clone -q --no-progress "$KNOTS_REPO" "$KNOTS_DIR" >/dev/null 2>&1 |
|
else |
|
echo "Using existing Knots repo at $KNOTS_DIR" |
|
fi |
|
|
|
if [ ! -d "$CORE_DIR/.git" ]; then |
|
echo "Cloning Core (quiet)…" |
|
git "${GIT_CFG[@]}" clone -q --no-progress "$CORE_REPO" "$CORE_DIR" >/dev/null 2>&1 |
|
else |
|
echo "Using existing Core repo at $CORE_DIR" |
|
fi |
|
|
|
echo "Fetching latest (quiet)…" |
|
git -C "$KNOTS_DIR" "${GIT_CFG[@]}" fetch -q --no-progress --all --tags --prune >/dev/null 2>&1 |
|
git -C "$CORE_DIR" "${GIT_CFG[@]}" fetch -q --no-progress --all --tags --prune >/dev/null 2>&1 |
|
|
|
# ---- Build git log arg sets ---- |
|
LOG_ARGS_BASE=(--no-merges --numstat --format=--%aN\|%aE --date=short) |
|
if [ -n "$SINCE" ]; then LOG_ARGS_BASE=(--since="$SINCE" "${LOG_ARGS_BASE[@]}"); fi |
|
|
|
# Range vs path selection (HEAD by default) |
|
if [ -n "$RANGE" ]; then |
|
LOG_ARGS_REPO_RANGE=("${LOG_ARGS_BASE[@]}" $RANGE -- .) |
|
fi |
|
LOG_ARGS_REPO_PATH=("${LOG_ARGS_BASE[@]}" -- .) |
|
|
|
# ---- calc_loc: build "author\tLOC" list for a repo (mailmap-applied) ---- |
|
calc_loc() { |
|
repo_dir="$1"; out_file="$2" |
|
|
|
if [ -n "$RANGE" ]; then |
|
git -C "$repo_dir" "${GIT_CFG[@]}" log "${LOG_ARGS_REPO_RANGE[@]}" 2>/dev/null |
|
else |
|
git -C "$repo_dir" "${GIT_CFG[@]}" log "${LOG_ARGS_REPO_PATH[@]}" 2>/dev/null |
|
fi | awk -v THRESHOLD="$THRESHOLD" -v OFS='\t' ' |
|
BEGIN { FS="\t" } |
|
# Header lines look like: --Name|Email |
|
/^--/ { |
|
line=$0; sub(/^--/,"",line) |
|
n = split(line, p, /\|/) |
|
author=(n>=1?p[1]:""); email=(n>=2?p[2]:"") |
|
key = (email != "" ? author " <" email ">" : author) |
|
next |
|
} |
|
# numstat rows: "<added>\t<deleted>\t<path>" |
|
NF==3 { |
|
a=$1; d=$2 |
|
if (a ~ /^[0-9]+$/ && d ~ /^[0-9]+$/) loc[key]+=a+d |
|
next |
|
} |
|
END { |
|
for (k in loc) if (loc[k] > THRESHOLD) print k, loc[k] |
|
} |
|
' | LC_ALL=C sort -u > "$out_file" |
|
} |
|
|
|
# ---- author_context_knots_raw: raw context lines from Knots repo ---- |
|
# Emits unsorted lines tagged as: |
|
# DIR<TAB><dir><TAB><loc> |
|
# FILE<TAB><path><TAB><loc> |
|
# COMM<TAB><date><TAB><sha><TAB><loc><TAB><subject> |
|
author_context_knots_raw() { |
|
author_key="$1" # "Name <email>" or "Name" |
|
repo_dir="$KNOTS_DIR" |
|
|
|
if [ -n "$RANGE" ]; then |
|
LOG_FMT=(--no-merges --numstat --format=@@@%aN\|%aE\|%H\|%ad\|%s --date=short $RANGE -- .) |
|
else |
|
LOG_FMT=(--no-merges --numstat --format=@@@%aN\|%aE\|%H\|%ad\|%s --date=short -- .) |
|
fi |
|
|
|
git -C "$repo_dir" "${GIT_CFG[@]}" log "${LOG_FMT[@]}" 2>/dev/null \ |
|
| awk -v KEY="$author_key" -v OFS='\t' ' |
|
BEGIN { FS="\t"; in_author=0 } |
|
/^@@@/ { |
|
sub(/^@@@/,"") |
|
# header: name|email|sha|date|subject |
|
n=split($0, h, /\|/) |
|
name=(n>=1?h[1]:""); email=(n>=2?h[2]:"") |
|
sha=(n>=3?h[3]:""); date=(n>=4?h[4]:""); subj=(n>=5?h[5]:"") |
|
key = (email != "" ? name " <" email ">" : name) |
|
in_author = (key==KEY) |
|
if (in_author) { curr_sha=sha; curr_date=date; curr_subj=subj; commit_loc[curr_sha]=0 } |
|
next |
|
} |
|
in_author && NF==3 { |
|
a=$1; d=$2; path=$3 |
|
if (a ~ /^[0-9]+$/ && d ~ /^[0-9]+$/) { |
|
loc = a+d |
|
dir=path; sub(/\/.*/,"",dir) |
|
dir_loc[dir]+=loc |
|
file_loc[path]+=loc |
|
commit_loc[curr_sha]+=loc |
|
commit_date[curr_sha]=curr_date |
|
commit_subj[curr_sha]=curr_subj |
|
} |
|
next |
|
} |
|
END { |
|
for (k in dir_loc) print "DIR", k, dir_loc[k] |
|
for (k in file_loc) print "FILE", k, file_loc[k] |
|
for (k in commit_loc) print "COMM", commit_date[k], k, commit_loc[k], commit_subj[k] |
|
} |
|
' |
|
} |
|
|
|
# ---- Compute LOC per author for each repo ---- |
|
KNOTS_TSV="$WORKDIR/knots_authors_loc.tsv" |
|
CORE_TSV="$WORKDIR/core_authors_loc.tsv" |
|
|
|
echo "Computing LOC by author (Knots)…" |
|
calc_loc "$KNOTS_DIR" "$KNOTS_TSV" |
|
echo "Computing LOC by author (Core)…" |
|
calc_loc "$CORE_DIR" "$CORE_TSV" |
|
|
|
# ---- Build name-only sets (sorted) ---- |
|
cut -f1 "$KNOTS_TSV" | LC_ALL=C sort -u > "$WORKDIR/knots.names" |
|
cut -f1 "$CORE_TSV" | LC_ALL=C sort -u > "$WORKDIR/core.names" |
|
|
|
# ---- Set differences (strict-zero exclusivity) ---- |
|
LC_ALL=C comm -23 "$WORKDIR/knots.names" "$WORKDIR/core.names" > "$WORKDIR/knots_only.names" |
|
LC_ALL=C comm -13 "$WORKDIR/knots.names" "$WORKDIR/core.names" > "$WORKDIR/core_only.names" |
|
|
|
# ---- Attach LOC counts (join) ---- |
|
LC_ALL=C sort -u "$KNOTS_TSV" -o "$KNOTS_TSV" |
|
LC_ALL=C sort -u "$CORE_TSV" -o "$CORE_TSV" |
|
LC_ALL=C sort -u "$WORKDIR/knots_only.names" -o "$WORKDIR/knots_only.names" |
|
LC_ALL=C sort -u "$WORKDIR/core_only.names" -o "$WORKDIR/core_only.names" |
|
|
|
join -t $'\t' -1 1 -2 1 "$WORKDIR/knots_only.names" "$KNOTS_TSV" > "$WORKDIR/knots_only_with_loc.tsv" || true |
|
join -t $'\t' -1 1 -2 1 "$WORKDIR/core_only.names" "$CORE_TSV" > "$WORKDIR/core_only_with_loc.tsv" || true |
|
|
|
# ---- Output: summaries ---- |
|
echo |
|
echo "Developers with >$THRESHOLD LOC in Knots and 0 LOC in Core:" |
|
if [ -s "$WORKDIR/knots_only_with_loc.tsv" ]; then |
|
awk -F'\t' '{printf "%s (LOC: %s)\n",$1,$2}' "$WORKDIR/knots_only_with_loc.tsv" | LC_ALL=C sort |
|
echo "(total: $(wc -l < "$WORKDIR/knots_only_with_loc.tsv" | xargs))" |
|
else |
|
echo "(none)" |
|
fi |
|
|
|
echo |
|
echo "Developers with >$THRESHOLD LOC in Core and 0 LOC in Knots:" |
|
if [ -s "$WORKDIR/core_only_with_loc.tsv" ]; then |
|
awk -F'\t' '{printf "%s (LOC: %s)\n",$1,$2}' "$WORKDIR/core_only_with_loc.tsv" | LC_ALL=C sort |
|
echo "(total: $(wc -l < "$WORKDIR/core_only_with_loc.tsv" | xargs))" |
|
else |
|
echo "(none)" |
|
fi |
|
|
|
# ---- Detailed context for Knots-only contributors ---- |
|
cleanup_tmp() { [ -n "${_CTX_TMP:-}" ] && rm -f "$_CTX_TMP" || true; } |
|
trap cleanup_tmp EXIT |
|
|
|
if [ -s "$WORKDIR/knots_only_with_loc.tsv" ]; then |
|
echo |
|
echo "===== Knots-only contributor context =====" |
|
while IFS=$'\t' read -r author_key loc; do |
|
echo |
|
echo "$author_key — total LOC in Knots: $loc" |
|
|
|
_CTX_TMP="$(mktemp -t knotsctx.XXXXXX)" |
|
author_context_knots_raw "$author_key" > "$_CTX_TMP" |
|
|
|
# Top directories |
|
echo " Top directories:" |
|
if grep -q "^DIR"$'\t' "$_CTX_TMP"; then |
|
grep "^DIR"$'\t' "$_CTX_TMP" \ |
|
| LC_ALL=C sort -t $'\t' -k3,3nr | head -n "$TOP_DIRS" \ |
|
| awk -F'\t' '{printf " - %-20s (LOC: %s)\n",$2,$3}' |
|
else |
|
echo " (none)" |
|
fi |
|
|
|
# Top files |
|
echo " Top files:" |
|
if grep -q "^FILE"$'\t' "$_CTX_TMP"; then |
|
grep "^FILE"$'\t' "$_CTX_TMP" \ |
|
| LC_ALL=C sort -t $'\t' -k3,3nr | head -n "$TOP_FILES" \ |
|
| awk -F'\t' '{printf " - %s (LOC: %s)\n",$2,$3}' |
|
else |
|
echo " (none)" |
|
fi |
|
|
|
# Top commits (by LOC) |
|
echo " Top commits:" |
|
if grep -q "^COMM"$'\t' "$_CTX_TMP"; then |
|
# COMM <date> <sha> <loc> <subject> |
|
grep "^COMM"$'\t' "$_CTX_TMP" \ |
|
| LC_ALL=C sort -t $'\t' -k4,4nr | head -n "$TOP_COMMITS" \ |
|
| awk -F'\t' '{printf " - %s %s (LOC: %s) — %s\n",$2,substr($3,1,12),$4,$5}' |
|
else |
|
echo " (none)" |
|
fi |
|
|
|
rm -f "$_CTX_TMP"; _CTX_TMP="" |
|
done < "$WORKDIR/knots_only_with_loc.tsv" |
|
fi |
|
|
|
echo |
|
echo "Files written:" |
|
echo " $KNOTS_TSV # Knots authors with LOC (> $THRESHOLD)" |
|
echo " $CORE_TSV # Core authors with LOC (> $THRESHOLD)" |
|
echo " $WORKDIR/knots_only_with_loc.tsv # Knots-only authors + LOC" |
|
echo " $WORKDIR/core_only_with_loc.tsv # Core-only authors + LOC" |