#!/usr/bin/env python3 """ Git Metrics Analyzer - Derive commit statistics and developer activity metrics from git history Similar to GitClear's research on developer commit patterns and annual days active """ import subprocess import json import argparse from datetime import datetime, timedelta from collections import defaultdict import statistics import sys from pathlib import Path class GitMetricsAnalyzer: def __init__(self, repo_path="."): self.repo_path = Path(repo_path) def run_git_command(self, cmd): """Execute a git command and return the output""" try: result = subprocess.run( cmd, shell=True, cwd=self.repo_path, capture_output=True, text=True, check=True ) return result.stdout.strip() except subprocess.CalledProcessError as e: print(f"Git command failed: {e}") return None def get_commit_data(self, since_date=None, until_date=None): """Extract commit data from git log""" # Format: hash|author_email|author_name|date|timestamp format_string = "%H|%ae|%an|%ai|%at" cmd = f'git log --all --format="{format_string}"' if since_date: cmd += f' --since="{since_date}"' if until_date: cmd += f' --until="{until_date}"' output = self.run_git_command(cmd) if not output: return [] commits = [] for line in output.split('\n'): if not line: continue parts = line.split('|') if len(parts) >= 5: commits.append({ 'hash': parts[0], 'author_email': parts[1], 'author_name': parts[2], 'date': parts[3], 'timestamp': int(parts[4]) }) return commits def calculate_developer_metrics(self, commits): """Calculate metrics per developer""" developer_data = defaultdict(lambda: { 'commits': [], 'active_dates': set(), 'first_commit': None, 'last_commit': None }) for commit in commits: author = commit['author_email'] commit_date = datetime.fromisoformat(commit['date'].split()[0]) developer_data[author]['commits'].append(commit) developer_data[author]['active_dates'].add(commit_date.date()) if not developer_data[author]['first_commit']: developer_data[author]['first_commit'] = commit_date developer_data[author]['last_commit'] = commit_date return developer_data def calculate_annual_metrics(self, developer_data): """Calculate annual metrics for each developer""" annual_metrics = defaultdict(lambda: defaultdict(lambda: { 'commits': 0, 'active_days': set() })) for author, data in developer_data.items(): for commit in data['commits']: commit_date = datetime.fromisoformat(commit['date'].split()[0]) year = commit_date.year annual_metrics[author][year]['commits'] += 1 annual_metrics[author][year]['active_days'].add(commit_date.date()) # Convert sets to counts for author in annual_metrics: for year in annual_metrics[author]: annual_metrics[author][year]['active_days_count'] = len( annual_metrics[author][year]['active_days'] ) del annual_metrics[author][year]['active_days'] return annual_metrics def calculate_percentiles(self, values, percentiles=[10, 25, 50, 75, 90, 95, 99]): """Calculate percentiles for a list of values""" if not values: return {} sorted_values = sorted(values) result = {} for p in percentiles: index = int(len(sorted_values) * p / 100) if index >= len(sorted_values): index = len(sorted_values) - 1 result[f'p{p}'] = sorted_values[index] return result def analyze_repository(self, since_date=None, until_date=None): """Main analysis function""" print(f"Analyzing repository: {self.repo_path}") print("-" * 50) # Get all commits commits = self.get_commit_data(since_date, until_date) if not commits: print("No commits found in the repository.") return None print(f"Total commits analyzed: {len(commits)}") # Calculate developer metrics developer_data = self.calculate_developer_metrics(commits) print(f"Total unique contributors: {len(developer_data)}") # Calculate annual metrics annual_metrics = self.calculate_annual_metrics(developer_data) # Aggregate statistics all_commit_counts = [] all_active_days = [] annual_active_days = [] for author, data in developer_data.items(): all_commit_counts.append(len(data['commits'])) all_active_days.append(len(data['active_dates'])) for author in annual_metrics: for year in annual_metrics[author]: annual_active_days.append(annual_metrics[author][year]['active_days_count']) # Calculate percentiles commit_percentiles = self.calculate_percentiles(all_commit_counts) active_days_percentiles = self.calculate_percentiles(all_active_days) annual_days_percentiles = self.calculate_percentiles(annual_active_days) return { 'summary': { 'total_commits': len(commits), 'unique_contributors': len(developer_data), 'date_range': { 'first_commit': min(c['date'] for c in commits), 'last_commit': max(c['date'] for c in commits) } }, 'commit_count_percentiles': commit_percentiles, 'total_active_days_percentiles': active_days_percentiles, 'annual_active_days_percentiles': annual_days_percentiles, 'developer_metrics': { author: { 'total_commits': len(data['commits']), 'total_active_days': len(data['active_dates']), 'annual_breakdown': annual_metrics[author] } for author, data in developer_data.items() } } def print_results(self, results): """Pretty print the analysis results""" if not results: return print("\n" + "=" * 60) print("GIT METRICS ANALYSIS RESULTS") print("=" * 60) # Summary print("\nRepository Summary:") print(f" Total Commits: {results['summary']['total_commits']:,}") print(f" Unique Contributors: {results['summary']['unique_contributors']}") print(f" Date Range: {results['summary']['date_range']['first_commit'][:10]} to {results['summary']['date_range']['last_commit'][:10]}") # Commit Count Percentiles print("\nCommit Count Percentiles (per developer):") for percentile, value in results['commit_count_percentiles'].items(): print(f" {percentile}: {value} commits") # Active Days Percentiles print("\nTotal Active Days Percentiles (per developer):") for percentile, value in results['total_active_days_percentiles'].items(): print(f" {percentile}: {value} days") # Annual Active Days Percentiles print("\nAnnual Active Days Percentiles (developer-years):") for percentile, value in results['annual_active_days_percentiles'].items(): print(f" {percentile}: {value} days/year") # Top contributors print("\nTop 10 Contributors by Commit Count:") sorted_devs = sorted( results['developer_metrics'].items(), key=lambda x: x[1]['total_commits'], reverse=True )[:10] for i, (author, metrics) in enumerate(sorted_devs, 1): print(f" {i}. {author}: {metrics['total_commits']} commits, {metrics['total_active_days']} active days") def export_results(self, results, output_file): """Export results to JSON file""" with open(output_file, 'w') as f: json.dump(results, f, indent=2, default=str) print(f"\nResults exported to: {output_file}") def main(): parser = argparse.ArgumentParser( description='Analyze git repository metrics similar to GitClear research' ) parser.add_argument( 'repo_path', nargs='?', default='.', help='Path to git repository (default: current directory)' ) parser.add_argument( '--since', help='Analyze commits since this date (YYYY-MM-DD)' ) parser.add_argument( '--until', help='Analyze commits until this date (YYYY-MM-DD)' ) parser.add_argument( '--export', help='Export results to JSON file' ) parser.add_argument( '--top', type=int, default=10, help='Number of top contributors to show (default: 10)' ) args = parser.parse_args() # Verify git repository if not Path(args.repo_path).joinpath('.git').exists(): print(f"Error: {args.repo_path} is not a git repository") sys.exit(1) # Run analysis analyzer = GitMetricsAnalyzer(args.repo_path) results = analyzer.analyze_repository(args.since, args.until) # Display results if results: analyzer.print_results(results) # Export if requested if args.export: analyzer.export_results(results, args.export) if __name__ == "__main__": main()