Skip to content

Instantly share code, notes, and snippets.

@danmoseley
Last active March 15, 2026 02:38
Show Gist options
  • Select an option

  • Save danmoseley/ecfdccef799ade09f53ebfaa1ef9b46e to your computer and use it in GitHub Desktop.

Select an option

Save danmoseley/ecfdccef799ade09f53ebfaa1ef9b46e to your computer and use it in GitHub Desktop.
PR Readiness Score: Empirical Weight Calibration (980 PRs, 11 dotnet repos)
"""
Dual-score analysis with linked issue data.
Score 1: Closeness to Merge (PR mechanical state)
Score 2: Deserves Attention (impact + urgency + effort-at-risk)
"""
import json
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
DATA_FILE = r"C:\git\pr_data\merged_pr_features.json"
MAINTAINERS_FILE = r"C:\git\pr_data\inferred_maintainers.json"
ISSUES_FILE = r"C:\git\pr_data\linked_issues.json"
def load():
with open(DATA_FILE, "r", encoding="utf-8") as f:
features = json.load(f)
with open(MAINTAINERS_FILE, "r", encoding="utf-8") as f:
maint_data = json.load(f)
with open(ISSUES_FILE, "r", encoding="utf-8") as f:
issues_data = json.load(f)
repo_maint = {k: set(v) for k, v in maint_data["repo_maintainers"].items()}
df = pd.DataFrame(features)
df['is_community_inferred'] = df.apply(
lambda r: r['author'].lower() not in repo_maint.get(r['repo'], set()), axis=1)
df['log_age'] = np.log1p(df['age_days'])
# Dashboard sub-scores
df['f_ci'] = df['build_analysis_conclusion'].map({
'SUCCESS': 1.0, 'ABSENT': 0.5, 'IN_PROGRESS': 0.5, 'FAILURE': 0.0
}).fillna(0.5)
df['f_approval'] = df.apply(lambda r:
1.0 if (r['approval_count'] >= 2 and r['has_owner_approval']) else
0.75 if r['has_owner_approval'] else
0.5 if r['approval_count'] >= 1 else 0.0, axis=1)
df['f_maint'] = 0.0
df.loc[df['has_any_review'] & ~df['has_owner_approval'], 'f_maint'] = 0.5
df.loc[df['has_owner_approval'], 'f_maint'] = 1.0
df['f_feedback'] = df['unresolved_threads'].apply(lambda x: 1.0 if x == 0 else 0.5)
df['f_size'] = df.apply(lambda r: 1.0 if (r['changed_files']<=5 and r['total_lines']<=200)
else (0.5 if (r['changed_files']<=20 and r['total_lines']<=500) else 0.0), axis=1)
df['f_community'] = df['is_community_inferred'].map({True: 0.5, False: 1.0})
df['f_align'] = df.apply(lambda r: 0.0 if (r['is_untriaged'] or not r['has_area_label']) else 1.0, axis=1)
df['f_discussion'] = df.apply(lambda r:
1.0 if (r['total_threads'] <= 5 and r['distinct_commenters'] <= 2) else
0.5 if (r['total_threads'] <= 15 and r['distinct_commenters'] <= 5) else 0.0, axis=1)
# Merge linked issue data
regression_labels = {"regression-from-last-release", "regression",
"regression-from-previous-release", "Regression"}
security_labels = {"security", "Security", "area-Security"}
bug_labels = {"bug", "Bug"}
issue_features = []
for _, row in df.iterrows():
key = f"{row['repo']}#{row['number']}"
idata = issues_data.get(key, {})
issues = idata.get("issues", [])
pr_labels = set(idata.get("pr_labels", []))
all_labels = set(pr_labels)
for issue in issues:
all_labels.update(issue.get("labels", []))
total_issue_reactions = sum(i.get("reaction_count", 0) for i in issues)
total_issue_thumbsup = sum(i.get("thumbs_up", 0) for i in issues)
total_issue_comments = sum(i.get("comment_count", 0) for i in issues)
max_issue_reactions = max((i.get("reaction_count", 0) for i in issues), default=0)
max_issue_thumbsup = max((i.get("thumbs_up", 0) for i in issues), default=0)
total_cross_refs = sum(i.get("cross_ref_count", 0) for i in issues)
has_regression = bool(all_labels & regression_labels)
has_security = bool(all_labels & security_labels)
has_bug = bool(all_labels & bug_labels)
has_milestone = idata.get("has_milestone", False)
has_linked_issue = len(issues) > 0
issue_features.append({
'has_linked_issue': has_linked_issue,
'linked_issue_count': len(issues),
'total_issue_reactions': total_issue_reactions,
'total_issue_thumbsup': total_issue_thumbsup,
'max_issue_reactions': max_issue_reactions,
'max_issue_thumbsup': max_issue_thumbsup,
'total_issue_comments': total_issue_comments,
'total_cross_refs': total_cross_refs,
'is_regression': has_regression,
'is_security': has_security,
'is_bug': has_bug,
'has_milestone': has_milestone,
})
issue_df = pd.DataFrame(issue_features)
df = pd.concat([df.reset_index(drop=True), issue_df], axis=1)
return df
def descriptive_stats(df):
print("="*70)
print("LINKED ISSUE DATA OVERVIEW")
print("="*70)
print(f"\n Total PRs: {len(df)}")
print(f" PRs with linked issues: {df['has_linked_issue'].sum()} ({df['has_linked_issue'].mean()*100:.0f}%)")
print(f" PRs with milestones: {df['has_milestone'].sum()} ({df['has_milestone'].mean()*100:.0f}%)")
print(f" Regression-labeled: {df['is_regression'].sum()}")
print(f" Security-labeled: {df['is_security'].sum()}")
print(f" Bug-labeled: {df['is_bug'].sum()} ({df['is_bug'].mean()*100:.0f}%)")
has_issues = df[df['has_linked_issue']]
if len(has_issues) > 0:
print(f"\n Among PRs WITH linked issues (n={len(has_issues)}):")
print(f" Mean issue reactions: {has_issues['total_issue_reactions'].mean():.1f}")
print(f" Mean issue thumbsup: {has_issues['total_issue_thumbsup'].mean():.1f}")
print(f" Mean issue comments: {has_issues['total_issue_comments'].mean():.1f}")
print(f" Mean cross-references: {has_issues['total_cross_refs'].mean():.1f}")
print(f" Max reactions on any linked issue: {has_issues['max_issue_reactions'].max()}")
print(f"\n Linked issue rate by repo:")
for repo in sorted(df['repo'].unique()):
rdf = df[df['repo'] == repo]
rate = rdf['has_linked_issue'].mean() * 100
bug_rate = rdf['is_bug'].mean() * 100
ms_rate = rdf['has_milestone'].mean() * 100
print(f" {repo}: issues={rate:.0f}%, bugs={bug_rate:.0f}%, milestones={ms_rate:.0f}%")
def test_issue_features_merge(df):
"""Do linked issue features predict merge speed?"""
print("\n" + "="*70)
print("DO ISSUE FEATURES PREDICT MERGE SPEED?")
print("="*70)
y = df['log_age']
candidates = {
'has_linked_issue': df['has_linked_issue'].astype(float),
'log_issue_reactions': np.log1p(df['total_issue_reactions']),
'log_issue_thumbsup': np.log1p(df['total_issue_thumbsup']),
'log_issue_comments': np.log1p(df['total_issue_comments']),
'is_bug': df['is_bug'].astype(float),
'has_milestone': df['has_milestone'].astype(float),
'log_cross_refs': np.log1p(df['total_cross_refs']),
}
print(f"\n {'Feature':25s} {'R²':>6s} {'coef':>8s} {'t':>8s} {'p':>8s}")
for name, feat in candidates.items():
X = sm.add_constant(feat)
m = sm.OLS(y, X).fit()
sig = '***' if m.pvalues.iloc[1]<0.001 else '**' if m.pvalues.iloc[1]<0.01 else '*' if m.pvalues.iloc[1]<0.05 else ''
print(f" {name:25s} {m.rsquared:>6.3f} {m.params.iloc[1]:>+8.3f} {m.tvalues.iloc[1]:>8.2f} {m.pvalues.iloc[1]:>8.4f} {sig}")
print(f"\n Do issue features add predictive power BEYOND dashboard sub-scores?")
dash_features = ['f_ci', 'f_approval', 'f_maint', 'f_feedback',
'f_size', 'f_community', 'f_align', 'f_discussion']
X_base = sm.add_constant(df[dash_features])
m_base = sm.OLS(y, X_base).fit()
df['log_issue_reactions_f'] = np.log1p(df['total_issue_reactions'].astype(float))
df['has_linked_issue_f'] = df['has_linked_issue'].astype(float)
df['is_bug_f'] = df['is_bug'].astype(float)
df['has_milestone_f'] = df['has_milestone'].astype(float)
issue_cols = ['has_linked_issue_f', 'log_issue_reactions_f', 'is_bug_f', 'has_milestone_f']
X_full = sm.add_constant(df[dash_features + issue_cols])
m_full = sm.OLS(y, X_full).fit()
print(f" Dashboard only: R² = {m_base.rsquared:.3f}")
print(f" Dashboard + issue features: R² = {m_full.rsquared:.3f}")
print(f" Incremental R²: {m_full.rsquared - m_base.rsquared:.3f}")
print(f"\n Issue feature coefficients in combined model:")
for f in issue_cols:
if f in m_full.params:
sig = '***' if m_full.pvalues[f]<0.001 else '**' if m_full.pvalues[f]<0.01 else '*' if m_full.pvalues[f]<0.05 else ''
print(f" {f:25s}: coef={m_full.params[f]:+.3f}, t={m_full.tvalues[f]:.2f}, p={m_full.pvalues[f]:.4f} {sig}")
def analyze_dual_scores(df):
"""Build and compare the two scores."""
print("\n" + "="*70)
print("DUAL SCORE ANALYSIS")
print("="*70)
# SCORE 1: CLOSENESS TO MERGE
def merge_readiness(row):
s = 0
s += row['f_ci'] * 2.5
s += row['f_approval'] * 2.5
s += row['f_maint'] * 1.5
s += row['f_feedback'] * 2.5
s += row['f_size'] * 2.0
s += row['f_community'] * 1.0
s += row['f_discussion'] * 2.5
s += row['f_align'] * 0.5
return round(s / 15.0 * 10, 1)
df['score_merge'] = df.apply(merge_readiness, axis=1)
# SCORE 2: DESERVES ATTENTION
def attention_score(row):
s = 0
# Urgency
if row['is_regression']: s += 4.0
if row['is_security']: s += 4.0
if row['is_bug']: s += 1.0
if row['has_milestone']: s += 1.0
# Community demand
if row['total_issue_thumbsup'] >= 10: s += 2.0
elif row['total_issue_thumbsup'] >= 3: s += 1.0
elif row['total_issue_reactions'] >= 5: s += 0.5
if row['total_issue_comments'] >= 20: s += 1.5
elif row['total_issue_comments'] >= 5: s += 0.5
if row['total_cross_refs'] >= 3: s += 1.0
elif row['total_cross_refs'] >= 1: s += 0.5
# Effort at risk
if row['is_community_inferred']: s += 2.0
if row['has_any_review'] and row['f_approval'] < 0.75: s += 1.0
if row['total_lines'] > 200: s += 0.5
# Blockers
if row['f_ci'] == 0: s += 1.0
if row['unresolved_threads'] > 0: s += 1.0
elif row['changes_requested_count'] > 0: s += 0.5
if row['f_approval'] == 0: s += 1.5
return round(min(s, 10.0), 1)
df['score_attention'] = df.apply(attention_score, axis=1)
corr = df['score_merge'].corr(df['score_attention'])
print(f"\n Correlation between scores: {corr:.3f}")
print(f" ({'They surface different PRs' if corr < 0 else 'Some overlap expected'})")
# Quadrant analysis
merge_med = df['score_merge'].median()
attn_med = df['score_attention'].median()
q1 = df[(df['score_merge'] >= merge_med) & (df['score_attention'] >= attn_med)]
q2 = df[(df['score_merge'] < merge_med) & (df['score_attention'] >= attn_med)]
q3 = df[(df['score_merge'] >= merge_med) & (df['score_attention'] < attn_med)]
q4 = df[(df['score_merge'] < merge_med) & (df['score_attention'] < attn_med)]
print(f"""
QUADRANT ANALYSIS (split at median of each score):
┌──────────────────────────────┬──────────────────────────────┐
│ Q1: HIGH merge + HIGH attn │ Q2: LOW merge + HIGH attn │
│ "Help across finish line" │ "Invest review time" │
│ n={len(q1):>4d} │ n={len(q2):>4d} │
│ median age: {q1['age_days'].median():>6.1f}d │ median age: {q2['age_days'].median():>6.1f}d │
│ community: {q1['is_community_inferred'].mean()*100:>5.0f}% │ community: {q2['is_community_inferred'].mean()*100:>5.0f}% │
│ median lines: {q1['total_lines'].median():>6.0f} │ median lines: {q2['total_lines'].median():>6.0f} │
│ has_linked_issue: {q1['has_linked_issue'].mean()*100:>3.0f}% │ has_linked_issue: {q2['has_linked_issue'].mean()*100:>3.0f}% │
│ has_milestone: {q1['has_milestone'].mean()*100:>3.0f}% │ has_milestone: {q2['has_milestone'].mean()*100:>3.0f}% │
│ is_bug: {q1['is_bug'].mean()*100:>3.0f}% │ is_bug: {q2['is_bug'].mean()*100:>3.0f}% │
├──────────────────────────────┼──────────────────────────────┤
│ Q3: HIGH merge + LOW attn │ Q4: LOW merge + LOW attn │
│ "Will merge on its own" │ "Deprioritize / close" │
│ n={len(q3):>4d} │ n={len(q4):>4d} │
│ median age: {q3['age_days'].median():>6.1f}d │ median age: {q4['age_days'].median():>6.1f}d │
│ community: {q3['is_community_inferred'].mean()*100:>5.0f}% │ community: {q4['is_community_inferred'].mean()*100:>5.0f}% │
│ median lines: {q3['total_lines'].median():>6.0f} │ median lines: {q4['total_lines'].median():>6.0f} │
│ has_linked_issue: {q3['has_linked_issue'].mean()*100:>3.0f}% │ has_linked_issue: {q4['has_linked_issue'].mean()*100:>3.0f}% │
│ has_milestone: {q3['has_milestone'].mean()*100:>3.0f}% │ has_milestone: {q4['has_milestone'].mean()*100:>3.0f}% │
│ is_bug: {q3['is_bug'].mean()*100:>3.0f}% │ is_bug: {q4['is_bug'].mean()*100:>3.0f}% │
└──────────────────────────────┴──────────────────────────────┘
Q1 = Priority: close to merge AND needs help -> review/merge now
Q2 = Investment: far from merge but deserving -> schedule review time
Q3 = Autopilot: close to merge, self-service -> let it flow
Q4 = Triage: far from merge, low priority -> close or deprioritize
""")
# Feature direction comparison
print(" FEATURE DIRECTION IN EACH SCORE:")
print(f" {'Feature':22s} {'Merge Score':>14s} {'Attention Score':>16s} {'Tension':>10s}")
print(" " + "-"*65)
comparisons = [
("CI passing", "HIGH readiness", "failing=needs help", "Opposite"),
("Has approval", "HIGH readiness", "missing=needs review","Opposite"),
("Maintainer review", "HIGH readiness", "moderate signal", "Weak"),
("Unresolved threads", "LOW readiness", "needs response", "Aligned"),
("Small size", "HIGH readiness", "large=significant", "CONFLICT"),
("Internal author", "HIGH readiness", "community=waiting", "CONFLICT"),
("Few commenters", "HIGH readiness", "many=important", "CONFLICT"),
("Area labeled", "HIGH readiness", "triaged=good", "Aligned"),
("Issue reactions", "(not in score)", "HIGH=community need", "Attn only"),
("Bug/regression", "(not in score)", "HIGH urgency", "Attn only"),
("Milestone", "(not in score)", "has deadline", "Attn only"),
("Cross-references", "(not in score)", "broad impact", "Attn only"),
]
for feat, merge, attn, tension in comparisons:
print(f" {feat:22s} {merge:>14s} {attn:>16s} {tension:>10s}")
print(f"""
PROPOSED ATTENTION SCORE COMPONENTS:
URGENCY (0-4 pts): regression +4, security +4, bug +1, milestone +1
COMMUNITY DEMAND (0-3): issue thumbsup, comments, cross-references
EFFORT-AT-RISK (0-3): community author +2, has reviews but no approval +1,
significant change +0.5
BLOCKED (0-2): CI failing +1, unresolved feedback +1, no approval +1.5
Key differences from merge score:
- Community PRs score HIGHER (they're waiting on us)
- Large/complex PRs score HIGHER (significant work at stake)
- Issue engagement is a NEW signal (not in merge score)
- CI failing scores HIGHER (needs help, not just "not ready")
- No penalty for many commenters (avoids death spiral)
""")
def main():
df = load()
descriptive_stats(df)
test_issue_features_merge(df)
analyze_dual_scores(df)
if __name__ == "__main__":
main()
"""
Fetch linked issue metadata for existing PR dataset.
For each PR, get closingIssuesReferences with reactions, labels, comments, age.
"""
import subprocess
import json
import time
import os
import sys
from datetime import datetime, timezone
DATA_FILE = r"C:\git\pr_data\merged_pr_features.json"
OUTPUT_FILE = r"C:\git\pr_data\linked_issues.json"
def gh_graphql(query, retries=2):
cmd = ["gh", "api", "graphql", "-f", f"query={query}"]
for attempt in range(retries + 1):
result = subprocess.run(cmd, capture_output=True, timeout=120,
encoding="utf-8", errors="replace")
if result.returncode != 0:
err = (result.stderr or "")[:300]
if ("504" in err or "502" in err) and attempt < retries:
time.sleep(5 * (attempt + 1))
continue
return None
if not result.stdout:
return None
return json.loads(result.stdout)
return None
def fetch_issue_data_batch(owner, repo, pr_numbers):
"""Fetch linked issue data for a batch of PRs."""
# Build individual PR queries
fragments = []
for i, num in enumerate(pr_numbers):
fragments.append(f"""
pr{i}: pullRequest(number: {num}) {{
number
labels(first: 20) {{ nodes {{ name }} }}
milestone {{ title dueOn }}
closingIssuesReferences(first: 10) {{
nodes {{
number
title
createdAt
comments {{ totalCount }}
reactions {{ totalCount }}
reactionGroups {{
content
reactors {{ totalCount }}
}}
labels(first: 15) {{ nodes {{ name }} }}
milestone {{ title }}
timelineItems(itemTypes: [CROSS_REFERENCED_EVENT], first: 5) {{
totalCount
}}
}}
}}
}}
""")
query = f"""
{{
repository(owner: "{owner}", name: "{repo}") {{
{"".join(fragments)}
}}
}}
"""
return gh_graphql(query)
def main():
with open(DATA_FILE, "r", encoding="utf-8") as f:
features = json.load(f)
# Load existing results if resuming
existing = {}
if os.path.exists(OUTPUT_FILE):
with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
existing = json.load(f)
# Group PRs by repo
repo_prs = {}
for feat in features:
repo = feat['repo']
num = feat['number']
key = f"{repo}#{num}"
if key not in existing:
repo_prs.setdefault(repo, []).append(num)
total_needed = sum(len(nums) for nums in repo_prs.values())
print(f"Need to fetch issue data for {total_needed} PRs across {len(repo_prs)} repos")
print(f"Already have {len(existing)} cached results")
for repo_full, numbers in sorted(repo_prs.items()):
owner, repo = repo_full.split("/")
print(f"\n{repo_full}: {len(numbers)} PRs to fetch")
# Batch in groups of 15 (GraphQL complexity limits)
for batch_start in range(0, len(numbers), 15):
batch = numbers[batch_start:batch_start + 15]
data = fetch_issue_data_batch(owner, repo, batch)
if not data or "data" not in data:
print(f" Failed batch starting at {batch_start}")
time.sleep(2)
continue
repo_data = data["data"]["repository"]
for i, num in enumerate(batch):
pr_key = f"pr{i}"
if pr_key in repo_data and repo_data[pr_key]:
pr_data = repo_data[pr_key]
key = f"{repo_full}#{num}"
# Extract issue summary
issues = pr_data.get("closingIssuesReferences", {}).get("nodes", [])
pr_labels = [l["name"] for l in pr_data.get("labels", {}).get("nodes", [])]
milestone = pr_data.get("milestone")
issue_summary = {
"pr_number": num,
"repo": repo_full,
"pr_labels": pr_labels,
"has_milestone": milestone is not None,
"milestone_title": milestone["title"] if milestone else None,
"linked_issue_count": len(issues),
"issues": []
}
for issue in issues:
issue_labels = [l["name"] for l in issue.get("labels", {}).get("nodes", [])]
reactions = issue.get("reactions", {}).get("totalCount", 0)
# Get thumbs-up specifically
thumbs_up = 0
for rg in issue.get("reactionGroups", []):
if rg["content"] == "THUMBS_UP":
thumbs_up = rg["reactors"]["totalCount"]
cross_refs = issue.get("timelineItems", {}).get("totalCount", 0)
issue_summary["issues"].append({
"number": issue["number"],
"title": issue.get("title", ""),
"created_at": issue.get("createdAt"),
"comment_count": issue.get("comments", {}).get("totalCount", 0),
"reaction_count": reactions,
"thumbs_up": thumbs_up,
"labels": issue_labels,
"cross_ref_count": cross_refs,
"has_milestone": issue.get("milestone") is not None,
})
existing[key] = issue_summary
if (batch_start + 15) % 60 == 0:
print(f" Fetched {min(batch_start + 15, len(numbers))}/{len(numbers)}")
time.sleep(0.5)
# Save incrementally
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(existing, f, indent=2)
print(f"\nTotal: {len(existing)} PR issue records saved to {OUTPUT_FILE}")
# Quick stats
has_issues = sum(1 for v in existing.values() if v["linked_issue_count"] > 0)
has_milestone = sum(1 for v in existing.values() if v["has_milestone"])
total_reactions = sum(
sum(i["reaction_count"] for i in v["issues"])
for v in existing.values()
)
print(f"\nQuick stats:")
print(f" PRs with linked issues: {has_issues} ({has_issues/len(existing)*100:.0f}%)")
print(f" PRs with milestones: {has_milestone} ({has_milestone/len(existing)*100:.0f}%)")
print(f" Total issue reactions: {total_reactions}")
# Regression/security label check
regression_labels = ["regression-from-last-release", "regression",
"regression-from-previous-release", "Regression"]
security_labels = ["security", "Security", "area-Security"]
has_regression = 0
has_security = 0
for v in existing.values():
all_labels = set(v["pr_labels"])
for issue in v["issues"]:
all_labels.update(issue["labels"])
if any(l in all_labels for l in regression_labels):
has_regression += 1
if any(l in all_labels for l in security_labels):
has_security += 1
print(f" Regression-labeled: {has_regression}")
print(f" Security-labeled: {has_security}")
if __name__ == "__main__":
main()

PR Readiness Score: Empirical Weight Calibration

Summary

The PR dashboard uses a weighted composite of ~12 features to score open PRs for merge readiness. The weights were originally hypothesized. This experiment uses data from 980 recently merged PRs across 11 dotnet repos to empirically calibrate them.

TL;DR: Discussion complexity is massively underweighted (1.5 should be ~2.5-4.5). CI and maintainer-review are overweighted. Size and community matter more than expected. But the strongest statistical predictor (number of distinct commenters) creates a death-spiral problem if used naively in the score. A dual-score system (merge readiness + deserves attention) may be more useful than a single composite score.

Method

  • Fetched 980 merged PRs via GitHub GraphQL across 11 repos (runtime, aspnetcore, roslyn, sdk, maui, msbuild, extensions, machinelearning, aspire, winforms, wpf)
  • Extracted: reviews, check runs (Build Analysis specifically), review threads, labels, size, author classification
  • Inferred per-repo maintainers from mergedBy data (much more accurate than a static list)
  • Fetched linked issue metadata (reactions, labels, comments, milestones, cross-references)
  • Ran OLS regression, logistic regression, Random Forest, Gradient Boosting, Lasso, and Ridge
  • Bootstrap stability analysis (500 resamples)
  • Event-gap analysis (time from each event to merge)
  • Per-repo breakdowns
  • Dual-score analysis (merge readiness vs. deserves attention)

Dataset Overview

Stat Value
Total PRs 980
Repos 11
Median merge time 1.0 days
Merged within 1 day 51%
Merged within 7 days 82%
Community PRs (inferred) 34%
Has owner approval 50%
PRs with linked issues 15%
PRs with milestones 29%

Fastest repos: aspire (0.2d), winforms (0.2d). Slowest: maui (6.2d median, 76.5d mean).

Key Findings

1. Discussion/Complexity is the Dominant Predictor

The number of distinct commenters and review threads is by far the strongest predictor of merge speed, significant in 7 of 11 repos. The current weight of 1.5 dramatically understates its importance.

Decomposition of the discussion signal:

Component R-squared alone Interpretation
distinct_commenters 0.228 More stakeholders = slower
total_comments 0.234 Raw engagement volume
total_threads 0.118 Review thread count
changes_requested 0.117 Explicit review feedback
unresolved_threads 0.036 Active blockers
resolution_rate 0.018 Weak signal

Discussion adds 16.7% R-squared beyond all other features combined. It is not just a proxy for PR size (correlation with size = 0.36).

2. CI is a Gate, Not a Gradient

CI (Build Analysis specifically) doesn't appear predictive in naive regression (p=0.77 with overall CI status). But:

  • Using Build Analysis specifically (matching dashboard behavior) makes it significant (p=0.008 in BA-present repos)
  • Event-gap analysis shows BA pass is the last gate before merge 70% of the time
  • Median time from BA pass to merge: 0.6 hours (53% merge within 1h of CI passing)
  • The regression underestimates CI because all merged PRs eventually pass it (survivor bias)

Important: Build Analysis is absent in ~40% of repos (extensions, msbuild, winforms, wpf don't have it). And in maui, BA is red 78% of the time.

3. Maintainer Classification Matters

The static maintainers.json classified 78% of PR authors as "community." Inferring maintainers from mergedBy data (anyone who merged >=2 PRs) reduced this to 34% and made the community signal significant (p=0.002).

Per-repo maintainer-vs-community merge speed gaps:

  • runtime: 3.9d vs 1.2d (3.3x slower for community)
  • machinelearning: 3.9d vs 0.8d (4.9x slower)
  • roslyn: 0.1d vs 1.1d (community faster -- they tend to submit small PRs)

4. The Death Spiral Problem

Using raw thread count in the score creates a death spiral: more discussion on a PR lowers its score, reducing attention, making it staler. The data confirms high-discussion PRs are genuinely the most significant work:

Thread count Median lines Median files Median commenters Median age
0-5 47 3 1 0.9d
6-15 364 7 4 4.6d
>15 814 18 4 13.2d

Alternative metrics tested:

Metric R-squared (full model) Death spiral?
A. Current (raw count) 0.292 YES
D. Commenters only 0.315 Partial
E. Hybrid (unresolved + commenters) 0.219 No
C. Unresolved only 0.132 No

distinct_commenters alone actually predicts better than the current metric.

5. Per-Repo Dynamics Vary Significantly

Repo R-squared Top Predictors Median Age
sdk 0.61 discussion 1.0d
maui 0.58 discussion, size, community, align 5.5d
winforms 0.44 discussion, approval, size 0.2d
extensions 0.41 discussion 1.5d
aspnetcore 0.41 discussion 0.7d
aspire 0.36 discussion, size 0.2d
runtime 0.33 discussion, community 2.2d
roslyn 0.33 approval, size 0.8d
msbuild 0.26 approval 1.9d
machinelearning 0.20 size 1.7d
wpf 0.05 (none) 1.0d
  • msbuild & roslyn: Approval is the key gate (compiler teams need specific reviewers)
  • maui: Most complex dynamics; many factors matter
  • wpf: Essentially unpredictable from these features

6. Linked Issue Data and the Dual-Score Concept

We fetched closingIssuesReferences for all 980 PRs to test whether linked issue engagement (reactions, comments, cross-references, labels) adds predictive value.

Issue features predict merge speed -- but in the WRONG direction:

Feature R-squared Coefficient p-value
has_linked_issue 0.141 +1.321 <0.001
log_cross_refs 0.143 +1.072 <0.001
log_issue_comments 0.119 +0.819 <0.001
log_issue_reactions 0.070 +1.056 <0.001
is_bug 0.014 +1.230 <0.001
has_milestone 0.000 -0.009 0.915

Positive coefficients mean PRs with more issue engagement take LONGER to merge. This confirms issue engagement signals importance/complexity, not readiness. Adding issue features beyond dashboard sub-scores: R-squared goes from 0.292 to 0.340 (+4.8%).

This motivates a dual-score system:

Score 1: Merge Readiness -- how mechanically close is this PR to merging?

  • Inputs: CI, approvals, conflicts, size, resolved threads, alignment

Score 2: Deserves Attention -- how much should a maintainer prioritize this PR?

  • Inputs: urgency labels, issue engagement, community demand, effort-at-risk, blockers

The correlation between the two scores is -0.63 -- they genuinely surface different PRs. The negative correlation means PRs that "deserve attention" are typically NOT close to merging.

Quadrant Analysis (980 PRs)

HIGH Merge Readiness LOW Merge Readiness
HIGH Attention Q1: "Help across finish line" (n=176): median 0.9d, 72% community, 14 lines, 20% linked issues Q2: "Invest review time" (n=355): median 2.5d, 59% community, 173 lines, 26% linked issues
LOW Attention Q3: "Will merge on its own" (n=337): median 0.5d, 0% community, 44 lines, 3% linked issues Q4: "Deprioritize" (n=112): median 1.8d, 0% community, 358 lines, 4% linked issues
  • Q1 -- Community PRs that are small and nearly ready. Quickest wins for maintainer time.
  • Q2 -- Community PRs with complexity, far from merge. Need investment to unblock.
  • Q3 -- Internal small PRs, merge quickly without help. Autopilot.
  • Q4 -- Internal large PRs, taking time. Lower priority for active triage.

Feature Direction Tension

Several features point in OPPOSITE directions for the two scores:

Feature Merge Readiness Deserves Attention Tension
CI passing HIGH = ready Failing = needs help Opposite
Has approval HIGH = ready Missing = needs review Opposite
Small size HIGH = ready Large = significant work CONFLICT
Internal author HIGH = ready Community = waiting on us CONFLICT
Few commenters HIGH = ready Many = important to community CONFLICT
Issue reactions (not used) HIGH = community demand Attention only
Bug/regression label (not used) HIGH = urgency Attention only
Milestone (not used) Has deadline Attention only
Cross-references (not used) Broad impact Attention only

Proposed Attention Score Components

URGENCY (0-4 pts):       regression +4, security +4, bug +1, milestone +1
COMMUNITY DEMAND (0-3):  issue thumbsup (>=10: +2, >=3: +1), comments (>=20: +1.5),
                         cross-references (>=3: +1)
EFFORT-AT-RISK (0-3):    community author +2, has reviews but no approval +1,
                         large change (>200 lines) +0.5
BLOCKED (0-2):           CI failing +1, unresolved feedback +1, no approval +1.5

Key differences from merge readiness:

  • Community PRs score HIGHER (they're waiting on maintainer action)
  • Large/complex PRs score HIGHER (significant work at stake)
  • Issue engagement is a NEW signal (not in merge score)
  • CI failing scores HIGHER (needs help, not just "not ready")
  • No penalty for many commenters (avoids death spiral)

Recommended Weights (Single-Score System)

Feature Current Recommended Change Confidence Rationale
ciScore 3.0 2.5 -0.5 Moderate Gate (last gate 70%); but BA absent in many repos
conflictScore 3.0 3.0 0.0 N/A Hard gate; can't measure historically
approvalScore 2.0 2.5 +0.5 Moderate Gate (40% merge within 1h of approval)
maintScore 3.0 1.5 -1.5 Lower Overlaps approval; Lasso drops it; unstable
feedbackScore 2.0 2.5 +0.5 High Redesign: unresolved threads + changes_requested
discussionScore 1.5 2.5 +1.0 Very High Redesign: based on distinct_commenters; cap at 0.5 min
sizeScore 1.0 2.0 +1.0 High Significant in 6/11 repos
communityScore 0.5 1.0 +0.5 High Significant with inferred maintainers
stalenessScore 1.5 1.0 -0.5 Low Can't validate from post-merge data
freshScore 1.0 0.7 -0.3 Low Overlaps staleness
alignScore 1.0 0.5 -0.5 Lower Weak predictor; only 2/11 repos
velocityScore 0.5 0.3 -0.2 Low Can't validate
TOTAL 20.0 20.0

Design Recommendations

  1. Consider a dual-score system: A single score conflates "close to merge" with "needs attention." The dual-score analysis shows these are anti-correlated (r=-0.63). Display both, or let users toggle sort mode between "ready to merge" and "needs review."

  2. Split discussion into feedback + engagement: Separate "unresolved blocking feedback" (actionable, in feedbackScore) from "stakeholder complexity" (informational, in engagementScore). This avoids the death spiral.

  3. Cap engagement penalty: distinct_commenters should reduce score to 0.5 at worst, never 0.0. Complex PRs need attention, not burial.

  4. Use Build Analysis specifically for CI: Overall CI status is noise in repos like runtime where some leg is always red.

  5. Infer maintainers from merge history: mergedBy data is far more accurate than a static list.

  6. Consider showing complexity separately: Display thread/commenter count as a separate column to set expectations about timeline, rather than penalizing in the sort score.

  7. Incorporate linked issue engagement for attention: Issue reactions, cross-references, and bug/regression labels are strong signals for "deserves attention" even though they predict SLOWER merges.

Methodology Notes

What Worked Well

  • Multiple model types agree (OLS, RF, GB, Lasso, Logistic all give consistent rankings)
  • Bootstrap analysis confirms discussion and size are very stable estimates (CV 8% and 36%)
  • Event-gap analysis captures gate behavior that regression misses
  • Robust across outcome definitions (merge within 1d, 7d, 30d all show same feature ranking)

Limitations

  • Survivor bias: Only merged PRs analyzed. Abandoned PRs would better show CI/conflict as blockers.
  • Snapshot, not trajectory: We see cumulative state at merge time, not the journey. A PR that went through 10 review rounds looks the same as one that was clean from the start.
  • ~65% unexplained: Reviewer timezone, release schedule, PR priority, and dependency chains are likely the dominant factors but aren't measurable from API data.
  • Temporal features untestable: staleness/freshness/velocity (3.0 combined current weight) can't be validated from post-merge data since all merged PRs are "fresh" at merge time.
  • Conflict untestable: Historical mergeability state not available. The 3.0 weight is an assumption.

Would More Data Help?

  • Global weights: Modestly. 980 PRs gives stable estimates for the top features.
  • Per-repo weights: Yes, especially for wpf (R-squared=0.05). 200+ per repo would be more reliable.
  • Temporal features: Needs a fundamentally different approach (time-series snapshots of open PRs).
  • Gate features: Need to include abandoned/closed-without-merge PRs.

Analysis scripts and data collection code available on request.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment