danmoseley · March 15, 2026 02:38
diff --git a/analyze_dual_score.py b/analyze_dual_score.py
 """
 Dual-score analysis with linked issue data.
 Score 1: Closeness to Merge (PR mechanical state)
 Score 2: Deserves Attention (impact + urgency + effort-at-risk)
 """

 import json
 import numpy as np
 import pandas as pd
 import statsmodels.api as sm
 from sklearn.preprocessing import StandardScaler
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import cross_val_score
 import warnings
 warnings.filterwarnings('ignore')

 DATA_FILE = r"C:\git\pr_data\merged_pr_features.json"
 MAINTAINERS_FILE = r"C:\git\pr_data\inferred_maintainers.json"
 ISSUES_FILE = r"C:\git\pr_data\linked_issues.json"

 def load():
    with open(DATA_FILE, "r", encoding="utf-8") as f:
        features = json.load(f)
    with open(MAINTAINERS_FILE, "r", encoding="utf-8") as f:
        maint_data = json.load(f)
    with open(ISSUES_FILE, "r", encoding="utf-8") as f:
        issues_data = json.load(f)
    
    repo_maint = {k: set(v) for k, v in maint_data["repo_maintainers"].items()}
    df = pd.DataFrame(features)
    df['is_community_inferred'] = df.apply(
        lambda r: r['author'].lower() not in repo_maint.get(r['repo'], set()), axis=1)
    df['log_age'] = np.log1p(df['age_days'])
    
    # Dashboard sub-scores
    df['f_ci'] = df['build_analysis_conclusion'].map({
        'SUCCESS': 1.0, 'ABSENT': 0.5, 'IN_PROGRESS': 0.5, 'FAILURE': 0.0
    }).fillna(0.5)
    
    df['f_approval'] = df.apply(lambda r:
        1.0 if (r['approval_count'] >= 2 and r['has_owner_approval']) else
        0.75 if r['has_owner_approval'] else
        0.5 if r['approval_count'] >= 1 else 0.0, axis=1)
    
    df['f_maint'] = 0.0
    df.loc[df['has_any_review'] & ~df['has_owner_approval'], 'f_maint'] = 0.5
    df.loc[df['has_owner_approval'], 'f_maint'] = 1.0
    
    df['f_feedback'] = df['unresolved_threads'].apply(lambda x: 1.0 if x == 0 else 0.5)
    df['f_size'] = df.apply(lambda r: 1.0 if (r['changed_files']<=5 and r['total_lines']<=200)
                            else (0.5 if (r['changed_files']<=20 and r['total_lines']<=500) else 0.0), axis=1)
    df['f_community'] = df['is_community_inferred'].map({True: 0.5, False: 1.0})
    df['f_align'] = df.apply(lambda r: 0.0 if (r['is_untriaged'] or not r['has_area_label']) else 1.0, axis=1)
    df['f_discussion'] = df.apply(lambda r:
        1.0 if (r['total_threads'] <= 5 and r['distinct_commenters'] <= 2) else
        0.5 if (r['total_threads'] <= 15 and r['distinct_commenters'] <= 5) else 0.0, axis=1)
    
    # Merge linked issue data
    regression_labels = {"regression-from-last-release", "regression",
                         "regression-from-previous-release", "Regression"}
    security_labels = {"security", "Security", "area-Security"}
    bug_labels = {"bug", "Bug"}
    
    issue_features = []
    for _, row in df.iterrows():
        key = f"{row['repo']}#{row['number']}"
        idata = issues_data.get(key, {})
        
        issues = idata.get("issues", [])
        pr_labels = set(idata.get("pr_labels", []))
        all_labels = set(pr_labels)
        for issue in issues:
            all_labels.update(issue.get("labels", []))
        
        total_issue_reactions = sum(i.get("reaction_count", 0) for i in issues)
        total_issue_thumbsup = sum(i.get("thumbs_up", 0) for i in issues)
        total_issue_comments = sum(i.get("comment_count", 0) for i in issues)
        max_issue_reactions = max((i.get("reaction_count", 0) for i in issues), default=0)
        max_issue_thumbsup = max((i.get("thumbs_up", 0) for i in issues), default=0)
        total_cross_refs = sum(i.get("cross_ref_count", 0) for i in issues)
        
        has_regression = bool(all_labels & regression_labels)
        has_security = bool(all_labels & security_labels)
        has_bug = bool(all_labels & bug_labels)
        has_milestone = idata.get("has_milestone", False)
        has_linked_issue = len(issues) > 0
        
        issue_features.append({
            'has_linked_issue': has_linked_issue,
            'linked_issue_count': len(issues),
            'total_issue_reactions': total_issue_reactions,
            'total_issue_thumbsup': total_issue_thumbsup,
            'max_issue_reactions': max_issue_reactions,
            'max_issue_thumbsup': max_issue_thumbsup,
            'total_issue_comments': total_issue_comments,
            'total_cross_refs': total_cross_refs,
            'is_regression': has_regression,
            'is_security': has_security,
            'is_bug': has_bug,
            'has_milestone': has_milestone,
        })
    
    issue_df = pd.DataFrame(issue_features)
    df = pd.concat([df.reset_index(drop=True), issue_df], axis=1)
    
    return df

 def descriptive_stats(df):
    print("="*70)
    print("LINKED ISSUE DATA OVERVIEW")
    print("="*70)
    
    print(f"\n  Total PRs: {len(df)}")
    print(f"  PRs with linked issues: {df['has_linked_issue'].sum()} ({df['has_linked_issue'].mean()*100:.0f}%)")
    print(f"  PRs with milestones: {df['has_milestone'].sum()} ({df['has_milestone'].mean()*100:.0f}%)")
    print(f"  Regression-labeled: {df['is_regression'].sum()}")
    print(f"  Security-labeled: {df['is_security'].sum()}")
    print(f"  Bug-labeled: {df['is_bug'].sum()} ({df['is_bug'].mean()*100:.0f}%)")
    
    has_issues = df[df['has_linked_issue']]
    if len(has_issues) > 0:
        print(f"\n  Among PRs WITH linked issues (n={len(has_issues)}):")
        print(f"    Mean issue reactions: {has_issues['total_issue_reactions'].mean():.1f}")
        print(f"    Mean issue thumbsup: {has_issues['total_issue_thumbsup'].mean():.1f}")
        print(f"    Mean issue comments: {has_issues['total_issue_comments'].mean():.1f}")
        print(f"    Mean cross-references: {has_issues['total_cross_refs'].mean():.1f}")
        print(f"    Max reactions on any linked issue: {has_issues['max_issue_reactions'].max()}")
    
    print(f"\n  Linked issue rate by repo:")
    for repo in sorted(df['repo'].unique()):
        rdf = df[df['repo'] == repo]
        rate = rdf['has_linked_issue'].mean() * 100
        bug_rate = rdf['is_bug'].mean() * 100
        ms_rate = rdf['has_milestone'].mean() * 100
        print(f"    {repo}: issues={rate:.0f}%, bugs={bug_rate:.0f}%, milestones={ms_rate:.0f}%")

 def test_issue_features_merge(df):
    """Do linked issue features predict merge speed?"""
    
    print("\n" + "="*70)
    print("DO ISSUE FEATURES PREDICT MERGE SPEED?")
    print("="*70)
    
    y = df['log_age']
    
    candidates = {
        'has_linked_issue': df['has_linked_issue'].astype(float),
        'log_issue_reactions': np.log1p(df['total_issue_reactions']),
        'log_issue_thumbsup': np.log1p(df['total_issue_thumbsup']),
        'log_issue_comments': np.log1p(df['total_issue_comments']),
        'is_bug': df['is_bug'].astype(float),
        'has_milestone': df['has_milestone'].astype(float),
        'log_cross_refs': np.log1p(df['total_cross_refs']),
    }
    
    print(f"\n  {'Feature':25s} {'R²':>6s} {'coef':>8s} {'t':>8s} {'p':>8s}")
    for name, feat in candidates.items():
        X = sm.add_constant(feat)
        m = sm.OLS(y, X).fit()
        sig = '***' if m.pvalues.iloc[1]<0.001 else '**' if m.pvalues.iloc[1]<0.01 else '*' if m.pvalues.iloc[1]<0.05 else ''
        print(f"  {name:25s} {m.rsquared:>6.3f} {m.params.iloc[1]:>+8.3f} {m.tvalues.iloc[1]:>8.2f} {m.pvalues.iloc[1]:>8.4f} {sig}")
    
    print(f"\n  Do issue features add predictive power BEYOND dashboard sub-scores?")
    
    dash_features = ['f_ci', 'f_approval', 'f_maint', 'f_feedback',
                     'f_size', 'f_community', 'f_align', 'f_discussion']
    
    X_base = sm.add_constant(df[dash_features])
    m_base = sm.OLS(y, X_base).fit()
    
    df['log_issue_reactions_f'] = np.log1p(df['total_issue_reactions'].astype(float))
    df['has_linked_issue_f'] = df['has_linked_issue'].astype(float)
    df['is_bug_f'] = df['is_bug'].astype(float)
    df['has_milestone_f'] = df['has_milestone'].astype(float)
    issue_cols = ['has_linked_issue_f', 'log_issue_reactions_f', 'is_bug_f', 'has_milestone_f']
    
    X_full = sm.add_constant(df[dash_features + issue_cols])
    m_full = sm.OLS(y, X_full).fit()
    
    print(f"  Dashboard only:              R² = {m_base.rsquared:.3f}")
    print(f"  Dashboard + issue features:  R² = {m_full.rsquared:.3f}")
    print(f"  Incremental R²:              {m_full.rsquared - m_base.rsquared:.3f}")
    
    print(f"\n  Issue feature coefficients in combined model:")
    for f in issue_cols:
        if f in m_full.params:
            sig = '***' if m_full.pvalues[f]<0.001 else '**' if m_full.pvalues[f]<0.01 else '*' if m_full.pvalues[f]<0.05 else ''
            print(f"    {f:25s}: coef={m_full.params[f]:+.3f}, t={m_full.tvalues[f]:.2f}, p={m_full.pvalues[f]:.4f} {sig}")

 def analyze_dual_scores(df):
    """Build and compare the two scores."""
    
    print("\n" + "="*70)
    print("DUAL SCORE ANALYSIS")
    print("="*70)
    
    # SCORE 1: CLOSENESS TO MERGE
    def merge_readiness(row):
        s = 0
        s += row['f_ci'] * 2.5
        s += row['f_approval'] * 2.5
        s += row['f_maint'] * 1.5
        s += row['f_feedback'] * 2.5
        s += row['f_size'] * 2.0
        s += row['f_community'] * 1.0
        s += row['f_discussion'] * 2.5
        s += row['f_align'] * 0.5
        return round(s / 15.0 * 10, 1)
    
    df['score_merge'] = df.apply(merge_readiness, axis=1)
    
    # SCORE 2: DESERVES ATTENTION
    def attention_score(row):
        s = 0
        # Urgency
        if row['is_regression']: s += 4.0
        if row['is_security']: s += 4.0
        if row['is_bug']: s += 1.0
        if row['has_milestone']: s += 1.0
        # Community demand
        if row['total_issue_thumbsup'] >= 10: s += 2.0
        elif row['total_issue_thumbsup'] >= 3: s += 1.0
        elif row['total_issue_reactions'] >= 5: s += 0.5
        if row['total_issue_comments'] >= 20: s += 1.5
        elif row['total_issue_comments'] >= 5: s += 0.5
        if row['total_cross_refs'] >= 3: s += 1.0
        elif row['total_cross_refs'] >= 1: s += 0.5
        # Effort at risk
        if row['is_community_inferred']: s += 2.0
        if row['has_any_review'] and row['f_approval'] < 0.75: s += 1.0
        if row['total_lines'] > 200: s += 0.5
        # Blockers
        if row['f_ci'] == 0: s += 1.0
        if row['unresolved_threads'] > 0: s += 1.0
        elif row['changes_requested_count'] > 0: s += 0.5
        if row['f_approval'] == 0: s += 1.5
        return round(min(s, 10.0), 1)
    
    df['score_attention'] = df.apply(attention_score, axis=1)
    
    corr = df['score_merge'].corr(df['score_attention'])
    print(f"\n  Correlation between scores: {corr:.3f}")
    print(f"  ({'They surface different PRs' if corr < 0 else 'Some overlap expected'})")
    
    # Quadrant analysis
    merge_med = df['score_merge'].median()
    attn_med = df['score_attention'].median()
    
    q1 = df[(df['score_merge'] >= merge_med) & (df['score_attention'] >= attn_med)]
    q2 = df[(df['score_merge'] < merge_med) & (df['score_attention'] >= attn_med)]
    q3 = df[(df['score_merge'] >= merge_med) & (df['score_attention'] < attn_med)]
    q4 = df[(df['score_merge'] < merge_med) & (df['score_attention'] < attn_med)]
    
    print(f"""
  QUADRANT ANALYSIS (split at median of each score):
  
  ┌──────────────────────────────┬──────────────────────────────┐
  │ Q1: HIGH merge + HIGH attn   │ Q2: LOW merge + HIGH attn    │
  │ "Help across finish line"    │ "Invest review time"         │
  │ n={len(q1):>4d}                      │ n={len(q2):>4d}                      │
  │ median age: {q1['age_days'].median():>6.1f}d          │ median age: {q2['age_days'].median():>6.1f}d          │
  │ community: {q1['is_community_inferred'].mean()*100:>5.0f}%           │ community: {q2['is_community_inferred'].mean()*100:>5.0f}%           │
  │ median lines: {q1['total_lines'].median():>6.0f}        │ median lines: {q2['total_lines'].median():>6.0f}        │
  │ has_linked_issue: {q1['has_linked_issue'].mean()*100:>3.0f}%       │ has_linked_issue: {q2['has_linked_issue'].mean()*100:>3.0f}%       │
  │ has_milestone: {q1['has_milestone'].mean()*100:>3.0f}%           │ has_milestone: {q2['has_milestone'].mean()*100:>3.0f}%           │
  │ is_bug: {q1['is_bug'].mean()*100:>3.0f}%                 │ is_bug: {q2['is_bug'].mean()*100:>3.0f}%                 │
  ├──────────────────────────────┼──────────────────────────────┤
  │ Q3: HIGH merge + LOW attn    │ Q4: LOW merge + LOW attn     │
  │ "Will merge on its own"      │ "Deprioritize / close"       │
  │ n={len(q3):>4d}                      │ n={len(q4):>4d}                      │
  │ median age: {q3['age_days'].median():>6.1f}d          │ median age: {q4['age_days'].median():>6.1f}d          │
  │ community: {q3['is_community_inferred'].mean()*100:>5.0f}%           │ community: {q4['is_community_inferred'].mean()*100:>5.0f}%           │
  │ median lines: {q3['total_lines'].median():>6.0f}        │ median lines: {q4['total_lines'].median():>6.0f}        │
  │ has_linked_issue: {q3['has_linked_issue'].mean()*100:>3.0f}%       │ has_linked_issue: {q4['has_linked_issue'].mean()*100:>3.0f}%       │
  │ has_milestone: {q3['has_milestone'].mean()*100:>3.0f}%           │ has_milestone: {q4['has_milestone'].mean()*100:>3.0f}%           │
  │ is_bug: {q3['is_bug'].mean()*100:>3.0f}%                 │ is_bug: {q4['is_bug'].mean()*100:>3.0f}%                 │
  └──────────────────────────────┴──────────────────────────────┘

  Q1 = Priority: close to merge AND needs help -> review/merge now
  Q2 = Investment: far from merge but deserving -> schedule review time
  Q3 = Autopilot: close to merge, self-service -> let it flow
  Q4 = Triage: far from merge, low priority -> close or deprioritize
 """)
    
    # Feature direction comparison
    print("  FEATURE DIRECTION IN EACH SCORE:")
    print(f"  {'Feature':22s} {'Merge Score':>14s} {'Attention Score':>16s} {'Tension':>10s}")
    print("  " + "-"*65)
    
    comparisons = [
        ("CI passing",         "HIGH readiness",    "failing=needs help",  "Opposite"),
        ("Has approval",       "HIGH readiness",    "missing=needs review","Opposite"),
        ("Maintainer review",  "HIGH readiness",    "moderate signal",     "Weak"),
        ("Unresolved threads", "LOW readiness",     "needs response",      "Aligned"),
        ("Small size",         "HIGH readiness",    "large=significant",   "CONFLICT"),
        ("Internal author",    "HIGH readiness",    "community=waiting",   "CONFLICT"),
        ("Few commenters",     "HIGH readiness",    "many=important",      "CONFLICT"),
        ("Area labeled",       "HIGH readiness",    "triaged=good",        "Aligned"),
        ("Issue reactions",    "(not in score)",    "HIGH=community need", "Attn only"),
        ("Bug/regression",     "(not in score)",    "HIGH urgency",        "Attn only"),
        ("Milestone",          "(not in score)",    "has deadline",        "Attn only"),
        ("Cross-references",   "(not in score)",    "broad impact",        "Attn only"),
    ]
    
    for feat, merge, attn, tension in comparisons:
        print(f"  {feat:22s} {merge:>14s} {attn:>16s} {tension:>10s}")
    
    print(f"""
  PROPOSED ATTENTION SCORE COMPONENTS:
  
  URGENCY (0-4 pts):     regression +4, security +4, bug +1, milestone +1
  COMMUNITY DEMAND (0-3): issue thumbsup, comments, cross-references
  EFFORT-AT-RISK (0-3):  community author +2, has reviews but no approval +1,
                         significant change +0.5
  BLOCKED (0-2):         CI failing +1, unresolved feedback +1, no approval +1.5
  
  Key differences from merge score:
  - Community PRs score HIGHER (they're waiting on us)
  - Large/complex PRs score HIGHER (significant work at stake)
  - Issue engagement is a NEW signal (not in merge score)
  - CI failing scores HIGHER (needs help, not just "not ready")
  - No penalty for many commenters (avoids death spiral)
 """)

 def main():
    df = load()
    descriptive_stats(df)
    test_issue_features_merge(df)
    analyze_dual_scores(df)

 if __name__ == "__main__":
    main()
diff --git a/fetch_linked_issues.py b/fetch_linked_issues.py
 """
 Fetch linked issue metadata for existing PR dataset.
 For each PR, get closingIssuesReferences with reactions, labels, comments, age.
 """

 import subprocess
 import json
 import time
 import os
 import sys
 from datetime import datetime, timezone

 DATA_FILE = r"C:\git\pr_data\merged_pr_features.json"
 OUTPUT_FILE = r"C:\git\pr_data\linked_issues.json"

 def gh_graphql(query, retries=2):
    cmd = ["gh", "api", "graphql", "-f", f"query={query}"]
    for attempt in range(retries + 1):
        result = subprocess.run(cmd, capture_output=True, timeout=120,
                                encoding="utf-8", errors="replace")
        if result.returncode != 0:
            err = (result.stderr or "")[:300]
            if ("504" in err or "502" in err) and attempt < retries:
                time.sleep(5 * (attempt + 1))
                continue
            return None
        if not result.stdout:
            return None
        return json.loads(result.stdout)
    return None

 def fetch_issue_data_batch(owner, repo, pr_numbers):
    """Fetch linked issue data for a batch of PRs."""
    # Build individual PR queries
    fragments = []
    for i, num in enumerate(pr_numbers):
        fragments.append(f"""
        pr{i}: pullRequest(number: {num}) {{
          number
          labels(first: 20) {{ nodes {{ name }} }}
          milestone {{ title dueOn }}
          closingIssuesReferences(first: 10) {{
            nodes {{
              number
              title
              createdAt
              comments {{ totalCount }}
              reactions {{ totalCount }}
              reactionGroups {{
                content
                reactors {{ totalCount }}
              }}
              labels(first: 15) {{ nodes {{ name }} }}
              milestone {{ title }}
              timelineItems(itemTypes: [CROSS_REFERENCED_EVENT], first: 5) {{
                totalCount
              }}
            }}
          }}
        }}
        """)
    
    query = f"""
    {{
      repository(owner: "{owner}", name: "{repo}") {{
        {"".join(fragments)}
      }}
    }}
    """
    return gh_graphql(query)

 def main():
    with open(DATA_FILE, "r", encoding="utf-8") as f:
        features = json.load(f)
    
    # Load existing results if resuming
    existing = {}
    if os.path.exists(OUTPUT_FILE):
        with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
            existing = json.load(f)
    
    # Group PRs by repo
    repo_prs = {}
    for feat in features:
        repo = feat['repo']
        num = feat['number']
        key = f"{repo}#{num}"
        if key not in existing:
            repo_prs.setdefault(repo, []).append(num)
    
    total_needed = sum(len(nums) for nums in repo_prs.values())
    print(f"Need to fetch issue data for {total_needed} PRs across {len(repo_prs)} repos")
    print(f"Already have {len(existing)} cached results")
    
    for repo_full, numbers in sorted(repo_prs.items()):
        owner, repo = repo_full.split("/")
        print(f"\n{repo_full}: {len(numbers)} PRs to fetch")
        
        # Batch in groups of 15 (GraphQL complexity limits)
        for batch_start in range(0, len(numbers), 15):
            batch = numbers[batch_start:batch_start + 15]
            data = fetch_issue_data_batch(owner, repo, batch)
            
            if not data or "data" not in data:
                print(f"  Failed batch starting at {batch_start}")
                time.sleep(2)
                continue
            
            repo_data = data["data"]["repository"]
            for i, num in enumerate(batch):
                pr_key = f"pr{i}"
                if pr_key in repo_data and repo_data[pr_key]:
                    pr_data = repo_data[pr_key]
                    key = f"{repo_full}#{num}"
                    
                    # Extract issue summary
                    issues = pr_data.get("closingIssuesReferences", {}).get("nodes", [])
                    pr_labels = [l["name"] for l in pr_data.get("labels", {}).get("nodes", [])]
                    milestone = pr_data.get("milestone")
                    
                    issue_summary = {
                        "pr_number": num,
                        "repo": repo_full,
                        "pr_labels": pr_labels,
                        "has_milestone": milestone is not None,
                        "milestone_title": milestone["title"] if milestone else None,
                        "linked_issue_count": len(issues),
                        "issues": []
                    }
                    
                    for issue in issues:
                        issue_labels = [l["name"] for l in issue.get("labels", {}).get("nodes", [])]
                        reactions = issue.get("reactions", {}).get("totalCount", 0)
                        
                        # Get thumbs-up specifically
                        thumbs_up = 0
                        for rg in issue.get("reactionGroups", []):
                            if rg["content"] == "THUMBS_UP":
                                thumbs_up = rg["reactors"]["totalCount"]
                        
                        cross_refs = issue.get("timelineItems", {}).get("totalCount", 0)
                        
                        issue_summary["issues"].append({
                            "number": issue["number"],
                            "title": issue.get("title", ""),
                            "created_at": issue.get("createdAt"),
                            "comment_count": issue.get("comments", {}).get("totalCount", 0),
                            "reaction_count": reactions,
                            "thumbs_up": thumbs_up,
                            "labels": issue_labels,
                            "cross_ref_count": cross_refs,
                            "has_milestone": issue.get("milestone") is not None,
                        })
                    
                    existing[key] = issue_summary
            
            if (batch_start + 15) % 60 == 0:
                print(f"  Fetched {min(batch_start + 15, len(numbers))}/{len(numbers)}")
            time.sleep(0.5)
        
        # Save incrementally
        with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
            json.dump(existing, f, indent=2)
    
    print(f"\nTotal: {len(existing)} PR issue records saved to {OUTPUT_FILE}")
    
    # Quick stats
    has_issues = sum(1 for v in existing.values() if v["linked_issue_count"] > 0)
    has_milestone = sum(1 for v in existing.values() if v["has_milestone"])
    total_reactions = sum(
        sum(i["reaction_count"] for i in v["issues"])
        for v in existing.values()
    )
    
    print(f"\nQuick stats:")
    print(f"  PRs with linked issues: {has_issues} ({has_issues/len(existing)*100:.0f}%)")
    print(f"  PRs with milestones: {has_milestone} ({has_milestone/len(existing)*100:.0f}%)")
    print(f"  Total issue reactions: {total_reactions}")
    
    # Regression/security label check
    regression_labels = ["regression-from-last-release", "regression", 
                         "regression-from-previous-release", "Regression"]
    security_labels = ["security", "Security", "area-Security"]
    
    has_regression = 0
    has_security = 0
    for v in existing.values():
        all_labels = set(v["pr_labels"])
        for issue in v["issues"]:
            all_labels.update(issue["labels"])
        if any(l in all_labels for l in regression_labels):
            has_regression += 1
        if any(l in all_labels for l in security_labels):
            has_security += 1
    
    print(f"  Regression-labeled: {has_regression}")
    print(f"  Security-labeled: {has_security}")

 if __name__ == "__main__":
    main()
diff --git a/gist_content.md b/gist_content.md
	"""
	Dual-score analysis with linked issue data.
	Score 1: Closeness to Merge (PR mechanical state)
	Score 2: Deserves Attention (impact + urgency + effort-at-risk)
	"""

	import json
	import numpy as np
	import pandas as pd
	import statsmodels.api as sm
	from sklearn.preprocessing import StandardScaler
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import cross_val_score
	import warnings
	warnings.filterwarnings('ignore')

	DATA_FILE = r"C:\git\pr_data\merged_pr_features.json"
	MAINTAINERS_FILE = r"C:\git\pr_data\inferred_maintainers.json"
	ISSUES_FILE = r"C:\git\pr_data\linked_issues.json"

	def load():
	with open(DATA_FILE, "r", encoding="utf-8") as f:
	features = json.load(f)
	with open(MAINTAINERS_FILE, "r", encoding="utf-8") as f:
	maint_data = json.load(f)
	with open(ISSUES_FILE, "r", encoding="utf-8") as f:
	issues_data = json.load(f)

	repo_maint = {k: set(v) for k, v in maint_data["repo_maintainers"].items()}
	df = pd.DataFrame(features)
	df['is_community_inferred'] = df.apply(
	lambda r: r['author'].lower() not in repo_maint.get(r['repo'], set()), axis=1)
	df['log_age'] = np.log1p(df['age_days'])

	# Dashboard sub-scores
	df['f_ci'] = df['build_analysis_conclusion'].map({
	'SUCCESS': 1.0, 'ABSENT': 0.5, 'IN_PROGRESS': 0.5, 'FAILURE': 0.0
	}).fillna(0.5)

	df['f_approval'] = df.apply(lambda r:
	1.0 if (r['approval_count'] >= 2 and r['has_owner_approval']) else
	0.75 if r['has_owner_approval'] else
	0.5 if r['approval_count'] >= 1 else 0.0, axis=1)

	df['f_maint'] = 0.0
	df.loc[df['has_any_review'] & ~df['has_owner_approval'], 'f_maint'] = 0.5
	df.loc[df['has_owner_approval'], 'f_maint'] = 1.0

	df['f_feedback'] = df['unresolved_threads'].apply(lambda x: 1.0 if x == 0 else 0.5)
	df['f_size'] = df.apply(lambda r: 1.0 if (r['changed_files']<=5 and r['total_lines']<=200)
	else (0.5 if (r['changed_files']<=20 and r['total_lines']<=500) else 0.0), axis=1)
	df['f_community'] = df['is_community_inferred'].map({True: 0.5, False: 1.0})
	df['f_align'] = df.apply(lambda r: 0.0 if (r['is_untriaged'] or not r['has_area_label']) else 1.0, axis=1)
	df['f_discussion'] = df.apply(lambda r:
	1.0 if (r['total_threads'] <= 5 and r['distinct_commenters'] <= 2) else
	0.5 if (r['total_threads'] <= 15 and r['distinct_commenters'] <= 5) else 0.0, axis=1)

	# Merge linked issue data
	regression_labels = {"regression-from-last-release", "regression",
	"regression-from-previous-release", "Regression"}
	security_labels = {"security", "Security", "area-Security"}
	bug_labels = {"bug", "Bug"}

	issue_features = []
	for _, row in df.iterrows():
	key = f"{row['repo']}#{row['number']}"
	idata = issues_data.get(key, {})

	issues = idata.get("issues", [])
	pr_labels = set(idata.get("pr_labels", []))
	all_labels = set(pr_labels)
	for issue in issues:
	all_labels.update(issue.get("labels", []))

	total_issue_reactions = sum(i.get("reaction_count", 0) for i in issues)
	total_issue_thumbsup = sum(i.get("thumbs_up", 0) for i in issues)
	total_issue_comments = sum(i.get("comment_count", 0) for i in issues)
	max_issue_reactions = max((i.get("reaction_count", 0) for i in issues), default=0)
	max_issue_thumbsup = max((i.get("thumbs_up", 0) for i in issues), default=0)
	total_cross_refs = sum(i.get("cross_ref_count", 0) for i in issues)

	has_regression = bool(all_labels & regression_labels)
	has_security = bool(all_labels & security_labels)
	has_bug = bool(all_labels & bug_labels)
	has_milestone = idata.get("has_milestone", False)
	has_linked_issue = len(issues) > 0

	issue_features.append({
	'has_linked_issue': has_linked_issue,
	'linked_issue_count': len(issues),
	'total_issue_reactions': total_issue_reactions,
	'total_issue_thumbsup': total_issue_thumbsup,
	'max_issue_reactions': max_issue_reactions,
	'max_issue_thumbsup': max_issue_thumbsup,
	'total_issue_comments': total_issue_comments,
	'total_cross_refs': total_cross_refs,
	'is_regression': has_regression,
	'is_security': has_security,
	'is_bug': has_bug,
	'has_milestone': has_milestone,
	})

	issue_df = pd.DataFrame(issue_features)
	df = pd.concat([df.reset_index(drop=True), issue_df], axis=1)

	return df

	def descriptive_stats(df):
	print("="*70)
	print("LINKED ISSUE DATA OVERVIEW")
	print("="*70)

	print(f"\n Total PRs: {len(df)}")
	print(f" PRs with linked issues: {df['has_linked_issue'].sum()} ({df['has_linked_issue'].mean()*100:.0f}%)")
	print(f" PRs with milestones: {df['has_milestone'].sum()} ({df['has_milestone'].mean()*100:.0f}%)")
	print(f" Regression-labeled: {df['is_regression'].sum()}")
	print(f" Security-labeled: {df['is_security'].sum()}")
	print(f" Bug-labeled: {df['is_bug'].sum()} ({df['is_bug'].mean()*100:.0f}%)")

	has_issues = df[df['has_linked_issue']]
	if len(has_issues) > 0:
	print(f"\n Among PRs WITH linked issues (n={len(has_issues)}):")
	print(f" Mean issue reactions: {has_issues['total_issue_reactions'].mean():.1f}")
	print(f" Mean issue thumbsup: {has_issues['total_issue_thumbsup'].mean():.1f}")
	print(f" Mean issue comments: {has_issues['total_issue_comments'].mean():.1f}")
	print(f" Mean cross-references: {has_issues['total_cross_refs'].mean():.1f}")
	print(f" Max reactions on any linked issue: {has_issues['max_issue_reactions'].max()}")

	print(f"\n Linked issue rate by repo:")
	for repo in sorted(df['repo'].unique()):
	rdf = df[df['repo'] == repo]
	rate = rdf['has_linked_issue'].mean() * 100
	bug_rate = rdf['is_bug'].mean() * 100
	ms_rate = rdf['has_milestone'].mean() * 100
	print(f" {repo}: issues={rate:.0f}%, bugs={bug_rate:.0f}%, milestones={ms_rate:.0f}%")

	def test_issue_features_merge(df):
	"""Do linked issue features predict merge speed?"""

	print("\n" + "="*70)
	print("DO ISSUE FEATURES PREDICT MERGE SPEED?")
	print("="*70)

	y = df['log_age']

	candidates = {
	'has_linked_issue': df['has_linked_issue'].astype(float),
	'log_issue_reactions': np.log1p(df['total_issue_reactions']),
	'log_issue_thumbsup': np.log1p(df['total_issue_thumbsup']),
	'log_issue_comments': np.log1p(df['total_issue_comments']),
	'is_bug': df['is_bug'].astype(float),
	'has_milestone': df['has_milestone'].astype(float),
	'log_cross_refs': np.log1p(df['total_cross_refs']),
	}

	print(f"\n {'Feature':25s} {'R²':>6s} {'coef':>8s} {'t':>8s} {'p':>8s}")
	for name, feat in candidates.items():
	X = sm.add_constant(feat)
	m = sm.OLS(y, X).fit()
	sig = '*' if m.pvalues.iloc[1]<0.001 else '' if m.pvalues.iloc[1]<0.01 else '*' if m.pvalues.iloc[1]<0.05 else ''
	print(f" {name:25s} {m.rsquared:>6.3f} {m.params.iloc[1]:>+8.3f} {m.tvalues.iloc[1]:>8.2f} {m.pvalues.iloc[1]:>8.4f} {sig}")

	print(f"\n Do issue features add predictive power BEYOND dashboard sub-scores?")

	dash_features = ['f_ci', 'f_approval', 'f_maint', 'f_feedback',
	'f_size', 'f_community', 'f_align', 'f_discussion']

	X_base = sm.add_constant(df[dash_features])
	m_base = sm.OLS(y, X_base).fit()

	df['log_issue_reactions_f'] = np.log1p(df['total_issue_reactions'].astype(float))
	df['has_linked_issue_f'] = df['has_linked_issue'].astype(float)
	df['is_bug_f'] = df['is_bug'].astype(float)
	df['has_milestone_f'] = df['has_milestone'].astype(float)
	issue_cols = ['has_linked_issue_f', 'log_issue_reactions_f', 'is_bug_f', 'has_milestone_f']

	X_full = sm.add_constant(df[dash_features + issue_cols])
	m_full = sm.OLS(y, X_full).fit()

	print(f" Dashboard only: R² = {m_base.rsquared:.3f}")
	print(f" Dashboard + issue features: R² = {m_full.rsquared:.3f}")
	print(f" Incremental R²: {m_full.rsquared - m_base.rsquared:.3f}")

	print(f"\n Issue feature coefficients in combined model:")
	for f in issue_cols:
	if f in m_full.params:
	sig = '*' if m_full.pvalues[f]<0.001 else '' if m_full.pvalues[f]<0.01 else '*' if m_full.pvalues[f]<0.05 else ''
	print(f" {f:25s}: coef={m_full.params[f]:+.3f}, t={m_full.tvalues[f]:.2f}, p={m_full.pvalues[f]:.4f} {sig}")

	def analyze_dual_scores(df):
	"""Build and compare the two scores."""

	print("\n" + "="*70)
	print("DUAL SCORE ANALYSIS")
	print("="*70)

	# SCORE 1: CLOSENESS TO MERGE
	def merge_readiness(row):
	s = 0
	s += row['f_ci'] * 2.5
	s += row['f_approval'] * 2.5
	s += row['f_maint'] * 1.5
	s += row['f_feedback'] * 2.5
	s += row['f_size'] * 2.0
	s += row['f_community'] * 1.0
	s += row['f_discussion'] * 2.5
	s += row['f_align'] * 0.5
	return round(s / 15.0 * 10, 1)

	df['score_merge'] = df.apply(merge_readiness, axis=1)

	# SCORE 2: DESERVES ATTENTION
	def attention_score(row):
	s = 0
	# Urgency
	if row['is_regression']: s += 4.0
	if row['is_security']: s += 4.0
	if row['is_bug']: s += 1.0
	if row['has_milestone']: s += 1.0
	# Community demand
	if row['total_issue_thumbsup'] >= 10: s += 2.0
	elif row['total_issue_thumbsup'] >= 3: s += 1.0
	elif row['total_issue_reactions'] >= 5: s += 0.5
	if row['total_issue_comments'] >= 20: s += 1.5
	elif row['total_issue_comments'] >= 5: s += 0.5
	if row['total_cross_refs'] >= 3: s += 1.0
	elif row['total_cross_refs'] >= 1: s += 0.5
	# Effort at risk
	if row['is_community_inferred']: s += 2.0
	if row['has_any_review'] and row['f_approval'] < 0.75: s += 1.0
	if row['total_lines'] > 200: s += 0.5
	# Blockers
	if row['f_ci'] == 0: s += 1.0
	if row['unresolved_threads'] > 0: s += 1.0
	elif row['changes_requested_count'] > 0: s += 0.5
	if row['f_approval'] == 0: s += 1.5
	return round(min(s, 10.0), 1)

	df['score_attention'] = df.apply(attention_score, axis=1)

	corr = df['score_merge'].corr(df['score_attention'])
	print(f"\n Correlation between scores: {corr:.3f}")
	print(f" ({'They surface different PRs' if corr < 0 else 'Some overlap expected'})")

	# Quadrant analysis
	merge_med = df['score_merge'].median()
	attn_med = df['score_attention'].median()

	q1 = df[(df['score_merge'] >= merge_med) & (df['score_attention'] >= attn_med)]
	q2 = df[(df['score_merge'] < merge_med) & (df['score_attention'] >= attn_med)]
	q3 = df[(df['score_merge'] >= merge_med) & (df['score_attention'] < attn_med)]
	q4 = df[(df['score_merge'] < merge_med) & (df['score_attention'] < attn_med)]

	print(f"""
	QUADRANT ANALYSIS (split at median of each score):

	┌──────────────────────────────┬──────────────────────────────┐
	│ Q1: HIGH merge + HIGH attn │ Q2: LOW merge + HIGH attn │
	│ "Help across finish line" │ "Invest review time" │
	│ n={len(q1):>4d} │ n={len(q2):>4d} │
	│ median age: {q1['age_days'].median():>6.1f}d │ median age: {q2['age_days'].median():>6.1f}d │
	│ community: {q1['is_community_inferred'].mean()100:>5.0f}% │ community: {q2['is_community_inferred'].mean()100:>5.0f}% │
	│ median lines: {q1['total_lines'].median():>6.0f} │ median lines: {q2['total_lines'].median():>6.0f} │
	│ has_linked_issue: {q1['has_linked_issue'].mean()100:>3.0f}% │ has_linked_issue: {q2['has_linked_issue'].mean()100:>3.0f}% │
	│ has_milestone: {q1['has_milestone'].mean()100:>3.0f}% │ has_milestone: {q2['has_milestone'].mean()100:>3.0f}% │
	│ is_bug: {q1['is_bug'].mean()100:>3.0f}% │ is_bug: {q2['is_bug'].mean()100:>3.0f}% │
	├──────────────────────────────┼──────────────────────────────┤
	│ Q3: HIGH merge + LOW attn │ Q4: LOW merge + LOW attn │
	│ "Will merge on its own" │ "Deprioritize / close" │
	│ n={len(q3):>4d} │ n={len(q4):>4d} │
	│ median age: {q3['age_days'].median():>6.1f}d │ median age: {q4['age_days'].median():>6.1f}d │
	│ community: {q3['is_community_inferred'].mean()100:>5.0f}% │ community: {q4['is_community_inferred'].mean()100:>5.0f}% │
	│ median lines: {q3['total_lines'].median():>6.0f} │ median lines: {q4['total_lines'].median():>6.0f} │
	│ has_linked_issue: {q3['has_linked_issue'].mean()100:>3.0f}% │ has_linked_issue: {q4['has_linked_issue'].mean()100:>3.0f}% │
	│ has_milestone: {q3['has_milestone'].mean()100:>3.0f}% │ has_milestone: {q4['has_milestone'].mean()100:>3.0f}% │
	│ is_bug: {q3['is_bug'].mean()100:>3.0f}% │ is_bug: {q4['is_bug'].mean()100:>3.0f}% │
	└──────────────────────────────┴──────────────────────────────┘

	Q1 = Priority: close to merge AND needs help -> review/merge now
	Q2 = Investment: far from merge but deserving -> schedule review time
	Q3 = Autopilot: close to merge, self-service -> let it flow
	Q4 = Triage: far from merge, low priority -> close or deprioritize
	""")

	# Feature direction comparison
	print(" FEATURE DIRECTION IN EACH SCORE:")
	print(f" {'Feature':22s} {'Merge Score':>14s} {'Attention Score':>16s} {'Tension':>10s}")
	print(" " + "-"*65)

	comparisons = [
	("CI passing", "HIGH readiness", "failing=needs help", "Opposite"),
	("Has approval", "HIGH readiness", "missing=needs review","Opposite"),
	("Maintainer review", "HIGH readiness", "moderate signal", "Weak"),
	("Unresolved threads", "LOW readiness", "needs response", "Aligned"),
	("Small size", "HIGH readiness", "large=significant", "CONFLICT"),
	("Internal author", "HIGH readiness", "community=waiting", "CONFLICT"),
	("Few commenters", "HIGH readiness", "many=important", "CONFLICT"),
	("Area labeled", "HIGH readiness", "triaged=good", "Aligned"),
	("Issue reactions", "(not in score)", "HIGH=community need", "Attn only"),
	("Bug/regression", "(not in score)", "HIGH urgency", "Attn only"),
	("Milestone", "(not in score)", "has deadline", "Attn only"),
	("Cross-references", "(not in score)", "broad impact", "Attn only"),
	]

	for feat, merge, attn, tension in comparisons:
	print(f" {feat:22s} {merge:>14s} {attn:>16s} {tension:>10s}")

	print(f"""
	PROPOSED ATTENTION SCORE COMPONENTS:

	URGENCY (0-4 pts): regression +4, security +4, bug +1, milestone +1
	COMMUNITY DEMAND (0-3): issue thumbsup, comments, cross-references
	EFFORT-AT-RISK (0-3): community author +2, has reviews but no approval +1,
	significant change +0.5
	BLOCKED (0-2): CI failing +1, unresolved feedback +1, no approval +1.5

	Key differences from merge score:
	- Community PRs score HIGHER (they're waiting on us)
	- Large/complex PRs score HIGHER (significant work at stake)
	- Issue engagement is a NEW signal (not in merge score)
	- CI failing scores HIGHER (needs help, not just "not ready")
	- No penalty for many commenters (avoids death spiral)
	""")

	def main():
	df = load()
	descriptive_stats(df)
	test_issue_features_merge(df)
	analyze_dual_scores(df)

	if __name__ == "__main__":
	main()
	"""
	Fetch linked issue metadata for existing PR dataset.
	For each PR, get closingIssuesReferences with reactions, labels, comments, age.
	"""

	import subprocess
	import json
	import time
	import os
	import sys
	from datetime import datetime, timezone

	DATA_FILE = r"C:\git\pr_data\merged_pr_features.json"
	OUTPUT_FILE = r"C:\git\pr_data\linked_issues.json"

	def gh_graphql(query, retries=2):
	cmd = ["gh", "api", "graphql", "-f", f"query={query}"]
	for attempt in range(retries + 1):
	result = subprocess.run(cmd, capture_output=True, timeout=120,
	encoding="utf-8", errors="replace")
	if result.returncode != 0:
	err = (result.stderr or "")[:300]
	if ("504" in err or "502" in err) and attempt < retries:
	time.sleep(5 * (attempt + 1))
	continue
	return None
	if not result.stdout:
	return None
	return json.loads(result.stdout)
	return None

	def fetch_issue_data_batch(owner, repo, pr_numbers):
	"""Fetch linked issue data for a batch of PRs."""
	# Build individual PR queries
	fragments = []
	for i, num in enumerate(pr_numbers):
	fragments.append(f"""
	pr{i}: pullRequest(number: {num}) {{
	number
	labels(first: 20) {{ nodes {{ name }} }}
	milestone {{ title dueOn }}
	closingIssuesReferences(first: 10) {{
	nodes {{
	number
	title
	createdAt
	comments {{ totalCount }}
	reactions {{ totalCount }}
	reactionGroups {{
	content
	reactors {{ totalCount }}
	}}
	labels(first: 15) {{ nodes {{ name }} }}
	milestone {{ title }}
	timelineItems(itemTypes: [CROSS_REFERENCED_EVENT], first: 5) {{
	totalCount
	}}
	}}
	}}
	}}
	""")

	query = f"""
	{{
	repository(owner: "{owner}", name: "{repo}") {{
	{"".join(fragments)}
	}}
	}}
	"""
	return gh_graphql(query)

	def main():
	with open(DATA_FILE, "r", encoding="utf-8") as f:
	features = json.load(f)

	# Load existing results if resuming
	existing = {}
	if os.path.exists(OUTPUT_FILE):
	with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
	existing = json.load(f)

	# Group PRs by repo
	repo_prs = {}
	for feat in features:
	repo = feat['repo']
	num = feat['number']
	key = f"{repo}#{num}"
	if key not in existing:
	repo_prs.setdefault(repo, []).append(num)

	total_needed = sum(len(nums) for nums in repo_prs.values())
	print(f"Need to fetch issue data for {total_needed} PRs across {len(repo_prs)} repos")
	print(f"Already have {len(existing)} cached results")

	for repo_full, numbers in sorted(repo_prs.items()):
	owner, repo = repo_full.split("/")
	print(f"\n{repo_full}: {len(numbers)} PRs to fetch")

	# Batch in groups of 15 (GraphQL complexity limits)
	for batch_start in range(0, len(numbers), 15):
	batch = numbers[batch_start:batch_start + 15]
	data = fetch_issue_data_batch(owner, repo, batch)

	if not data or "data" not in data:
	print(f" Failed batch starting at {batch_start}")
	time.sleep(2)
	continue

	repo_data = data["data"]["repository"]
	for i, num in enumerate(batch):
	pr_key = f"pr{i}"
	if pr_key in repo_data and repo_data[pr_key]:
	pr_data = repo_data[pr_key]
	key = f"{repo_full}#{num}"

	# Extract issue summary
	issues = pr_data.get("closingIssuesReferences", {}).get("nodes", [])
	pr_labels = [l["name"] for l in pr_data.get("labels", {}).get("nodes", [])]
	milestone = pr_data.get("milestone")

	issue_summary = {
	"pr_number": num,
	"repo": repo_full,
	"pr_labels": pr_labels,
	"has_milestone": milestone is not None,
	"milestone_title": milestone["title"] if milestone else None,
	"linked_issue_count": len(issues),
	"issues": []
	}

	for issue in issues:
	issue_labels = [l["name"] for l in issue.get("labels", {}).get("nodes", [])]
	reactions = issue.get("reactions", {}).get("totalCount", 0)

	# Get thumbs-up specifically
	thumbs_up = 0
	for rg in issue.get("reactionGroups", []):
	if rg["content"] == "THUMBS_UP":
	thumbs_up = rg["reactors"]["totalCount"]

	cross_refs = issue.get("timelineItems", {}).get("totalCount", 0)

	issue_summary["issues"].append({
	"number": issue["number"],
	"title": issue.get("title", ""),
	"created_at": issue.get("createdAt"),
	"comment_count": issue.get("comments", {}).get("totalCount", 0),
	"reaction_count": reactions,
	"thumbs_up": thumbs_up,
	"labels": issue_labels,
	"cross_ref_count": cross_refs,
	"has_milestone": issue.get("milestone") is not None,
	})

	existing[key] = issue_summary

	if (batch_start + 15) % 60 == 0:
	print(f" Fetched {min(batch_start + 15, len(numbers))}/{len(numbers)}")
	time.sleep(0.5)

	# Save incrementally
	with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
	json.dump(existing, f, indent=2)

	print(f"\nTotal: {len(existing)} PR issue records saved to {OUTPUT_FILE}")

	# Quick stats
	has_issues = sum(1 for v in existing.values() if v["linked_issue_count"] > 0)
	has_milestone = sum(1 for v in existing.values() if v["has_milestone"])
	total_reactions = sum(
	sum(i["reaction_count"] for i in v["issues"])
	for v in existing.values()
	)

	print(f"\nQuick stats:")
	print(f" PRs with linked issues: {has_issues} ({has_issues/len(existing)*100:.0f}%)")
	print(f" PRs with milestones: {has_milestone} ({has_milestone/len(existing)*100:.0f}%)")
	print(f" Total issue reactions: {total_reactions}")

	# Regression/security label check
	regression_labels = ["regression-from-last-release", "regression",
	"regression-from-previous-release", "Regression"]
	security_labels = ["security", "Security", "area-Security"]

	has_regression = 0
	has_security = 0
	for v in existing.values():
	all_labels = set(v["pr_labels"])
	for issue in v["issues"]:
	all_labels.update(issue["labels"])
	if any(l in all_labels for l in regression_labels):
	has_regression += 1
	if any(l in all_labels for l in security_labels):
	has_security += 1

	print(f" Regression-labeled: {has_regression}")
	print(f" Security-labeled: {has_security}")

	if __name__ == "__main__":
	main()
Stat	Value
Total PRs	980
Repos	11
Median merge time	1.0 days
Merged within 1 day	51%
Merged within 7 days	82%
Community PRs (inferred)	34%
Has owner approval	50%
PRs with linked issues	15%
PRs with milestones	29%
Component	R-squared alone	Interpretation
distinct_commenters	0.228	More stakeholders = slower
total_comments	0.234	Raw engagement volume
total_threads	0.118	Review thread count
changes_requested	0.117	Explicit review feedback
unresolved_threads	0.036	Active blockers
resolution_rate	0.018	Weak signal
Metric	R-squared (full model)	Death spiral?
A. Current (raw count)	0.292	YES
D. Commenters only	0.315	Partial
E. Hybrid (unresolved + commenters)	0.219	No
C. Unresolved only	0.132	No
Repo	R-squared	Top Predictors	Median Age
sdk	0.61	discussion	1.0d
maui	0.58	discussion, size, community, align	5.5d
winforms	0.44	discussion, approval, size	0.2d
extensions	0.41	discussion	1.5d
aspnetcore	0.41	discussion	0.7d
aspire	0.36	discussion, size	0.2d
runtime	0.33	discussion, community	2.2d
roslyn	0.33	approval, size	0.8d
msbuild	0.26	approval	1.9d
machinelearning	0.20	size	1.7d
wpf	0.05	(none)	1.0d
Feature	R-squared	Coefficient	p-value
has_linked_issue	0.141	+1.321	<0.001
log_cross_refs	0.143	+1.072	<0.001
log_issue_comments	0.119	+0.819	<0.001
log_issue_reactions	0.070	+1.056	<0.001
is_bug	0.014	+1.230	<0.001
has_milestone	0.000	-0.009	0.915
	HIGH Merge Readiness	LOW Merge Readiness
HIGH Attention	Q1: "Help across finish line" (n=176): median 0.9d, 72% community, 14 lines, 20% linked issues	Q2: "Invest review time" (n=355): median 2.5d, 59% community, 173 lines, 26% linked issues
LOW Attention	Q3: "Will merge on its own" (n=337): median 0.5d, 0% community, 44 lines, 3% linked issues	Q4: "Deprioritize" (n=112): median 1.8d, 0% community, 358 lines, 4% linked issues
Feature	Merge Readiness	Deserves Attention	Tension
CI passing	HIGH = ready	Failing = needs help	Opposite
Has approval	HIGH = ready	Missing = needs review	Opposite
Small size	HIGH = ready	Large = significant work	CONFLICT
Internal author	HIGH = ready	Community = waiting on us	CONFLICT
Few commenters	HIGH = ready	Many = important to community	CONFLICT
Issue reactions	(not used)	HIGH = community demand	Attention only
Bug/regression label	(not used)	HIGH = urgency	Attention only
Milestone	(not used)	Has deadline	Attention only
Cross-references	(not used)	Broad impact	Attention only
Feature	Current	Recommended	Change	Confidence	Rationale
ciScore	3.0	2.5	-0.5	Moderate	Gate (last gate 70%); but BA absent in many repos
conflictScore	3.0	3.0	0.0	N/A	Hard gate; can't measure historically
approvalScore	2.0	2.5	+0.5	Moderate	Gate (40% merge within 1h of approval)
maintScore	3.0	1.5	-1.5	Lower	Overlaps approval; Lasso drops it; unstable
feedbackScore	2.0	2.5	+0.5	High	Redesign: unresolved threads + changes_requested
discussionScore	1.5	2.5	+1.0	Very High	Redesign: based on distinct_commenters; cap at 0.5 min
sizeScore	1.0	2.0	+1.0	High	Significant in 6/11 repos
communityScore	0.5	1.0	+0.5	High	Significant with inferred maintainers
stalenessScore	1.5	1.0	-0.5	Low	Can't validate from post-merge data
freshScore	1.0	0.7	-0.3	Low	Overlaps staleness
alignScore	1.0	0.5	-0.5	Lower	Weak predictor; only 2/11 repos
velocityScore	0.5	0.3	-0.2	Low	Can't validate
TOTAL	20.0	20.0