Skip to content

Instantly share code, notes, and snippets.

@wassname
Last active July 26, 2025 02:08
Show Gist options
  • Select an option

  • Save wassname/04f0c50a68054f0323f62b0da418daec to your computer and use it in GitHub Desktop.

Select an option

Save wassname/04f0c50a68054f0323f62b0da418daec to your computer and use it in GitHub Desktop.

Revisions

  1. wassname revised this gist Jul 26, 2025. 1 changed file with 31 additions and 17 deletions.
    48 changes: 31 additions & 17 deletions simple_pref_eval.py
    Original file line number Diff line number Diff line change
    @@ -7,7 +7,11 @@
    @url: https://gist.github.com/wassname/04f0c50a68054f0323f62b0da418daec
    """
    import torch
    import copy
    from tqdm.auto import tqdm
    from torch.nn import CrossEntropyLoss

    from transformers import DynamicCache, PreTrainedModel, PreTrainedTokenizerBase
    from datasets import Dataset

    @@ -26,7 +30,7 @@ def get_output_ppx(output, input):
    nll = (loss * shift_masks)
    count = shift_masks.sum().item()
    return {
    'ppx': np.exp(nll.sum().item() / count),
    'ppx': torch.exp(nll.sum().item() / count),
    # 'nll': nll.sum().item(),
    'nll_mean': nll.sum().item() / count,
    # 'count': count,
    @@ -63,21 +67,31 @@ def eval_pref_ds_ppx(model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase,
    res[f"{p}_{k}"] = ppx[k]
    results.append(res)

    df = pd.DataFrame(results)
    df['ppx_ratio'] = (df.chosen_ppx/df.rejected_ppx)
    # df = pd.DataFrame(results)
    # df['ppx_ratio'] = (df.chosen_ppx/df.rejected_ppx)
    # df['ppx_ratio'] = (df.chosen_nll-df.rejected_nll)
    return df
    return (df.chosen_ppx/df.rejected_ppx)

    ds_pref = load_dataset("wassname/genies_preferences", name="illegal_dont_help", split="train")
    ds_pref = ds_pref.select(range(0, 1000))
    df_results = eval_pref_ds_ppx(model, tokenizer, ds_pref, batch_size, max_new_tokens)
    # print(df_results.head(1)to_markdown())
    s = df_results['ppx_ratio'].mean()
    print(f"mean_ppx_ratio: {s:2.2f}")
    # np.float64(0.36348262129569164)
    """
    | | rejected_ppx | rejected_nll_mean | chosen_ppx | chosen_nll_mean | ppx_ratio |
    |---:|---------------:|--------------------:|-------------:|------------------:|------------:|
    | 0 | 12.5819 | 2.53226 | 4.59144 | 1.52419 | 0.364925 |
    | 1 | 16.0257 | 2.77419 | 4.59144 | 1.52419 | 0.286505 |
    """
    if __name__ == "__main__":
    from datasets import load_dataset
    max_new_tokens = 128
    batch_size = 2
    from transformers import AutoModelForCausalLM, AutoTokenizer
    model_name = "unsloth/Qwen3-4B-unsloth-bnb-4bit"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, padding_side="left")
    model.eval()
    ds_pref = load_dataset("wassname/genies_preferences", name="illegal_dont_help", split="train")
    ds_pref = ds_pref.select(range(0, 1000))
    df_results = eval_pref_ds_ppx(model, tokenizer, ds_pref, batch_size, max_new_tokens)
    # print(df_results.head(1)to_markdown())
    s = df_results['ppx_ratio'].mean()
    print(f"mean_ppx_ratio: {s:2.2f}")
    # np.float64(0.36348262129569164)
    """
    | | rejected_ppx | rejected_nll_mean | chosen_ppx | chosen_nll_mean | ppx_ratio |
    |---:|---------------:|--------------------:|-------------:|------------------:|------------:|
    | 0 | 12.5819 | 2.53226 | 4.59144 | 1.52419 | 0.364925 |
    | 1 | 16.0257 | 2.77419 | 4.59144 | 1.52419 | 0.286505 |
    """
  2. wassname revised this gist Mar 20, 2025. 1 changed file with 38 additions and 9 deletions.
    47 changes: 38 additions & 9 deletions simple_pref_eval.py
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,12 @@
    """
    This is a simple way to evaluate if a model prefers the accepted or rejected completions of a prompt.
    We look at the perplexity of the chosen and rejected completions of a prompt.
    Example dataset: https://huggingface.co/datasets/wassname/genies_preferences/viewer/illegal_dont_help?views[]=illegal_dont_help_train&views[]=illegal_dont_help_test
    @url: https://gist.github.com/wassname/04f0c50a68054f0323f62b0da418daec
    """
    import copy
    from transformers import DynamicCache, PreTrainedModel, PreTrainedTokenizerBase
    from datasets import Dataset
    @@ -9,12 +18,20 @@ def get_output_ppx(output, input):
    shift_logits = output.logits[:, :-1].contiguous()
    shift_labels = input.input_ids[:, 1:].contiguous()
    loss = loss_fn(shift_logits.transpose(1, 2), shift_labels)
    shift_masks = input.attention_mask[:, 1:].contiguous() # target_masks[:, 1:].contiguous() *
    nll = (loss * shift_masks).sum().item()

    # crop the attention mask to just the provided input
    attention_mask = input.attention_mask[:, :input.input_ids.size(1)].contiguous()
    # input.attention_mask
    shift_masks = attention_mask[:, 1:].contiguous()
    nll = (loss * shift_masks)
    count = shift_masks.sum().item()
    return np.exp(nll / count)
    return {
    'ppx': np.exp(nll.sum().item() / count),
    # 'nll': nll.sum().item(),
    'nll_mean': nll.sum().item() / count,
    # 'count': count,
    }

    # I could get the logprobs of each yep


    @torch.no_grad()
    @@ -42,13 +59,25 @@ def eval_pref_ds_ppx(model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase,
    kv_cache2 = copy.deepcopy(kv_cache)
    output = model.forward(**input, past_key_values=kv_cache2)
    ppx = get_output_ppx(output, input)
    res[p] = ppx
    for k in ppx:
    res[f"{p}_{k}"] = ppx[k]
    results.append(res)

    df_results = pd.DataFrame(results)
    return df_results
    df = pd.DataFrame(results)
    df['ppx_ratio'] = (df.chosen_ppx/df.rejected_ppx)
    # df['ppx_ratio'] = (df.chosen_nll-df.rejected_nll)
    return df

    ds_pref = load_dataset("wassname/genies_preferences", name="illegal_dont_help", split="train")
    ds_pref = ds_pref.select(range(0, 100))
    ds_pref = ds_pref.select(range(0, 1000))
    df_results = eval_pref_ds_ppx(model, tokenizer, ds_pref, batch_size, max_new_tokens)
    df_results.describe()
    # print(df_results.head(1)to_markdown())
    s = df_results['ppx_ratio'].mean()
    print(f"mean_ppx_ratio: {s:2.2f}")
    # np.float64(0.36348262129569164)
    """
    | | rejected_ppx | rejected_nll_mean | chosen_ppx | chosen_nll_mean | ppx_ratio |
    |---:|---------------:|--------------------:|-------------:|------------------:|------------:|
    | 0 | 12.5819 | 2.53226 | 4.59144 | 1.52419 | 0.364925 |
    | 1 | 16.0257 | 2.77419 | 4.59144 | 1.52419 | 0.286505 |
    """
  3. wassname created this gist Mar 20, 2025.
    54 changes: 54 additions & 0 deletions simple_pref_eval.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,54 @@
    import copy
    from transformers import DynamicCache, PreTrainedModel, PreTrainedTokenizerBase
    from datasets import Dataset

    # how to eval, I couldlook at perplexity on chosen vs rejected in the context of prompt

    def get_output_ppx(output, input):
    loss_fn = CrossEntropyLoss(reduction="none")
    shift_logits = output.logits[:, :-1].contiguous()
    shift_labels = input.input_ids[:, 1:].contiguous()
    loss = loss_fn(shift_logits.transpose(1, 2), shift_labels)
    shift_masks = input.attention_mask[:, 1:].contiguous() # target_masks[:, 1:].contiguous() *
    nll = (loss * shift_masks).sum().item()
    count = shift_masks.sum().item()
    return np.exp(nll / count)

    # I could get the logprobs of each yep


    @torch.no_grad()
    def eval_pref_ds_ppx(model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase, ds_pref: Dataset, batch_size: int=2, max_new_tokens: int=128):
    """
    Evaluate on a preference dataset.
    The relative perplexity of the chosen and rejected completions of a prompt.
    """
    results = []
    for batch in tqdm(ds_pref.batch(batch_size), unit="batch"):
    # first we cache the prompt
    kv_cache = DynamicCache()
    inputs1 = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=max_new_tokens//2, return_token_type_ids=False, return_attention_mask=True)
    model.forward(**inputs1, past_key_values=kv_cache)

    # then we evaluate the perplexity of the accepted and rejected completion
    res = {}
    for p in ['rejected', 'chosen']:
    input = tokenizer(batch[p], return_tensors="pt", padding=True, truncation=True, max_length=max_new_tokens//2, return_token_type_ids=False, return_attention_mask=True)

    # we need to update the attention mask to match the kv_cache
    input['attention_mask'] = torch.cat([inputs1['attention_mask'], input['attention_mask']], dim=1)

    kv_cache2 = copy.deepcopy(kv_cache)
    output = model.forward(**input, past_key_values=kv_cache2)
    ppx = get_output_ppx(output, input)
    res[p] = ppx
    results.append(res)

    df_results = pd.DataFrame(results)
    return df_results

    ds_pref = load_dataset("wassname/genies_preferences", name="illegal_dont_help", split="train")
    ds_pref = ds_pref.select(range(0, 100))
    df_results = eval_pref_ds_ppx(model, tokenizer, ds_pref, batch_size, max_new_tokens)
    df_results.describe()