wassname · July 26, 2025 02:08 · Jul 26, 2025 · Mar 20, 2025 · Mar 20, 2025
diff --git a/simple_pref_eval.py b/simple_pref_eval.py
@@ -7,7 +7,11 @@
 
 @url: https://gist.github.com/wassname/04f0c50a68054f0323f62b0da418daec
 """
+import torch
 import copy
+from tqdm.auto import tqdm
+from torch.nn import CrossEntropyLoss
+
 from transformers import DynamicCache, PreTrainedModel, PreTrainedTokenizerBase
 from datasets import Dataset
 
@@ -26,7 +30,7 @@ def get_output_ppx(output, input):
     nll = (loss * shift_masks)
     count = shift_masks.sum().item()
     return {
-        'ppx': np.exp(nll.sum().item() / count),
+        'ppx': torch.exp(nll.sum().item() / count),
         # 'nll': nll.sum().item(),
         'nll_mean': nll.sum().item() / count,
         # 'count': count,
@@ -63,21 +67,31 @@ def eval_pref_ds_ppx(model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase,
                 res[f"{p}_{k}"] = ppx[k]
         results.append(res)
 
-    df = pd.DataFrame(results)
-    df['ppx_ratio'] = (df.chosen_ppx/df.rejected_ppx)
+    # df = pd.DataFrame(results)
+    # df['ppx_ratio'] = (df.chosen_ppx/df.rejected_ppx)
     # df['ppx_ratio'] = (df.chosen_nll-df.rejected_nll)
-    return df
+    return (df.chosen_ppx/df.rejected_ppx)
 
-ds_pref = load_dataset("wassname/genies_preferences", name="illegal_dont_help", split="train")
-ds_pref = ds_pref.select(range(0, 1000))
-df_results = eval_pref_ds_ppx(model, tokenizer, ds_pref, batch_size, max_new_tokens)
-# print(df_results.head(1)to_markdown())
-s = df_results['ppx_ratio'].mean()
-print(f"mean_ppx_ratio: {s:2.2f}")
-# np.float64(0.36348262129569164)
-"""
-|    |   rejected_ppx |   rejected_nll_mean |   chosen_ppx |   chosen_nll_mean |   ppx_ratio |
-|---:|---------------:|--------------------:|-------------:|------------------:|------------:|
-|  0 |       12.5819  |             2.53226 |      4.59144 |           1.52419 |    0.364925 |
-|  1 |       16.0257  |             2.77419 |      4.59144 |           1.52419 |    0.286505 |
-"""
+if __name__ == "__main__":
+    from datasets import load_dataset
+    max_new_tokens = 128
+    batch_size = 2
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    model_name = "unsloth/Qwen3-4B-unsloth-bnb-4bit"
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, padding_side="left")
+    model.eval()
+    ds_pref = load_dataset("wassname/genies_preferences", name="illegal_dont_help", split="train")
+    ds_pref = ds_pref.select(range(0, 1000))
+    df_results = eval_pref_ds_ppx(model, tokenizer, ds_pref, batch_size, max_new_tokens)
+    # print(df_results.head(1)to_markdown())
+    s = df_results['ppx_ratio'].mean()
+    print(f"mean_ppx_ratio: {s:2.2f}")
+    # np.float64(0.36348262129569164)
+    """
+    |    |   rejected_ppx |   rejected_nll_mean |   chosen_ppx |   chosen_nll_mean |   ppx_ratio |
+    |---:|---------------:|--------------------:|-------------:|------------------:|------------:|
+    |  0 |       12.5819  |             2.53226 |      4.59144 |           1.52419 |    0.364925 |
+    |  1 |       16.0257  |             2.77419 |      4.59144 |           1.52419 |    0.286505 |
+    """
diff --git a/simple_pref_eval.py b/simple_pref_eval.py
@@ -1,3 +1,12 @@
+"""
+This is a simple way to evaluate if a model prefers the accepted or rejected completions of a prompt.
+
+We look at the perplexity of the chosen and rejected completions of a prompt.
+
+Example dataset: https://huggingface.co/datasets/wassname/genies_preferences/viewer/illegal_dont_help?views[]=illegal_dont_help_train&views[]=illegal_dont_help_test
+
+@url: https://gist.github.com/wassname/04f0c50a68054f0323f62b0da418daec
+"""
 import copy
 from transformers import DynamicCache, PreTrainedModel, PreTrainedTokenizerBase
 from datasets import Dataset
@@ -9,12 +18,20 @@ def get_output_ppx(output, input):
     shift_logits = output.logits[:, :-1].contiguous()
     shift_labels = input.input_ids[:, 1:].contiguous()
     loss = loss_fn(shift_logits.transpose(1, 2), shift_labels)
-    shift_masks = input.attention_mask[:, 1:].contiguous() # target_masks[:, 1:].contiguous() * 
-    nll = (loss * shift_masks).sum().item()
+
+    # crop the attention mask to just the provided input
+    attention_mask = input.attention_mask[:, :input.input_ids.size(1)].contiguous()
+    # input.attention_mask
+    shift_masks = attention_mask[:, 1:].contiguous()
+    nll = (loss * shift_masks)
     count = shift_masks.sum().item()
-    return np.exp(nll / count)
+    return {
+        'ppx': np.exp(nll.sum().item() / count),
+        # 'nll': nll.sum().item(),
+        'nll_mean': nll.sum().item() / count,
+        # 'count': count,
+    }
 
-# I could get the logprobs of each yep
 
 
 @torch.no_grad()
@@ -42,13 +59,25 @@ def eval_pref_ds_ppx(model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase,
             kv_cache2 = copy.deepcopy(kv_cache)
             output = model.forward(**input, past_key_values=kv_cache2)
             ppx = get_output_ppx(output, input)
-            res[p] = ppx
+            for k in ppx:
+                res[f"{p}_{k}"] = ppx[k]
         results.append(res)
 
-    df_results = pd.DataFrame(results)
-    return df_results
+    df = pd.DataFrame(results)
+    df['ppx_ratio'] = (df.chosen_ppx/df.rejected_ppx)
+    # df['ppx_ratio'] = (df.chosen_nll-df.rejected_nll)
+    return df
 
 ds_pref = load_dataset("wassname/genies_preferences", name="illegal_dont_help", split="train")
-ds_pref = ds_pref.select(range(0, 100))
+ds_pref = ds_pref.select(range(0, 1000))
 df_results = eval_pref_ds_ppx(model, tokenizer, ds_pref, batch_size, max_new_tokens)
-df_results.describe()
+# print(df_results.head(1)to_markdown())
+s = df_results['ppx_ratio'].mean()
+print(f"mean_ppx_ratio: {s:2.2f}")
+# np.float64(0.36348262129569164)
+"""
+|    |   rejected_ppx |   rejected_nll_mean |   chosen_ppx |   chosen_nll_mean |   ppx_ratio |
+|---:|---------------:|--------------------:|-------------:|------------------:|------------:|
+|  0 |       12.5819  |             2.53226 |      4.59144 |           1.52419 |    0.364925 |
+|  1 |       16.0257  |             2.77419 |      4.59144 |           1.52419 |    0.286505 |
+"""
diff --git a/simple_pref_eval.py b/simple_pref_eval.py
@@ -0,0 +1,54 @@
+import copy
+from transformers import DynamicCache, PreTrainedModel, PreTrainedTokenizerBase
+from datasets import Dataset
+
+# how to eval, I couldlook at perplexity on chosen vs rejected in the context of prompt
+
+def get_output_ppx(output, input):
+    loss_fn = CrossEntropyLoss(reduction="none")
+    shift_logits = output.logits[:, :-1].contiguous()
+    shift_labels = input.input_ids[:, 1:].contiguous()
+    loss = loss_fn(shift_logits.transpose(1, 2), shift_labels)
+    shift_masks = input.attention_mask[:, 1:].contiguous() # target_masks[:, 1:].contiguous() * 
+    nll = (loss * shift_masks).sum().item()
+    count = shift_masks.sum().item()
+    return np.exp(nll / count)
+
+# I could get the logprobs of each yep
+
+
+@torch.no_grad()
+def eval_pref_ds_ppx(model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase, ds_pref: Dataset, batch_size: int=2, max_new_tokens: int=128):
+    """
+    Evaluate on a preference dataset. 
+    
+    The relative perplexity of the chosen and rejected completions of a prompt.
+    """
+    results = []
+    for batch in tqdm(ds_pref.batch(batch_size), unit="batch"):
+        # first we cache the prompt
+        kv_cache = DynamicCache()
+        inputs1 = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=max_new_tokens//2, return_token_type_ids=False, return_attention_mask=True)
+        model.forward(**inputs1, past_key_values=kv_cache)
+
+        # then we evaluate the perplexity of the accepted and rejected completion
+        res = {}
+        for p in ['rejected', 'chosen']:
+            input = tokenizer(batch[p], return_tensors="pt", padding=True, truncation=True, max_length=max_new_tokens//2, return_token_type_ids=False, return_attention_mask=True)
+
+            # we need to update the attention mask to match the kv_cache
+            input['attention_mask'] = torch.cat([inputs1['attention_mask'], input['attention_mask']], dim=1)
+
+            kv_cache2 = copy.deepcopy(kv_cache)
+            output = model.forward(**input, past_key_values=kv_cache2)
+            ppx = get_output_ppx(output, input)
+            res[p] = ppx
+        results.append(res)
+
+    df_results = pd.DataFrame(results)
+    return df_results
+
+ds_pref = load_dataset("wassname/genies_preferences", name="illegal_dont_help", split="train")
+ds_pref = ds_pref.select(range(0, 100))
+df_results = eval_pref_ds_ppx(model, tokenizer, ds_pref, batch_size, max_new_tokens)
+df_results.describe()
No results found