Last active
July 26, 2025 02:08
-
-
Save wassname/04f0c50a68054f0323f62b0da418daec to your computer and use it in GitHub Desktop.
Revisions
-
wassname revised this gist
Jul 26, 2025 . 1 changed file with 31 additions and 17 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -7,7 +7,11 @@ @url: https://gist.github.com/wassname/04f0c50a68054f0323f62b0da418daec """ import torch import copy from tqdm.auto import tqdm from torch.nn import CrossEntropyLoss from transformers import DynamicCache, PreTrainedModel, PreTrainedTokenizerBase from datasets import Dataset @@ -26,7 +30,7 @@ def get_output_ppx(output, input): nll = (loss * shift_masks) count = shift_masks.sum().item() return { 'ppx': torch.exp(nll.sum().item() / count), # 'nll': nll.sum().item(), 'nll_mean': nll.sum().item() / count, # 'count': count, @@ -63,21 +67,31 @@ def eval_pref_ds_ppx(model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase, res[f"{p}_{k}"] = ppx[k] results.append(res) # df = pd.DataFrame(results) # df['ppx_ratio'] = (df.chosen_ppx/df.rejected_ppx) # df['ppx_ratio'] = (df.chosen_nll-df.rejected_nll) return (df.chosen_ppx/df.rejected_ppx) if __name__ == "__main__": from datasets import load_dataset max_new_tokens = 128 batch_size = 2 from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "unsloth/Qwen3-4B-unsloth-bnb-4bit" device = "cuda" if torch.cuda.is_available() else "cpu" model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16) tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, padding_side="left") model.eval() ds_pref = load_dataset("wassname/genies_preferences", name="illegal_dont_help", split="train") ds_pref = ds_pref.select(range(0, 1000)) df_results = eval_pref_ds_ppx(model, tokenizer, ds_pref, batch_size, max_new_tokens) # print(df_results.head(1)to_markdown()) s = df_results['ppx_ratio'].mean() print(f"mean_ppx_ratio: {s:2.2f}") # np.float64(0.36348262129569164) """ | | rejected_ppx | rejected_nll_mean | chosen_ppx | chosen_nll_mean | ppx_ratio | |---:|---------------:|--------------------:|-------------:|------------------:|------------:| | 0 | 12.5819 | 2.53226 | 4.59144 | 1.52419 | 0.364925 | | 1 | 16.0257 | 2.77419 | 4.59144 | 1.52419 | 0.286505 | """ -
wassname revised this gist
Mar 20, 2025 . 1 changed file with 38 additions and 9 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,3 +1,12 @@ """ This is a simple way to evaluate if a model prefers the accepted or rejected completions of a prompt. We look at the perplexity of the chosen and rejected completions of a prompt. Example dataset: https://huggingface.co/datasets/wassname/genies_preferences/viewer/illegal_dont_help?views[]=illegal_dont_help_train&views[]=illegal_dont_help_test @url: https://gist.github.com/wassname/04f0c50a68054f0323f62b0da418daec """ import copy from transformers import DynamicCache, PreTrainedModel, PreTrainedTokenizerBase from datasets import Dataset @@ -9,12 +18,20 @@ def get_output_ppx(output, input): shift_logits = output.logits[:, :-1].contiguous() shift_labels = input.input_ids[:, 1:].contiguous() loss = loss_fn(shift_logits.transpose(1, 2), shift_labels) # crop the attention mask to just the provided input attention_mask = input.attention_mask[:, :input.input_ids.size(1)].contiguous() # input.attention_mask shift_masks = attention_mask[:, 1:].contiguous() nll = (loss * shift_masks) count = shift_masks.sum().item() return { 'ppx': np.exp(nll.sum().item() / count), # 'nll': nll.sum().item(), 'nll_mean': nll.sum().item() / count, # 'count': count, } @torch.no_grad() @@ -42,13 +59,25 @@ def eval_pref_ds_ppx(model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase, kv_cache2 = copy.deepcopy(kv_cache) output = model.forward(**input, past_key_values=kv_cache2) ppx = get_output_ppx(output, input) for k in ppx: res[f"{p}_{k}"] = ppx[k] results.append(res) df = pd.DataFrame(results) df['ppx_ratio'] = (df.chosen_ppx/df.rejected_ppx) # df['ppx_ratio'] = (df.chosen_nll-df.rejected_nll) return df ds_pref = load_dataset("wassname/genies_preferences", name="illegal_dont_help", split="train") ds_pref = ds_pref.select(range(0, 1000)) df_results = eval_pref_ds_ppx(model, tokenizer, ds_pref, batch_size, max_new_tokens) # print(df_results.head(1)to_markdown()) s = df_results['ppx_ratio'].mean() print(f"mean_ppx_ratio: {s:2.2f}") # np.float64(0.36348262129569164) """ | | rejected_ppx | rejected_nll_mean | chosen_ppx | chosen_nll_mean | ppx_ratio | |---:|---------------:|--------------------:|-------------:|------------------:|------------:| | 0 | 12.5819 | 2.53226 | 4.59144 | 1.52419 | 0.364925 | | 1 | 16.0257 | 2.77419 | 4.59144 | 1.52419 | 0.286505 | """ -
wassname created this gist
Mar 20, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,54 @@ import copy from transformers import DynamicCache, PreTrainedModel, PreTrainedTokenizerBase from datasets import Dataset # how to eval, I couldlook at perplexity on chosen vs rejected in the context of prompt def get_output_ppx(output, input): loss_fn = CrossEntropyLoss(reduction="none") shift_logits = output.logits[:, :-1].contiguous() shift_labels = input.input_ids[:, 1:].contiguous() loss = loss_fn(shift_logits.transpose(1, 2), shift_labels) shift_masks = input.attention_mask[:, 1:].contiguous() # target_masks[:, 1:].contiguous() * nll = (loss * shift_masks).sum().item() count = shift_masks.sum().item() return np.exp(nll / count) # I could get the logprobs of each yep @torch.no_grad() def eval_pref_ds_ppx(model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase, ds_pref: Dataset, batch_size: int=2, max_new_tokens: int=128): """ Evaluate on a preference dataset. The relative perplexity of the chosen and rejected completions of a prompt. """ results = [] for batch in tqdm(ds_pref.batch(batch_size), unit="batch"): # first we cache the prompt kv_cache = DynamicCache() inputs1 = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=max_new_tokens//2, return_token_type_ids=False, return_attention_mask=True) model.forward(**inputs1, past_key_values=kv_cache) # then we evaluate the perplexity of the accepted and rejected completion res = {} for p in ['rejected', 'chosen']: input = tokenizer(batch[p], return_tensors="pt", padding=True, truncation=True, max_length=max_new_tokens//2, return_token_type_ids=False, return_attention_mask=True) # we need to update the attention mask to match the kv_cache input['attention_mask'] = torch.cat([inputs1['attention_mask'], input['attention_mask']], dim=1) kv_cache2 = copy.deepcopy(kv_cache) output = model.forward(**input, past_key_values=kv_cache2) ppx = get_output_ppx(output, input) res[p] = ppx results.append(res) df_results = pd.DataFrame(results) return df_results ds_pref = load_dataset("wassname/genies_preferences", name="illegal_dont_help", split="train") ds_pref = ds_pref.select(range(0, 100)) df_results = eval_pref_ds_ppx(model, tokenizer, ds_pref, batch_size, max_new_tokens) df_results.describe()