Skip to content

Instantly share code, notes, and snippets.

@mbuotidem
Forked from jiahao87/pegasus_fine_tune.py
Created October 28, 2021 03:58
Show Gist options
  • Select an option

  • Save mbuotidem/fd4185ee33b6fb29af1282d5f65f82b8 to your computer and use it in GitHub Desktop.

Select an option

Save mbuotidem/fd4185ee33b6fb29af1282d5f65f82b8 to your computer and use it in GitHub Desktop.

Revisions

  1. @jiahao87 jiahao87 revised this gist Jul 2, 2021. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion pegasus_fine_tune.py
    Original file line number Diff line number Diff line change
    @@ -29,7 +29,7 @@ def __getitem__(self, idx):
    item['labels'] = torch.tensor(self.labels['input_ids'][idx]) # torch.tensor(self.labels[idx])
    return item
    def __len__(self):
    return len(self.labels['input_ids'])
    return len(self.labels['input_ids']) # len(self.labels)


    def prepare_data(model_name,
  2. @jiahao87 jiahao87 revised this gist Jul 2, 2021. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion pegasus_fine_tune.py
    Original file line number Diff line number Diff line change
    @@ -29,7 +29,7 @@ def __getitem__(self, idx):
    item['labels'] = torch.tensor(self.labels['input_ids'][idx]) # torch.tensor(self.labels[idx])
    return item
    def __len__(self):
    return len(self.labels)
    return len(self.labels['input_ids'])


    def prepare_data(model_name,
  3. @jiahao87 jiahao87 revised this gist May 11, 2021. 1 changed file with 9 additions and 7 deletions.
    16 changes: 9 additions & 7 deletions pegasus_fine_tune.py
    Original file line number Diff line number Diff line change
    @@ -7,8 +7,8 @@
    # use Pegasus Large model as base for fine-tuning
    model_name = 'google/pegasus-large'
    train_dataset, _, _ = prepare_data(model_name, train_texts, train_labels)
    trainer = prepare_fine_tuning(model_name, train_dataset)
    train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
    trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
    trainer.train()
    Reference:
    @@ -54,10 +54,10 @@ def tokenize_data(texts, labels):
    val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
    test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

    return train_dataset, val_dataset, test_dataset
    return train_dataset, val_dataset, test_dataset, tokenizer


    def prepare_fine_tuning(model_name, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
    def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
    """
    Prepare configurations and base model for fine-tuning
    """
    @@ -88,7 +88,8 @@ def prepare_fine_tuning(model_name, train_dataset, val_dataset=None, freeze_enco
    model=model, # the instantiated 🤗 Transformers model to be trained
    args=training_args, # training arguments, defined above
    train_dataset=train_dataset, # training dataset
    eval_dataset=val_dataset # evaluation dataset
    eval_dataset=val_dataset, # evaluation dataset
    tokenizer=tokenizer
    )

    else:
    @@ -108,6 +109,7 @@ def prepare_fine_tuning(model_name, train_dataset, val_dataset=None, freeze_enco
    model=model, # the instantiated 🤗 Transformers model to be trained
    args=training_args, # training arguments, defined above
    train_dataset=train_dataset, # training dataset
    tokenizer=tokenizer
    )

    return trainer
    @@ -121,6 +123,6 @@ def prepare_fine_tuning(model_name, train_dataset, val_dataset=None, freeze_enco

    # use Pegasus Large model as base for fine-tuning
    model_name = 'google/pegasus-large'
    train_dataset, _, _ = prepare_data(model_name, train_texts, train_labels)
    trainer = prepare_fine_tuning(model_name, train_dataset)
    train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
    trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
    trainer.train()
  4. @jiahao87 jiahao87 revised this gist Feb 2, 2021. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions pegasus_fine_tune.py
    Original file line number Diff line number Diff line change
    @@ -72,8 +72,8 @@ def prepare_fine_tuning(model_name, train_dataset, val_dataset=None, freeze_enco
    training_args = TrainingArguments(
    output_dir=output_dir, # output directory
    num_train_epochs=2000, # total number of training epochs
    per_device_train_batch_size=1, # batch size per device during training
    per_device_eval_batch_size=1, # batch size for evaluation
    per_device_train_batch_size=1, # batch size per device during training, can increase if memory allows
    per_device_eval_batch_size=1, # batch size for evaluation, can increase if memory allows
    save_steps=500, # number of updates steps before checkpoint saves
    save_total_limit=5, # limit the total amount of checkpoints and deletes the older checkpoints
    evaluation_strategy='steps', # evaluation strategy to adopt during training
    @@ -95,7 +95,7 @@ def prepare_fine_tuning(model_name, train_dataset, val_dataset=None, freeze_enco
    training_args = TrainingArguments(
    output_dir=output_dir, # output directory
    num_train_epochs=2000, # total number of training epochs
    per_device_train_batch_size=1, # batch size per device during training
    per_device_train_batch_size=1, # batch size per device during training, can increase if memory allows
    save_steps=500, # number of updates steps before checkpoint saves
    save_total_limit=5, # limit the total amount of checkpoints and deletes the older checkpoints
    warmup_steps=500, # number of warmup steps for learning rate scheduler
  5. @jiahao87 jiahao87 created this gist Feb 1, 2021.
    126 changes: 126 additions & 0 deletions pegasus_fine_tune.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,126 @@
    """Script for fine-tuning Pegasus
    Example usage:
    # use XSum dataset as example, with first 1000 docs as training data
    from datasets import load_dataset
    dataset = load_dataset("xsum")
    train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]
    # use Pegasus Large model as base for fine-tuning
    model_name = 'google/pegasus-large'
    train_dataset, _, _ = prepare_data(model_name, train_texts, train_labels)
    trainer = prepare_fine_tuning(model_name, train_dataset)
    trainer.train()
    Reference:
    https://huggingface.co/transformers/master/custom_datasets.html
    """

    from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
    import torch


    class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels
    def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels['input_ids'][idx]) # torch.tensor(self.labels[idx])
    return item
    def __len__(self):
    return len(self.labels)


    def prepare_data(model_name,
    train_texts, train_labels,
    val_texts=None, val_labels=None,
    test_texts=None, test_labels=None):
    """
    Prepare input data for model fine-tuning
    """
    tokenizer = PegasusTokenizer.from_pretrained(model_name)

    prepare_val = False if val_texts is None or val_labels is None else True
    prepare_test = False if test_texts is None or test_labels is None else True

    def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

    train_dataset = tokenize_data(train_texts, train_labels)
    val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
    test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

    return train_dataset, val_dataset, test_dataset


    def prepare_fine_tuning(model_name, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
    """
    Prepare configurations and base model for fine-tuning
    """
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    if freeze_encoder:
    for param in model.model.encoder.parameters():
    param.requires_grad = False

    if val_dataset is not None:
    training_args = TrainingArguments(
    output_dir=output_dir, # output directory
    num_train_epochs=2000, # total number of training epochs
    per_device_train_batch_size=1, # batch size per device during training
    per_device_eval_batch_size=1, # batch size for evaluation
    save_steps=500, # number of updates steps before checkpoint saves
    save_total_limit=5, # limit the total amount of checkpoints and deletes the older checkpoints
    evaluation_strategy='steps', # evaluation strategy to adopt during training
    eval_steps=100, # number of update steps before evaluation
    warmup_steps=500, # number of warmup steps for learning rate scheduler
    weight_decay=0.01, # strength of weight decay
    logging_dir='./logs', # directory for storing logs
    logging_steps=10,
    )

    trainer = Trainer(
    model=model, # the instantiated 🤗 Transformers model to be trained
    args=training_args, # training arguments, defined above
    train_dataset=train_dataset, # training dataset
    eval_dataset=val_dataset # evaluation dataset
    )

    else:
    training_args = TrainingArguments(
    output_dir=output_dir, # output directory
    num_train_epochs=2000, # total number of training epochs
    per_device_train_batch_size=1, # batch size per device during training
    save_steps=500, # number of updates steps before checkpoint saves
    save_total_limit=5, # limit the total amount of checkpoints and deletes the older checkpoints
    warmup_steps=500, # number of warmup steps for learning rate scheduler
    weight_decay=0.01, # strength of weight decay
    logging_dir='./logs', # directory for storing logs
    logging_steps=10,
    )

    trainer = Trainer(
    model=model, # the instantiated 🤗 Transformers model to be trained
    args=training_args, # training arguments, defined above
    train_dataset=train_dataset, # training dataset
    )

    return trainer


    if __name__=='__main__':
    # use XSum dataset as example, with first 1000 docs as training data
    from datasets import load_dataset
    dataset = load_dataset("xsum")
    train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]

    # use Pegasus Large model as base for fine-tuning
    model_name = 'google/pegasus-large'
    train_dataset, _, _ = prepare_data(model_name, train_texts, train_labels)
    trainer = prepare_fine_tuning(model_name, train_dataset)
    trainer.train()