Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save suakow/0837abd971cb811ee46ccfae1606c577 to your computer and use it in GitHub Desktop.

Select an option

Save suakow/0837abd971cb811ee46ccfae1606c577 to your computer and use it in GitHub Desktop.
__author__ = "Puri Phakmongkol"
__author_email__ = "me@puri.in.th"
"""
* Thesis
*
* Created date : 15/06/2021
*
+ o + o
+ o + +
o +
o + + +
+ o o + o
-_-_-_-_-_-_-_,------, o
_-_-_-_-_-_-_-| /\_/\
-_-_-_-_-_-_-~|__( ^ .^) + +
_-_-_-_-_-_-_-"" ""
+ o o + o
+ +
o o _-_-_-_- NSC Baseline v1
o +
+ + o o +
$ srun -v --gres=gpu:1 --pty python nsc_private-baseline-v1.py
"""
print('----- Starting script -----')
#@title Param
param_training_name = "nsc_private-t5_b-qa-step2-v1" #@param {type:"string"}
param_description = "t5-v1:NSC-private" #@param {type:"string"}
param_batch_size = 1#@param {type:"integer"}
param_max_length = 512#@param {type:"integer"}
param_doc_stride = 128#@param {type:"integer"}
param_model_name = 'google/mt5-base'
#@markdown -----
#@markdown Pretraining Parameters
param_pretrain_lr = 5e-6#@param {type:"number"}
param_pretrain_epoch = 25#@param {type:"integer"}
param_weight_decay = 0.01#@param {type:"number"}
#@markdown -----
#@markdown Wandb
param_wandb_project = "thaiqa-semi-v9" #@param {type:"string"}
param_tags = ['t5_b-qa', 'nsc_span'] #@param {type:"raw"}
param_wandb_api_key = "xxxxxxx" #@param {type:"string"}
#@markdown -----
#@markdown Colab
param_notebook_path = "/data/users/ppuri/thesis/thaiqa-semi/finetune/semi-v10/" #@param {type:"string"}
import transformers
import numpy as np
from tqdm.auto import tqdm
import torch
#datasets
from datasets import load_dataset
#transformers
from transformers import (
CamembertTokenizerFast,
TrainingArguments,
Trainer,
T5Tokenizer,
MT5ForConditionalGeneration
)
#thai2transformers
import thai2transformers
from thai2transformers.preprocess import process_transformers
from thai2transformers.metrics import (
classification_metrics,
multilabel_classification_metrics,
)
from thai2transformers.tokenizers import (
ThaiRobertaTokenizer,
ThaiWordsNewmmTokenizer,
ThaiWordsSyllableTokenizer,
FakeSefrCutTokenizer,
SEFR_SPLIT_TOKEN
)
import os
# import wandb
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import functools
import random
random.seed(5555)
"""
* Wamdb Configuration
"""
# print('Configuration Wandb...')
# os.environ['WANDB_PROJECT'] = param_wandb_project
# os.environ["WANDB_API_KEY"] = param_wandb_api_key
# wandb.init(project=param_wandb_project,
# name=param_training_name,
# tags=param_tags,
# group='wangchanberta')
# param_config = {
# 'batchsize' : param_batch_size,
# 'max_length' : param_max_length,
# 'doc_stride' : param_doc_stride,
# 'learning_rate' : param_pretrain_lr,
# 'epoch' : param_pretrain_epoch,
# 'weight_decay' : param_weight_decay,
# }
# wandb.config.update(param_config)
# wandb.log({'run_name': param_training_name, 'description': param_description})
# wandb.log({
# 'params' : param_config
# })
batch_size = param_batch_size
# tokenizer = CamembertTokenizerFast.from_pretrained('airesearch/wangchanberta-base-att-spm-uncased', revision='main', model_max_length=416)
tokenizer = T5Tokenizer.from_pretrained(param_model_name, model_max_length=512)
import json
print('Strat import dataset...')
train_datasets = DatasetDict.load_from_disk('/data/users/ppuri/thesis/thaiqa-semi/data/content/nsc_private/')
"""
* Preprocessing
"""
print('Strat preprocessing...')
import pythainlp
def preprocessing_normalization(example) :
example['question'] = pythainlp.util.normalize(example['question'])
example['context'] = pythainlp.util.normalize(example['context'])
return example
def lowercase_example(example):
example['question'] = example['question'].lower()
example['context'] = example['context'].lower()
return example
def preprocessing_NSC(example) :
example['question'] = example['question'].replace('\xa0', ' ')
example['context'] = example['context'].replace('\xa0', ' ')
example['answers']['answer'][0] = example['answers']['answer'][0].replace('\xa0', ' ')
return example
train_datasets = train_datasets.map(lowercase_example)
train_datasets = train_datasets.map(preprocessing_normalization)
train_datasets = train_datasets.map(preprocessing_NSC)
def preprocess_t5_feature(example) :
example['input_text'] = 'q: %s c: %s </s>'%(example['question'], example['context'])
example['target_text'] = '%s </s>'%(example['answers']['answer'][0])
return example
train_datasets = train_datasets.map(preprocess_t5_feature)
def preprocess_convert_feature(example) :
input_encodings = tokenizer(example['input_text'], max_length=512, padding='max_length', truncation=True)
output_encodings = tokenizer(example['target_text'], max_length=30, padding='max_length', truncation=True)
example['input_ids'] = input_encodings['input_ids']
example['attention_mask'] = input_encodings['attention_mask']
example['labels'] = input_encodings['input_ids']
return example
train_tokenized_datasets = train_datasets.map(preprocess_convert_feature)
train_tokenized_datasets = train_tokenized_datasets.remove_columns(['answers', 'context', 'input_text', 'question', 'target_text'])
max_length = param_max_length
doc_stride = param_doc_stride
pad_on_right = tokenizer.padding_side == "right"
print(train_tokenized_datasets)
train_tokenized_datasets.set_format(type='pt')
test_dataset_pt = [ _ for _ in train_tokenized_datasets['test']]
test_dataloader = torch.utils.data.DataLoader(test_dataset_pt, batch_size=1)
references = [{"id": str(ex["question_id"]),
"answers": {'text': ex['answers']['answer'],
'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]
print(references)
# def prepare_train_features(examples):
# # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
# # in one example possible giving several features when a context is long, each of those features having a
# # context that overlaps a bit the context of the previous feature.
# tokenized_examples = tokenizer(
# examples["question" if pad_on_right else "context"],
# examples["context" if pad_on_right else "question"],
# truncation="only_second" if pad_on_right else "only_first",
# max_length=max_length,
# stride=doc_stride,
# return_overflowing_tokens=True,
# return_offsets_mapping=True,
# padding="max_length",
# )
# # Since one example might give us several features if it has a long context, we need a map from a feature to
# # its corresponding example. This key gives us just that.
# sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
# # The offset mappings will give us a map from token to character position in the original context. This will
# # help us compute the start_positions and end_positions.
# offset_mapping = tokenized_examples.pop("offset_mapping")
# # Let's label those examples!
# tokenized_examples["start_positions"] = []
# tokenized_examples["end_positions"] = []
# for i, offsets in enumerate(offset_mapping):
# # We will label impossible answers with the index of the <s> token.
# input_ids = tokenized_examples["input_ids"][i]
# cls_index = input_ids.index(tokenizer.cls_token_id)
# # Grab the sequence corresponding to that example (to know what is the context and what is the question).
# sequence_ids = tokenized_examples.sequence_ids(i)
# # One example can give several spans, this is the index of the example containing this span of text.
# sample_index = sample_mapping[i]
# answers = examples["answers"][sample_index]
# # If no answers are given, set the cls_index as answer.
# if len(answers["answer_begin_position"]) == 0:
# tokenized_examples["start_positions"].append(cls_index)
# tokenized_examples["end_positions"].append(cls_index)
# else:
# # Start/end character index of the answer in the text.
# start_char = answers["answer_begin_position"][0]
# end_char = start_char + len(answers["answer"][0]) + 1
# # Start token index of the current span in the text.
# token_start_index = 0
# while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
# token_start_index += 1
# # End token index of the current span in the text.
# token_end_index = len(input_ids) - 1
# while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
# token_end_index -= 1
# # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
# if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
# tokenized_examples["start_positions"].append(cls_index)
# tokenized_examples["end_positions"].append(cls_index)
# else:
# # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
# # Note: we could go after the last offset if the answer is the last word (edge case).
# while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
# token_start_index += 1
# tokenized_examples["start_positions"].append(token_start_index - 1)
# while offsets[token_end_index][1] >= end_char:
# token_end_index -= 1
# tokenized_examples["end_positions"].append(token_end_index + 1)
# return tokenized_examples
# train_tokenized_datasets = train_datasets.map(prepare_train_features, batched=True, remove_columns=train_datasets["train"].column_names)
# train_tokenized_datasets = train_tokenized_datasets.filter(lambda _: _['start_positions'] != 0 and _['end_positions'] != 0)
"""
* Load Best Model
"""
training_states = json.loads(open(f"{param_notebook_path}{param_training_name}/trainer_state.json", 'r').read())
all_training_states = [ _ for _ in training_states['log_history'] if _.get('eval_loss') != None ]
best_state = sorted(all_training_states, key=lambda k: k['eval_loss'])[0]
print(best_state)
"""
* Define model
"""
print('Defind model...')
torch.cuda.empty_cache()
from transformers import TrainingArguments, Trainer
# model = AutoModelForQuestionAnswering.from_pretrained('airesearch/wangchanberta-base-att-spm-uncased', revision='main')
model = MT5ForConditionalGeneration.from_pretrained(f'{param_notebook_path}{param_training_name}/checkpoint-%s/'%(best_state['step']))
model.to('cuda')
model.eval()
# args = TrainingArguments(
# param_training_name,
# evaluation_strategy = "epoch",
# save_strategy = 'epoch',
# learning_rate = param_pretrain_lr,
# per_device_train_batch_size = batch_size,
# per_device_eval_batch_size = batch_size,
# num_train_epochs = param_pretrain_epoch,
# weight_decay = param_weight_decay,
# report_to = 'wandb',
# run_name = param_training_name,
# logging_dir=f'{param_notebook_path}{param_training_name}/logs',
# logging_strategy='epoch',
# )
# from transformers import default_data_collator
# data_collator = default_data_collator
# trainer = Trainer(
# model,
# args,
# train_dataset=train_tokenized_datasets["train"],
# eval_dataset=train_tokenized_datasets["validation"],
# data_collator=data_collator,
# tokenizer=tokenizer,
# )
# y_pred = trainer.predict(train_tokenized_datasets['test'])
import re
import pandas as pd
import collections
from tqdm.auto import tqdm
test_result = {}
thai_metric = load_metric('/data/users/ppuri/thesis/thaiqa_squad_metric/thai_squad_newmm.py')
pattern = re.compile(r"[^.\u0E00-\u0E7F0-9a-zA-Z' ]|^'|'$|''")
final_predictions = collections.OrderedDict()
for i in tqdm(test_dataloader) :
beam_outputs = model.generate(
input_ids= i['input_ids'].to('cuda'),
attention_mask=i['attention_mask'].to('cuda'),
max_length=50,
num_beams=5,
no_repeat_ngram_size=2,
early_stopping=True
)
text = tokenizer.decode(beam_outputs[0])
text = re.sub('</?\w*>', '', text).strip()
final_predictions[i['question_id'].item()] = text
final_predictions_test_2 = [ (_[0], re.sub(pattern, '', _[1]).strip()) for _ in list(final_predictions.items()) ]
formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions.items()]
# references = [{"id": str(ex["question_id"]),
# "answers": {'text': ex['answers']['answer'],
# 'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]
e = thai_metric.compute(predictions=formatted_predictions, references=references)
formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions_test_2]
references = [{"id": str(ex["question_id"]),
"answers": {'text': ex['answers']['answer'],
'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]
e2 = thai_metric.compute(predictions=formatted_predictions, references=references)
best_model_path = f'{param_notebook_path}{param_training_name}/checkpoint-%s/'%(best_state['step'])
test_result['best_epoch'] = {
'wo_post' : e,
'w_post' : e2,
'best_state_path' : best_model_path,
'best_state_detail' : best_state
}
print(test_result)
open(f'{param_notebook_path}{param_training_name}/test_result.json', 'w').write(json.dumps(test_result))
pred_ans = []
for _ in range(len(train_datasets['test'])) :
metric_compute = thai_metric.compute(predictions=[{'id': '1', 'prediction_text': formatted_predictions[_]['prediction_text']},],
references=[{'answers': {'answer_start': [1], 'text': [references[_]['answers']['text'][0]]}, 'id': '1'},])
pred_ans.append({
'id' : formatted_predictions[_]['id'],
'y_pred' : formatted_predictions[_]['prediction_text'],
'y_true_text' : references[_]['answers']['text'][0],
'em' : metric_compute['exact_match'],
'f1' : metric_compute['f1'],
'y_true_start' : references[_]['answers']['answer_start'][0],
'y_pred_start' : train_datasets['test'][_]['context'].find(formatted_predictions[_]['prediction_text']),
'question' : train_datasets['test'][_]['question'],
'context' : train_datasets['test'][_]['context'],
'len_context' : len(train_datasets['test'][_]['context']),
'true_pos' : references[_]['answers']['answer_start'][0] / len(train_datasets['test'][_]['context']),
'pred_pos' : train_datasets['test'][_]['context'].find(formatted_predictions[_]['prediction_text']) / len(train_datasets['test'][_]['context']) if train_datasets['test'][_]['context'].find(formatted_predictions[_]['prediction_text']) != -1 else 0
})
pred_df = pd.DataFrame(pred_ans)
print(pred_df.head())
pred_df.to_csv(f'{param_notebook_path}{param_training_name}/pred.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment