suakow · July 9, 2022 17:41
diff --git a/nsc_private-t5_b-qa-step2-inference-v1.py b/nsc_private-t5_b-qa-step2-inference-v1.py


 __author__ = "Puri Phakmongkol"
 __author_email__ = "me@puri.in.th"

 """
 * Thesis
 *
 * Created date : 15/06/2021
 *
 +      o     +              o
    +             o     +       +
 o          +
    o  +           +        +
 +        o     o       +        o
 -_-_-_-_-_-_-_,------,      o
 _-_-_-_-_-_-_-|   /\_/\
 -_-_-_-_-_-_-~|__( ^ .^)  +     +
 _-_-_-_-_-_-_-""  ""
 +      o         o   +       o
    +         +
 o      o  _-_-_-_- NSC Baseline v1
    o           +
 +      +     o        o      +
 $ srun -v --gres=gpu:1 --pty python nsc_private-baseline-v1.py
 """

 print('----- Starting script -----')

 #@title Param
 param_training_name = "nsc_private-t5_b-qa-step2-v1" #@param {type:"string"}
 param_description = "t5-v1:NSC-private" #@param {type:"string"}
 param_batch_size =  1#@param {type:"integer"}
 param_max_length =  512#@param {type:"integer"}
 param_doc_stride =  128#@param {type:"integer"}
 param_model_name = 'google/mt5-base'
 #@markdown -----
 #@markdown Pretraining Parameters
 param_pretrain_lr =  5e-6#@param {type:"number"}
 param_pretrain_epoch = 25#@param {type:"integer"}
 param_weight_decay =  0.01#@param {type:"number"}
 #@markdown -----
 #@markdown Wandb
 param_wandb_project = "thaiqa-semi-v9" #@param {type:"string"}
 param_tags = ['t5_b-qa', 'nsc_span'] #@param {type:"raw"}
 param_wandb_api_key = "xxxxxxx" #@param {type:"string"}
 #@markdown -----
 #@markdown Colab
 param_notebook_path = "/data/users/ppuri/thesis/thaiqa-semi/finetune/semi-v10/" #@param {type:"string"}

 import transformers

 import numpy as np
 from tqdm.auto import tqdm
 import torch

 #datasets
 from datasets import load_dataset

 #transformers
 from transformers import (
    CamembertTokenizerFast,
    TrainingArguments,
    Trainer,
    T5Tokenizer, 
    MT5ForConditionalGeneration
 )

 #thai2transformers
 import thai2transformers
 from thai2transformers.preprocess import process_transformers
 from thai2transformers.metrics import (
    classification_metrics, 
    multilabel_classification_metrics,
 )
 from thai2transformers.tokenizers import (
    ThaiRobertaTokenizer,
    ThaiWordsNewmmTokenizer,
    ThaiWordsSyllableTokenizer,
    FakeSefrCutTokenizer,
    SEFR_SPLIT_TOKEN
 )

 import os
 # import wandb

 from datasets import load_dataset, load_metric, Dataset, DatasetDict

 import functools
 import random
 random.seed(5555)

 """
 * Wamdb Configuration
 """
 # print('Configuration Wandb...')
 # os.environ['WANDB_PROJECT'] = param_wandb_project
 # os.environ["WANDB_API_KEY"] = param_wandb_api_key

 # wandb.init(project=param_wandb_project, 
 #            name=param_training_name,
 #            tags=param_tags,
 #            group='wangchanberta')

 # param_config = {
 #     'batchsize' : param_batch_size,
 #     'max_length' : param_max_length,
 #     'doc_stride' : param_doc_stride,
 #     'learning_rate' : param_pretrain_lr,
 #     'epoch' : param_pretrain_epoch,
 #     'weight_decay' : param_weight_decay,
 # }
 # wandb.config.update(param_config)

 # wandb.log({'run_name': param_training_name, 'description': param_description})
 # wandb.log({
 #     'params' : param_config
 # })

 batch_size = param_batch_size

 # tokenizer = CamembertTokenizerFast.from_pretrained('airesearch/wangchanberta-base-att-spm-uncased', revision='main', model_max_length=416)
 tokenizer = T5Tokenizer.from_pretrained(param_model_name, model_max_length=512)

 import json

 print('Strat import dataset...')
 train_datasets = DatasetDict.load_from_disk('/data/users/ppuri/thesis/thaiqa-semi/data/content/nsc_private/')

 """
 * Preprocessing
 """
 print('Strat preprocessing...')

 import pythainlp

 def preprocessing_normalization(example) :
    example['question'] = pythainlp.util.normalize(example['question'])
    example['context'] = pythainlp.util.normalize(example['context'])

    return example

 def lowercase_example(example):
    example['question'] = example['question'].lower()
    example['context'] = example['context'].lower()
    return example

 def preprocessing_NSC(example) :
    example['question'] = example['question'].replace('\xa0', ' ')
    example['context'] = example['context'].replace('\xa0', ' ')
    example['answers']['answer'][0] = example['answers']['answer'][0].replace('\xa0', ' ')

    return example

 train_datasets = train_datasets.map(lowercase_example)
 train_datasets = train_datasets.map(preprocessing_normalization)
 train_datasets = train_datasets.map(preprocessing_NSC)

 def preprocess_t5_feature(example) :
    example['input_text'] = 'q: %s c: %s </s>'%(example['question'], example['context'])
    example['target_text'] = '%s </s>'%(example['answers']['answer'][0])

    return example

 train_datasets = train_datasets.map(preprocess_t5_feature)

 def preprocess_convert_feature(example) :
    input_encodings = tokenizer(example['input_text'], max_length=512, padding='max_length', truncation=True)
    output_encodings = tokenizer(example['target_text'], max_length=30, padding='max_length', truncation=True)

    example['input_ids'] = input_encodings['input_ids']
    example['attention_mask'] = input_encodings['attention_mask']
    example['labels'] = input_encodings['input_ids']

    return example

 train_tokenized_datasets = train_datasets.map(preprocess_convert_feature)
 train_tokenized_datasets = train_tokenized_datasets.remove_columns(['answers', 'context', 'input_text', 'question', 'target_text'])

 max_length = param_max_length
 doc_stride = param_doc_stride
 pad_on_right = tokenizer.padding_side == "right"

 print(train_tokenized_datasets)

 train_tokenized_datasets.set_format(type='pt')
 test_dataset_pt = [ _ for _ in train_tokenized_datasets['test']]
 test_dataloader = torch.utils.data.DataLoader(test_dataset_pt, batch_size=1)

 references = [{"id": str(ex["question_id"]), 
               "answers": {'text': ex['answers']['answer'],
                           'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]

 print(references)

 # def prepare_train_features(examples):
 #     # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
 #     # in one example possible giving several features when a context is long, each of those features having a
 #     # context that overlaps a bit the context of the previous feature.
 #     tokenized_examples = tokenizer(
 #         examples["question" if pad_on_right else "context"],
 #         examples["context" if pad_on_right else "question"],
 #         truncation="only_second" if pad_on_right else "only_first",
 #         max_length=max_length,
 #         stride=doc_stride,
 #         return_overflowing_tokens=True,
 #         return_offsets_mapping=True,
 #         padding="max_length",
 #     )

 #     # Since one example might give us several features if it has a long context, we need a map from a feature to
 #     # its corresponding example. This key gives us just that.
 #     sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
 #     # The offset mappings will give us a map from token to character position in the original context. This will
 #     # help us compute the start_positions and end_positions.
 #     offset_mapping = tokenized_examples.pop("offset_mapping")

 #     # Let's label those examples!
 #     tokenized_examples["start_positions"] = []
 #     tokenized_examples["end_positions"] = []

 #     for i, offsets in enumerate(offset_mapping):
 #         # We will label impossible answers with the index of the <s> token.
 #         input_ids = tokenized_examples["input_ids"][i]
 #         cls_index = input_ids.index(tokenizer.cls_token_id)

 #         # Grab the sequence corresponding to that example (to know what is the context and what is the question).
 #         sequence_ids = tokenized_examples.sequence_ids(i)

 #         # One example can give several spans, this is the index of the example containing this span of text.
 #         sample_index = sample_mapping[i]
 #         answers = examples["answers"][sample_index]
 #         # If no answers are given, set the cls_index as answer.
 #         if len(answers["answer_begin_position"]) == 0:
 #             tokenized_examples["start_positions"].append(cls_index)
 #             tokenized_examples["end_positions"].append(cls_index)
 #         else:
 #             # Start/end character index of the answer in the text.
 #             start_char = answers["answer_begin_position"][0] 
 #             end_char = start_char + len(answers["answer"][0]) + 1

 #             # Start token index of the current span in the text.
 #             token_start_index = 0
 #             while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
 #                 token_start_index += 1

 #             # End token index of the current span in the text.
 #             token_end_index = len(input_ids) - 1
 #             while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
 #                 token_end_index -= 1

 #             # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
 #             if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
 #                 tokenized_examples["start_positions"].append(cls_index)
 #                 tokenized_examples["end_positions"].append(cls_index)
 #             else:
 #                 # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
 #                 # Note: we could go after the last offset if the answer is the last word (edge case).
 #                 while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
 #                     token_start_index += 1
 #                 tokenized_examples["start_positions"].append(token_start_index - 1)
 #                 while offsets[token_end_index][1] >= end_char:
 #                     token_end_index -= 1
 #                 tokenized_examples["end_positions"].append(token_end_index + 1)

 #     return tokenized_examples

 # train_tokenized_datasets = train_datasets.map(prepare_train_features, batched=True, remove_columns=train_datasets["train"].column_names)
 # train_tokenized_datasets = train_tokenized_datasets.filter(lambda _: _['start_positions'] != 0 and _['end_positions'] != 0)

 """
 * Load Best Model
 """
 training_states = json.loads(open(f"{param_notebook_path}{param_training_name}/trainer_state.json", 'r').read())
 all_training_states = [ _ for _ in training_states['log_history'] if _.get('eval_loss') != None ]
 best_state = sorted(all_training_states, key=lambda k: k['eval_loss'])[0]

 print(best_state)


 """
 * Define model
 """
 print('Defind model...')
 torch.cuda.empty_cache()

 from transformers import TrainingArguments, Trainer

 # model = AutoModelForQuestionAnswering.from_pretrained('airesearch/wangchanberta-base-att-spm-uncased', revision='main')
 model = MT5ForConditionalGeneration.from_pretrained(f'{param_notebook_path}{param_training_name}/checkpoint-%s/'%(best_state['step']))
 model.to('cuda')
 model.eval()
 # args = TrainingArguments(
 #     param_training_name,
 #     evaluation_strategy = "epoch",
 #     save_strategy = 'epoch',
 #     learning_rate = param_pretrain_lr,
 #     per_device_train_batch_size = batch_size,
 #     per_device_eval_batch_size = batch_size,
 #     num_train_epochs = param_pretrain_epoch,
 #     weight_decay = param_weight_decay,
 #     report_to = 'wandb',
 #     run_name = param_training_name,
 #     logging_dir=f'{param_notebook_path}{param_training_name}/logs',
 #     logging_strategy='epoch',
 # )

 # from transformers import default_data_collator

 # data_collator = default_data_collator

 # trainer = Trainer(
 #     model,
 #     args,
 #     train_dataset=train_tokenized_datasets["train"],
 #     eval_dataset=train_tokenized_datasets["validation"],
 #     data_collator=data_collator,
 #     tokenizer=tokenizer,
 # )

 # y_pred = trainer.predict(train_tokenized_datasets['test'])
 import re
 import pandas as pd
 import collections
 from tqdm.auto import tqdm

 test_result = {}

 thai_metric = load_metric('/data/users/ppuri/thesis/thaiqa_squad_metric/thai_squad_newmm.py')

 pattern = re.compile(r"[^.\u0E00-\u0E7F0-9a-zA-Z' ]|^'|'$|''")

 final_predictions = collections.OrderedDict()
 for i in tqdm(test_dataloader) :
    beam_outputs = model.generate(
            input_ids= i['input_ids'].to('cuda'),
            attention_mask=i['attention_mask'].to('cuda'),
            max_length=50,
            num_beams=5,
            no_repeat_ngram_size=2,
            early_stopping=True
        )

    text = tokenizer.decode(beam_outputs[0])
    text = re.sub('</?\w*>', '', text).strip()
    final_predictions[i['question_id'].item()] = text

 final_predictions_test_2 = [ (_[0], re.sub(pattern, '', _[1]).strip()) for _ in list(final_predictions.items()) ]

 formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions.items()]
 # references = [{"id": str(ex["question_id"]), 
 #                "answers": {'text': ex['answers']['answer'],
 #                            'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]

 e = thai_metric.compute(predictions=formatted_predictions, references=references)

 formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions_test_2]
 references = [{"id": str(ex["question_id"]), 
               "answers": {'text': ex['answers']['answer'],
                           'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]

 e2 = thai_metric.compute(predictions=formatted_predictions, references=references)

 best_model_path = f'{param_notebook_path}{param_training_name}/checkpoint-%s/'%(best_state['step'])


 test_result['best_epoch'] = {
    'wo_post' : e,
    'w_post' : e2,
    'best_state_path' : best_model_path,
    'best_state_detail' : best_state
 }

 print(test_result)
 open(f'{param_notebook_path}{param_training_name}/test_result.json', 'w').write(json.dumps(test_result))

 pred_ans = []

 for _ in range(len(train_datasets['test'])) :
    metric_compute = thai_metric.compute(predictions=[{'id': '1', 'prediction_text': formatted_predictions[_]['prediction_text']},], 
                    references=[{'answers': {'answer_start': [1], 'text': [references[_]['answers']['text'][0]]}, 'id': '1'},])
    pred_ans.append({
        'id' : formatted_predictions[_]['id'],
        'y_pred' : formatted_predictions[_]['prediction_text'],
        'y_true_text' : references[_]['answers']['text'][0],
        'em' : metric_compute['exact_match'],
        'f1' : metric_compute['f1'],
        'y_true_start' : references[_]['answers']['answer_start'][0],
        'y_pred_start' : train_datasets['test'][_]['context'].find(formatted_predictions[_]['prediction_text']),
        'question' : train_datasets['test'][_]['question'],
        'context' : train_datasets['test'][_]['context'],
        'len_context' : len(train_datasets['test'][_]['context']),
        'true_pos' : references[_]['answers']['answer_start'][0] / len(train_datasets['test'][_]['context']),
        'pred_pos' : train_datasets['test'][_]['context'].find(formatted_predictions[_]['prediction_text']) / len(train_datasets['test'][_]['context']) if train_datasets['test'][_]['context'].find(formatted_predictions[_]['prediction_text']) != -1 else 0
    })

 pred_df = pd.DataFrame(pred_ans)
 print(pred_df.head())

 pred_df.to_csv(f'{param_notebook_path}{param_training_name}/pred.csv')


	__author__ = "Puri Phakmongkol"
	__author_email__ = "me@puri.in.th"

	"""
	* Thesis
	*
	* Created date : 15/06/2021
	*
	+ o + o
	+ o + +
	o +
	o + + +
	+ o o + o
	-_-_-_-_-_-_-_,------, o
	_-_-_-_-_-_-_-\| /\_/\
	-_-_-_-_-_-_-~\|__( ^ .^) + +
	_-_-_-_-_-_-_-"" ""
	+ o o + o
	+ +
	o o _-_-_-_- NSC Baseline v1
	o +
	+ + o o +
	$ srun -v --gres=gpu:1 --pty python nsc_private-baseline-v1.py
	"""

	print('----- Starting script -----')

	#@title Param
	param_training_name = "nsc_private-t5_b-qa-step2-v1" #@param {type:"string"}
	param_description = "t5-v1:NSC-private" #@param {type:"string"}
	param_batch_size = 1#@param {type:"integer"}
	param_max_length = 512#@param {type:"integer"}
	param_doc_stride = 128#@param {type:"integer"}
	param_model_name = 'google/mt5-base'
	#@markdown -----
	#@markdown Pretraining Parameters
	param_pretrain_lr = 5e-6#@param {type:"number"}
	param_pretrain_epoch = 25#@param {type:"integer"}
	param_weight_decay = 0.01#@param {type:"number"}
	#@markdown -----
	#@markdown Wandb
	param_wandb_project = "thaiqa-semi-v9" #@param {type:"string"}
	param_tags = ['t5_b-qa', 'nsc_span'] #@param {type:"raw"}
	param_wandb_api_key = "xxxxxxx" #@param {type:"string"}
	#@markdown -----
	#@markdown Colab
	param_notebook_path = "/data/users/ppuri/thesis/thaiqa-semi/finetune/semi-v10/" #@param {type:"string"}

	import transformers

	import numpy as np
	from tqdm.auto import tqdm
	import torch

	#datasets
	from datasets import load_dataset

	#transformers
	from transformers import (
	CamembertTokenizerFast,
	TrainingArguments,
	Trainer,
	T5Tokenizer,
	MT5ForConditionalGeneration
	)

	#thai2transformers
	import thai2transformers
	from thai2transformers.preprocess import process_transformers
	from thai2transformers.metrics import (
	classification_metrics,
	multilabel_classification_metrics,
	)
	from thai2transformers.tokenizers import (
	ThaiRobertaTokenizer,
	ThaiWordsNewmmTokenizer,
	ThaiWordsSyllableTokenizer,
	FakeSefrCutTokenizer,
	SEFR_SPLIT_TOKEN
	)

	import os
	# import wandb

	from datasets import load_dataset, load_metric, Dataset, DatasetDict

	import functools
	import random
	random.seed(5555)

	"""
	* Wamdb Configuration
	"""
	# print('Configuration Wandb...')
	# os.environ['WANDB_PROJECT'] = param_wandb_project
	# os.environ["WANDB_API_KEY"] = param_wandb_api_key

	# wandb.init(project=param_wandb_project,
	# name=param_training_name,
	# tags=param_tags,
	# group='wangchanberta')

	# param_config = {
	# 'batchsize' : param_batch_size,
	# 'max_length' : param_max_length,
	# 'doc_stride' : param_doc_stride,
	# 'learning_rate' : param_pretrain_lr,
	# 'epoch' : param_pretrain_epoch,
	# 'weight_decay' : param_weight_decay,
	# }
	# wandb.config.update(param_config)

	# wandb.log({'run_name': param_training_name, 'description': param_description})
	# wandb.log({
	# 'params' : param_config
	# })

	batch_size = param_batch_size

	# tokenizer = CamembertTokenizerFast.from_pretrained('airesearch/wangchanberta-base-att-spm-uncased', revision='main', model_max_length=416)
	tokenizer = T5Tokenizer.from_pretrained(param_model_name, model_max_length=512)

	import json

	print('Strat import dataset...')
	train_datasets = DatasetDict.load_from_disk('/data/users/ppuri/thesis/thaiqa-semi/data/content/nsc_private/')

	"""
	* Preprocessing
	"""
	print('Strat preprocessing...')

	import pythainlp

	def preprocessing_normalization(example) :
	example['question'] = pythainlp.util.normalize(example['question'])
	example['context'] = pythainlp.util.normalize(example['context'])

	return example

	def lowercase_example(example):
	example['question'] = example['question'].lower()
	example['context'] = example['context'].lower()
	return example

	def preprocessing_NSC(example) :
	example['question'] = example['question'].replace('\xa0', ' ')
	example['context'] = example['context'].replace('\xa0', ' ')
	example['answers']['answer'][0] = example['answers']['answer'][0].replace('\xa0', ' ')

	return example

	train_datasets = train_datasets.map(lowercase_example)
	train_datasets = train_datasets.map(preprocessing_normalization)
	train_datasets = train_datasets.map(preprocessing_NSC)

	def preprocess_t5_feature(example) :
	example['input_text'] = 'q: %s c: %s </s>'%(example['question'], example['context'])
	example['target_text'] = '%s </s>'%(example['answers']['answer'][0])

	return example

	train_datasets = train_datasets.map(preprocess_t5_feature)

	def preprocess_convert_feature(example) :
	input_encodings = tokenizer(example['input_text'], max_length=512, padding='max_length', truncation=True)
	output_encodings = tokenizer(example['target_text'], max_length=30, padding='max_length', truncation=True)

	example['input_ids'] = input_encodings['input_ids']
	example['attention_mask'] = input_encodings['attention_mask']
	example['labels'] = input_encodings['input_ids']

	return example

	train_tokenized_datasets = train_datasets.map(preprocess_convert_feature)
	train_tokenized_datasets = train_tokenized_datasets.remove_columns(['answers', 'context', 'input_text', 'question', 'target_text'])

	max_length = param_max_length
	doc_stride = param_doc_stride
	pad_on_right = tokenizer.padding_side == "right"

	print(train_tokenized_datasets)

	train_tokenized_datasets.set_format(type='pt')
	test_dataset_pt = [ _ for _ in train_tokenized_datasets['test']]
	test_dataloader = torch.utils.data.DataLoader(test_dataset_pt, batch_size=1)

	references = [{"id": str(ex["question_id"]),
	"answers": {'text': ex['answers']['answer'],
	'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]

	print(references)

	# def prepare_train_features(examples):
	# # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
	# # in one example possible giving several features when a context is long, each of those features having a
	# # context that overlaps a bit the context of the previous feature.
	# tokenized_examples = tokenizer(
	# examples["question" if pad_on_right else "context"],
	# examples["context" if pad_on_right else "question"],
	# truncation="only_second" if pad_on_right else "only_first",
	# max_length=max_length,
	# stride=doc_stride,
	# return_overflowing_tokens=True,
	# return_offsets_mapping=True,
	# padding="max_length",
	# )

	# # Since one example might give us several features if it has a long context, we need a map from a feature to
	# # its corresponding example. This key gives us just that.
	# sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
	# # The offset mappings will give us a map from token to character position in the original context. This will
	# # help us compute the start_positions and end_positions.
	# offset_mapping = tokenized_examples.pop("offset_mapping")

	# # Let's label those examples!
	# tokenized_examples["start_positions"] = []
	# tokenized_examples["end_positions"] = []

	# for i, offsets in enumerate(offset_mapping):
	# # We will label impossible answers with the index of the <s> token.
	# input_ids = tokenized_examples["input_ids"][i]
	# cls_index = input_ids.index(tokenizer.cls_token_id)

	# # Grab the sequence corresponding to that example (to know what is the context and what is the question).
	# sequence_ids = tokenized_examples.sequence_ids(i)

	# # One example can give several spans, this is the index of the example containing this span of text.
	# sample_index = sample_mapping[i]
	# answers = examples["answers"][sample_index]
	# # If no answers are given, set the cls_index as answer.
	# if len(answers["answer_begin_position"]) == 0:
	# tokenized_examples["start_positions"].append(cls_index)
	# tokenized_examples["end_positions"].append(cls_index)
	# else:
	# # Start/end character index of the answer in the text.
	# start_char = answers["answer_begin_position"][0]
	# end_char = start_char + len(answers["answer"][0]) + 1

	# # Start token index of the current span in the text.
	# token_start_index = 0
	# while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
	# token_start_index += 1

	# # End token index of the current span in the text.
	# token_end_index = len(input_ids) - 1
	# while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
	# token_end_index -= 1

	# # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
	# if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
	# tokenized_examples["start_positions"].append(cls_index)
	# tokenized_examples["end_positions"].append(cls_index)
	# else:
	# # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
	# # Note: we could go after the last offset if the answer is the last word (edge case).
	# while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
	# token_start_index += 1
	# tokenized_examples["start_positions"].append(token_start_index - 1)
	# while offsets[token_end_index][1] >= end_char:
	# token_end_index -= 1
	# tokenized_examples["end_positions"].append(token_end_index + 1)

	# return tokenized_examples

	# train_tokenized_datasets = train_datasets.map(prepare_train_features, batched=True, remove_columns=train_datasets["train"].column_names)
	# train_tokenized_datasets = train_tokenized_datasets.filter(lambda _: _['start_positions'] != 0 and _['end_positions'] != 0)

	"""
	* Load Best Model
	"""
	training_states = json.loads(open(f"{param_notebook_path}{param_training_name}/trainer_state.json", 'r').read())
	all_training_states = [ _ for _ in training_states['log_history'] if _.get('eval_loss') != None ]
	best_state = sorted(all_training_states, key=lambda k: k['eval_loss'])[0]

	print(best_state)


	"""
	* Define model
	"""
	print('Defind model...')
	torch.cuda.empty_cache()

	from transformers import TrainingArguments, Trainer

	# model = AutoModelForQuestionAnswering.from_pretrained('airesearch/wangchanberta-base-att-spm-uncased', revision='main')
	model = MT5ForConditionalGeneration.from_pretrained(f'{param_notebook_path}{param_training_name}/checkpoint-%s/'%(best_state['step']))
	model.to('cuda')
	model.eval()
	# args = TrainingArguments(
	# param_training_name,
	# evaluation_strategy = "epoch",
	# save_strategy = 'epoch',
	# learning_rate = param_pretrain_lr,
	# per_device_train_batch_size = batch_size,
	# per_device_eval_batch_size = batch_size,
	# num_train_epochs = param_pretrain_epoch,
	# weight_decay = param_weight_decay,
	# report_to = 'wandb',
	# run_name = param_training_name,
	# logging_dir=f'{param_notebook_path}{param_training_name}/logs',
	# logging_strategy='epoch',
	# )

	# from transformers import default_data_collator

	# data_collator = default_data_collator

	# trainer = Trainer(
	# model,
	# args,
	# train_dataset=train_tokenized_datasets["train"],
	# eval_dataset=train_tokenized_datasets["validation"],
	# data_collator=data_collator,
	# tokenizer=tokenizer,
	# )

	# y_pred = trainer.predict(train_tokenized_datasets['test'])
	import re
	import pandas as pd
	import collections
	from tqdm.auto import tqdm

	test_result = {}

	thai_metric = load_metric('/data/users/ppuri/thesis/thaiqa_squad_metric/thai_squad_newmm.py')

	pattern = re.compile(r"[^.\u0E00-\u0E7F0-9a-zA-Z' ]\|^'\|'$\|''")

	final_predictions = collections.OrderedDict()
	for i in tqdm(test_dataloader) :
	beam_outputs = model.generate(
	input_ids= i['input_ids'].to('cuda'),
	attention_mask=i['attention_mask'].to('cuda'),
	max_length=50,
	num_beams=5,
	no_repeat_ngram_size=2,
	early_stopping=True
	)

	text = tokenizer.decode(beam_outputs[0])
	text = re.sub('</?\w*>', '', text).strip()
	final_predictions[i['question_id'].item()] = text

	final_predictions_test_2 = [ (_[0], re.sub(pattern, '', _[1]).strip()) for _ in list(final_predictions.items()) ]

	formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions.items()]
	# references = [{"id": str(ex["question_id"]),
	# "answers": {'text': ex['answers']['answer'],
	# 'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]

	e = thai_metric.compute(predictions=formatted_predictions, references=references)

	formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions_test_2]
	references = [{"id": str(ex["question_id"]),
	"answers": {'text': ex['answers']['answer'],
	'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]

	e2 = thai_metric.compute(predictions=formatted_predictions, references=references)

	best_model_path = f'{param_notebook_path}{param_training_name}/checkpoint-%s/'%(best_state['step'])


	test_result['best_epoch'] = {
	'wo_post' : e,
	'w_post' : e2,
	'best_state_path' : best_model_path,
	'best_state_detail' : best_state
	}

	print(test_result)
	open(f'{param_notebook_path}{param_training_name}/test_result.json', 'w').write(json.dumps(test_result))

	pred_ans = []

	for _ in range(len(train_datasets['test'])) :
	metric_compute = thai_metric.compute(predictions=[{'id': '1', 'prediction_text': formatted_predictions[_]['prediction_text']},],
	references=[{'answers': {'answer_start': [1], 'text': [references[_]['answers']['text'][0]]}, 'id': '1'},])
	pred_ans.append({
	'id' : formatted_predictions[_]['id'],
	'y_pred' : formatted_predictions[_]['prediction_text'],
	'y_true_text' : references[_]['answers']['text'][0],
	'em' : metric_compute['exact_match'],
	'f1' : metric_compute['f1'],
	'y_true_start' : references[_]['answers']['answer_start'][0],
	'y_pred_start' : train_datasets['test'][_]['context'].find(formatted_predictions[_]['prediction_text']),
	'question' : train_datasets['test'][_]['question'],
	'context' : train_datasets['test'][_]['context'],
	'len_context' : len(train_datasets['test'][_]['context']),
	'true_pos' : references[_]['answers']['answer_start'][0] / len(train_datasets['test'][_]['context']),
	'pred_pos' : train_datasets['test'][_]['context'].find(formatted_predictions[_]['prediction_text']) / len(train_datasets['test'][_]['context']) if train_datasets['test'][_]['context'].find(formatted_predictions[_]['prediction_text']) != -1 else 0
	})

	pred_df = pd.DataFrame(pred_ans)
	print(pred_df.head())

	pred_df.to_csv(f'{param_notebook_path}{param_training_name}/pred.csv')
No results found