suakow · July 8, 2022 16:19
diff --git a/nsc_private-wc-qa-full-step2-v1.py b/nsc_private-wc-qa-full-step2-v1.py
 __author__ = "Puri Phakmongkol"
 __author_email__ = "me@puri.in.th"

 """
 * Thesis
 *
 * Created date : 15/06/2021
 *
 +      o     +              o
    +             o     +       +
 o          +
    o  +           +        +
 +        o     o       +        o
 -_-_-_-_-_-_-_,------,      o
 _-_-_-_-_-_-_-|   /\_/\
 -_-_-_-_-_-_-~|__( ^ .^)  +     +
 _-_-_-_-_-_-_-""  ""
 +      o         o   +       o
    +         +
 o      o  _-_-_-_- NSC QA Full Dataset - WangchanBERTa Step 2
    o           +
 +      +     o        o      +
 $ srun -v --gres=gpu:1 --pty python nsc_private-baseline-v1.py
 """

 print('----- Starting script -----')

 #@title Param
 param_training_name = "nsc_private-wc-qa-full-step2-v1" #@param {type:"string"}
 param_step1_model_name = "nsc_private-wc-qa-full-step1-v1"
 param_description = "baseline-v1:NSC-private" #@param {type:"string"}
 param_batch_size =  12#@param {type:"integer"}
 param_max_length =  416#@param {type:"integer"}
 param_doc_stride =  128#@param {type:"integer"}
 #@markdown -----
 #@markdown Pretraining Parameters
 param_pretrain_lr =  5e-6#@param {type:"number"}
 param_pretrain_epoch = 25#@param {type:"integer"}
 param_weight_decay =  0.01#@param {type:"number"}
 #@markdown -----
 #@markdown Wandb
 param_wandb_project = "thaiqa-semi-v10" #@param {type:"string"}
 param_tags = ['baseline', 'nsc_span'] #@param {type:"raw"}
 param_wandb_api_key = "xxxxxxxxxxxxx" #@param {type:"string"}
 #@markdown -----
 #@markdown Colab
 param_notebook_path = "/data/users/ppuri/thesis/thaiqa-semi/finetune/semi-v10/" #@param {type:"string"}

 import transformers

 import numpy as np
 from tqdm.auto import tqdm
 import torch

 #datasets
 from datasets import load_dataset

 #transformers
 from transformers import (
    CamembertTokenizerFast,
    TrainingArguments,
    Trainer,
 )

 #thai2transformers
 import thai2transformers
 from thai2transformers.preprocess import process_transformers
 from thai2transformers.metrics import (
    classification_metrics, 
    multilabel_classification_metrics,
 )

 from thai2transformers.tokenizers import (
    ThaiRobertaTokenizer,
    ThaiWordsNewmmTokenizer,
    ThaiWordsSyllableTokenizer,
    FakeSefrCutTokenizer,
    SEFR_SPLIT_TOKEN
 )

 import os
 import wandb

 from datasets import load_dataset, load_metric, Dataset, DatasetDict

 import functools
 import random
 random.seed(5555)

 """
 * Wamdb Configuration
 """
 print('Configuration Wandb...')
 os.environ['WANDB_PROJECT'] = param_wandb_project
 os.environ["WANDB_API_KEY"] = param_wandb_api_key

 wandb.init(project=param_wandb_project, 
           name=param_training_name,
           tags=param_tags,
           group='wangchanberta')

 param_config = {
    'batchsize' : param_batch_size,
    'max_length' : param_max_length,
    'doc_stride' : param_doc_stride,
    'learning_rate' : param_pretrain_lr,
    'epoch' : param_pretrain_epoch,
    'weight_decay' : param_weight_decay,
 }
 wandb.config.update(param_config)

 wandb.log({'run_name': param_training_name, 'description': param_description})
 wandb.log({
    'params' : param_config
 })

 batch_size = param_batch_size

 tokenizer = CamembertTokenizerFast.from_pretrained('airesearch/wangchanberta-base-att-spm-uncased', revision='main', model_max_length=416)

 import json

 print('Strat import dataset...')
 base_datasets = DatasetDict.load_from_disk('/data/users/ppuri/thesis/thaiqa-semi/data/content/nsc_private/')
 # all_new_dataset_list = json.loads(open('/data/users/ppuri/thesis/thaiqa-semi/finetune/semi-v10/nsc-t5_l-question-gen-v1/gen_question-rev1.json', 'r').read())
 # print(len(all_new_dataset_list))

 # selected_id_list = json.loads(open('/data/users/ppuri/thesis/thaiqa-semi/finetune/semi-v10/nsc-baseline-v1/f1_100_id.csv', 'r').read())
 # print(len(selected_id_list))

 # new_dataset_list = [ _ for _ in all_new_dataset_list if _['question_id'] in selected_id_list ]

 # print(len(new_dataset_list))

 # new_context_list = [ _['context'] for _ in new_dataset_list ]
 # new_question_id_list = [ _['question_id'] for _ in new_dataset_list ]
 # new_question_list = [ _['question'] for _ in new_dataset_list ]
 # new_article_id_list = [ _['article_id'] for _ in new_dataset_list ]
 # new_answers_list = [ _['answers'] for _ in new_dataset_list ]

 # document_dict = Dataset.from_dict({
 #     'context' : new_context_list,
 #     'question_id' : new_question_id_list,
 #     'question' : new_question_list,
 #     'article_id' : new_article_id_list,
 #     'answers' : new_answers_list
 # })

 train_datasets = DatasetDict({
    'train' : base_datasets['train'],
    'validation' : base_datasets['validation'],
    'test' : base_datasets['test'],
 })

 print(train_datasets)

 """
 * Preprocessing
 """
 print('Strat preprocessing...')

 import pythainlp

 def preprocessing_normalization(example) :
    example['question'] = pythainlp.util.normalize(example['question'])
    example['context'] = pythainlp.util.normalize(example['context'])

    return example

 def lowercase_example(example):
    example['question'] = example['question'].lower()
    example['context'] = example['context'].lower()
    return example

 def preprocessing_NSC(example) :
    example['question'] = example['question'].replace('\xa0', ' ')
    example['context'] = example['context'].replace('\xa0', ' ')
    example['answers']['answer'][0] = example['answers']['answer'][0].replace('\xa0', ' ')

    return example

 def preprocessing_answer(example) : 
    temp = example
    example['answers']['answer'] = [ temp['answers']['answer'] ]
    example['answers']['answer_begin_position'] = [ temp['answers']['answer_begin_position'] ]
    return example

 # train_datasets['train'] = train_datasets['train'].map(preprocessing_answer)
 train_datasets = train_datasets.filter(lambda _: _['context'] != None and _['question'] != None and _['answers']['answer'] != None)
 train_datasets = train_datasets.map(lowercase_example)
 train_datasets = train_datasets.map(preprocessing_normalization)
 train_datasets = train_datasets.map(preprocessing_NSC)

 max_length = param_max_length
 doc_stride = param_doc_stride
 pad_on_right = tokenizer.padding_side == "right"

 def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the <s> token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_begin_position"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_begin_position"][0] 
            end_char = start_char + len(answers["answer"][0]) + 1

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

 train_tokenized_datasets = train_datasets.map(prepare_train_features, batched=True, remove_columns=train_datasets["train"].column_names)
 train_tokenized_datasets = train_tokenized_datasets.filter(lambda _: _['start_positions'] != 0 and _['end_positions'] != 0)

 """
 * Define model
 """
 print('Defind model...')

 from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

 model = AutoModelForQuestionAnswering.from_pretrained(f'{param_notebook_path}{param_step1_model_name}/trained_model')

 args = TrainingArguments(
    param_training_name,
    evaluation_strategy = "epoch",
    save_strategy = 'epoch',
    learning_rate = param_pretrain_lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = param_pretrain_epoch,
    weight_decay = param_weight_decay,
    report_to = 'wandb',
    run_name = param_training_name,
    logging_dir=f'{param_notebook_path}{param_training_name}/logs',
    logging_strategy='epoch',
 )

 from transformers import default_data_collator

 data_collator = default_data_collator

 trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets["train"],
    eval_dataset=train_tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
 )

 """
 * Start Training
 """
 print('Starting training...')
 trainer.train()

 """
 * Save Model
 """
 trainer.save_model(f"{param_notebook_path}{param_training_name}/trained_model")
 trainer.save_state()
 # os.mkdir(f"{param_notebook_path}{param_training_name}/train_steps")
 # os.system(f'mv -r {param_training_name}/ {param_notebook_path}{param_training_name}/pretrain_steps/')
 training_states = json.loads(open(f"{param_notebook_path}{param_training_name}/trainer_state.json", 'r').read())
 all_training_states = [ _ for _ in training_states['log_history'] if _.get('eval_loss') != None ]
 best_state = sorted(all_training_states, key=lambda k: k['eval_loss'])[0]

 """
 * Evaluation on Last Epoch
 """
 n_best_size = 20
 max_answer_length = 30

 def prepare_validation_features(examples):
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["question_id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

 validation_features = train_datasets["test"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=train_datasets["test"].column_names
 )
 raw_predictions = trainer.predict(validation_features)
 validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

 import collections

 examples = train_datasets["test"]
 features = validation_features

 example_id_to_index = {k: i for i, k in enumerate(examples["question_id"])}
 features_per_example = collections.defaultdict(list)
 for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

 from tqdm.auto import tqdm

 def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["question_id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
        predictions[example["question_id"]] = answer

    return predictions

 final_predictions = postprocess_qa_predictions(train_datasets["test"], validation_features, raw_predictions.predictions)

 thai_metric = load_metric('/data/users/ppuri/thesis/thaiqa_squad_metric/thai_squad_newmm.py')

 test_result = {}

 import re
 pattern = re.compile(r"[^.\u0E00-\u0E7F0-9a-zA-Z' ]|^'|'$|''")

 final_predictions_test_2 = [ (_[0], re.sub(pattern, '', _[1]).strip()) for _ in list(final_predictions.items()) ]

 formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions.items()]
 references = [{"id": str(ex["question_id"]), 
               "answers": {'text': ex['answers']['answer'],
                           'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]

 e = thai_metric.compute(predictions=formatted_predictions, references=references)

 formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions_test_2]
 references = [{"id": str(ex["question_id"]), 
               "answers": {'text': ex['answers']['answer'],
                           'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]

 e2 = thai_metric.compute(predictions=formatted_predictions, references=references)

 test_result['last_epoch'] = {
    'wo_post' : e,
    'w_post' : e2
 }

 """
 * Evaluate on Best Epoch
 """

 best_model_path = f'{param_notebook_path}{param_training_name}/checkpoint-%s/'%(best_state['step'])

 model = AutoModelForQuestionAnswering.from_pretrained(best_model_path)
 args = TrainingArguments(
    param_training_name,
    evaluation_strategy = "epoch",
    save_strategy = 'epoch',
    learning_rate = param_pretrain_lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = param_pretrain_epoch,
    weight_decay = param_weight_decay,
    report_to = 'wandb',
    run_name = param_training_name,
 )
 data_collator = default_data_collator
 trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets["train"],
    eval_dataset=train_tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
 )

 raw_predictions = trainer.predict(validation_features)
 final_predictions = postprocess_qa_predictions(train_datasets["test"], validation_features, raw_predictions.predictions)
 final_predictions_test_2 = [ (_[0], re.sub(pattern, '', _[1]).strip()) for _ in list(final_predictions.items()) ]

 formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions.items()]
 references = [{"id": str(ex["question_id"]), 
               "answers": {'text': ex['answers']['answer'],
                           'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]

 e = thai_metric.compute(predictions=formatted_predictions, references=references)

 formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions_test_2]
 references = [{"id": str(ex["question_id"]), 
               "answers": {'text': ex['answers']['answer'],
                           'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]

 e2 = thai_metric.compute(predictions=formatted_predictions, references=references)

 test_result['best_epoch'] = {
    'wo_post' : e,
    'w_post' : e2,
    'best_state_path' : best_model_path,
    'best_state_detail' : best_state
 }

 print(test_result)
 open(f'{param_notebook_path}{param_training_name}/test_result.json', 'w').write(json.dumps(test_result))

 if not os.path.exists(f'/data/users/ppuri/thesis/thaiqa-semi/result/{param_wandb_project}/{param_training_name}/') :
    os.mkdir(f'/data/users/ppuri/thesis/thaiqa-semi/result/{param_wandb_project}/{param_training_name}')
    open(f'/data/users/ppuri/thesis/thaiqa-semi/result/{param_wandb_project}/{param_training_name}/test_result.json', 'w').write(json.dumps(test_result))
	__author__ = "Puri Phakmongkol"
	__author_email__ = "me@puri.in.th"

	"""
	* Thesis
	*
	* Created date : 15/06/2021
	*
	+ o + o
	+ o + +
	o +
	o + + +
	+ o o + o
	-_-_-_-_-_-_-_,------, o
	_-_-_-_-_-_-_-\| /\_/\
	-_-_-_-_-_-_-~\|__( ^ .^) + +
	_-_-_-_-_-_-_-"" ""
	+ o o + o
	+ +
	o o _-_-_-_- NSC QA Full Dataset - WangchanBERTa Step 2
	o +
	+ + o o +
	$ srun -v --gres=gpu:1 --pty python nsc_private-baseline-v1.py
	"""

	print('----- Starting script -----')

	#@title Param
	param_training_name = "nsc_private-wc-qa-full-step2-v1" #@param {type:"string"}
	param_step1_model_name = "nsc_private-wc-qa-full-step1-v1"
	param_description = "baseline-v1:NSC-private" #@param {type:"string"}
	param_batch_size = 12#@param {type:"integer"}
	param_max_length = 416#@param {type:"integer"}
	param_doc_stride = 128#@param {type:"integer"}
	#@markdown -----
	#@markdown Pretraining Parameters
	param_pretrain_lr = 5e-6#@param {type:"number"}
	param_pretrain_epoch = 25#@param {type:"integer"}
	param_weight_decay = 0.01#@param {type:"number"}
	#@markdown -----
	#@markdown Wandb
	param_wandb_project = "thaiqa-semi-v10" #@param {type:"string"}
	param_tags = ['baseline', 'nsc_span'] #@param {type:"raw"}
	param_wandb_api_key = "xxxxxxxxxxxxx" #@param {type:"string"}
	#@markdown -----
	#@markdown Colab
	param_notebook_path = "/data/users/ppuri/thesis/thaiqa-semi/finetune/semi-v10/" #@param {type:"string"}

	import transformers

	import numpy as np
	from tqdm.auto import tqdm
	import torch

	#datasets
	from datasets import load_dataset

	#transformers
	from transformers import (
	CamembertTokenizerFast,
	TrainingArguments,
	Trainer,
	)

	#thai2transformers
	import thai2transformers
	from thai2transformers.preprocess import process_transformers
	from thai2transformers.metrics import (
	classification_metrics,
	multilabel_classification_metrics,
	)

	from thai2transformers.tokenizers import (
	ThaiRobertaTokenizer,
	ThaiWordsNewmmTokenizer,
	ThaiWordsSyllableTokenizer,
	FakeSefrCutTokenizer,
	SEFR_SPLIT_TOKEN
	)

	import os
	import wandb

	from datasets import load_dataset, load_metric, Dataset, DatasetDict

	import functools
	import random
	random.seed(5555)

	"""
	* Wamdb Configuration
	"""
	print('Configuration Wandb...')
	os.environ['WANDB_PROJECT'] = param_wandb_project
	os.environ["WANDB_API_KEY"] = param_wandb_api_key

	wandb.init(project=param_wandb_project,
	name=param_training_name,
	tags=param_tags,
	group='wangchanberta')

	param_config = {
	'batchsize' : param_batch_size,
	'max_length' : param_max_length,
	'doc_stride' : param_doc_stride,
	'learning_rate' : param_pretrain_lr,
	'epoch' : param_pretrain_epoch,
	'weight_decay' : param_weight_decay,
	}
	wandb.config.update(param_config)

	wandb.log({'run_name': param_training_name, 'description': param_description})
	wandb.log({
	'params' : param_config
	})

	batch_size = param_batch_size

	tokenizer = CamembertTokenizerFast.from_pretrained('airesearch/wangchanberta-base-att-spm-uncased', revision='main', model_max_length=416)

	import json

	print('Strat import dataset...')
	base_datasets = DatasetDict.load_from_disk('/data/users/ppuri/thesis/thaiqa-semi/data/content/nsc_private/')
	# all_new_dataset_list = json.loads(open('/data/users/ppuri/thesis/thaiqa-semi/finetune/semi-v10/nsc-t5_l-question-gen-v1/gen_question-rev1.json', 'r').read())
	# print(len(all_new_dataset_list))

	# selected_id_list = json.loads(open('/data/users/ppuri/thesis/thaiqa-semi/finetune/semi-v10/nsc-baseline-v1/f1_100_id.csv', 'r').read())
	# print(len(selected_id_list))

	# new_dataset_list = [ _ for _ in all_new_dataset_list if _['question_id'] in selected_id_list ]

	# print(len(new_dataset_list))

	# new_context_list = [ _['context'] for _ in new_dataset_list ]
	# new_question_id_list = [ _['question_id'] for _ in new_dataset_list ]
	# new_question_list = [ _['question'] for _ in new_dataset_list ]
	# new_article_id_list = [ _['article_id'] for _ in new_dataset_list ]
	# new_answers_list = [ _['answers'] for _ in new_dataset_list ]

	# document_dict = Dataset.from_dict({
	# 'context' : new_context_list,
	# 'question_id' : new_question_id_list,
	# 'question' : new_question_list,
	# 'article_id' : new_article_id_list,
	# 'answers' : new_answers_list
	# })

	train_datasets = DatasetDict({
	'train' : base_datasets['train'],
	'validation' : base_datasets['validation'],
	'test' : base_datasets['test'],
	})

	print(train_datasets)

	"""
	* Preprocessing
	"""
	print('Strat preprocessing...')

	import pythainlp

	def preprocessing_normalization(example) :
	example['question'] = pythainlp.util.normalize(example['question'])
	example['context'] = pythainlp.util.normalize(example['context'])

	return example

	def lowercase_example(example):
	example['question'] = example['question'].lower()
	example['context'] = example['context'].lower()
	return example

	def preprocessing_NSC(example) :
	example['question'] = example['question'].replace('\xa0', ' ')
	example['context'] = example['context'].replace('\xa0', ' ')
	example['answers']['answer'][0] = example['answers']['answer'][0].replace('\xa0', ' ')

	return example

	def preprocessing_answer(example) :
	temp = example
	example['answers']['answer'] = [ temp['answers']['answer'] ]
	example['answers']['answer_begin_position'] = [ temp['answers']['answer_begin_position'] ]
	return example

	# train_datasets['train'] = train_datasets['train'].map(preprocessing_answer)
	train_datasets = train_datasets.filter(lambda _: _['context'] != None and _['question'] != None and _['answers']['answer'] != None)
	train_datasets = train_datasets.map(lowercase_example)
	train_datasets = train_datasets.map(preprocessing_normalization)
	train_datasets = train_datasets.map(preprocessing_NSC)

	max_length = param_max_length
	doc_stride = param_doc_stride
	pad_on_right = tokenizer.padding_side == "right"

	def prepare_train_features(examples):
	# Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
	# in one example possible giving several features when a context is long, each of those features having a
	# context that overlaps a bit the context of the previous feature.
	tokenized_examples = tokenizer(
	examples["question" if pad_on_right else "context"],
	examples["context" if pad_on_right else "question"],
	truncation="only_second" if pad_on_right else "only_first",
	max_length=max_length,
	stride=doc_stride,
	return_overflowing_tokens=True,
	return_offsets_mapping=True,
	padding="max_length",
	)

	# Since one example might give us several features if it has a long context, we need a map from a feature to
	# its corresponding example. This key gives us just that.
	sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
	# The offset mappings will give us a map from token to character position in the original context. This will
	# help us compute the start_positions and end_positions.
	offset_mapping = tokenized_examples.pop("offset_mapping")

	# Let's label those examples!
	tokenized_examples["start_positions"] = []
	tokenized_examples["end_positions"] = []

	for i, offsets in enumerate(offset_mapping):
	# We will label impossible answers with the index of the <s> token.
	input_ids = tokenized_examples["input_ids"][i]
	cls_index = input_ids.index(tokenizer.cls_token_id)

	# Grab the sequence corresponding to that example (to know what is the context and what is the question).
	sequence_ids = tokenized_examples.sequence_ids(i)

	# One example can give several spans, this is the index of the example containing this span of text.
	sample_index = sample_mapping[i]
	answers = examples["answers"][sample_index]
	# If no answers are given, set the cls_index as answer.
	if len(answers["answer_begin_position"]) == 0:
	tokenized_examples["start_positions"].append(cls_index)
	tokenized_examples["end_positions"].append(cls_index)
	else:
	# Start/end character index of the answer in the text.
	start_char = answers["answer_begin_position"][0]
	end_char = start_char + len(answers["answer"][0]) + 1

	# Start token index of the current span in the text.
	token_start_index = 0
	while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
	token_start_index += 1

	# End token index of the current span in the text.
	token_end_index = len(input_ids) - 1
	while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
	token_end_index -= 1

	# Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
	if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
	tokenized_examples["start_positions"].append(cls_index)
	tokenized_examples["end_positions"].append(cls_index)
	else:
	# Otherwise move the token_start_index and token_end_index to the two ends of the answer.
	# Note: we could go after the last offset if the answer is the last word (edge case).
	while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
	token_start_index += 1
	tokenized_examples["start_positions"].append(token_start_index - 1)
	while offsets[token_end_index][1] >= end_char:
	token_end_index -= 1
	tokenized_examples["end_positions"].append(token_end_index + 1)

	return tokenized_examples

	train_tokenized_datasets = train_datasets.map(prepare_train_features, batched=True, remove_columns=train_datasets["train"].column_names)
	train_tokenized_datasets = train_tokenized_datasets.filter(lambda _: _['start_positions'] != 0 and _['end_positions'] != 0)

	"""
	* Define model
	"""
	print('Defind model...')

	from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

	model = AutoModelForQuestionAnswering.from_pretrained(f'{param_notebook_path}{param_step1_model_name}/trained_model')

	args = TrainingArguments(
	param_training_name,
	evaluation_strategy = "epoch",
	save_strategy = 'epoch',
	learning_rate = param_pretrain_lr,
	per_device_train_batch_size = batch_size,
	per_device_eval_batch_size = batch_size,
	num_train_epochs = param_pretrain_epoch,
	weight_decay = param_weight_decay,
	report_to = 'wandb',
	run_name = param_training_name,
	logging_dir=f'{param_notebook_path}{param_training_name}/logs',
	logging_strategy='epoch',
	)

	from transformers import default_data_collator

	data_collator = default_data_collator

	trainer = Trainer(
	model,
	args,
	train_dataset=train_tokenized_datasets["train"],
	eval_dataset=train_tokenized_datasets["validation"],
	data_collator=data_collator,
	tokenizer=tokenizer,
	)

	"""
	* Start Training
	"""
	print('Starting training...')
	trainer.train()

	"""
	* Save Model
	"""
	trainer.save_model(f"{param_notebook_path}{param_training_name}/trained_model")
	trainer.save_state()
	# os.mkdir(f"{param_notebook_path}{param_training_name}/train_steps")
	# os.system(f'mv -r {param_training_name}/ {param_notebook_path}{param_training_name}/pretrain_steps/')
	training_states = json.loads(open(f"{param_notebook_path}{param_training_name}/trainer_state.json", 'r').read())
	all_training_states = [ _ for _ in training_states['log_history'] if _.get('eval_loss') != None ]
	best_state = sorted(all_training_states, key=lambda k: k['eval_loss'])[0]

	"""
	* Evaluation on Last Epoch
	"""
	n_best_size = 20
	max_answer_length = 30

	def prepare_validation_features(examples):
	# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
	# in one example possible giving several features when a context is long, each of those features having a
	# context that overlaps a bit the context of the previous feature.
	tokenized_examples = tokenizer(
	examples["question" if pad_on_right else "context"],
	examples["context" if pad_on_right else "question"],
	truncation="only_second" if pad_on_right else "only_first",
	max_length=max_length,
	stride=doc_stride,
	return_overflowing_tokens=True,
	return_offsets_mapping=True,
	padding="max_length",
	)

	# Since one example might give us several features if it has a long context, we need a map from a feature to
	# its corresponding example. This key gives us just that.
	sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

	# We keep the example_id that gave us this feature and we will store the offset mappings.
	tokenized_examples["example_id"] = []

	for i in range(len(tokenized_examples["input_ids"])):
	# Grab the sequence corresponding to that example (to know what is the context and what is the question).
	sequence_ids = tokenized_examples.sequence_ids(i)
	context_index = 1 if pad_on_right else 0

	# One example can give several spans, this is the index of the example containing this span of text.
	sample_index = sample_mapping[i]
	tokenized_examples["example_id"].append(examples["question_id"][sample_index])

	# Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
	# position is part of the context or not.
	tokenized_examples["offset_mapping"][i] = [
	(o if sequence_ids[k] == context_index else None)
	for k, o in enumerate(tokenized_examples["offset_mapping"][i])
	]

	return tokenized_examples

	validation_features = train_datasets["test"].map(
	prepare_validation_features,
	batched=True,
	remove_columns=train_datasets["test"].column_names
	)
	raw_predictions = trainer.predict(validation_features)
	validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

	import collections

	examples = train_datasets["test"]
	features = validation_features

	example_id_to_index = {k: i for i, k in enumerate(examples["question_id"])}
	features_per_example = collections.defaultdict(list)
	for i, feature in enumerate(features):
	features_per_example[example_id_to_index[feature["example_id"]]].append(i)

	from tqdm.auto import tqdm

	def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
	all_start_logits, all_end_logits = raw_predictions
	# Build a map example to its corresponding features.
	example_id_to_index = {k: i for i, k in enumerate(examples["question_id"])}
	features_per_example = collections.defaultdict(list)
	for i, feature in enumerate(features):
	features_per_example[example_id_to_index[feature["example_id"]]].append(i)

	# The dictionaries we have to fill.
	predictions = collections.OrderedDict()

	# Logging.
	print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

	# Let's loop over all the examples!
	for example_index, example in enumerate(tqdm(examples)):
	# Those are the indices of the features associated to the current example.
	feature_indices = features_per_example[example_index]

	min_null_score = None # Only used if squad_v2 is True.
	valid_answers = []

	context = example["context"]
	# Looping through all the features associated to the current example.
	for feature_index in feature_indices:
	# We grab the predictions of the model for this feature.
	start_logits = all_start_logits[feature_index]
	end_logits = all_end_logits[feature_index]
	# This is what will allow us to map some the positions in our logits to span of texts in the original
	# context.
	offset_mapping = features[feature_index]["offset_mapping"]

	# Update minimum null prediction.
	cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
	feature_null_score = start_logits[cls_index] + end_logits[cls_index]
	if min_null_score is None or min_null_score < feature_null_score:
	min_null_score = feature_null_score

	# Go through all possibilities for the `n_best_size` greater start and end logits.
	start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
	end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
	for start_index in start_indexes:
	for end_index in end_indexes:
	# Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
	# to part of the input_ids that are not in the context.
	if (
	start_index >= len(offset_mapping)
	or end_index >= len(offset_mapping)
	or offset_mapping[start_index] is None
	or offset_mapping[end_index] is None
	):
	continue
	# Don't consider answers with a length that is either < 0 or > max_answer_length.
	if end_index < start_index or end_index - start_index + 1 > max_answer_length:
	continue

	start_char = offset_mapping[start_index][0]
	end_char = offset_mapping[end_index][1]
	valid_answers.append(
	{
	"score": start_logits[start_index] + end_logits[end_index],
	"text": context[start_char: end_char]
	}
	)

	if len(valid_answers) > 0:
	best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
	else:
	# In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
	# failure.
	best_answer = {"text": "", "score": 0.0}

	# Let's pick our final answer: the best one or the null answer (only for squad_v2)
	answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
	predictions[example["question_id"]] = answer

	return predictions

	final_predictions = postprocess_qa_predictions(train_datasets["test"], validation_features, raw_predictions.predictions)

	thai_metric = load_metric('/data/users/ppuri/thesis/thaiqa_squad_metric/thai_squad_newmm.py')

	test_result = {}

	import re
	pattern = re.compile(r"[^.\u0E00-\u0E7F0-9a-zA-Z' ]\|^'\|'$\|''")

	final_predictions_test_2 = [ (_[0], re.sub(pattern, '', _[1]).strip()) for _ in list(final_predictions.items()) ]

	formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions.items()]
	references = [{"id": str(ex["question_id"]),
	"answers": {'text': ex['answers']['answer'],
	'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]

	e = thai_metric.compute(predictions=formatted_predictions, references=references)

	formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions_test_2]
	references = [{"id": str(ex["question_id"]),
	"answers": {'text': ex['answers']['answer'],
	'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]

	e2 = thai_metric.compute(predictions=formatted_predictions, references=references)

	test_result['last_epoch'] = {
	'wo_post' : e,
	'w_post' : e2
	}

	"""
	* Evaluate on Best Epoch
	"""

	best_model_path = f'{param_notebook_path}{param_training_name}/checkpoint-%s/'%(best_state['step'])

	model = AutoModelForQuestionAnswering.from_pretrained(best_model_path)
	args = TrainingArguments(
	param_training_name,
	evaluation_strategy = "epoch",
	save_strategy = 'epoch',
	learning_rate = param_pretrain_lr,
	per_device_train_batch_size = batch_size,
	per_device_eval_batch_size = batch_size,
	num_train_epochs = param_pretrain_epoch,
	weight_decay = param_weight_decay,
	report_to = 'wandb',
	run_name = param_training_name,
	)
	data_collator = default_data_collator
	trainer = Trainer(
	model,
	args,
	train_dataset=train_tokenized_datasets["train"],
	eval_dataset=train_tokenized_datasets["validation"],
	data_collator=data_collator,
	tokenizer=tokenizer,
	)

	raw_predictions = trainer.predict(validation_features)
	final_predictions = postprocess_qa_predictions(train_datasets["test"], validation_features, raw_predictions.predictions)
	final_predictions_test_2 = [ (_[0], re.sub(pattern, '', _[1]).strip()) for _ in list(final_predictions.items()) ]

	formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions.items()]
	references = [{"id": str(ex["question_id"]),
	"answers": {'text': ex['answers']['answer'],
	'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]

	e = thai_metric.compute(predictions=formatted_predictions, references=references)

	formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions_test_2]
	references = [{"id": str(ex["question_id"]),
	"answers": {'text': ex['answers']['answer'],
	'answer_start':ex['answers']['answer_begin_position']}} for ex in train_datasets["test"]]

	e2 = thai_metric.compute(predictions=formatted_predictions, references=references)

	test_result['best_epoch'] = {
	'wo_post' : e,
	'w_post' : e2,
	'best_state_path' : best_model_path,
	'best_state_detail' : best_state
	}

	print(test_result)
	open(f'{param_notebook_path}{param_training_name}/test_result.json', 'w').write(json.dumps(test_result))

	if not os.path.exists(f'/data/users/ppuri/thesis/thaiqa-semi/result/{param_wandb_project}/{param_training_name}/') :
	os.mkdir(f'/data/users/ppuri/thesis/thaiqa-semi/result/{param_wandb_project}/{param_training_name}')
	open(f'/data/users/ppuri/thesis/thaiqa-semi/result/{param_wandb_project}/{param_training_name}/test_result.json', 'w').write(json.dumps(test_result))
No results found