Skip to content

Instantly share code, notes, and snippets.

@joydeb28
Last active May 6, 2020 17:33
Show Gist options
  • Select an option

  • Save joydeb28/0a5bfc7f45730a3a6f8b2dde5cb14656 to your computer and use it in GitHub Desktop.

Select an option

Save joydeb28/0a5bfc7f45730a3a6f8b2dde5cb14656 to your computer and use it in GitHub Desktop.

Revisions

  1. joydeb28 revised this gist May 6, 2020. 1 changed file with 28 additions and 22 deletions.
    50 changes: 28 additions & 22 deletions bert_intent_prediction_bert_layer.py
    Original file line number Diff line number Diff line change
    @@ -15,49 +15,55 @@ def __init__(self):
    self.tokenizer = FullTokenizer(self.vocab_file,self.do_lower_case)

    def get_masks(self,tokens, max_seq_length):
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))
    mask_data = [1]*len(tokens) + [0] * (max_seq_length - len(tokens))
    return mask_data

    def get_segments(self,tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    '''
    Segments: 0 for the first sequence,
    1 for the second
    '''
    segments = []
    current_segment_id = 0
    segment_id = 0
    for token in tokens:
    segments.append(current_segment_id)
    if token == "[SEP]":
    current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))
    segment_id = 1
    '''Remaining are padded with 0'''
    remaining_segment = [0] * (max_seq_length - len(tokens))
    segment_data = segments + remaining_segment
    return segment_data

    def get_ids(self,tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    remaining_ids = [0] * (max_seq_length-len(token_ids))
    input_ids = token_ids + remaining_ids
    return input_ids
    def create_single_input(self,sentence,maxlen):

    stokens = self.tokenizer.tokenize(sentence)

    def get_input_data(self,sentence,maxlen):

    stokens = stokens[:maxlen]
    sent_token = self.tokenizer.tokenize(sentence)

    stokens = ["[CLS]"] + stokens + ["[SEP]"]
    sent_token = sent_token[:maxlen]

    ids = self.get_ids(stokens, self.tokenizer, self.max_len)
    masks = self.get_masks(stokens, self.max_len)
    segments = self.get_segments(stokens, self.max_len)
    sent_token = ["[CLS]"] + sent_token + ["[SEP]"]

    return ids,masks,segments
    id = self.get_ids(sent_token, self.tokenizer, self.max_len)
    mask = self.get_masks(sent_token, self.max_len)
    segment = self.get_segments(sent_token, self.max_len)
    input_data = [id,mask,segment]
    return input_data

    def create_input_array(self,sentences):
    def get_input_array(self,sentences):

    input_ids, input_masks, input_segments = [], [], []

    for sentence in tqdm(sentences,position=0, leave=True):
    ids,masks,segments=self.create_single_input(sentence,self.max_len-2)
    ids,masks,segments=self.get_input_data(sentence,self.max_len-2)

    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

    tensor = [np.asarray(input_ids, dtype=np.int32),
    np.asarray(input_masks, dtype=np.int32),
    np.asarray(input_segments, dtype=np.int32)]
    return tensor
    input_array = [np.asarray(input_ids, dtype=np.int32),np.asarray(input_masks, dtype=np.int32), np.asarray(input_segments, dtype=np.int32)]
    return input_array
  2. joydeb28 created this gist May 5, 2020.
    63 changes: 63 additions & 0 deletions bert_intent_prediction_bert_layer.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,63 @@
    class BertModel(object):

    def __init__(self):

    self.max_len = 128
    bert_path = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
    FullTokenizer=bert.bert_tokenization.FullTokenizer

    self.bert_module = hub.KerasLayer(bert_path,trainable=True)

    self.vocab_file = self.bert_module.resolved_object.vocab_file.asset_path.numpy()

    self.do_lower_case = self.bert_module.resolved_object.do_lower_case.numpy()

    self.tokenizer = FullTokenizer(self.vocab_file,self.do_lower_case)

    def get_masks(self,tokens, max_seq_length):
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

    def get_segments(self,tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    segments = []
    current_segment_id = 0
    for token in tokens:
    segments.append(current_segment_id)
    if token == "[SEP]":
    current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

    def get_ids(self,tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids
    def create_single_input(self,sentence,maxlen):

    stokens = self.tokenizer.tokenize(sentence)

    stokens = stokens[:maxlen]

    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    ids = self.get_ids(stokens, self.tokenizer, self.max_len)
    masks = self.get_masks(stokens, self.max_len)
    segments = self.get_segments(stokens, self.max_len)

    return ids,masks,segments

    def create_input_array(self,sentences):

    input_ids, input_masks, input_segments = [], [], []

    for sentence in tqdm(sentences,position=0, leave=True):
    ids,masks,segments=self.create_single_input(sentence,self.max_len-2)

    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

    tensor = [np.asarray(input_ids, dtype=np.int32),
    np.asarray(input_masks, dtype=np.int32),
    np.asarray(input_segments, dtype=np.int32)]
    return tensor