Last active
August 1, 2025 16:40
-
-
Save syllog1sm/10343947 to your computer and use it in GitHub Desktop.
Revisions
-
syllog1sm revised this gist
Apr 10, 2014 . 1 changed file with 3 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,8 +1,11 @@ """A simple implementation of a greedy transition-based parser. Released under BSD license.""" from os import path import os import sys from collections import defaultdict import random import time import pickle SHIFT = 0; RIGHT = 1; LEFT = 2; MOVES = (SHIFT, RIGHT, LEFT) @@ -513,7 +516,4 @@ def main(model_dir, train_loc, heldout_in, heldout_gold): if __name__ == '__main__': main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) -
syllog1sm revised this gist
Apr 10, 2014 . No changes.There are no files selected for viewing
-
syllog1sm revised this gist
Apr 10, 2014 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,4 @@ """A simple implementation of a greedy transition-based parser. Released under BSD license.""" from os import path import os from collections import defaultdict -
syllog1sm created this gist
Apr 10, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,519 @@ """A simple implementation of a greedy transition-based parser""" from os import path import os from collections import defaultdict import random SHIFT = 0; RIGHT = 1; LEFT = 2; MOVES = (SHIFT, RIGHT, LEFT) START = ['-START-', '-START2-'] END = ['-END-', '-END2-'] class DefaultList(list): """A list that returns a default value if index out of bounds.""" def __init__(self, default=None): self.default = default list.__init__(self) def __getitem__(self, index): try: return list.__getitem__(self, index) except IndexError: return self.default class Parse(object): def __init__(self, n): self.n = n self.heads = [None] * (n-1) self.labels = [None] * (n-1) self.lefts = [] self.rights = [] for i in range(n+1): self.lefts.append(DefaultList(0)) self.rights.append(DefaultList(0)) def add(self, head, child, label=None): self.heads[child] = head self.labels[child] = label if child < head: self.lefts[head].append(child) else: self.rights[head].append(child) class Parser(object): def __init__(self, load=True): model_dir = os.path.dirname(__file__) self.model = Perceptron(MOVES) if load: self.model.load(path.join(model_dir, 'parser.pickle')) self.tagger = PerceptronTagger(load=load) self.confusion_matrix = defaultdict(lambda: defaultdict(int)) def save(self): self.model.save(path.join(os.path.dirname(__file__), 'parser.pickle')) self.tagger.save() def parse(self, words): n = len(words) i = 2; stack = [1]; parse = Parse(n) tags = self.tagger.tag(words) while stack or (i+1) < n: features = extract_features(words, tags, i, n, stack, parse) scores = self.model.score(features) valid_moves = get_valid_moves(i, n, len(stack)) guess = max(valid_moves, key=lambda move: scores[move]) i = transition(guess, i, stack, parse) return tags, parse.heads def train_one(self, itn, words, gold_tags, gold_heads): n = len(words) i = 2; stack = [1]; parse = Parse(n) tags = self.tagger.tag(words) while stack or (i + 1) < n: features = extract_features(words, tags, i, n, stack, parse) scores = self.model.score(features) valid_moves = get_valid_moves(i, n, len(stack)) gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads) guess = max(valid_moves, key=lambda move: scores[move]) assert gold_moves best = max(gold_moves, key=lambda move: scores[move]) self.model.update(best, guess, features) i = transition(guess, i, stack, parse) self.confusion_matrix[best][guess] += 1 return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]]) def transition(move, i, stack, parse): if move == SHIFT: stack.append(i) return i + 1 elif move == RIGHT: parse.add(stack[-2], stack.pop()) return i elif move == LEFT: parse.add(i, stack.pop()) return i assert move in MOVES def get_valid_moves(i, n, stack_depth): moves = [] if (i+1) < n: moves.append(SHIFT) if stack_depth >= 2: moves.append(RIGHT) if stack_depth >= 1: moves.append(LEFT) return moves def get_gold_moves(n0, n, stack, heads, gold): def deps_between(target, others, gold): for word in others: if gold[word] == target or gold[target] == word: return True return False valid = get_valid_moves(n0, n, len(stack)) if not stack or (SHIFT in valid and gold[n0] == stack[-1]): return [SHIFT] if gold[stack[-1]] == n0: return [LEFT] costly = set([m for m in MOVES if m not in valid]) # If the word behind s0 is its gold head, Left is incorrect if len(stack) >= 2 and gold[stack[-1]] == stack[-2]: costly.add(LEFT) # If there are any dependencies between n0 and the stack, # pushing n0 will lose them. if SHIFT not in costly and deps_between(n0, stack, gold): costly.add(SHIFT) # If there are any dependencies between s0 and the buffer, popping # s0 will lose them. if deps_between(stack[-1], range(n0+1, n-1), gold): costly.add(LEFT) costly.add(RIGHT) return [m for m in MOVES if m not in costly] def extract_features(words, tags, n0, n, stack, parse): def get_stack_context(depth, stack, data): if depth >= 3: return data[stack[-1]], data[stack[-2]], data[stack[-3]] elif depth >= 2: return data[stack[-1]], data[stack[-2]], '' elif depth == 1: return data[stack[-1]], '', '' else: return '', '', '' def get_buffer_context(i, n, data): if i + 1 >= n: return data[i], '', '' elif i + 2 >= n: return data[i], data[i + 1], '' else: return data[i], data[i + 1], data[i + 2] def get_parse_context(word, deps, data): if word == -1: return 0, '', '' deps = deps[word] valency = len(deps) if not valency: return 0, '', '' elif valency == 1: return 1, data[deps[-1]], '' else: return valency, data[deps[-1]], data[deps[-2]] features = {} # Set up the context pieces --- the word (W) and tag (T) of: # S0-2: Top three words on the stack # N0-2: First three words of the buffer # n0b1, n0b2: Two leftmost children of the first word of the buffer # s0b1, s0b2: Two leftmost children of the top word of the stack # s0f1, s0f2: Two rightmost children of the top word of the stack depth = len(stack) s0 = stack[-1] if depth else -1 Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words) Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags) Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words) Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags) Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words) Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags) Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words) _, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags) Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words) _, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags) Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words) _, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags) # Cap numeric features at 5? # String-distance Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0 features['bias'] = 1 # Add word and tag unigrams for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2): if w: features['w=%s' % w] = 1 for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2): if t: features['t=%s' % t] = 1 # Add word/tag pairs for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))): if w or t: features['%d w=%s, t=%s' % (i, w, t)] = 1 # Add some bigrams features['s0w=%s, n0w=%s' % (Ws0, Wn0)] = 1 features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1 features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1 features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1 features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1 features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1 features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1 features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1 # Add some tag trigrams trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1), (Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2), (Ts0, Ts1, Ts1)) for i, (t1, t2, t3) in enumerate(trigrams): if t1 or t2 or t3: features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1 # Add some valency and distance features vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b)) vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b)) d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0), ('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0)) for i, (w_t, v_d) in enumerate(vw + vt + d): if w_t or v_d: features['val/d-%d %s %d' % (i, w_t, v_d)] = 1 return features class Perceptron(object): def __init__(self, classes=None): # Each feature gets its own weight vector, so weights is a dict-of-arrays self.classes = classes self.weights = {} # The accumulated values, for the averaging. These will be keyed by # feature/clas tuples self._totals = defaultdict(int) # The last time the feature was changed, for the averaging. Also # keyed by feature/clas tuples # (tstamps is short for timestamps) self._tstamps = defaultdict(int) # Number of instances seen self.i = 0 def predict(self, features): '''Dot-product the features and current weights and return the best class.''' scores = self.score(features) # Do a secondary alphabetic sort, for stability return max(self.classes, key=lambda clas: (scores[clas], clas)) def score(self, features): all_weights = self.weights scores = dict((clas, 0) for clas in self.classes) for feat, value in features.items(): if value == 0: continue if feat not in all_weights: continue weights = all_weights[feat] for clas, weight in weights.items(): scores[clas] += value * weight return scores def update(self, truth, guess, features): def upd_feat(c, f, w, v): param = (f, c) self._totals[param] += (self.i - self._tstamps[param]) * w self._tstamps[param] = self.i self.weights[f][c] = w + v self.i += 1 if truth == guess: return None for f in features: weights = self.weights.setdefault(f, {}) upd_feat(truth, f, weights.get(truth, 0.0), 1.0) upd_feat(guess, f, weights.get(guess, 0.0), -1.0) def average_weights(self): for feat, weights in self.weights.items(): new_feat_weights = {} for clas, weight in weights.items(): param = (feat, clas) total = self._totals[param] total += (self.i - self._tstamps[param]) * weight averaged = round(total / float(self.i), 3) if averaged: new_feat_weights[clas] = averaged self.weights[feat] = new_feat_weights def save(self, path): print "Saving model to %s" % path pickle.dump(self.weights, open(path, 'w')) def load(self, path): self.weights = pickle.load(open(path)) class PerceptronTagger(object): '''Greedy Averaged Perceptron tagger''' model_loc = os.path.join(os.path.dirname(__file__), 'tagger.pickle') def __init__(self, classes=None, load=True): self.tagdict = {} if classes: self.classes = classes else: self.classes = set() self.model = Perceptron(self.classes) if load: self.load(PerceptronTagger.model_loc) def tag(self, words, tokenize=True): prev, prev2 = START tags = DefaultList('') context = START + [self._normalize(w) for w in words] + END for i, word in enumerate(words): tag = self.tagdict.get(word) if not tag: features = self._get_features(i, word, context, prev, prev2) tag = self.model.predict(features) tags.append(tag) prev2 = prev; prev = tag return tags def start_training(self, sentences): self._make_tagdict(sentences) self.model = Perceptron(self.classes) def train(self, sentences, save_loc=None, nr_iter=5): '''Train a model from sentences, and save it at save_loc. nr_iter controls the number of Perceptron training iterations.''' self.start_training(sentences) for iter_ in range(nr_iter): for words, tags in sentences: self.train_one(words, tags) random.shuffle(sentences) self.end_training(save_loc) def save(self): # Pickle as a binary file pickle.dump((self.model.weights, self.tagdict, self.classes), open(PerceptronTagger.model_loc, 'wb'), -1) def train_one(self, words, tags): prev, prev2 = START context = START + [self._normalize(w) for w in words] + END for i, word in enumerate(words): guess = self.tagdict.get(word) if not guess: feats = self._get_features(i, word, context, prev, prev2) guess = self.model.predict(feats) self.model.update(tags[i], guess, feats) prev2 = prev; prev = guess def load(self, loc): w_td_c = pickle.load(open(loc, 'rb')) self.model.weights, self.tagdict, self.classes = w_td_c self.model.classes = self.classes def _normalize(self, word): if '-' in word and word[0] != '-': return '!HYPHEN' elif word.isdigit() and len(word) == 4: return '!YEAR' elif word[0].isdigit(): return '!DIGITS' else: return word.lower() def _get_features(self, i, word, context, prev, prev2): '''Map tokens into a feature representation, implemented as a {hashable: float} dict. If the features change, a new model must be trained.''' def add(name, *args): features[' '.join((name,) + tuple(args))] += 1 i += len(START) features = defaultdict(int) # It's useful to have a constant feature, which acts sort of like a prior add('bias') add('i suffix', word[-3:]) add('i pref1', word[0]) add('i-1 tag', prev) add('i-2 tag', prev2) add('i tag+i-2 tag', prev, prev2) add('i word', context[i]) add('i-1 tag+i word', prev, context[i]) add('i-1 word', context[i-1]) add('i-1 suffix', context[i-1][-3:]) add('i-2 word', context[i-2]) add('i+1 word', context[i+1]) add('i+1 suffix', context[i+1][-3:]) add('i+2 word', context[i+2]) return features def _make_tagdict(self, sentences): '''Make a tag dictionary for single-tag words.''' counts = defaultdict(lambda: defaultdict(int)) for sent in sentences: for word, tag in zip(sent[0], sent[1]): counts[word][tag] += 1 self.classes.add(tag) freq_thresh = 20 ambiguity_thresh = 0.97 for word, tag_freqs in counts.items(): tag, mode = max(tag_freqs.items(), key=lambda item: item[1]) n = sum(tag_freqs.values()) # Don't add rare words to the tag dictionary # Only add quite unambiguous words if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh: self.tagdict[word] = tag def _pc(n, d): return (float(n) / d) * 100 def train(parser, sentences, nr_iter): parser.tagger.start_training(sentences) for itn in range(nr_iter): corr = 0; total = 0 random.shuffle(sentences) for words, gold_tags, gold_parse, gold_label in sentences: corr += parser.train_one(itn, words, gold_tags, gold_parse) if itn < 5: parser.tagger.train_one(words, gold_tags) total += len(words) print itn, '%.3f' % (float(corr) / float(total)) if itn == 4: parser.tagger.model.average_weights() print 'Averaging weights' parser.model.average_weights() def read_pos(loc): for line in open(loc): if not line.strip(): continue words = DefaultList('') tags = DefaultList('') for token in line.split(): if not token: continue word, tag = token.rsplit('/', 1) #words.append(normalize(word)) words.append(word) tags.append(tag) pad_tokens(words); pad_tokens(tags) yield words, tags def read_conll(loc): for sent_str in open(loc).read().strip().split('\n\n'): lines = [line.split() for line in sent_str.split('\n')] words = DefaultList(''); tags = DefaultList('') heads = [None]; labels = [None] for i, (word, pos, head, label) in enumerate(lines): words.append(intern(word)) #words.append(intern(normalize(word))) tags.append(intern(pos)) heads.append(int(head) + 1 if head != '-1' else len(lines) + 1) labels.append(label) pad_tokens(words); pad_tokens(tags) yield words, tags, heads, labels def pad_tokens(tokens): tokens.insert(0, '<start>') tokens.append('ROOT') def main(model_dir, train_loc, heldout_in, heldout_gold): if not os.path.exists(model_dir): os.mkdir(model_dir) input_sents = list(read_pos(heldout_in)) parser = Parser(load=False) sentences = list(read_conll(train_loc)) train(parser, sentences, nr_iter=15) parser.save() c = 0 t = 0 gold_sents = list(read_conll(heldout_gold)) t1 = time.time() for (words, tags), (_, _, gold_heads, gold_labels) in zip(input_sents, gold_sents): _, heads = parser.parse(words) for i, w in list(enumerate(words))[1:-1]: if gold_labels[i] in ('P', 'punct'): continue if heads[i] == gold_heads[i]: c += 1 t += 1 t2 = time.time() print 'Parsing took %0.3f ms' % ((t2-t1)*1000.0) print c, t, float(c)/t if __name__ == '__main__': import sys import cProfile import pstats main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])