""" Example of a Streamlit app for an interactive Prodigy dataset viewer that also lets you run simple training experiments for NER and text classification. Requires the Prodigy annotation tool to be installed: https://prodi.gy See here for details on Streamlit: https://streamlit.io. """ import streamlit as st from prodigy.components.db import connect from prodigy.models.ner import EntityRecognizer, merge_spans, guess_batch_size from prodigy.models.textcat import TextClassifier from prodigy.util import split_evals import pandas as pd import spacy from spacy import displacy from spacy.util import filter_spans, minibatch import random SPACY_MODEL_NAMES = ["en_core_web_sm"] EXC_FIELDS = ["meta", "priority", "score"] HTML_WRAPPER = """

{}

""" COLOR_ACCEPT = "#93eaa1" COLOR_REJECT = "#ff8f8e" def guess_dataset_type(first_eg): if "image" in first_eg: return "image" if "arc" in first_eg: return "dep" if "options" in first_eg or "label" in first_eg: return "textcat" if "spans" in first_eg: return "ner" return "other" def get_answer_counts(examples): result = {"accept": 0, "reject": 0, "ignore": 0} for eg in examples: answer = eg.get("answer") if answer: result[answer] += 1 return result def format_label(label, answer="accept"): # Hack to use different colors for the label (by adding zero-width space) return f"{label}\u200B" if answer == "reject" else label st.sidebar.title("Prodigy Data Explorer") db = connect() db_sets = db.datasets placeholder = "Select dataset..." dataset = st.sidebar.selectbox(f"Datasets ({len(db_sets)})", [placeholder] + db_sets) if dataset != placeholder: examples = db.get_dataset(dataset) st.header(f"{dataset} ({len(examples)})") if not len(examples): st.markdown("_Empty dataset._") else: counts = get_answer_counts(examples) st.markdown(", ".join(f"**{c}** {a}" for a, c in counts.items())) dataset_types = ["ner", "textcat", "dep", "pos", "image", "other"] guessed_index = dataset_types.index(guess_dataset_type(examples[0])) set_type = st.sidebar.selectbox("Dataset type", dataset_types, guessed_index) fields = list(examples[0].keys()) default_fields = [f for f in fields if f[0] != "_" and f not in EXC_FIELDS] task_fields = st.sidebar.multiselect("Visible fields", fields, default_fields) st.dataframe(pd.DataFrame(examples).filter(task_fields), height=500) if set_type in ["ner", "textcat"]: st.sidebar.header("Viewer options") purpose = "tokenization & training" if set_type == "ner" else "training" spacy_model_title = f"spaCy model for {purpose}" spacy_model = st.sidebar.selectbox(spacy_model_title, SPACY_MODEL_NAMES) st.sidebar.subheader("Training configuration") n_iter = st.sidebar.slider("Number of iterations", 1, 100, 5, 1) dropout = st.sidebar.slider("Dropout rate", 0.0, 1.0, 0.2, 0.05) eval_split_label = "% of examples held back for evaluation" eval_split = st.sidebar.slider(eval_split_label, 0.0, 1.0, 0.2, 0.05) if set_type == "ner": st.subheader("Named entity viewer") nlp = spacy.load(spacy_model) merged_examples = merge_spans(list(examples)) all_labels = set() for eg in merged_examples: for span in eg["spans"]: all_labels.add(span["label"]) colors = {} for label in all_labels: colors[label] = COLOR_ACCEPT colors[format_label(label, "reject")] = COLOR_REJECT ner_example_i = st.selectbox( f"Merged examples ({len(merged_examples)})", range(len(merged_examples)), format_func=lambda i: merged_examples[int(i)]["text"][:400], ) ner_example = merged_examples[int(ner_example_i)] doc = nlp.make_doc(ner_example["text"]) ents = [] for span in ner_example.get("spans", []): label = format_label(span["label"], span["answer"]) ents.append(doc.char_span(span["start"], span["end"], label=label)) doc.ents = filter_spans(ents) html = displacy.render(doc, style="ent", options={"colors": colors}) html = html.replace("\n", " ") # Newlines seem to mess with the rendering st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) show_ner_example_json = st.checkbox("Show JSON example") if show_ner_example_json: st.json(ner_example) st.subheader("Train a model (experimental)") no_missing = st.checkbox( "Data is gold-standard and contains no missing values", False ) start_blank = st.checkbox("Start with blank NER model", True) if st.button("🚀 Start training"): if start_blank: ner = nlp.create_pipe("ner") if "ner" in nlp.pipe_names: nlp.replace_pipe("ner", ner) else: nlp.add_pipe(ner) ner.begin_training([]) else: ner = nlp.get_pipe("ner") for label in all_labels: ner.add_label(label) random.shuffle(examples) train_examples, evals, eval_split = split_evals( merged_examples, eval_split ) st.success( f"✅ Using **{len(train_examples)}** training examples " f"and **{len(evals)}** evaluation examples with " f"**{len(all_labels)}** label(s)" ) annot_model = EntityRecognizer( nlp, label=all_labels, no_missing=no_missing ) batch_size = guess_batch_size(len(train_examples)) baseline = annot_model.evaluate(evals) st.info( f"ℹ️ **Baseline**\n**{baseline['right']:.0f}** right " f"entities, **{baseline['wrong']:.0f}** wrong entities, " f"**{baseline['unk']:.0f}** unkown entities, " f"**{baseline['ents']:.0f}** total predicted, " f"**{baseline['acc']:.2f}** accuracy" ) progress = st.progress(0) results = [] result_table = st.empty() best_acc = 0.0 for i in range(n_iter): random.shuffle(train_examples) losses = annot_model.batch_train( train_examples, batch_size=batch_size, drop=dropout, beam_width=16, ) stats = annot_model.evaluate(evals) stats = { "Right": stats["right"], "Wrong": stats["wrong"], "Unknown": stats["unk"], "Predicted Ents": stats["ents"], "Loss": losses["ner"], "Accuracy": round(stats["acc"], 3), } best_acc = ( stats["Accuracy"] if stats["Accuracy"] > best_acc else best_acc ) def highlight(v): is_best = v != 0 and v == best_acc return f"background: {'yellow' if is_best else 'white'}" results.append(stats) results_df = pd.DataFrame(results, dtype="float") result_table.dataframe(results_df.style.applymap(highlight)) progress.progress(int((i + 1) / n_iter * 100)) elif set_type == "textcat": st.subheader("Train a model (experimental)") exclusive = st.checkbox("Labels are mututally exclusive", False) if st.button("🚀 Start training"): nlp = spacy.load(spacy_model) examples = list(examples) all_labels = set() for eg in examples: all_labels.update(eg.get("accelt", [])) if "label" in eg: all_labels.add(eg["label"]) textcat = nlp.create_pipe("textcat") for label in all_labels: textcat.add_label(label) textcat.begin_training() nlp.add_pipe(textcat) random.shuffle(examples) train_examples, evals, eval_split = split_evals(examples, eval_split) st.success( f"✅ Using **{len(train_examples)}** training examples " f"and **{len(evals)}** evaluation examples with " f"**{len(all_labels)}** label(s)" ) annot_model = TextClassifier( nlp, all_labels, low_data=len(train_examples) < 1000, exclusive_classes=exclusive, ) progress = st.progress(0) results = [] result_table = st.empty() best_acc = 0.0 for i in range(n_iter): loss = 0.0 random.shuffle(train_examples) for batch in minibatch(train_examples, size=10): batch = list(batch) loss += annot_model.update(batch, revise=False, drop=dropout) with nlp.use_params(annot_model.optimizer.averages): stats = annot_model.evaluate(evals) stats = { "Loss": loss, "F-Score": stats["fscore"], "Accuracy": round(stats["accuracy"], 3), } best_acc = ( stats["Accuracy"] if stats["Accuracy"] > best_acc else best_acc ) def highlight(v): is_best = v != 0 and v == best_acc return f"background: {'yellow' if is_best else 'white'}" results.append(stats) results_df = pd.DataFrame(results, dtype="float").round(3) result_table.dataframe(results_df.style.applymap(highlight)) progress.progress(int((i + 1) / n_iter * 100))