Skip to content

Instantly share code, notes, and snippets.

@thiborose
Last active June 16, 2021 13:06
Show Gist options
  • Select an option

  • Save thiborose/2530336055e711ff451162663d585bc2 to your computer and use it in GitHub Desktop.

Select an option

Save thiborose/2530336055e711ff451162663d585bc2 to your computer and use it in GitHub Desktop.
A customized spacy pipeline to characterize texts, encompassing an improved sentencizer.
import spacy
from spacy import Language
from spacy.tokens import Doc
import pysbd # improved sentencizer
# Setting up spacy
@Language.component("pysbd_sentence_boundaries")
def pysbd_sentence_boundaries(doc):
"""improved sentence segmenter"""
seg = pysbd.Segmenter(language="fr", clean=False, char_span=True) #Specify language here
sents_char_spans = seg.segment(doc.text)
char_spans = [doc.char_span(sent_span.start, sent_span.end, alignment_mode="contract") for sent_span in sents_char_spans]
start_token_ids = [span[0].idx for span in char_spans if span is not None]
for token in doc:
token.is_sent_start = True if token.idx in start_token_ids else False
return doc
nlp = spacy.load("fr_core_news_md") # Select base model here
nlp.add_pipe("pysbd_sentence_boundaries", before="parser")
get_adj_incidence = lambda doc: len([tok.i for tok in doc if tok.pos_=="ADJ"])/len(doc)
get_verb_incidence = lambda doc: len([tok.i for tok in doc if tok.pos_=="VERB"])/len(doc)
get_noun_incidence = lambda doc: len([tok.i for tok in doc if tok.pos_=="NOUN"])/len(doc)
get_avg_sent_len = lambda doc: sum([len(list(sent)) for sent in doc.sents])/len(list(doc.sents))
get_vocab = lambda doc: set([tok.text for tok in doc])
Doc.set_extension("adj_incidence", getter=get_adj_incidence)
Doc.set_extension("verb_incidence", getter=get_verb_incidence)
Doc.set_extension("noun_incidence", getter=get_noun_incidence)
Doc.set_extension("avg_sent_len", getter=get_avg_sent_len)
Doc.set_extension("vocab", getter=get_vocab)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment