Last active
June 16, 2021 13:06
-
-
Save thiborose/2530336055e711ff451162663d585bc2 to your computer and use it in GitHub Desktop.
A customized spacy pipeline to characterize texts, encompassing an improved sentencizer.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import spacy | |
| from spacy import Language | |
| from spacy.tokens import Doc | |
| import pysbd # improved sentencizer | |
| # Setting up spacy | |
| @Language.component("pysbd_sentence_boundaries") | |
| def pysbd_sentence_boundaries(doc): | |
| """improved sentence segmenter""" | |
| seg = pysbd.Segmenter(language="fr", clean=False, char_span=True) #Specify language here | |
| sents_char_spans = seg.segment(doc.text) | |
| char_spans = [doc.char_span(sent_span.start, sent_span.end, alignment_mode="contract") for sent_span in sents_char_spans] | |
| start_token_ids = [span[0].idx for span in char_spans if span is not None] | |
| for token in doc: | |
| token.is_sent_start = True if token.idx in start_token_ids else False | |
| return doc | |
| nlp = spacy.load("fr_core_news_md") # Select base model here | |
| nlp.add_pipe("pysbd_sentence_boundaries", before="parser") | |
| get_adj_incidence = lambda doc: len([tok.i for tok in doc if tok.pos_=="ADJ"])/len(doc) | |
| get_verb_incidence = lambda doc: len([tok.i for tok in doc if tok.pos_=="VERB"])/len(doc) | |
| get_noun_incidence = lambda doc: len([tok.i for tok in doc if tok.pos_=="NOUN"])/len(doc) | |
| get_avg_sent_len = lambda doc: sum([len(list(sent)) for sent in doc.sents])/len(list(doc.sents)) | |
| get_vocab = lambda doc: set([tok.text for tok in doc]) | |
| Doc.set_extension("adj_incidence", getter=get_adj_incidence) | |
| Doc.set_extension("verb_incidence", getter=get_verb_incidence) | |
| Doc.set_extension("noun_incidence", getter=get_noun_incidence) | |
| Doc.set_extension("avg_sent_len", getter=get_avg_sent_len) | |
| Doc.set_extension("vocab", getter=get_vocab) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment