pip install spacy
python -m spacy download ja_core_news_sm
Last active
May 22, 2023 06:22
-
-
Save bhanpuramufaddal/ab7c974c2f7b3d96d8244fda779135fc to your computer and use it in GitHub Desktop.
Split Text into Sentences
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from langchain.text_splitter import CharacterTextSplitter | |
| import tiktoken | |
| import spacy | |
| from langchain.text_splitter import SpacyTextSplitter | |
| import sys | |
| tokenizer = tiktoken.get_encoding('cl100k_base') | |
| # create the length function for tiktoken | |
| def tiktoken_len(text): | |
| tokens = tokenizer.encode( | |
| text, | |
| disallowed_special=() | |
| ) | |
| return len(tokens) | |
| # create a function to find number of bytes in string | |
| def utf8len(s): | |
| return len(s.encode('utf-8')) | |
| primary_splitter = CharacterTextSplitter( | |
| separator = "\n", | |
| chunk_size = 20000, | |
| chunk_overlap = 100, | |
| length_function = utf8len, | |
| ) | |
| nlp = spacy.load("ja_core_news_sm") | |
| spacy_splitter = SpacyTextSplitter( | |
| pipeline = "ja_core_news_sm", | |
| chunk_size = 1000, | |
| chunk_overlap = 100, | |
| length_function = tiktoken_len, | |
| ) | |
| def chunk_by_spacy(text): | |
| # Spacy tokenizer can only handle a max of 4000 bytes String size. | |
| chunks = primary_splitter.split_text(text) | |
| spacy_splitter._separator = '\n' | |
| chunks = [spacy_splitter.split_text(chunk) for chunk in chunks] | |
| chunks = sum(chunks, []) | |
| # If you want to chunk using multiple delimiters | |
| # spacy_splitter._separator = '。' | |
| # chunks = [spacy_splitter.split_text(chunk) for chunk in chunks] | |
| # chunks = sum(chunks, []) | |
| return chunks |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from langchain.text_splitter import CharacterTextSplitter | |
| import tiktoken | |
| import spacy | |
| from langchain.text_splitter import SpacyTextSplitter | |
| import sys | |
| tokenizer = tiktoken.get_encoding('cl100k_base') | |
| # create the length function | |
| def tiktoken_len(text): | |
| tokens = tokenizer.encode( | |
| text, | |
| disallowed_special=() | |
| ) | |
| return len(tokens) | |
| def utf8len(s): | |
| return len(s.encode('utf-8')) | |
| primary_splitter = CharacterTextSplitter( | |
| separator = "\n", | |
| chunk_size = 20000, | |
| chunk_overlap = 100, | |
| length_function = utf8len, | |
| ) | |
| nlp = spacy.load("ja_core_news_sm") | |
| def chunk_by_spacy(text): | |
| primary_chunks = primary_splitter.split_text(text) | |
| chunks = [list(nlp(chunk).sents) for chunk in primary_chunks] | |
| total_chunks = sum(chunks, []) | |
| #total_chunks = [str(chunk).replace('\n','') for chunk in total_chunks] | |
| return total_chunks | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment