Skip to content

Instantly share code, notes, and snippets.

@bhanpuramufaddal
Last active May 22, 2023 06:22
Show Gist options
  • Select an option

  • Save bhanpuramufaddal/ab7c974c2f7b3d96d8244fda779135fc to your computer and use it in GitHub Desktop.

Select an option

Save bhanpuramufaddal/ab7c974c2f7b3d96d8244fda779135fc to your computer and use it in GitHub Desktop.
Split Text into Sentences
from langchain.text_splitter import CharacterTextSplitter
import tiktoken
import spacy
from langchain.text_splitter import SpacyTextSplitter
import sys
tokenizer = tiktoken.get_encoding('cl100k_base')
# create the length function for tiktoken
def tiktoken_len(text):
tokens = tokenizer.encode(
text,
disallowed_special=()
)
return len(tokens)
# create a function to find number of bytes in string
def utf8len(s):
return len(s.encode('utf-8'))
primary_splitter = CharacterTextSplitter(
separator = "\n",
chunk_size = 20000,
chunk_overlap = 100,
length_function = utf8len,
)
nlp = spacy.load("ja_core_news_sm")
spacy_splitter = SpacyTextSplitter(
pipeline = "ja_core_news_sm",
chunk_size = 1000,
chunk_overlap = 100,
length_function = tiktoken_len,
)
def chunk_by_spacy(text):
# Spacy tokenizer can only handle a max of 4000 bytes String size.
chunks = primary_splitter.split_text(text)
spacy_splitter._separator = '\n'
chunks = [spacy_splitter.split_text(chunk) for chunk in chunks]
chunks = sum(chunks, [])
# If you want to chunk using multiple delimiters
# spacy_splitter._separator = '。'
# chunks = [spacy_splitter.split_text(chunk) for chunk in chunks]
# chunks = sum(chunks, [])
return chunks
from langchain.text_splitter import CharacterTextSplitter
import tiktoken
import spacy
from langchain.text_splitter import SpacyTextSplitter
import sys
tokenizer = tiktoken.get_encoding('cl100k_base')
# create the length function
def tiktoken_len(text):
tokens = tokenizer.encode(
text,
disallowed_special=()
)
return len(tokens)
def utf8len(s):
return len(s.encode('utf-8'))
primary_splitter = CharacterTextSplitter(
separator = "\n",
chunk_size = 20000,
chunk_overlap = 100,
length_function = utf8len,
)
nlp = spacy.load("ja_core_news_sm")
def chunk_by_spacy(text):
primary_chunks = primary_splitter.split_text(text)
chunks = [list(nlp(chunk).sents) for chunk in primary_chunks]
total_chunks = sum(chunks, [])
#total_chunks = [str(chunk).replace('\n','') for chunk in total_chunks]
return total_chunks
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment