from langchain.text_splitter import CharacterTextSplitter import tiktoken import spacy from langchain.text_splitter import SpacyTextSplitter import sys tokenizer = tiktoken.get_encoding('cl100k_base') # create the length function for tiktoken def tiktoken_len(text): tokens = tokenizer.encode( text, disallowed_special=() ) return len(tokens) # create a function to find number of bytes in string def utf8len(s): return len(s.encode('utf-8')) primary_splitter = CharacterTextSplitter( separator = "\n", chunk_size = 20000, chunk_overlap = 100, length_function = utf8len, ) nlp = spacy.load("ja_core_news_sm") spacy_splitter = SpacyTextSplitter( pipeline = "ja_core_news_sm", chunk_size = 1000, chunk_overlap = 100, length_function = tiktoken_len, ) def chunk_by_spacy(text): # Spacy tokenizer can only handle a max of 4000 bytes String size. chunks = primary_splitter.split_text(text) spacy_splitter._separator = '\n' chunks = [spacy_splitter.split_text(chunk) for chunk in chunks] chunks = sum(chunks, []) # If you want to chunk using multiple delimiters # spacy_splitter._separator = '。' # chunks = [spacy_splitter.split_text(chunk) for chunk in chunks] # chunks = sum(chunks, []) return chunks