capableguptadotcom · January 27, 2025 07:05
diff --git a/summarize_and_split.py b/summarize_and_split.py
 Step-by-Step Instructions for Summarizing PDF Chunks for Improved Citation Retrieval and Query Similarity

 Objective:
 Create concise summaries of large PDF text chunks that retain critical information for both human readability and effective similarity matching during query retrieval.

 Optimize Summaries for Query Similarity
 Why: Ensure summaries retain terms/queries users might search for.
 Steps:

 Keyword Extraction:

 Use RAKE, YAKE, or SpaCy’s Noun Chunks to identify key terms (e.g., entities, domain-specific jargon).

 Explicitly include these keywords in summaries.

 Query-Focused Prompts:

 For abstractive models, add instructions to the summarization prompt:

 Copy
 "Summarize this text for a technical audience, emphasizing [topic/keywords]."  
 Embedding Tests:

 Generate embeddings for summaries and original chunks using models like Sentence-BERT or OpenAI Embeddings.

 Compare cosine similarity between original and summary embeddings to ensure critical information is preserved.


 Validate Summary Quality
 Checklist for Each Summary:

 ✅ Retains key entities (names, dates, concepts).

 ✅ Preserves relationships (e.g., causality, comparisons).

 ✅ Matches the semantic meaning of the original chunk (test with embedding similarity).

 ✅ Avoids hallucinations (common in abstractive methods).


 ######################
 Here's a Python code template to preprocess Markdown text chunks for summarization and similarity tasks. The code handles Markdown syntax removal, text cleaning, and sentence splitting:


 import re
 import spacy
 from unidecode import unidecode

 # Load SpaCy model for sentence splitting (run `python -m spacy download en_core_web_sm` first)
 nlp = spacy.load("en_core_web_sm")

 def preprocess_markdown_chunk(text_chunk):
    """
    Process a Markdown text chunk through cleaning and normalization steps.
    Returns cleaned text and individual sentences.
    """
    # Remove Markdown syntax
    cleaned = strip_markdown(text_chunk)
    
    # Additional cleaning
    cleaned = remove_urls(cleaned)
    cleaned = normalize_whitespace(cleaned)
    cleaned = fix_encoding(cleaned)
    
    # Split sentences
    sentences = split_sentences(cleaned)
    
    return {
        "cleaned_text": cleaned,
        "sentences": sentences
    }

 def strip_markdown(text):
    """Remove Markdown formatting from text"""
    # Remove images
    text = re.sub(r'!\[(.*?)\]\(.*?\)', r'\1', text)
    # Remove links
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
    # Remove code blocks
    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
    # Remove inline code
    text = re.sub(r'`([^`]+)`', r'\1', text)
    # Remove headers
    text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
    # Remove emphasis
    text = re.sub(r'\*{1,2}(.*?)\*{1,2}', r'\1', text)
    text = re.sub(r'_{1,2}(.*?)_{1,2}', r'\1', text)
    # Remove blockquotes
    text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
    # Remove lists
    text = re.sub(r'^[\*\-+]\s+', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE)
    return text

 def remove_urls(text):
    """Remove remaining URLs not caught by Markdown parsing"""
    return re.sub(r'https?://\S+|www\.\S+', '', text)

 def normalize_whitespace(text):
    """Clean up whitespace formatting"""
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines
    text = re.sub(r'\s([.,!?])', r'\1', text)  # Remove space before punctuation
    return text.strip()

 def fix_encoding(text):
    """Normalize special characters and encoding"""
    return unidecode(text)

 def split_sentences(text):
    """Split text into sentences using SpaCy"""
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

 # Example usage
 if __name__ == "__main__":
    sample_markdown = """
    # Machine Learning Applications

    **Machine learning** (ML) has revolutionized _many industries_. Here are key applications:

    - Healthcare: Predictive analytics for [patient outcomes](https://example.com)
    - Finance: `fraud_detection.py` systems using neural networks
    - Climate Science: ```python
    print("CO2 emission prediction models")
    ```

    > "ML will transform how we approach complex problems." - John Doe

    ![ML Diagram](image.png) See more at www.mlexamples.com
    """

    processed = preprocess_markdown_chunk(sample_markdown)
    
    print("=== Cleaned Text ===")
    print(processed["cleaned_text"])
    
    print("\n=== Sentences ===")
    for i, sent in enumerate(processed["sentences"], 1):
        print(f"{i}. {sent}")
        
        
        
        
 Here's a solution using LangChain's JsonLoader and RecursiveCharacterTextSplitter optimized for Markdown content. This code splits JSON objects based on token length while preserving metadata:
 import json
 from langchain.document_loaders import JsonLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import tiktoken  # For accurate token counting

 # 1. Configure token counting function
 def count_tokens(text: str) -> int:
    enc = tiktoken.get_encoding("cl100k_base")  # GPT-4/3.5 tokenizer
    return len(enc.encode(text))

 # 2. Set up JSON loader with content field extraction
 loader = JsonLoader(
    file_path="input.json",
    jq_schema=".[]",  # Process array of JSON objects
    content_key="content",  # Field containing markdown content
    metadata_func=lambda record: {k: v for k, v in record.items() if k != "content"}
 )

 # 3. Initialize splitter with Markdown-aware parameters
 splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,  # Token limit per chunk
    chunk_overlap=64,  # Context overlap between chunks
    length_function=count_tokens,
    separators=[
        "\n\n## ",  # Markdown headings
        "\n\n### ",
        "\n\n",
        "\n",
        " ",
        ""
    ]
 )

 # 4. Process and split documents
 def process_documents():
    original_docs = loader.load()
    split_docs = []
    
    for doc in original_docs:
        # Split the content field
        chunks = splitter.split_text(doc.page_content)
        
        # Create new JSON objects for each chunk
        for chunk in chunks:
            new_doc = {
                "content": chunk,
                **doc.metadata  # Preserve original metadata
            }
            split_docs.append(new_doc)
    
    # 5. Save all chunks to single file
    with open("split_output.json", "w") as f:
        json.dump(split_docs, f, indent=2)

 if __name__ == "__main__":
    process_documents()
 


 RecursiveCharacterTextSplitter(
    chunk_size=25,  # Max tokens per chunk
    chunk_overlap=5,
    separators=["\n\n## ", "\n\n### ", "\n\n", "\n", " "]
 )
	Step-by-Step Instructions for Summarizing PDF Chunks for Improved Citation Retrieval and Query Similarity

	Objective:
	Create concise summaries of large PDF text chunks that retain critical information for both human readability and effective similarity matching during query retrieval.

	Optimize Summaries for Query Similarity
	Why: Ensure summaries retain terms/queries users might search for.
	Steps:

	Keyword Extraction:

	Use RAKE, YAKE, or SpaCy’s Noun Chunks to identify key terms (e.g., entities, domain-specific jargon).

	Explicitly include these keywords in summaries.

	Query-Focused Prompts:

	For abstractive models, add instructions to the summarization prompt:

	Copy
	"Summarize this text for a technical audience, emphasizing [topic/keywords]."
	Embedding Tests:

	Generate embeddings for summaries and original chunks using models like Sentence-BERT or OpenAI Embeddings.

	Compare cosine similarity between original and summary embeddings to ensure critical information is preserved.


	Validate Summary Quality
	Checklist for Each Summary:

	✅ Retains key entities (names, dates, concepts).

	✅ Preserves relationships (e.g., causality, comparisons).

	✅ Matches the semantic meaning of the original chunk (test with embedding similarity).

	✅ Avoids hallucinations (common in abstractive methods).


	######################
	Here's a Python code template to preprocess Markdown text chunks for summarization and similarity tasks. The code handles Markdown syntax removal, text cleaning, and sentence splitting:


	import re
	import spacy
	from unidecode import unidecode

	# Load SpaCy model for sentence splitting (run `python -m spacy download en_core_web_sm` first)
	nlp = spacy.load("en_core_web_sm")

	def preprocess_markdown_chunk(text_chunk):
	"""
	Process a Markdown text chunk through cleaning and normalization steps.
	Returns cleaned text and individual sentences.
	"""
	# Remove Markdown syntax
	cleaned = strip_markdown(text_chunk)

	# Additional cleaning
	cleaned = remove_urls(cleaned)
	cleaned = normalize_whitespace(cleaned)
	cleaned = fix_encoding(cleaned)

	# Split sentences
	sentences = split_sentences(cleaned)

	return {
	"cleaned_text": cleaned,
	"sentences": sentences
	}

	def strip_markdown(text):
	"""Remove Markdown formatting from text"""
	# Remove images
	text = re.sub(r'!\[(.?)\]\(.?\)', r'\1', text)
	# Remove links
	text = re.sub(r'\[(.?)\]\(.?\)', r'\1', text)
	# Remove code blocks
	text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
	# Remove inline code
	text = re.sub(r'`([^`]+)`', r'\1', text)
	# Remove headers
	text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
	# Remove emphasis
	text = re.sub(r'\{1,2}(.?)\*{1,2}', r'\1', text)
	text = re.sub(r'_{1,2}(.*?)_{1,2}', r'\1', text)
	# Remove blockquotes
	text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
	# Remove lists
	text = re.sub(r'^[\*\-+]\s+', '', text, flags=re.MULTILINE)
	text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE)
	return text

	def remove_urls(text):
	"""Remove remaining URLs not caught by Markdown parsing"""
	return re.sub(r'https?://\S+\|www\.\S+', '', text)

	def normalize_whitespace(text):
	"""Clean up whitespace formatting"""
	text = re.sub(r'\s+', ' ', text) # Replace multiple spaces/newlines
	text = re.sub(r'\s([.,!?])', r'\1', text) # Remove space before punctuation
	return text.strip()

	def fix_encoding(text):
	"""Normalize special characters and encoding"""
	return unidecode(text)

	def split_sentences(text):
	"""Split text into sentences using SpaCy"""
	doc = nlp(text)
	return [sent.text.strip() for sent in doc.sents]

	# Example usage
	if __name__ == "__main__":
	sample_markdown = """
	# Machine Learning Applications

	Machine learning (ML) has revolutionized _many industries_. Here are key applications:

	- Healthcare: Predictive analytics for [patient outcomes](https://example.com)
	- Finance: `fraud_detection.py` systems using neural networks
	- Climate Science: ```python
	print("CO2 emission prediction models")
	```

	> "ML will transform how we approach complex problems." - John Doe

	![ML Diagram](image.png) See more at www.mlexamples.com
	"""

	processed = preprocess_markdown_chunk(sample_markdown)

	print("=== Cleaned Text ===")
	print(processed["cleaned_text"])

	print("\n=== Sentences ===")
	for i, sent in enumerate(processed["sentences"], 1):
	print(f"{i}. {sent}")




	Here's a solution using LangChain's JsonLoader and RecursiveCharacterTextSplitter optimized for Markdown content. This code splits JSON objects based on token length while preserving metadata:
	import json
	from langchain.document_loaders import JsonLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import tiktoken # For accurate token counting

	# 1. Configure token counting function
	def count_tokens(text: str) -> int:
	enc = tiktoken.get_encoding("cl100k_base") # GPT-4/3.5 tokenizer
	return len(enc.encode(text))

	# 2. Set up JSON loader with content field extraction
	loader = JsonLoader(
	file_path="input.json",
	jq_schema=".[]", # Process array of JSON objects
	content_key="content", # Field containing markdown content
	metadata_func=lambda record: {k: v for k, v in record.items() if k != "content"}
	)

	# 3. Initialize splitter with Markdown-aware parameters
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=512, # Token limit per chunk
	chunk_overlap=64, # Context overlap between chunks
	length_function=count_tokens,
	separators=[
	"\n\n## ", # Markdown headings
	"\n\n### ",
	"\n\n",
	"\n",
	" ",
	""
	]
	)

	# 4. Process and split documents
	def process_documents():
	original_docs = loader.load()
	split_docs = []

	for doc in original_docs:
	# Split the content field
	chunks = splitter.split_text(doc.page_content)

	# Create new JSON objects for each chunk
	for chunk in chunks:
	new_doc = {
	"content": chunk,
	**doc.metadata # Preserve original metadata
	}
	split_docs.append(new_doc)

	# 5. Save all chunks to single file
	with open("split_output.json", "w") as f:
	json.dump(split_docs, f, indent=2)

	if __name__ == "__main__":
	process_documents()



	RecursiveCharacterTextSplitter(
	chunk_size=25, # Max tokens per chunk
	chunk_overlap=5,
	separators=["\n\n## ", "\n\n### ", "\n\n", "\n", " "]
	)
No results found