Created
January 27, 2025 07:05
-
-
Save capableguptadotcom/6ac76d07f22610c35ad17d1ba4cb6b5e to your computer and use it in GitHub Desktop.
Generate Summaries and Splitting File
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Step-by-Step Instructions for Summarizing PDF Chunks for Improved Citation Retrieval and Query Similarity | |
| Objective: | |
| Create concise summaries of large PDF text chunks that retain critical information for both human readability and effective similarity matching during query retrieval. | |
| Optimize Summaries for Query Similarity | |
| Why: Ensure summaries retain terms/queries users might search for. | |
| Steps: | |
| Keyword Extraction: | |
| Use RAKE, YAKE, or SpaCy’s Noun Chunks to identify key terms (e.g., entities, domain-specific jargon). | |
| Explicitly include these keywords in summaries. | |
| Query-Focused Prompts: | |
| For abstractive models, add instructions to the summarization prompt: | |
| Copy | |
| "Summarize this text for a technical audience, emphasizing [topic/keywords]." | |
| Embedding Tests: | |
| Generate embeddings for summaries and original chunks using models like Sentence-BERT or OpenAI Embeddings. | |
| Compare cosine similarity between original and summary embeddings to ensure critical information is preserved. | |
| Validate Summary Quality | |
| Checklist for Each Summary: | |
| ✅ Retains key entities (names, dates, concepts). | |
| ✅ Preserves relationships (e.g., causality, comparisons). | |
| ✅ Matches the semantic meaning of the original chunk (test with embedding similarity). | |
| ✅ Avoids hallucinations (common in abstractive methods). | |
| ###################### | |
| Here's a Python code template to preprocess Markdown text chunks for summarization and similarity tasks. The code handles Markdown syntax removal, text cleaning, and sentence splitting: | |
| import re | |
| import spacy | |
| from unidecode import unidecode | |
| # Load SpaCy model for sentence splitting (run `python -m spacy download en_core_web_sm` first) | |
| nlp = spacy.load("en_core_web_sm") | |
| def preprocess_markdown_chunk(text_chunk): | |
| """ | |
| Process a Markdown text chunk through cleaning and normalization steps. | |
| Returns cleaned text and individual sentences. | |
| """ | |
| # Remove Markdown syntax | |
| cleaned = strip_markdown(text_chunk) | |
| # Additional cleaning | |
| cleaned = remove_urls(cleaned) | |
| cleaned = normalize_whitespace(cleaned) | |
| cleaned = fix_encoding(cleaned) | |
| # Split sentences | |
| sentences = split_sentences(cleaned) | |
| return { | |
| "cleaned_text": cleaned, | |
| "sentences": sentences | |
| } | |
| def strip_markdown(text): | |
| """Remove Markdown formatting from text""" | |
| # Remove images | |
| text = re.sub(r'!\[(.*?)\]\(.*?\)', r'\1', text) | |
| # Remove links | |
| text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text) | |
| # Remove code blocks | |
| text = re.sub(r'```.*?```', '', text, flags=re.DOTALL) | |
| # Remove inline code | |
| text = re.sub(r'`([^`]+)`', r'\1', text) | |
| # Remove headers | |
| text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE) | |
| # Remove emphasis | |
| text = re.sub(r'\*{1,2}(.*?)\*{1,2}', r'\1', text) | |
| text = re.sub(r'_{1,2}(.*?)_{1,2}', r'\1', text) | |
| # Remove blockquotes | |
| text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE) | |
| # Remove lists | |
| text = re.sub(r'^[\*\-+]\s+', '', text, flags=re.MULTILINE) | |
| text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE) | |
| return text | |
| def remove_urls(text): | |
| """Remove remaining URLs not caught by Markdown parsing""" | |
| return re.sub(r'https?://\S+|www\.\S+', '', text) | |
| def normalize_whitespace(text): | |
| """Clean up whitespace formatting""" | |
| text = re.sub(r'\s+', ' ', text) # Replace multiple spaces/newlines | |
| text = re.sub(r'\s([.,!?])', r'\1', text) # Remove space before punctuation | |
| return text.strip() | |
| def fix_encoding(text): | |
| """Normalize special characters and encoding""" | |
| return unidecode(text) | |
| def split_sentences(text): | |
| """Split text into sentences using SpaCy""" | |
| doc = nlp(text) | |
| return [sent.text.strip() for sent in doc.sents] | |
| # Example usage | |
| if __name__ == "__main__": | |
| sample_markdown = """ | |
| # Machine Learning Applications | |
| **Machine learning** (ML) has revolutionized _many industries_. Here are key applications: | |
| - Healthcare: Predictive analytics for [patient outcomes](https://example.com) | |
| - Finance: `fraud_detection.py` systems using neural networks | |
| - Climate Science: ```python | |
| print("CO2 emission prediction models") | |
| ``` | |
| > "ML will transform how we approach complex problems." - John Doe | |
|  See more at www.mlexamples.com | |
| """ | |
| processed = preprocess_markdown_chunk(sample_markdown) | |
| print("=== Cleaned Text ===") | |
| print(processed["cleaned_text"]) | |
| print("\n=== Sentences ===") | |
| for i, sent in enumerate(processed["sentences"], 1): | |
| print(f"{i}. {sent}") | |
| Here's a solution using LangChain's JsonLoader and RecursiveCharacterTextSplitter optimized for Markdown content. This code splits JSON objects based on token length while preserving metadata: | |
| import json | |
| from langchain.document_loaders import JsonLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| import tiktoken # For accurate token counting | |
| # 1. Configure token counting function | |
| def count_tokens(text: str) -> int: | |
| enc = tiktoken.get_encoding("cl100k_base") # GPT-4/3.5 tokenizer | |
| return len(enc.encode(text)) | |
| # 2. Set up JSON loader with content field extraction | |
| loader = JsonLoader( | |
| file_path="input.json", | |
| jq_schema=".[]", # Process array of JSON objects | |
| content_key="content", # Field containing markdown content | |
| metadata_func=lambda record: {k: v for k, v in record.items() if k != "content"} | |
| ) | |
| # 3. Initialize splitter with Markdown-aware parameters | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=512, # Token limit per chunk | |
| chunk_overlap=64, # Context overlap between chunks | |
| length_function=count_tokens, | |
| separators=[ | |
| "\n\n## ", # Markdown headings | |
| "\n\n### ", | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "" | |
| ] | |
| ) | |
| # 4. Process and split documents | |
| def process_documents(): | |
| original_docs = loader.load() | |
| split_docs = [] | |
| for doc in original_docs: | |
| # Split the content field | |
| chunks = splitter.split_text(doc.page_content) | |
| # Create new JSON objects for each chunk | |
| for chunk in chunks: | |
| new_doc = { | |
| "content": chunk, | |
| **doc.metadata # Preserve original metadata | |
| } | |
| split_docs.append(new_doc) | |
| # 5. Save all chunks to single file | |
| with open("split_output.json", "w") as f: | |
| json.dump(split_docs, f, indent=2) | |
| if __name__ == "__main__": | |
| process_documents() | |
| RecursiveCharacterTextSplitter( | |
| chunk_size=25, # Max tokens per chunk | |
| chunk_overlap=5, | |
| separators=["\n\n## ", "\n\n### ", "\n\n", "\n", " "] | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment