Last active
March 21, 2026 12:59
-
-
Save kenny-kvibe/b380c9c45c94130459958652aea12d79 to your computer and use it in GitHub Desktop.
Extract text from a document
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| def extract_document_text(file_path: str, *, line_separator: str = '\n') -> str: | |
| """ Supported file types: `.txt`, `.md`, `.rst`, `.csv`, `.tsv`, `.log`, `.py`, `.rs`, `.json`, `.htm`, `.html`, `.xhtml`, `.xml`, `.pdf`, `.docx`, `.xlsx`, `.odt`, `.ods` """ | |
| ext = os.path.splitext(file_path)[-1].lower() | |
| if os.path.exists(file_path): | |
| try: | |
| if ext in ('.txt', '.md', '.rst', '.csv', '.tsv', '.log', '.py', '.rs'): | |
| with open(file_path, 'rb') as fp: | |
| text = line_separator.join(filter(bool, map(str.strip, | |
| fp.read().decode().split('\n') | |
| ))) | |
| del fp | |
| return text | |
| if ext == '.json': | |
| import json | |
| with open(file_path, 'rb') as fp: | |
| text = json.dumps(json.load(fp), indent=None, ensure_ascii=False, default=str) | |
| del fp | |
| return text | |
| if ext == '.xml': | |
| import json, xmltodict | |
| with open(file_path, 'rb') as fp: | |
| text = json.dumps(xmltodict.parse(fp), indent=None, ensure_ascii=False, default=str) | |
| del fp | |
| return text | |
| if ext in ('.htm', '.html', '.xhtml'): | |
| import bs4, bs4.filter as bs4_filter | |
| with open(file_path, 'rb') as fp: | |
| text = line_separator.join(filter(bool, map(str.strip, | |
| bs4.BeautifulSoup(fp, 'html.parser', None, bs4_filter.SoupStrainer('body')).get_text(separator='').split('\n') | |
| ))) | |
| del fp | |
| return text | |
| if ext == '.pdf': | |
| import PyPDF2 | |
| with open(file_path, 'rb') as fp: | |
| text = line_separator.join(filter(bool, ( | |
| page.extract_text().strip() | |
| for page in PyPDF2.PdfReader(fp).pages | |
| ))) | |
| del fp | |
| return text | |
| if ext == '.docx': | |
| import docx | |
| with open(file_path, 'rb') as fp: | |
| text = line_separator.join(filter(bool, ( | |
| p.text.strip() | |
| for p in docx.Document(fp).paragraphs | |
| ))) | |
| del fp | |
| return text | |
| if ext == '.xlsx': | |
| import openpyxl | |
| with open(file_path, 'rb') as fp: | |
| wb = openpyxl.load_workbook(fp) | |
| text = '' | |
| if wb.active is not None: | |
| text = line_separator.join(', '.join( | |
| str(cell.value) if cell.value is not None else '' | |
| for cell in row | |
| ) for row in wb.active.iter_rows()) | |
| del fp | |
| return text | |
| if ext == '.odt': | |
| from odf import (opendocument as odf_doc, teletype as odf_ttype, text as odf_text) | |
| with open(file_path, 'rb') as fp: | |
| text = line_separator.join(filter(bool, ( | |
| odf_ttype.extractText(p).strip() | |
| for p in odf_doc.load(fp).getElementsByType(odf_text.P) | |
| ))) | |
| del fp | |
| return text | |
| if ext == '.ods': | |
| from odf import (opendocument as odf_doc, table as odf_table, teletype as odf_ttype) | |
| with open(file_path, 'rb') as fp: | |
| text = line_separator.join(', '.join(filter(bool, ( | |
| odf_ttype.extractText(cell).strip() | |
| for cell in row.getElementsByType(odf_table.TableCell) | |
| ))) for row in odf_doc.load(fp).getElementsByType(odf_table.Table)[0].getElementsByType(odf_table.TableRow)) | |
| del fp | |
| return text | |
| except Exception as exc: | |
| raise exc | |
| raise FileNotFoundError(file_path) | |
| raise ValueError(f'Unsupported file type "{ext}"') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| odfpy | |
| PyPDF2 | |
| python-docx | |
| openpyxl | |
| beautifulsoup4 | |
| lxml | |
| xmltodict |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment