Last active
March 24, 2026 20:01
-
-
Save kenny-kvibe/b380c9c45c94130459958652aea12d79 to your computer and use it in GitHub Desktop.
Extract text from a document
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| def extract_document_text(file_path: str, *, line_separator: str = '\n') -> str: | |
| """ Parsable file types: `.json`, `.yaml`, `.yml`, `.xml`, `.htm`, `.html`, `.xhtml`, `.pdf`, `.docx`, `.xlsx`, `.odt`, `.ods`. All other file types are read as plain text. """ | |
| file_path = os.path.realpath(file_path) | |
| if os.path.exists(file_path): | |
| ext = os.path.splitext(file_path)[-1].lower() | |
| if ext == '.json': | |
| import json | |
| with open(file_path, 'rb') as fp: | |
| text = json.dumps(json.load(fp), indent=None, ensure_ascii=False, default=str) | |
| del fp | |
| return text | |
| if ext in ('.yaml', '.yml'): | |
| import json, yaml | |
| with open(file_path, 'rb') as fp: | |
| text = json.dumps(yaml.safe_load(fp), indent=None, ensure_ascii=False, default=str) | |
| del fp | |
| return text | |
| if ext == '.xml': | |
| import json, xmltodict | |
| with open(file_path, 'rb') as fp: | |
| text = json.dumps(xmltodict.parse(fp), indent=None, ensure_ascii=False, default=str) | |
| del fp | |
| return text | |
| if ext in ('.htm', '.html', '.xhtml'): | |
| import bs4, bs4.filter as bs4_filter | |
| with open(file_path, 'rb') as fp: | |
| text = line_separator.join(filter(bool, map(str.strip, | |
| bs4.BeautifulSoup(fp, 'html.parser', None, bs4_filter.SoupStrainer('body')).get_text(separator='').split('\n') | |
| ))) | |
| del fp | |
| return text | |
| if ext == '.pdf': | |
| import PyPDF2 | |
| with open(file_path, 'rb') as fp: | |
| text = line_separator.join(filter(bool, ( | |
| page.extract_text().strip() | |
| for page in PyPDF2.PdfReader(fp).pages | |
| ))) | |
| del fp | |
| return text | |
| if ext == '.docx': | |
| import docx | |
| with open(file_path, 'rb') as fp: | |
| text = line_separator.join(filter(bool, ( | |
| p.text.strip() | |
| for p in docx.Document(fp).paragraphs | |
| ))) | |
| del fp | |
| return text | |
| if ext == '.xlsx': | |
| import openpyxl | |
| with open(file_path, 'rb') as fp: | |
| wb = openpyxl.load_workbook(fp) | |
| text = '' | |
| if wb.active is not None: | |
| text = line_separator.join(', '.join( | |
| str(cell.value) if cell.value is not None else '' | |
| for cell in row | |
| ) for row in wb.active.iter_rows()) | |
| del fp | |
| return text | |
| if ext == '.odt': | |
| import odf.opendocument as odf_doc, odf.teletype as odf_ttype, odf.text as odf_text | |
| with open(file_path, 'rb') as fp: | |
| text = line_separator.join(filter(bool, ( | |
| odf_ttype.extractText(p).strip() | |
| for p in odf_doc.load(fp).getElementsByType(odf_text.P) | |
| ))) | |
| del fp | |
| return text | |
| if ext == '.ods': | |
| import odf.opendocument as odf_doc, odf.table as odf_table, odf.teletype as odf_ttype | |
| with open(file_path, 'rb') as fp: | |
| text = line_separator.join(', '.join(filter(bool, ( | |
| odf_ttype.extractText(cell).strip() | |
| for cell in row.getElementsByType(odf_table.TableCell) | |
| ))) for row in odf_doc.load(fp).getElementsByType(odf_table.Table)[0].getElementsByType(odf_table.TableRow)) | |
| del fp | |
| return text | |
| with open(file_path, 'rb') as fp: | |
| text = line_separator.join(filter(bool, map(str.strip, | |
| fp.read().decode().split('\n') | |
| ))) | |
| del fp | |
| return text | |
| raise FileNotFoundError(file_path) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| odfpy | |
| PyPDF2 | |
| python-docx | |
| openpyxl | |
| beautifulsoup4 | |
| lxml | |
| xmltodict |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment