Skip to content

Instantly share code, notes, and snippets.

@kenny-kvibe
Last active March 21, 2026 12:59
Show Gist options
  • Select an option

  • Save kenny-kvibe/b380c9c45c94130459958652aea12d79 to your computer and use it in GitHub Desktop.

Select an option

Save kenny-kvibe/b380c9c45c94130459958652aea12d79 to your computer and use it in GitHub Desktop.
Extract text from a document
import os
def extract_document_text(file_path: str, *, line_separator: str = '\n') -> str:
""" Supported file types: `.txt`, `.md`, `.rst`, `.csv`, `.tsv`, `.log`, `.py`, `.rs`, `.json`, `.htm`, `.html`, `.xhtml`, `.xml`, `.pdf`, `.docx`, `.xlsx`, `.odt`, `.ods` """
ext = os.path.splitext(file_path)[-1].lower()
if os.path.exists(file_path):
try:
if ext in ('.txt', '.md', '.rst', '.csv', '.tsv', '.log', '.py', '.rs'):
with open(file_path, 'rb') as fp:
text = line_separator.join(filter(bool, map(str.strip,
fp.read().decode().split('\n')
)))
del fp
return text
if ext == '.json':
import json
with open(file_path, 'rb') as fp:
text = json.dumps(json.load(fp), indent=None, ensure_ascii=False, default=str)
del fp
return text
if ext == '.xml':
import json, xmltodict
with open(file_path, 'rb') as fp:
text = json.dumps(xmltodict.parse(fp), indent=None, ensure_ascii=False, default=str)
del fp
return text
if ext in ('.htm', '.html', '.xhtml'):
import bs4, bs4.filter as bs4_filter
with open(file_path, 'rb') as fp:
text = line_separator.join(filter(bool, map(str.strip,
bs4.BeautifulSoup(fp, 'html.parser', None, bs4_filter.SoupStrainer('body')).get_text(separator='').split('\n')
)))
del fp
return text
if ext == '.pdf':
import PyPDF2
with open(file_path, 'rb') as fp:
text = line_separator.join(filter(bool, (
page.extract_text().strip()
for page in PyPDF2.PdfReader(fp).pages
)))
del fp
return text
if ext == '.docx':
import docx
with open(file_path, 'rb') as fp:
text = line_separator.join(filter(bool, (
p.text.strip()
for p in docx.Document(fp).paragraphs
)))
del fp
return text
if ext == '.xlsx':
import openpyxl
with open(file_path, 'rb') as fp:
wb = openpyxl.load_workbook(fp)
text = ''
if wb.active is not None:
text = line_separator.join(', '.join(
str(cell.value) if cell.value is not None else ''
for cell in row
) for row in wb.active.iter_rows())
del fp
return text
if ext == '.odt':
from odf import (opendocument as odf_doc, teletype as odf_ttype, text as odf_text)
with open(file_path, 'rb') as fp:
text = line_separator.join(filter(bool, (
odf_ttype.extractText(p).strip()
for p in odf_doc.load(fp).getElementsByType(odf_text.P)
)))
del fp
return text
if ext == '.ods':
from odf import (opendocument as odf_doc, table as odf_table, teletype as odf_ttype)
with open(file_path, 'rb') as fp:
text = line_separator.join(', '.join(filter(bool, (
odf_ttype.extractText(cell).strip()
for cell in row.getElementsByType(odf_table.TableCell)
))) for row in odf_doc.load(fp).getElementsByType(odf_table.Table)[0].getElementsByType(odf_table.TableRow))
del fp
return text
except Exception as exc:
raise exc
raise FileNotFoundError(file_path)
raise ValueError(f'Unsupported file type "{ext}"')
odfpy
PyPDF2
python-docx
openpyxl
beautifulsoup4
lxml
xmltodict
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment