kenny-kvibe · March 24, 2026 20:01
diff --git a/document_text_extractor.py b/document_text_extractor.py
 import os


 def extract_document_text(file_path: str, *, line_separator: str = '\n') -> str:
 	"""	Parsable file types: `.json`, `.yaml`, `.yml`, `.xml`, `.htm`, `.html`, `.xhtml`, `.pdf`, `.docx`, `.xlsx`, `.odt`, `.ods`. All other file types are read as plain text. """
 	file_path = os.path.realpath(file_path)
 	if os.path.exists(file_path):
 		ext = os.path.splitext(file_path)[-1].lower()
 		if ext == '.json':
 			import json
 			with open(file_path, 'rb') as fp:
 				text = json.dumps(json.load(fp), indent=None, ensure_ascii=False, default=str)
 			del fp
 			return text
 		if ext in ('.yaml', '.yml'):
 			import json, yaml
 			with open(file_path, 'rb') as fp:
 				text = json.dumps(yaml.safe_load(fp), indent=None, ensure_ascii=False, default=str)
 			del fp
 			return text
 		if ext == '.xml':
 			import json, xmltodict
 			with open(file_path, 'rb') as fp:
 				text = json.dumps(xmltodict.parse(fp), indent=None, ensure_ascii=False, default=str)
 			del fp
 			return text
 		if ext in ('.htm', '.html', '.xhtml'):
 			import bs4, bs4.filter as bs4_filter
 			with open(file_path, 'rb') as fp:
 				text = line_separator.join(filter(bool, map(str.strip,
 					bs4.BeautifulSoup(fp, 'html.parser', None, bs4_filter.SoupStrainer('body')).get_text(separator='').split('\n')
 				)))
 			del fp
 			return text
 		if ext == '.pdf':
 			import PyPDF2
 			with open(file_path, 'rb') as fp:
 				text = line_separator.join(filter(bool, (
 					page.extract_text().strip()
 					for page in PyPDF2.PdfReader(fp).pages
 				)))
 			del fp
 			return text
 		if ext == '.docx':
 			import docx
 			with open(file_path, 'rb') as fp:
 				text = line_separator.join(filter(bool, (
 					p.text.strip()
 					for p in docx.Document(fp).paragraphs
 				)))
 			del fp
 			return text
 		if ext == '.xlsx':
 			import openpyxl
 			with open(file_path, 'rb') as fp:
 				wb = openpyxl.load_workbook(fp)
 				text = ''
 				if wb.active is not None:
 					text = line_separator.join(', '.join(
 						str(cell.value) if cell.value is not None else ''
 						for cell in row
 					) for row in wb.active.iter_rows())
 			del fp
 			return text
 		if ext == '.odt':
 			import odf.opendocument as odf_doc, odf.teletype as odf_ttype, odf.text as odf_text
 			with open(file_path, 'rb') as fp:
 				text = line_separator.join(filter(bool, (
 					odf_ttype.extractText(p).strip()
 					for p in odf_doc.load(fp).getElementsByType(odf_text.P)
 				)))
 			del fp
 			return text
 		if ext == '.ods':
 			import odf.opendocument as odf_doc, odf.table as odf_table, odf.teletype as odf_ttype
 			with open(file_path, 'rb') as fp:
 				text = line_separator.join(', '.join(filter(bool, (
 					odf_ttype.extractText(cell).strip()
 					for cell in row.getElementsByType(odf_table.TableCell)
 				))) for row in odf_doc.load(fp).getElementsByType(odf_table.Table)[0].getElementsByType(odf_table.TableRow))
 			del fp
 			return text
 		with open(file_path, 'rb') as fp:
 			text = line_separator.join(filter(bool, map(str.strip,
 				fp.read().decode().split('\n')
 			)))
 		del fp
 		return text
 	raise FileNotFoundError(file_path)
diff --git a/requirements.txt b/requirements.txt
 odfpy
 PyPDF2
 python-docx
 openpyxl
 beautifulsoup4
 lxml
 xmltodict
	import os


	def extract_document_text(file_path: str, *, line_separator: str = '\n') -> str:
	""" Parsable file types: `.json`, `.yaml`, `.yml`, `.xml`, `.htm`, `.html`, `.xhtml`, `.pdf`, `.docx`, `.xlsx`, `.odt`, `.ods`. All other file types are read as plain text. """
	file_path = os.path.realpath(file_path)
	if os.path.exists(file_path):
	ext = os.path.splitext(file_path)[-1].lower()
	if ext == '.json':
	import json
	with open(file_path, 'rb') as fp:
	text = json.dumps(json.load(fp), indent=None, ensure_ascii=False, default=str)
	del fp
	return text
	if ext in ('.yaml', '.yml'):
	import json, yaml
	with open(file_path, 'rb') as fp:
	text = json.dumps(yaml.safe_load(fp), indent=None, ensure_ascii=False, default=str)
	del fp
	return text
	if ext == '.xml':
	import json, xmltodict
	with open(file_path, 'rb') as fp:
	text = json.dumps(xmltodict.parse(fp), indent=None, ensure_ascii=False, default=str)
	del fp
	return text
	if ext in ('.htm', '.html', '.xhtml'):
	import bs4, bs4.filter as bs4_filter
	with open(file_path, 'rb') as fp:
	text = line_separator.join(filter(bool, map(str.strip,
	bs4.BeautifulSoup(fp, 'html.parser', None, bs4_filter.SoupStrainer('body')).get_text(separator='').split('\n')
	)))
	del fp
	return text
	if ext == '.pdf':
	import PyPDF2
	with open(file_path, 'rb') as fp:
	text = line_separator.join(filter(bool, (
	page.extract_text().strip()
	for page in PyPDF2.PdfReader(fp).pages
	)))
	del fp
	return text
	if ext == '.docx':
	import docx
	with open(file_path, 'rb') as fp:
	text = line_separator.join(filter(bool, (
	p.text.strip()
	for p in docx.Document(fp).paragraphs
	)))
	del fp
	return text
	if ext == '.xlsx':
	import openpyxl
	with open(file_path, 'rb') as fp:
	wb = openpyxl.load_workbook(fp)
	text = ''
	if wb.active is not None:
	text = line_separator.join(', '.join(
	str(cell.value) if cell.value is not None else ''
	for cell in row
	) for row in wb.active.iter_rows())
	del fp
	return text
	if ext == '.odt':
	import odf.opendocument as odf_doc, odf.teletype as odf_ttype, odf.text as odf_text
	with open(file_path, 'rb') as fp:
	text = line_separator.join(filter(bool, (
	odf_ttype.extractText(p).strip()
	for p in odf_doc.load(fp).getElementsByType(odf_text.P)
	)))
	del fp
	return text
	if ext == '.ods':
	import odf.opendocument as odf_doc, odf.table as odf_table, odf.teletype as odf_ttype
	with open(file_path, 'rb') as fp:
	text = line_separator.join(', '.join(filter(bool, (
	odf_ttype.extractText(cell).strip()
	for cell in row.getElementsByType(odf_table.TableCell)
	))) for row in odf_doc.load(fp).getElementsByType(odf_table.Table)[0].getElementsByType(odf_table.TableRow))
	del fp
	return text
	with open(file_path, 'rb') as fp:
	text = line_separator.join(filter(bool, map(str.strip,
	fp.read().decode().split('\n')
	)))
	del fp
	return text
	raise FileNotFoundError(file_path)
No results found