Last active
May 29, 2021 21:35
-
-
Save michalsieron/c8f30e2fd874b8a60f871143d1c2dca0 to your computer and use it in GitHub Desktop.
Transform Adobe Acrobat Reader comment xfdf file to txt
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Transform Adobe Acrobat Reader comment xfdf file to txt. | |
| Usage: python3 xfdf.py <file_path> | |
| Run in the same directory as xfdf file, because there is no path | |
| validation. There should be a file of the same name, but txt | |
| extension created next to the original xfdf file. | |
| """ | |
| import sys | |
| from xml.dom.minidom import parse | |
| try: | |
| NAME = sys.argv[1].split(".")[0] | |
| except IndexError: | |
| print("You must provide file name!", file=sys.stderr) | |
| sys.exit(1) | |
| dom = parse(NAME + ".xfdf") | |
| annots = dom.getElementsByTagName("annots")[0].childNodes | |
| with open(NAME + ".txt", "w") as fp: | |
| for a in annots: | |
| page = a.getAttribute("page") | |
| author = a.getAttribute("title") | |
| datetime = a.getAttribute("date") | |
| year = datetime[2:6] | |
| month = datetime[6:8] | |
| day = datetime[8:10] | |
| date = f"{day}.{month}.{year}" | |
| fp.write(f"page: {page}, author: {author}, date: {date}\n") | |
| for s in a.getElementsByTagName("span"): | |
| t = s.firstChild.wholeText.replace("\r\r", "\r").replace("\r", "\n") | |
| fp.write(t.strip() + "\n\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment