Skip to content

Instantly share code, notes, and snippets.

@evdcush
Forked from Arnie97/remove_watermark.py
Created June 29, 2022 11:04
Show Gist options
  • Select an option

  • Save evdcush/505862396d48607a0f9090903c6ee90b to your computer and use it in GitHub Desktop.

Select an option

Save evdcush/505862396d48607a0f9090903c6ee90b to your computer and use it in GitHub Desktop.

Revisions

  1. @Arnie97 Arnie97 revised this gist Aug 2, 2015. 1 changed file with 44 additions and 37 deletions.
    81 changes: 44 additions & 37 deletions remove_watermark.py
    Original file line number Diff line number Diff line change
    @@ -1,24 +1,21 @@
    #!/usr/bin/env python3

    import sys
    import re
    import shutil
    import argparse
    import binascii
    from os import path
    from sys import stderr

    #
    # Author: Daxda
    # Date: 02.04.2014
    # WTF: This is a quick tool I've hacked together to easily remove the meta
    # information as well as the annoying link on each page of eBooks down-
    # loaded from it-ebooks.info. The modified file will hold the original
    # file name and the original file will be renamed to 'original.pdf.OLD'
    #
    #

    # 'pattern' is the regex pattern which is used to remove the annotation elements,
    # the rough structure of it looks like this:
    # information as well as the annoying link on each page of eBooks
    # downloaded from it-ebooks.info. The modified file will hold the
    # original file name, and the original file will be renamed to
    # 'original.pdf.old'. 'pattern' is the regex pattern which is used to
    # remove the annotation elements, the rough structure of it looks
    # like this:
    #
    # obj
    # <<
    @@ -35,19 +32,22 @@
    # endobj
    #

    pattern = b"""0a2f54797065202f416e6e6f740a2f53756274797065202f4c696e6b0a2f526563
    74205b20.*?205d0a2f426f7264657220.*?\n0a2f41203c3c0a2f54797065202f416374696f6e0
    a2f53202f5552490a2f5552492028687474703a2f2f7777772e69742d65626f6f6b732e696e666f
    2f290a3e3e""".replace(b"\n", b"").strip()
    pattern = b'''0a2f54797065202f416e6e6f740a2f53756274797065202f4c696e6b0a2f52656
    374205b20.*?205d0a2f426f7264657220.*?\n0a2f41203c3c0a2f54797065202f416374696f6e
    0a2f53202f5552490a2f5552492028687474703a2f2f7777772e69742d65626f6f6b732e696e666
    f2f290a3e3e'''.replace(b'\n', b'').strip()

    def remove_evil_links(pdf_data):
    """ Removes all it-ebook's links and metadata from the passed PDF data. """
    'Removes all it-ebook links and metadata from the passed PDF data.'
    pdf_data = binascii.hexlify(pdf_data)
    # Remove each annotation element inside the PDF file (This removes the
    # "clickable" it-ebooks.info links)
    new_data = re.sub(pattern, b"", pdf_data)
    # Remove the actual links (link elements which are assigned to the annotations)
    new_data = new_data.replace(binascii.hexlify(b"www.it-ebooks.info"), b"")

    # Remove each annotation element inside the PDF file
    # (This removes the "clickable" it-ebooks.info links)
    new_data = re.sub(pattern, b'', pdf_data)

    # Remove the actual links
    # (link elements which are assigned to the annotations)
    new_data = new_data.replace(binascii.hexlify(b'www.it-ebooks.info'), b'')
    return binascii.unhexlify(new_data)

    def main(args):
    @@ -57,41 +57,48 @@ def main(args):
    if not file_path:
    continue
    if args.verbose:
    print("Processing: {0}".format(file_path))
    print('Processing: {0}'.format(file_path))
    try:
    with open(file_path, "rb") as input_file:
    with open(file_path, 'rb') as input_file:
    pdf_data = input_file.read()
    except IOError as e:
    stderr.write("{0}: {1}\n".format(file_path, e.strerror))
    stderr.flush()
    sys.stderr.write('{0}: {1}\n'.format(file_path, e.strerror))
    sys.stderr.flush()
    continue

    # Backup the file with a different name
    if not args.no_backup:
    if args.verbose:
    print("Creating backup: {0}.OLD".format(file_path))
    shutil.move(file_path, "{0}.OLD".format(file_path))
    print('Creating backup: {0}.old'.format(file_path))
    shutil.move(file_path, '{0}.old'.format(file_path))

    # Modify the PDF file
    new_pdf_data = remove_evil_links(pdf_data)
    # Save the new file
    with open(file_path, "wb") as out_file:
    with open(file_path, 'wb') as out_file:
    out_file.write(new_pdf_data)
    if args.verbose:
    print("Saving modified file: {0}".format(file_path))
    print('Saving modified file: {0}'.format(file_path))
    except KeyboardInterrupt:
    pass

    if __name__ == "__main__":
    if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-f", "--files",
    help="One or more PDF files to remove it-ebook's watermarks.",
    nargs="*", required=True)
    parser.add_argument("--no-backup",
    help="Disables the creating of backups for the files which"+\
    " are being processed. ",
    action="store_true")
    parser.add_argument("-v", "--verbose", action="store_true")
    parser.add_argument(
    '-f', '--files',
    help='One or more PDF files to remove it-ebook watermarks.',
    nargs='*', required=True
    )
    parser.add_argument(
    '-n', '--no-backup',
    help='Disables the creating of backups for the files ' +
    'which are being processed.',
    action='store_true'
    )
    parser.add_argument(
    '-v', '--verbose',
    action='store_true'
    )

    args = parser.parse_args()
    main(args)
  2. @Arnie97 Arnie97 revised this gist Aug 2, 2015. 1 changed file with 8 additions and 7 deletions.
    15 changes: 8 additions & 7 deletions remove_watermark.py
    Original file line number Diff line number Diff line change
    @@ -1,8 +1,9 @@
    #!/usr/bin/env python
    #!/usr/bin/env python3

    import re
    import shutil
    import argparse
    import binascii
    from os import path
    from sys import stderr

    @@ -34,20 +35,20 @@
    # endobj
    #

    pattern = """0a2f54797065202f416e6e6f740a2f53756274797065202f4c696e6b0a2f526563
    pattern = b"""0a2f54797065202f416e6e6f740a2f53756274797065202f4c696e6b0a2f526563
    74205b20.*?205d0a2f426f7264657220.*?\n0a2f41203c3c0a2f54797065202f416374696f6e0
    a2f53202f5552490a2f5552492028687474703a2f2f7777772e69742d65626f6f6b732e696e666f
    2f290a3e3e""".replace("\n", "").strip()
    2f290a3e3e""".replace(b"\n", b"").strip()

    def remove_evil_links(pdf_data):
    """ Removes all it-ebook's links and metadata from the passed PDF data. """
    pdf_data = pdf_data.encode("hex")
    pdf_data = binascii.hexlify(pdf_data)
    # Remove each annotation element inside the PDF file (This removes the
    # "clickable" it-ebooks.info links)
    new_data = re.sub(pattern, "", pdf_data)
    new_data = re.sub(pattern, b"", pdf_data)
    # Remove the actual links (link elements which are assigned to the annotations)
    new_data = new_data.replace("www.it-ebooks.info".encode("hex"), "")
    return new_data.decode("hex")
    new_data = new_data.replace(binascii.hexlify(b"www.it-ebooks.info"), b"")
    return binascii.unhexlify(new_data)

    def main(args):
    try:
  3. @Arnie97 Arnie97 revised this gist Aug 2, 2015. 1 changed file with 3 additions and 1 deletion.
    4 changes: 3 additions & 1 deletion remove_watermark.py
    100644 → 100755
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,5 @@
    #!/usr/bin/env python

    import re
    import shutil
    import argparse
    @@ -91,4 +93,4 @@ def main(args):
    parser.add_argument("-v", "--verbose", action="store_true")

    args = parser.parse_args()
    main(args)
    main(args)
  4. @Arnie97 Arnie97 renamed this gist Aug 2, 2015. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  5. @Daapii Daapii created this gist Apr 2, 2014.
    94 changes: 94 additions & 0 deletions uNoWatermark.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,94 @@
    import re
    import shutil
    import argparse
    from os import path
    from sys import stderr

    #
    # Author: Daxda
    # Date: 02.04.2014
    # WTF: This is a quick tool I've hacked together to easily remove the meta
    # information as well as the annoying link on each page of eBooks down-
    # loaded from it-ebooks.info. The modified file will hold the original
    # file name and the original file will be renamed to 'original.pdf.OLD'
    #
    #

    # 'pattern' is the regex pattern which is used to remove the annotation elements,
    # the rough structure of it looks like this:
    #
    # obj
    # <<
    # /Type /Annot
    # /Subtype /Link
    # /Rect [ 264 91 348 79 ] # The digits on this line will differ
    # /Border [ 0 0 0 ] # The same goes for the digits on this line
    # /A <<
    # /Type /Action
    # /S /URI
    # /URI (http://www.it-ebooks.info/)
    # >>
    # >>
    # endobj
    #

    pattern = """0a2f54797065202f416e6e6f740a2f53756274797065202f4c696e6b0a2f526563
    74205b20.*?205d0a2f426f7264657220.*?\n0a2f41203c3c0a2f54797065202f416374696f6e0
    a2f53202f5552490a2f5552492028687474703a2f2f7777772e69742d65626f6f6b732e696e666f
    2f290a3e3e""".replace("\n", "").strip()

    def remove_evil_links(pdf_data):
    """ Removes all it-ebook's links and metadata from the passed PDF data. """
    pdf_data = pdf_data.encode("hex")
    # Remove each annotation element inside the PDF file (This removes the
    # "clickable" it-ebooks.info links)
    new_data = re.sub(pattern, "", pdf_data)
    # Remove the actual links (link elements which are assigned to the annotations)
    new_data = new_data.replace("www.it-ebooks.info".encode("hex"), "")
    return new_data.decode("hex")

    def main(args):
    try:
    args.files = list(set(args.files))
    for file_path in args.files:
    if not file_path:
    continue
    if args.verbose:
    print("Processing: {0}".format(file_path))
    try:
    with open(file_path, "rb") as input_file:
    pdf_data = input_file.read()
    except IOError as e:
    stderr.write("{0}: {1}\n".format(file_path, e.strerror))
    stderr.flush()
    continue

    # Backup the file with a different name
    if not args.no_backup:
    if args.verbose:
    print("Creating backup: {0}.OLD".format(file_path))
    shutil.move(file_path, "{0}.OLD".format(file_path))

    # Modify the PDF file
    new_pdf_data = remove_evil_links(pdf_data)
    # Save the new file
    with open(file_path, "wb") as out_file:
    out_file.write(new_pdf_data)
    if args.verbose:
    print("Saving modified file: {0}".format(file_path))
    except KeyboardInterrupt:
    pass

    if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-f", "--files",
    help="One or more PDF files to remove it-ebook's watermarks.",
    nargs="*", required=True)
    parser.add_argument("--no-backup",
    help="Disables the creating of backups for the files which"+\
    " are being processed. ",
    action="store_true")
    parser.add_argument("-v", "--verbose", action="store_true")

    args = parser.parse_args()
    main(args)