-
-
Save Rourke101/cc98a0dc37780710dc893de6c5c67858 to your computer and use it in GitHub Desktop.
Remove metadata from a PDF file, using exiftool and qpdf. Note that embedded objects may still contain metadata.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| $file = 'example.pdf'; | |
| // get the current metadata | |
| $command = sprintf('pdftk %s dump_data', escapeshellarg($file)); | |
| $output = array(); $return = null; exec($command, $output, $return); | |
| //print_r($output); | |
| if ($return) { | |
| throw new Exception('There was an error reading metadata from the PDF file'); | |
| } | |
| // set any metadata values to null | |
| foreach ($output as $index => $line) { | |
| if (strpos($line, 'InfoValue:') === 0) { | |
| $output[$index] = 'InfoValue:'; | |
| } | |
| } | |
| // write the updated metadata to a file | |
| $metadataFile = tempnam(sys_get_temp_dir(), 'pdf-meta-'); | |
| file_put_contents($metadataFile, implode("\n", $output)); | |
| // create a new PDF using the updated metadata | |
| $tmpFile = tempnam(sys_get_temp_dir(), 'pdf-tmp-'); | |
| $command = sprintf('pdftk %s update_info %s output %s', | |
| escapeshellarg($file), escapeshellarg($metadataFile), escapeshellarg($tmpFile)); | |
| $output = array(); $return = null; exec($command, $output, $return); | |
| if ($return) { | |
| throw new Exception('There was an error writing metadata to the PDF file'); | |
| } | |
| // clean up the temporary files | |
| rename($tmpFile, $file); | |
| unlink($metadataFile); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| FILE=example.pdf | |
| # read tags from the original PDF | |
| #exiftool -all:all $FILE | |
| # remove tags (XMP + metadata) from the PDF | |
| exiftool -all:all= $FILE | |
| # linearize the file to remove orphan data | |
| qpdf --linearize $FILE | |
| # read XMP from the modified PDF | |
| #exiftool -all:all $FILE | |
| # read all strings from the modified PDF | |
| #cat $FILE | strings > $FILE.txt | |
| # read XMP from embedded objects in the modified PDF | |
| #exiftool -extractEmbedded-all:all $FILE |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment