Rourke101 · February 2, 2018 19:45 · Jul 25, 2013 · Jul 25, 2013 · Jul 25, 2013 · Jul 25, 2013
diff --git a/README.md b/README.md
@@ -12,7 +12,9 @@ Metadata in PDF files can be stored in at least two places:
 A PDF file contains a) objects and b) pointers to those objects.
 
 When information is added to a PDF file, it is appended to the end of the file and a pointer is added.
+
 When information is removed from a PDF file, the pointer is removed, but the actual data may not be removed.
+
 To remove previously-deleted data, the PDF file must be rebuilt.
 
 ## pdftk

diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ To remove previously-deleted data, the PDF file must be rebuilt.
 
 [pdftk][pdftk] can be used to update the Info Dictionary of a PDF file. See `pdftk-unset-info-dictionary-values.php` below for an example. As noted in [the pdftk documentation](http://www.pdflabs.com/docs/pdftk-man-page/), though, pdftk does not alter XMP metadata.
 
-[pdftk] http://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/
+[pdftk]: http://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/
 
 ## exiftool
 
@@ -30,15 +30,15 @@ To remove previously-deleted data, the PDF file must be rebuilt.
 
 `exiftool -all:all` also removes the pointer to the Info Dictionary, but does not completely remove it.
 
-[exiftool] http://www.sno.phy.queensu.ca/~phil/exiftool/
+[exiftool]: http://www.sno.phy.queensu.ca/~phil/exiftool/
 
 ## qpdf
 
 [qpdf][qpdf] can be used to linearize PDF files (`qpdf --linearize $FILE`), which optimises them for fast web loading and removes any orphan data.
 
+[qpdf]: http://qpdf.sourceforge.net/
+
 ## Embedded objects.
 
 After running qpdf, there may be new XMP metadata, as it extracts metadata from any embedded objects. To read the XMP tags of 
 embedded objects, use `exiftool -extractEmbedded -all:all $FILE`.
-
-[qpdf] http://qpdf.sourceforge.net/
diff --git a/README.md b/README.md
@@ -1,2 +1,44 @@
 # Anonymising PDFs
 
+## PDF metadata
+
+Metadata in PDF files can be stored in at least two places:
+
+  * the Info Dictionary, a limited set of key/value pairs
+  * XMP packets, which contain RDF statements expressed as XML
+
+## PDF files
+
+A PDF file contains a) objects and b) pointers to those objects.
+
+When information is added to a PDF file, it is appended to the end of the file and a pointer is added.
+When information is removed from a PDF file, the pointer is removed, but the actual data may not be removed.
+To remove previously-deleted data, the PDF file must be rebuilt.
+
+## pdftk
+
+[pdftk][pdftk] can be used to update the Info Dictionary of a PDF file. See `pdftk-unset-info-dictionary-values.php` below for an example. As noted in [the pdftk documentation](http://www.pdflabs.com/docs/pdftk-man-page/), though, pdftk does not alter XMP metadata.
+
+[pdftk] http://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/
+
+## exiftool
+
+[exiftool][exiftool] can be used to read/write XMP metadata from/to PDF files.
+
+  * `exiftool -all:all` => read all the tags. 
+  * `exiftool -all:all=` => remove all the tags.
+
+`exiftool -all:all` also removes the pointer to the Info Dictionary, but does not completely remove it.
+
+[exiftool] http://www.sno.phy.queensu.ca/~phil/exiftool/
+
+## qpdf
+
+[qpdf][qpdf] can be used to linearize PDF files (`qpdf --linearize $FILE`), which optimises them for fast web loading and removes any orphan data.
+
+## Embedded objects.
+
+After running qpdf, there may be new XMP metadata, as it extracts metadata from any embedded objects. To read the XMP tags of 
+embedded objects, use `exiftool -extractEmbedded -all:all $FILE`.
+
+[qpdf] http://qpdf.sourceforge.net/
diff --git a/pdftk-remove-info-dictionary.php → pdftk-unset-info-dictionary-values.php b/pdftk-remove-info-dictionary.php → pdftk-unset-info-dictionary-values.php
diff --git a/remove-pdf-metadata.sh b/remove-pdf-metadata.sh
@@ -18,4 +18,4 @@ qpdf --linearize $FILE
 #cat $FILE | strings > $FILE.txt
 
 # read XMP from embedded objects in the modified PDF
-#exiftool -extractEmbedded-all:all $FILE
+#exiftool -extractEmbedded -all:all $FILE
diff --git a/README.md b/README.md
@@ -0,0 +1,2 @@
+# Anonymising PDFs
+
diff --git a/pdftk-remove-info-dictionary.php b/pdftk-remove-info-dictionary.php
@@ -0,0 +1,38 @@
+<?php
+
+$file = 'example.pdf';
+
+// get the current metadata
+$command = sprintf('pdftk %s dump_data', escapeshellarg($file));
+$output = array(); $return = null; exec($command, $output, $return);
+
+//print_r($output);
+
+if ($return) {
+    throw new Exception('There was an error reading metadata from the PDF file');
+}
+
+// set any metadata values to null
+foreach ($output as $index => $line) {
+    if (strpos($line, 'InfoValue:') === 0) {
+        $output[$index] = 'InfoValue:';
+    }
+}
+
+// write the updated metadata to a file
+$metadataFile = tempnam(sys_get_temp_dir(), 'pdf-meta-');
+file_put_contents($metadataFile, implode("\n", $output));
+
+// create a new PDF using the updated metadata
+$tmpFile = tempnam(sys_get_temp_dir(), 'pdf-tmp-');
+$command = sprintf('pdftk %s update_info %s output %s',
+    escapeshellarg($file), escapeshellarg($metadataFile), escapeshellarg($tmpFile));
+$output = array(); $return = null; exec($command, $output, $return);
+
+if ($return) {
+    throw new Exception('There was an error writing metadata to the PDF file');
+}
+
+// clean up the temporary files
+rename($tmpFile, $file);
+unlink($metadataFile);
diff --git a/remove-pdf-metadata.sh b/remove-pdf-metadata.sh
@@ -16,3 +16,6 @@ qpdf --linearize $FILE
 
 # read all strings from the modified PDF
 #cat $FILE | strings > $FILE.txt
+
+# read XMP from embedded objects in the modified PDF
+#exiftool -extractEmbedded-all:all $FILE
diff --git a/remove-pdf-metadata.sh b/remove-pdf-metadata.sh
@@ -15,4 +15,4 @@ qpdf --linearize $FILE
 #exiftool -all:all $FILE
 
 # read all strings from the modified PDF
-#cat example.pdf | strings > $FILE.txt
+#cat $FILE | strings > $FILE.txt
diff --git a/remove-pdf-metadata.sh b/remove-pdf-metadata.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+FILE=example.pdf
+
+# read tags from the original PDF
+#exiftool -all:all $FILE
+
+# remove tags (XMP + metadata) from the PDF
+exiftool -all:all= $FILE
+
+# linearize the file to remove orphan data
+qpdf --linearize $FILE
+
+# read XMP from the modified PDF
+#exiftool -all:all $FILE
+
+# read all strings from the modified PDF
+#cat example.pdf | strings > $FILE.txt
diff --git a/remove-xmp-metadata.sh b/remove-xmp-metadata.sh
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-FILE=example.pdf
-
-# read XMP from the original PDF
-#rm $FILE.before.xmp 
-#exiftool -tagsfromfile $FILE $FILE.before.xmp
-#cat $FILE.before.xmp 
-
-# remove XMP from the PDF
-exiftool -all:all= $FILE
-
-# read XMP from the modified PDF
-#rm $FILE.after.xmp
-#exiftool -tagsfromfile $FILE $FILE.after.xmp
-#cat $FILE.after.xmp
-
-# read all strings from the modified PDF
-#cat example.pdf | strings > $FILE.txt

diff --git a/remove-xmp-metadata.sh b/remove-xmp-metadata.sh
@@ -4,13 +4,16 @@ FILE=example.pdf
 
 # read XMP from the original PDF
 #rm $FILE.before.xmp 
-#exiftool -tagsfromfile $FILE $FILE.before.xmp; 
+#exiftool -tagsfromfile $FILE $FILE.before.xmp
 #cat $FILE.before.xmp 
 
 # remove XMP from the PDF
 exiftool -all:all= $FILE
 
 # read XMP from the modified PDF
 #rm $FILE.after.xmp
-#exiftool -tagsfromfile $FILE $FILE.after.xmp; 
+#exiftool -tagsfromfile $FILE $FILE.after.xmp
 #cat $FILE.after.xmp
+
+# read all strings from the modified PDF
+#cat example.pdf | strings > $FILE.txt
diff --git a/remove-xmp-metadata.sh b/remove-xmp-metadata.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+FILE=example.pdf
+
+# read XMP from the original PDF
+#rm $FILE.before.xmp 
+#exiftool -tagsfromfile $FILE $FILE.before.xmp; 
+#cat $FILE.before.xmp 
+
+# remove XMP from the PDF
+exiftool -all:all= $FILE
+
+# read XMP from the modified PDF
+#rm $FILE.after.xmp
+#exiftool -tagsfromfile $FILE $FILE.after.xmp; 
+#cat $FILE.after.xmp
No results found