pdfcrop generates larger file?
Here is my version of an improved pdfcrop
.
Default operation is to remove white margins from the pdf input, optionally leaving a user defined extra-margin (option -m ...
).
Alternative operation is to trim the page edges by user defined amounts (option -t ...
).
pdfcrop.sh
uses gs
(Ghostscript) for page-wise determination of the tightly enclosing bounding box, pdftk
for uncompressing/compressing the PDF files and getting the order of pages (which doesn't need to be linear), and perl
for replacing original page dimensions by the tight bounding boxes found.
Unlike original pdfcrop
the bash script below preserves the original interactive parts of the PDF (links, annotations etc.). The output file size is about the same as before.
Update: Option -two
added for two-sided page layout
Usage examples:
#getting help
pdfcrop.sh -help
#default operation
pdfcrop.sh orig.pdf cropped.pdf
pdfcrop.sh -m 10 orig.pdf cropped.pdf
pdfcrop.sh -hires orig.pdf cropped.pdf
#trimming pages
pdfcrop.sh -t "10 20 30 40" orig.pdf trimmed.pdf
#same for two-sided layout
pdfcrop.sh -t "10 20 30 40" -two orig.pdf trimmed.pdf
Content of pdfcrop.sh
:
#!/bin/bash
function usage () {
echo "Usage: `basename $0` [Options] <input.pdf> [<output.pdf>]"
echo
echo " * Removes white margins from every page in the file. (Default operation)"
echo " * Trims page edges by given amounts. (Alternative operation)"
echo
echo "If only <input.pdf> is given, it is overwritten with the cropped output."
echo
echo "Options:"
echo
echo " -m \"<left> [<bottom> [<right> <top>]]\""
echo " adds extra margins in default operation mode. Unit is bp. A single number"
echo " is used for all margins, two numbers \"<left> <bottom>\" are applied to the"
echo " right and top margins alike."
echo
echo " -t \"<left> [<bottom> [<right> <top>]]\""
echo " trims outer page edges by the given amounts. Unit is bp. A single number"
echo " is used for all trims, two numbers \"<left> <bottom>\" are applied to the"
echo " right and top trims alike."
echo
echo " -two"
echo " to be used for documents with two-sided page layout; the meaning of <left>"
echo " and <right> changes to <inner> and <outer> for options -m and -t"
echo
echo " -hires"
echo " %%HiResBoundingBox is used in default operation mode."
echo
echo " -help"
echo " prints this message."
}
c=0
mar=(0 0 0 0); tri=(0 0 0 0)
bbtype=BoundingBox
two=0
while getopts m:t:h: opt
do
case $opt
in
m)
eval mar=($OPTARG)
[[ -z "${mar[1]}" ]] && mar[1]=${mar[0]}
[[ -z "${mar[2]}" || -z "${mar[3]}" ]] && mar[2]=${mar[0]} && mar[3]=${mar[1]}
c=0
;;
t)
if [[ "$OPTARG" == "wo" ]]
then
two=1
else
eval tri=($OPTARG)
[[ -z "${tri[1]}" ]] && tri[1]=${tri[0]}
[[ -z "${tri[2]}" || -z "${tri[3]}" ]] && tri[2]=${tri[0]} && tri[3]=${tri[1]}
c=1
fi
;;
h)
if [[ "$OPTARG" == "ires" ]]
then
bbtype=HiResBoundingBox
else
usage 1>&2; exit 0
fi
;;
\?)
usage 1>&2; exit 1
;;
esac
done
shift $((OPTIND-1))
[[ -z "$1" ]] && echo "`basename $0`: missing filename" 1>&2 && usage 1>&2 && exit 1
input=$1;output=$1;shift;
[[ -n "$1" ]] && output=$1 && shift;
(
[[ "$c" -eq 0 ]] && gs -dNOPAUSE -q -dBATCH -sDEVICE=bbox "$input" 2>&1 | grep "%%$bbtype"
pdftk "$input" output - uncompress
) | perl -w -n -s -e '
BEGIN {@m=split /\s+/, $mar; @t=split /\s+/, $tri; @mb=(); $p=-1;}
sub fixMB {
if($c){
if($two && $p%2) {
$mb[0]+=$t[2];$mb[1]+=$t[1];$mb[2]-=$t[0];$mb[3]-=$t[3];
}
else {
$mb[0]+=$t[0];$mb[1]+=$t[1];$mb[2]-=$t[2];$mb[3]-=$t[3];
}
print "/MediaBox [", join(" ", @mb), "]\n";
} else {
@bb=split /\s+/, $bbox[$p];
if($two && $p%2) {
$bb[0]+=$mb[0];$bb[1]+=$mb[1];$bb[2]+=$mb[0];$bb[3]+=$mb[1];
$bb[0]-=$m[2];$bb[1]-=$m[1];$bb[2]+=$m[0];$bb[3]+=$m[3];
}
else {
$bb[0]+=$mb[0];$bb[1]+=$mb[1];$bb[2]+=$mb[0];$bb[3]+=$mb[1];
$bb[0]-=$m[0];$bb[1]-=$m[1];$bb[2]+=$m[2];$bb[3]+=$m[3];
}
print "/MediaBox [", join(" ", @bb), "]\n";
}
}
if (/BoundingBox:\s+([\d\.\s]+\d)/) { push @bbox, $1; next;}
elsif (/\/MediaBox\s+\[([\d\.\s]+\d)\]/) {
@mb=split /\s+/, $1; next if($p<0);
fixMB; @mb=(); $p=-1; next;
}
elsif (/pdftk_PageNum\s+(\d+)/) {
$p=$1-1; next unless(@mb);
fixMB; @mb=(); $p=-1; next;
}
print;
' -- -mar="${mar[*]}" -tri="${tri[*]}" -c=$c -two=$two | pdftk - output "$output" compress
I use a python script found here: http://www.mobileread.com/forums/showthread.php?t=25565 with following features:
- output has reasonable size as you requested
- supports absolute cropping (in case automatically computed bounding box is not useful when you have horizontal footer or header bars)
- it is awesome fast: for 200 pages in less than a second!
Of course you need to get pyPdf installed in advance. As the link may get orphan, I paste source code here:
#! /usr/bin/python
import getopt, sys
from pyPdf import PdfFileWriter, PdfFileReader
def usage ():
print """sjvr767\'s PDF Cropping Script.
Example:
my_pdf_crop.py -s -p 0.5 -i input.pdf -o output.pdf
my_pdf_crop.py --skip --percent 0.5 -input input.pdf -output output.pdf
\n
REQUIRED OPTIONS:
-p\t--percent
The factor by which to crop. Must be positive and less than or equal to 1.
-i\t--input
The path to the file to be cropped.
\n
OPTIONAL:
-s\t--skip
Skip the first page. Ouptut file will not contain the first page of the input file.
-o\t--output
Specify the name and path of the output file. If none specified, the script appends \'cropped\' to the file name.
-m\t--margin
Specify additional absolute cropping, for fine tuning results.
\t-m "left top right bottom"
"""
sys.exit(0)
def cut_length(dictionary, key, factor):
cut_factor = 1-factor
cut = float(dictionary[key])*cut_factor
cut = cut / 4
return cut
def new_coords(dictionary, key, cut, margin, code = "tl"):
if code == "tl":
if key == "x":
return abs(float(dictionary[key])+(cut+margin["l"]))
else:
return abs(float(dictionary[key])-(cut+margin["t"]))
elif code == "tr":
if key == "x":
return abs(float(dictionary[key])-(cut+margin["r"]))
else:
return abs(float(dictionary[key])-(cut+margin["t"]))
elif code == "bl":
if key == "x":
return abs(float(dictionary[key])+(cut+margin["l"]))
else:
return abs(float(dictionary[key])+(cut+margin["b"]))
else:
if key == "x":
return abs(float(dictionary[key])-(cut+margin["r"]))
else:
return abs(float(dictionary[key])+(cut+margin["b"]))
try:
opts, args = getopt.getopt(sys.argv[1:], "sp:i:o:m:", ["skip", "percent=", "input=", "output=", "margin="])
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
skipone = 0
for a in opts[:]:
if a[0] == '-s' or a[0]=='--skip':
skipone = 1
factor = 0.8 #default scaling factor
for a in opts[:]:
if a[0] == '-p' or a[0]=='--factor':
if a[1] != None:
try:
factor = float(a[1])
except TypeError:
print "Factor must be a number."
sys.exit(2) #exit if no appropriate input file
input_file = None #no defualt input file
for a in opts[:]:
if a[0] == '-i' or a[0]=='--input':
if a[1] != None:
try:
if a[1][-4:]=='.pdf':
input_file = a[1]
else:
print "Input file must be a PDF."
sys.exit(2) #exit if no appropriate input file
except TypeError:
print "Input file must be a PDF."
sys.exit(2) #exit if no appropriate input file
except IndexError:
print "Input file must be a PDF."
sys.exit(2) #exit if no appropriate input file
else:
print "Please speicfy an input file."
sys.exit(2) #exit if no appropriate input file
output_file = "%s_cropped.pdf" %input_file[:-4] #default output
for a in opts[:]:
if a[0] == '-o' or a[0]=='--output':
if a[1]!= None:
try:
if a[1][-4:]=='.pdf':
output_file = a[1]
else:
print "Output file must be a PDF."
except TypeError:
print "Output file must be a PDF."
except IndexError:
print "Output file must be a PDF."
margin = {"l": 0, "t": 0, "r": 0, "b": 0}
for a in opts[:]:
if a[0] == '-m' or a[0]=='--margin':
if a[1]!= None:
m_temp = a[1].strip("\"").split()
margin["l"] = float(m_temp[0])
margin["t"] = float(m_temp[1])
margin["r"] = float(m_temp[2])
margin["b"] = float(m_temp[3])
else:
print "Error"
input1 = PdfFileReader(file(input_file, "rb"))
output = PdfFileWriter()
outputstream = file(output_file, "wb")
pages = input1.getNumPages()
top_right = {'x': input1.getPage(1).mediaBox.getUpperRight_x(), 'y': input1.getPage(1).mediaBox.getUpperRight_y()}
top_left = {'x': input1.getPage(1).mediaBox.getUpperLeft_x(), 'y': input1.getPage(1).mediaBox.getUpperLeft_y()}
bottom_right = {'x': input1.getPage(1).mediaBox.getLowerRight_x(), 'y': input1.getPage(1).mediaBox.getLowerRight_y()}
bottom_left = {'x': input1.getPage(1).mediaBox.getLowerLeft_x(), 'y': input1.getPage(1).mediaBox.getLowerLeft_y()}
print('Page dim.\t%f by %f' %(top_right['x'], top_right['y']))
cut = cut_length(top_right, 'x', factor)
new_tr = (new_coords(top_right, 'x', cut, margin, code = "tr"), new_coords(top_right, 'y', cut, margin, code = "tr"))
new_br = (new_coords(bottom_right, 'x', cut, margin, code = "br"), new_coords(bottom_right, 'y', cut, margin, code = "br" ))
new_tl = (new_coords(top_left, 'x', cut, margin, code = "tl"), new_coords(top_left, 'y', cut, margin, code = "tl"))
new_bl = (new_coords(bottom_left, 'x', cut, margin, code = "bl"), new_coords(bottom_left, 'y', cut, margin, code = "bl"))
if skipone == 0:
for i in range(0, pages):
page = input1.getPage(i)
page.mediaBox.upperLeft = new_tl
page.mediaBox.upperRight = new_tr
page.mediaBox.lowerLeft = new_bl
page.mediaBox.lowerRight = new_br
output.addPage(page)
else:
for i in range(1, pages):
page = input1.getPage(i)
page.mediaBox.upperLeft = new_tl
page.mediaBox.upperRight = new_tr
page.mediaBox.lowerLeft = new_bl
page.mediaBox.lowerRight = new_br
output.addPage(page)
output.write(outputstream)
outputstream.close()
I really like the script by Alexander Grahn but I am missing a feature to still allow a small margin. I made a small modification to the script to allow this margin like the original PDF crop does.
Since I'm new to this part of Stack Exchange I can't comment so I'll post the entire script here. Unfortunately I suck at bash so I wasted some time trying to make it optional but gave up eventually. I kept the margin declaration outside the Perl script so with a little bit more bash-foo it should be doable.
#!/bin/bash
MARGIN=10
(
gs -dNOPAUSE -q -dBATCH -sDEVICE=bbox "$1" 2>&1 | grep '%%BoundingBox'
pdftk "$1" output - uncompress
) | perl -w -n -e '
$margin = '$MARGIN';
if (/BoundingBox:\s+(\d+\s+\d+\s+\d+\s+\d+)/) {
push @bbox, $1; next;
}
elsif (/pdftk_PageNum\s+(\d+)/) {
# Split the sizes
@sizes = split(/ /, $bbox[$1-1]);
# Add or substract the margin size
$j = 0;
foreach(@sizes) {
if($j < 2) {
$_ = $_ - $margin;
} else {
$_ = $_ + $margin;
}
$j++;
}
# Print the box
print "/MediaBox [" .join(" ", @sizes) . "]\n";
}
elsif (/MediaBox/) {
next;
}
print;
' | pdftk - output "$2" compress