Reducir el tamaño de multiples PDF desde un bash script

A veces la editoriales suben pdfs en alta, y es necesario reducir el tamaño para poder usarlos. Acá les dejo un script que me ayudó a automatizar esta tarea.

Bueno, este es mi código:

#!/bin/bash
if [ ! -f /tmp/hmhbaja.lock ]; then
#compruebo que no hay otra instancia corriendo
touch /tmp/hmhbaja.lock
    for i in $(cat /etc/hmhbaja/paths); do
        path1=$i"/"
        path2=$i"_low/"
	rsync -r --ignore-existing $path1 $path2
        if [ -f $path2.marca ]; then
    	    find $path2 -iname \*.pdf -cnewer $path2.marca -execdir /root/pdf2low.pl \{\} \;
	else
    	    find $path2 -iname \*.pdf -cnewer -execdir /root/pdf2low.pl \{\} \;
        fi
        touch $path2.marca
    done;
rm /tmp/hmhbaja.lock
fi

Tiene un archivo de configuración con los paths:

/var/ftp/virtual1/hmhreadingse/Grado2/TE_Reprints/07-Digitals
/var/ftp/virtual1/hmhreadingse/Grado3/TE_Reprints/07-Digitals

y el archivo que hace la magia, o sea, reduce el temaño del archivo, el archivo es propiedad de Wolfgang Dobler, para que funcione obiamente necesita pdftk.

#!/bin/sh
#  -*-Perl-*-
# ====================================================================== #
# Run the right perl version:
if [ -x /usr/local/bin/perl ]; then
  perl=/usr/local/bin/perl
elif [ -x /usr/bin/perl ]; then
  perl=/usr/bin/perl
else
  perl=`which perl| sed 's/.*aliased to *//'`
fi

exec $perl -x -S $0 "$@"     # -x: start from the following line
# ====================================================================== #
#! /Good_Path/perl -w
# line 17

# Name:   compress-newsletter
# Author: wd (Wolfgang.Dobler@ucalgary.ca)
# Date:   03-Oct-2005
# Description:
#   Use ghostscript's pdfwrite device (à la ps2pdf) to reduce the
#   Newsletter's PDF file size, and add meta information like author,
#   date, etc.
#   The preferred route is currently:
#                 [scribus>=1.2.3]
#                        |
#                    file.pdf
#                        |
#                 [pdftops>=3.00]
#                        |
#                     file.ps
#                        |
#            [pstopdf14 (gs-gnu-8.16 or higher)]
#                        |
#                        V
#                    final.pdf
# Usage:
#   compress-newletter [-i col:gray:mono] Newsletter_big.pdf
# Options:
#   -i col:gray:mono
#   --imgres=col:gray:mono   Set resolution for downsampling color,
#                            grayscale and black-and-white images
#                            (default is 144:300:300)
#   --debug                  Be verbose and keep temporary files around
use strict;
use File::Temp qw/ :mktemp /;

use Getopt::Long;
# Allow for `-Plp' as equivalent to `-P lp' etc:
Getopt::Long::config("bundling");

my (%opts);			# Options hash for GetOptions
my $doll='\$';			# Need this to trick CVS

## Process command line
GetOptions(\%opts,
	   qw( -h   --help
	       -i=s --imgres=s
	            --debug
	       -q   --quiet
               -v   --version ));

my $debug = ($opts{'debug'} ? 1 : 0 ); # undocumented debug option
if ($debug) {
    printopts(\%opts);
    print "\@ARGV = `@ARGV'\n";
}

if ($opts{'h'} || $opts{'help'})    { die usage();   }
if ($opts{'v'} || $opts{'version'}) { die version(); }

my $quiet  = ($opts{'q'} || $opts{'quiet'}  || ''           );
my $imgres = ($opts{'i'} || $opts{'imgres'} || '144:300:300');

my ($gs,      @gsargs     ) = ('gs'     );
my ($pdftops, @pdftopsargs) = ('pdftops');
my ($pdfopt,  @pdfoptargs ) = ('pdfopt' );

my $infile = shift or die usage();
(my $root=$infile) =~ s/\.(pdf|ps).*//;
(my $outfile=$infile) =~ s/(.*)(\.(pdf|ps))/${1}_new${2}/;
my $tmpfile = mktemp("${root}.tmp_XXXXXX");

## 0. Extract all sorts of information

# Extract Scribus version, creation date, bookmarks from original PDF:
print "Running pdftk ...\n";
print STDERR "pdftk $infile dump_data output\n" if ($debug);
my $meta = `pdftk $infile dump_data output -`;
my ($creator) = ( $meta =~
		  m{InfoKey: Creator\s+InfoValue:\s*(.+)$}m
		);
$creator = 'Scribus 1.2.3' unless defined($creator);
my $datestring = extract_CreationDate($meta);
my @bookmarks = extract_bookmarks($meta);

# Extract desired image resolutions
my ($colres,$grayres,$monores) = ($imgres =~ /([0-9]+):([0-9]+):([0-9]+)/);
die "Image resolution must be of form `col:gray:mono'\n"
    unless defined($monores);

## 1. Run pdftops
push @pdftopsargs, "-level3";
my $psfile = mktemp("${root}.ps_XXXXXX");
push @pdftopsargs, $infile, $psfile;
print "Running pdftops ...\n";
print STDERR "$pdftops @pdftopsargs\n" if ($debug);
system($pdftops,@pdftopsargs);

## 2. Run gs
# a) Prepare options
push @gsargs, qw{-q -dNOPAUSE -dBATCH};
push @gsargs, '-sDEVICE=pdfwrite';
push @gsargs, '-dCompatibilityLevel=1.3';
# One of /printer, /screen, /prepress, /ebook, /default; see Ps2pdf.htm:
push @gsargs, '-dPDFSETTINGS=/screen';
push @gsargs, '-dEmbedAllFonts=true';
push @gsargs, '-dSubsetFonts=true';
push @gsargs, '-dColorImageDownsampleType=/Bicubic';
push @gsargs, "-dColorImageResolution=$colres";
push @gsargs, '-dGrayImageDownsampleType=/Bicubic';
push @gsargs, "-dGrayImageResolution=$grayres";
push @gsargs, '-dMonoImageDownsampleType=/Bicubic';
push @gsargs, "-dMonoImageResolution=$monores";
push @gsargs, "-sOutputFile=$tmpfile";
push @gsargs, "-c .setpdfwrite";

# b) Write meta information to temporary file
#my $metafile = mktemp("metainfo.tmp_XXXXXX");
my $metafile = "${root}.meta";
open(META, "> $metafile");
print META <<"DEAD_PARROT";
% Document information
[%
 /CreationDate (D:$datestring)
 /ModDate (D:$datestring)
 /Creator ($creator)
 /Title ([Insert your document title here])
 /Subject ([Insert the Subject here])
 /Keywords ([Insert key words here])
 /Author ([Insert author' nsme here])
 /DOCINFO pdfmark

% Initial view on opening the document
[/View [/Fit] % Fit page in window
 /Page 1
 % /PageMode /UseOutlines % /UseNone /UserOutlines /UseThumbs /FullScreen
 /DOCVIEW pdfmark

DEAD_PARROT

## Bookmarks. [Commented out for acroread 7.0 has problems] Currently at
## the mercy of the original bookmarks (and Scribus 1.2.2 does not allow
## to edit the bookmark names) and the encoding that pdftk understands
## (most quotation marks get mapped to `?').
## Ideally, one would write out the meta information file with
## `compress-newsletter -m CC.pdf' and use it then with
## `compress-newsletter CC.pdf'.
## % Bookmarks: @bookmarks

push @gsargs, '-f', $psfile, $metafile;
print "Running gs ...\n";
print STDERR "$gs @gsargs\n" if ($debug);
system($gs,@gsargs);

## 3. Run pdfopt
print "Running pdfopt ...\n";
print STDERR "$pdfopt @pdfoptargs $tmpfile $outfile\n" if ($debug);
system($pdfopt,@pdfoptargs,$tmpfile,$outfile);

# Some diagnostics:
system('rm', $infile, $psfile, $tmpfile,$metafile);
system('mv',$outfile,$infile);

END {
    # Clean up even in case of an error:
    unless ($debug) {
        foreach my $file ($psfile,$tmpfile) {
	    unlink $file if (defined($file) && -f $file);
        }
    }
}

# ---------------------------------------------------------------------- #
sub extract_CreationDate {

    use POSIX qw(strftime);

    my $meta = shift;

    my ($cdate) = ( $meta =~
		    m{InfoKey: CreationDate\s+InfoValue:\s*(.+)$}m
		  );
    # Time string: need to splice in "'" after hours and minutes of time zone
    # definition. To me this looks like the technical documentation was taken
    # too literally and now applications (and Acroread 7) insist on these
    # stupid markers.
    my $datestring;
    if ($cdate =~ /[0-9]{14}/) { # managed to extract CreationDate from $meta
	$datestring = "$cdate-06'00'";
    } else {		         # Creation date unknown -- use current date
	my $tz = strftime "%z", localtime();
	$tz =~ s/([0-9][0-9])([0-9][0-9])/$1'$2'/;
	$datestring = strftime "%Y%m%d%H%M%S$tz", localtime();
    }

    $datestring;
}
# ---------------------------------------------------------------------- #
sub extract_bookmarks {

    my $meta = shift;

    my @bm;

    while ($meta =~ /^BookmarkTitle:      \s* (.*) \n
                      BookmarkLevel:      \s* (.*) \n
                      BookmarkPageNumber: \s* (.*) /xmg) {
	my ($title,$level,$page) = ($1,$2,$3);
	push @bm, "[/Title ($title /Page $page /OUT pdfmark\n";
    }

}
# ---------------------------------------------------------------------- #
sub printopts {
# Print command line options
    my $optsref = shift;
    my %opts = %$optsref;
    foreach my $opt (keys(%opts)) {
	print STDERR "\$opts{$opt} = `$opts{$opt}'\n";
    }
}
# ---------------------------------------------------------------------- #
sub usage {
# Extract description and usage information from this file's header.
    my $thisfile = __FILE__;
    local $/ = '';              # Read paragraphs
    open(FILE, "<$thisfile") or die "Cannot open $thisfile\n";
    while () {
	# Paragraph _must_ contain `Description:' or `Usage:'
        next unless /^\s*\#\s*(Description|Usage):/m;
        # Drop `Author:', etc. (anything before `Description:' or `Usage:')
        s/.*?\n(\s*\#\s*(Description|Usage):\s*\n.*)/$1/s;
        # Don't print comment sign:
        s/^\s*# ?//mg;
        last;                        # ignore body
    }
    $_ or "\n";
}
# ---------------------------------------------------------------------- #
sub version {
# Return CVS data and version info.
    my $doll='\$';		# Need this to trick CVS
    my $cmdname = (split('/', $0))[-1];
    my $rev = '$Revision: 1.8 $';
    my $date = '$Date: 2006/02/02 09:38:52 $';
    $rev =~ s/${doll}Revision:\s*(\S+).*/$1/;
    $date =~ s/${doll}Date:\s*(\S+).*/$1/;
    "$cmdname version $rev ($date)\n";
}
# ---------------------------------------------------------------------- #

# End of file compress-newsletter

Espero les serva 😉

Tags: , , ,

Comments are closed.