#! /usr/bin/python3
# vim: set filetype=python:

# opt-pdf: recompress PDF files using Ghostscript or Poppler

# Copyright (C) 2025-2026 by Brian Lindholm.  This file is part of the
# littleutils utility set.
#
# The opt-pdf utility is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later version.
#
# The opt-pdf utility is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# the littleutils.  If not, see <https://www.gnu.org/licenses/>.

import concurrent.futures, getopt, os, signal, subprocess, sys, tempfile

### PREP SIGNAL HANDLER ###
interrupted = False
def handler(signum, frame):
    global interrupted
    interrupted = True
for signal_VAL in (signal.SIGHUP, signal.SIGINT, signal.SIGPIPE, signal.SIGQUIT, signal.SIGTERM):
    signal.signal(signal_VAL, handler)

### GET INPUT ARGUMENTS ###
# print online help
def usage(rc: int) -> None:
    print('opt-pdf 1.4.0')
    print('usage: opt-pdf [-b(ackup)] [-c(olor) DPI] [-f filelist] [-F(orce_overwrite)]')
    print('         [-g(rayscale) DPI] [-h(elp)] [-m(onochrome) DPI] [-o(ptimize_for_web)]')
    print('         [-p(ipe)] [-q(uiet)] [-t(ouch)] [-T threads] [-v(erbose)]')
    print('         [-(e)X(tra_pass)] filename ...')
    sys.exit(rc)
# load list of files
def load_list_from_file() -> None:
    if not os.path.isfile(opt_f):  # abort if file does not exist
        print('opt-pdf error: file list %s does not exist' % opt_f, file=sys.stderr)
        sys.exit(1)
    try:
        FILE = open(opt_f, 'r')
    except:  # abort if file cannot be opened for read
        print('opt-pdf error: file list %s cannot be opened' % opt_f, file=sys.stderr)
        sys.exit(1)
    filelist.extend(FILE.read().splitlines())
    FILE.close()
# load list of files from stdin
def load_list_from_stdin() -> None:
    filelist.extend(sys.stdin.read().splitlines())
    sys.stdin.close()
# set defaults
filelist = []
opt_b = False  # create a backup file
opt_c = None   # DPI for color bitmaps
opt_f = None   # file containing list of files to process
opt_F = None   # force PDF processing, even if file is larger
opt_g = None   # DPI for grayscale bitmaps
opt_m = None   # DPI for strictly black-and-white bitmaps
opt_o = False  # optimize for web (which makes files a little larger)
opt_p = False  # read list of files to process from stdin
opt_q = False  # be quiet
opt_t = False  # "touch" re-written files to preserve timestamps
opt_T = None   # requested thread-count
opt_v = False  # be verbose
opt_X = False  # try extra hard
# get command-line options
try:
    opts, filelist = getopt.getopt(sys.argv[1:], 'bc:f:Fg:hm:opqtT:vX', 'help')
except getopt.error as msg:
    # print help if bad opts used, then quit
    print(msg)
    usage(1)
# parse options
for o, v in opts:
    if o in ('-h', '--help'): usage(0)
    elif o == '-b': opt_b = True
    elif o == '-c': opt_c = ('-dAutoFilterColorImages=false', '-dColorImageResolution=%.0f' % float(v), \
            '-dColorImageDownsampleType=/Bicubic')
    elif o == '-f': opt_f = str(v)
    elif o == '-F': opt_F = True
    elif o == '-g': opt_g = ('-dAutoFilterGrayImages=false', '-dGrayImageResolution=%.0f' % float(v), \
            '-dGrayImageDownsampleType=/Bicubic')
    elif o == '-m': opt_m = ('-dAutoFilterMonoImages=false', '-dMonoImageResolution=%.0f' % float(v), \
            '-dMonoImageDownsampleType=/Bicubic')
    elif o == '-o': opt_o = True
    elif o == '-p': opt_p = True
    elif o == '-q': opt_q = True
    elif o == '-t': opt_t = True
    elif o == '-T': opt_T = int(v)
    elif o == '-v': opt_v = True
    elif o == '-X': opt_X = True
# load file list from file and/or stdin if requested
if opt_f != None: load_list_from_file()
if opt_p: load_list_from_stdin()
# make sure we have at least one file to process
if len(filelist) == 0:
    if (not opt_f) and (not opt_p): usage(1)
    sys.exit(0)
# remove leading './' and trim list to unique items
filelist = [x.removeprefix('./') for x in filelist]
seen = set()
unique_filelist = [x for x in filelist if x not in seen and (seen.add(x) or True)]
# pick a reasonable default if thread-count is unspecified
if opt_T == None: opt_T = max(1, min(os.cpu_count() // 2, len(unique_filelist)))
# establish base gs and cairo arguments
gs_args = ['gs', '-dSAFER', '-dBATCH', '-dNOPAUSE', '-sDEVICE=pdfwrite', '-dCompatibilityLevel=1.7', \
        '-dDetectDuplicateImages=true', '-sBandListStorage=memory', '-dSubsetFonts=true', '-dCompressFonts=true']
if not opt_v: gs_args.insert(1, '-q')
if opt_c != None: gs_args.extend(opt_c)
if opt_g != None: gs_args.extend(opt_g)
if opt_m != None: gs_args.extend(opt_m)
cairo_args = ['pdftocairo', '-pdf']
if not opt_v: cairo_args.insert(1, '-q')
# determine gs version
GSVERSION = subprocess.run(['gs', '--version'], capture_output=True, text=True)
gs_ver = int(GSVERSION.stdout.rstrip().replace('.','',1)[:-2])
if opt_o:
    if gs_ver >= 950: gs_opt = ('-dFastWebView=true', '-c', '33550336', 'setvmthreshold')
    elif gs_ver >= 907: gs_opt = ('-dFastWebView=true', '-c', '.setpdfwrite')
    else: gs_opt = ('-c', '.setpdfwrite', '-f', 'pdfopt.ps')
else:
    if gs_ver >= 950: gs_opt = ('-c', '33550336', 'setvmthreshold')
    else: gs_opt = ('-c', '.setpdfwrite')
# create a working directory
TMPDIR = tempfile.TemporaryDirectory()
os.environ['TEMP'] = TMPDIR.name
os.environ['TMP'] = TMPDIR.name
os.environ['TMPDIR'] = TMPDIR.name

### MAIN PROGRAM ###
# optimize the PDF file
def process_file(filename: str) -> None:
    # abort if file does not exist
    if interrupted: return
    if not os.path.isfile(filename):
        if not opt_q: print('opt-pdf error: %s is not a file' % filename, file=sys.stderr)
        return
    # skip zero-length files
    origsize = os.path.getsize(filename)
    if origsize == 0: return
    # grab initial timestamp if required
    if opt_t: timestamp = os.path.getmtime(filename)
    # see if we've already been processed by gs or poppler/cairo
    PDFINFO = subprocess.run(['pdfinfo', filename], capture_output=True, text=True)
    lines = PDFINFO.stdout.splitlines()
    processed_by = 'neither'
    for line in lines:
        if ('Producer' in line) and ('Ghostscript' in line): processed_by = 'gs'
        elif ('Producer' in line) and ('cairo' in line): processed_by = 'cairo'
    if not opt_X:
        if processed_by == 'gs':
            if not opt_q: print('opt-pdf message: skipping ghost-processed %s' % filename, file=sys.stderr)
            return
        elif processed_by == 'cairo':
            if not opt_q: print('opt-pdf message: skipping cairo-processed %s' % filename, file=sys.stderr)
            return
    # initialize data structures
    TMPPDF = {}
    PDFDATA = {}
    newsize = {}
    tmpname = {}
    # trial 1: ghostscript
    if processed_by != 'gs':
        idx = 'ghost'
        TMPPDF[idx] = tempfile.NamedTemporaryFile(mode='w+b', suffix='.pdf', dir=TMPDIR.name, buffering=0, delete=False)
        tmpname[idx] = TMPPDF[idx].name
        args = gs_args.copy()
        args.append('-sOutputFile=%s' % tmpname[idx])
        args.extend(gs_opt)
        args.extend(['-f', filename])
        subprocess.run(args, capture_output=True, text=True)
        if interrupted:
            os.remove(tmpname[idx])
            return
        PDFDATA[idx] = TMPPDF[idx].read()
        TMPPDF[idx].close()
        newsize[idx] = len(PDFDATA[idx])
        if newsize[idx] == 0:
            if not opt_q: print('opt-pdf error: failed %s processing of %s' % (idx, filename), file=sys.stderr)
            return
    # trial 2: poppler/cairo
    if processed_by != 'cairo':
        idx = 'cairo'
        TMPPDF[idx] = tempfile.NamedTemporaryFile(mode='w+b', suffix='.pdf', dir=TMPDIR.name, buffering=0, delete=False)
        tmpname[idx] = TMPPDF[idx].name
        args = cairo_args.copy()
        args.extend([filename, tmpname[idx]])
        subprocess.run(args, capture_output=True, text=True)
        if interrupted:
            if 'ghost' in tmpname: os.remove(tmpname['ghost'])
            os.remove(tmpname[idx])
            return
        PDFDATA[idx] = TMPPDF[idx].read()
        TMPPDF[idx].close()
        newsize[idx] = len(PDFDATA[idx])
        if newsize[idx] == 0:
            if not opt_q: print('opt-pdf error: failed %s processing of %s' % (idx, filename), file=sys.stderr)
            return
    # trial 3: ghostscript of poppler/cairo
    if opt_X and (processed_by == 'neither'):
        idx = 'cairo-ghost'
        TMPPDF[idx] = tempfile.NamedTemporaryFile(mode='w+b', suffix='.pdf', dir=TMPDIR.name, buffering=0)
        tmpname[idx] = TMPPDF[idx].name
        args = gs_args.copy()
        args.append('-sOutputFile=%s' % tmpname[idx])
        args.extend(gs_opt)
        args.extend(['-f', tmpname['cairo']])
        subprocess.run(args, capture_output=True, text=True)
        if interrupted:
            if 'ghost' in tmpname: os.remove(tmpname['ghost'])
            os.remove(tmpname['cairo'])
            os.remove(tmpname[idx])
            return
        PDFDATA[idx] = TMPPDF[idx].read()
        TMPPDF[idx].close()
        newsize[idx] = len(PDFDATA[idx])
        if newsize[idx] == 0:
            if not opt_q: print('opt-pdf error: failed %s processing of %s' % (idx, filename), file=sys.stderr)
            return
    # trial 4: poppler/cairo of ghostscript
    if opt_X and (processed_by == 'neither'):
        idx = 'ghost-cairo'
        TMPPDF[idx] = tempfile.NamedTemporaryFile(mode='w+b', suffix='.pdf', dir=TMPDIR.name, buffering=0)
        tmpname[idx] = TMPPDF[idx].name
        args = cairo_args.copy()
        args.extend([tmpname['ghost'], tmpname[idx]])
        subprocess.run(args, capture_output=True, text=True)
        if interrupted:
            os.remove(tmpname['ghost'])
            if 'cairo' in tmpname: os.remove(tmpname['cairo'])
            if 'ghost-cairo' in tmpname: os.remove(tmpname['ghost-cairo'])
            os.remove(tmpname[idx])
            return
        PDFDATA[idx] = TMPPDF[idx].read()
        TMPPDF[idx].close()
        newsize[idx] = len(PDFDATA[idx])
        if newsize[idx] == 0:
            if not opt_q: print('opt-pdf error: failed %s processing of %s' % (idx, filename), file=sys.stderr)
            return
    # find smallest file of results
    if 'ghost' in tmpname: os.remove(tmpname['ghost'])
    if 'cairo' in tmpname: os.remove(tmpname['cairo'])
    if 'cairo-ghost' in tmpname: os.remove(tmpname['cairo-ghost'])
    if 'ghost-cairo' in tmpname: os.remove(tmpname['ghost-cairo'])
    if interrupted: return
    best_idx = None
    for idx in newsize:
        if best_idx == None: best_idx = idx
        elif newsize[idx] < newsize[best_idx]: best_idx = idx
    # use best result if smaller
    if opt_F or (newsize[best_idx] < origsize):
        if opt_b: os.rename(filename, filename + '.bak')
        try:
            FILE = open(filename, 'wb')
        except:  # abort if file cannot be opened for write
            print('opt-pdf error: %s cannot be opened for write' % opt_f, file=sys.stderr)
            if opt_b: os.rename(filename + '.bak', filename)
            return
        FILE.write(PDFDATA[best_idx])
        FILE.close()
        if not opt_q: print('%s: [%s] %d vs. %d' % (filename, best_idx, origsize, newsize[best_idx]))
        if opt_t: os.utime(filename, (timestamp, timestamp))
    elif not opt_q: print('%s: unchanged' % filename)

# process files
if opt_T < 2:
    for filename in unique_filelist:
        if interrupted: break
        process_file(filename)
else:
    with concurrent.futures.ThreadPoolExecutor(max_workers=opt_T) as executor:
        executor.map(process_file, unique_filelist)
