#! /bin/bash
# vim: set filetype=bash:

# pdfidiff: create a new PDF file showing the differences between two specified
# PDF files; relies on ghostscript, img2pdf, imagdiff, pdftocairo, and
# (optionally) GNU parallel.

# Copyright (C) 2018-2026 by Brian Lindholm.  This file is part of the
# littleutils utility set.
#
# The pdfidiff utility is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later version.
#
# The pdfidiff utility is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# the littleutils.  If not, see <https://www.gnu.org/licenses/>.

# get command-line options
DEVICE='pnggray'
declare -i DPI=300
ENGINE='poppler'
declare -i FUZZ=15
declare -i MODE=2
OPTIMIZE='n'
USE_PARALLEL='y'
QUIET='n'
while getopts f:ghm:oqr:sv opts ; do
  case $opts in
    f) FUZZ=$OPTARG ;;
    g) ENGINE='ghostscript' ;;
    h) echo 'pdfidiff 1.4.0'
       echo 'usage: pdfidiff [-h(elp)] [-g(hostscript)] [-f fuzz_dist] [-m mode]'
       echo '         [-q(uiet)] [-r resolution_DPI] [-o(ptimize)] [-s(ingle_threaded)]'
       echo '         old.pdf new.pdf diff.pdf'
       echo '       mode 1 = gray, 2 = light gray, 3 = dark gray, 4 = color,'
       echo '            5 = stretched color'
       echo 'defaults: pdfidiff -f 15 -m 2 -r 300'
       exit 0 ;;
    m) MODE=$OPTARG ;;
    o) OPTIMIZE='y' ;;
    q) QUIET='y' ;;
    r) DPI=$OPTARG ;;
    s) USE_PARALLEL='n' ;;
    v) QUIET='n' ;;
    *) echo 'pdfidiff 1.4.0'
       echo 'usage: pdfidiff [-h(elp)] [-g(hostscript)] [-f fuzz_dist] [-m mode]'
       echo '         [-q(uiet)] [-r resolution_DPI] [-o(ptimize)] [-s(ingle_threaded)]'
       echo '         old.pdf new.pdf diff.pdf'
       echo '       mode 1 = gray, 2 = light gray, 3 = dark gray, 4 = color,'
       echo '            5 = stretched color'
       echo 'defaults: pdfidiff -f 15 -m 2 -r 300'
       exit 1 ;;
  esac
done
shift $((${OPTIND}-1))

# double-check parameters
if [ $MODE -lt 1 -o $MODE -gt 5 ]; then
  MODE=2
fi
command -v parallel > /dev/null
if [ $? -ne 0 ]; then
  USE_PARALLEL='n'
fi

# compute ancilliary parameters
if [ $DPI -gt 0 ]; then
  if [ $DPI -gt 2400 ]; then
    DPI=2400
  fi
  DPM=$((DPI*10000/254))
fi
if [ "$MODE" = '1' ]; then
  DEVICE='pnggray'
  CAIROOPT='-gray'
elif [ "$MODE" = '2' ]; then
  DEVICE='pnggray'
  CAIROOPT='-gray'
elif [ "$MODE" = '3' ]; then
  DEVICE='pnggray'
  CAIROOPT='-gray'
elif [ "$MODE" = '4' ]; then
  DEVICE='png16m'
  CAIROOPT=''
elif [ "$MODE" = '5' ]; then
  DEVICE='png16m'
  CAIROOPT=''
fi

# make sure that input files and targets are specified
if [ ! -r "$1" ]; then
  echo "pdfidiff error: old input file ${1} is missing"
  exit 1
fi
if [ ! -r "$2" ]; then
  echo "pdfidiff error: new input file ${2} is missing"
  exit 1
fi
if [ "X$3" = 'X' ]; then
  echo "pdfidiff error: output file ${3} is unspecified"
  exit 1
fi
OLDFILE=$(realpath "$1")
NEWFILE=$(realpath "$2")
OUTPUTFILE=$(realpath "$3")

# set up traps, create and change to temporary folder
trap 'rm -rf "$TMPFOLDER" ; exit 1' 1 2 3 13 15
TMPFOLDER=$(tempname -D pdfidiff_$$) || exit 99
pushd "$TMPFOLDER" > /dev/null

# explode original files into individual pages
if [ "$QUIET" = 'n' ]; then
  echo 'pdfidiff message:  -BEGIN-'
  echo "pdfidiff message: splitting (old) $1 and (new) $2 into pages..."
fi
if [ "$ENGINE" = 'ghostscript' ]; then
  if [ "$USE_PARALLEL" = 'n' ]; then
    gs -q -dBATCH -dSAFER -dNOPAUSE -sDEVICE=$DEVICE -r${DPI} -sBandListStorage=memory -sOutputFile=old-%05d.png -f "$OLDFILE"
    gs -q -dBATCH -dSAFER -dNOPAUSE -sDEVICE=$DEVICE -r${DPI} -sBandListStorage=memory -sOutputFile=new-%05d.png -f "$NEWFILE"
  else
    parallel -s 10000 "gs -q -dBATCH -dSAFER -dNOPAUSE -sDEVICE=${DEVICE} -r${DPI} -sBandListStorage=memory -sOutputFile={1}-%05d.png -f {2}" \
      ::: 'old' 'new' :::+ "$OLDFILE" "$NEWFILE"
  fi
else
  if [ "$USE_PARALLEL" = 'n' ]; then
    pdftocairo -png $CAIROOPT -r $DPI "$OLDFILE" old
    pdftocairo -png $CAIROOPT -r $DPI "$NEWFILE" new
  else
    parallel -s 10000 "pdftocairo -png $CAIROOPT -r $DPI {1} {2}" ::: "$OLDFILE" "$NEWFILE" :::+ 'old' 'new'
  fi
fi

# count image files
for INDEX in old-*.png ; do
  touch "diff${INDEX#old}"
  if [ ! -r "new${INDEX#old}" ]; then
    echo "pdfidiff warning: $INDEX from $OLDFILE has no counterpart"
  fi
done
for INDEX in new-*.png ; do
  if [ ! -r "diff${INDEX#new}" ]; then
    touch "diff${INDEX#new}"
  fi
  if [ ! -r "old${INDEX#new}" ]; then
    echo "pdfidiff warning: $INDEX from $NEWFILE has no counterpart"
  fi
done

# create delta image files
if [ "$QUIET" = 'n' ]; then
  echo 'pdfidiff message: calculating difference between old and new pages...'
fi
if [ "$USE_PARALLEL" = 'n' ]; then
  for INDEX in diff-*.png ; do
    imagdiff -f $FUZZ -m $MODE "old${INDEX#diff}" "new${INDEX#diff}" "$INDEX"
  done
else
  parallel -s 10000 --plus "imagdiff -f $FUZZ -m $MODE old{#diff} new{#diff} {}" ::: diff-*.png
fi

# take some steps to reduce file size
if [ "$OPTIMIZE" = 'y' ]; then
  # full-blown PNG image optimization if requested
  if [ "$QUIET" = 'n' ]; then
    echo 'pdfidiff message: attempting filesize optimization on delta pages...'
  fi
  if [ "$USE_PARALLEL" = 'n' ]; then
    opt-png -q -r $DPI diff-*.png
  else
    parallel -s 10000 opt-png -q -r $DPI ::: diff-*.png
  fi
else
  # otherwise, attempt colorspace reduction and alpha channel stripping
  if [ "$QUIET" = 'n' ]; then
    echo 'pdfidiff message: attempting colorspace reduction on delta pages...'
  fi
  if [ "$USE_PARALLEL" = 'n' ]; then
    for INDEX in diff-*.png ; do
      pngrecolor -q -n "$INDEX" "rc${INDEX#diff}"
      if [ -s "rc${INDEX#diff}" ]; then
        mv "rc${INDEX#diff}" "$INDEX"
      fi
    done
  else
    parallel -s 10000 --plus 'pngrecolor -q -n {} rc{#diff} ; if [ -s rc{#diff} ]; then mv rc{#diff} {} ; fi' ::: diff-*.png
  fi
  if [ "$QUIET" = 'n' ]; then
    echo 'pdfidiff message: stripping alpha channel and setting DPI on delta pages...'
  fi
  if [ "$USE_PARALLEL" = 'n' ]; then
    for INDEX in diff-*.png ; do
      pngstrip -a -r $DPM "$INDEX" "st${INDEX#diff}"
      if [ -s "st${INDEX#diff}" ]; then
        mv "st${INDEX#diff}" "$INDEX"
      fi
    done
  else
    parallel -s 10000 --plus "pngstrip -a -r $DPM {} st{#diff} ; if [ -s st{#diff} ]; then mv st{#diff} {} ; fi" ::: diff-*.png
  fi
fi

# convert delta image files to PDF
if [ "$QUIET" = 'n' ]; then
  echo "pdfidiff message: combining delta pages files into (output) $3..."
fi
img2pdf -D --engine=internal -o "$OUTPUTFILE" diff-*.png

# clean up afterwards
if [ "$QUIET" = 'n' ]; then
  echo 'pdfidiff message:  -END-'
fi
popd > /dev/null
rm -rf "$TMPFOLDER"
