#! /bin/bash
# vim: set filetype=bash:

# pdfbw4: reduce the filesize of a PDFs composed of scanned images by
# extracting images, quantizing them, and creating a new PDF; relies on
# img2pdf, pdfimages, pdfseparate, and netpbm.

# Copyright (C) 2025-2026 by Brian Lindholm.  This file is part of the
# littleutils utility set.
#
# The pdfbw4 utility is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later version.
#
# The pdfbw4 utility is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# the littleutils.  If not, see <https://www.gnu.org/licenses/>.

# get command-line options
USE_PARALLEL='y'
QUIET='n'
AUTORESOLUTION='y'
declare -i RESOLUTION=300
declare -i DPI DPM
SUFFIX='-g4'
THRESHOLD=''
while getopts hqr:st: opts ; do
  case $opts in
    h) echo 'pdfbw4 1.4.0'
       echo 'usage: pdfbw4 [-h(elp)] [-q(uiet)] [-r(esolution) DPI ] [-s(ingle_threaded)]'
       echo '         [-t threshold] pdf_file...'
       exit 0 ;;
    q) QUIET='y' ;;
    r) AUTORESOLUTION='n'
       RESOLUTION="$OPTARG" ;;
    s) USE_PARALLEL='n' ;;
    t) THRESHOLD="-simple -threshold=$OPTARG" ;;
    *) echo 'pdfbw4 1.4.0'
       echo 'usage: pdfbw4 [-h(elp)] [-q(uiet)] [-r(esolution) DPI ] [-s(ingle_threaded)]'
       echo '         [-t threshold] pdf_file...'
       exit 1 ;;
  esac
done
shift $((${OPTIND}-1))

# double-check parameters
command -v parallel > /dev/null
if [ $? -ne 0 ]; then
  USE_PARALLEL='n'
fi

# set up traps and create temporary folder
trap 'rm -rf "$TMPFOLDER" ; exit 1' 1 2 3 13 15
TMPFOLDER=$(tempname -D pdfbw4_$$) || exit 99

# run through files
while [ $# -gt 0 ]; do

  # make sure that input file is specified
  if [ ! -f "$1" -o ! -r "$1" ]; then
    echo "pdfbw4 error: input file ${1} is not a readable file"
    shift; continue
  fi
  INPUTFILE=$(realpath "$1")
  pushd "$TMPFOLDER" > /dev/null

  # explode original files into individual pages
  if [ "$QUIET" = 'n' ]; then
    echo 'pdfbw4 message:  -BEGIN-'
    echo "pdfbw4 message: splitting $1 into pages..."
  fi
  cd "$TMPFOLDERS"
  pdfseparate "$INPUTFILE" page-%06d.pdf

  # convert pages into PNG images
  if [ "$QUIET" = 'n' ]; then
    echo 'pdfbw4 message: converting pages to PNG...'
  fi
  if [ "$USE_PARALLEL" = 'n' ]; then
    for PAGE in page*.pdf ; do
      pdfimages -png $PAGE ${PAGE%.pdf}
    done
  else
    parallel -s 10000 --plus 'pdfimages -png {} {%.pdf}' ::: page*.pdf
  fi

  # created quantized versions of PNG images
  if [ "$QUIET" = 'n' ]; then
    echo 'pdfbw4 message: converting PNG pages to G4 TIFF...'
  fi
  for PNG in page*.png ; do
    if [ "$AUTORESOLUTION" = 'y' ]; then
      DPM=$(pnginfo "$PNG" | sed -e 's/^ *Resolution: //' -e 's/,.*$//' -e '/^ */d' -e '/^$/d')
      DPI=$((($DPM + 4) * 254 / 10000))
      if [ $DPI -lt 75 ]; then
        DPI="$RESOLUTION"
      fi
    else
      DPI="$RESOLUTION"
    fi
    TIFF="${PNG%.png}${SUFFIX}.tiff"
    pngtopam "$PNG" | ppmtopgm | pamthreshold -quiet $THRESHOLD | pamtotiff -miniswhite -xresolution=$DPI -yresolution=$DPI -resolutionunit=inch -g4 > "$TIFF"
  done

  # reassemble TIFF images into new PDF
  if [ "$QUIET" = 'n' ]; then
    echo 'pdfbw4 message: converting G4 TIFF pages to PDF...'
  fi
  img2pdf -D --engine=internal -o "${INPUTFILE%.pdf}${SUFFIX}.pdf" page*${SUFFIX}.tiff

  # clean up afterwards
  if [ "$QUIET" = 'n' ]; then
    echo 'pdfbw4 message:  -END-'
    echo ''
  fi
  rm -f page*
  popd > /dev/null
  shift

done
rmdir "$TMPFOLDER"
