#! /usr/bin/python3
# vim: set filetype=python:

# filehash: print various hash digests and filesizes for specified files

# Copyright (C) 2004-2026 by Brian Lindholm.  This file is part of the
# littleutils utility set.
#
# The filehash utility is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later version.
#
# The filehash utility is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# the littleutils.  If not, see <https://www.gnu.org/licenses/>.

import base64, concurrent.futures, getopt, hashlib, os, signal, subprocess, sys

### PREP SIGNAL HANDLER ###
interrupted = False
def handler(signum, frame):
    global interrupted
    interrupted = True
for signal_VAL in (signal.SIGHUP, signal.SIGINT, signal.SIGPIPE, signal.SIGQUIT, signal.SIGTERM):
    signal.signal(signal_VAL, handler)

### GET INPUT ARGUMENTS ###
# print online help
def usage(rc: int) -> None:
    print('filehash 1.4.0')
    print('usage: filehash filehash [-1(MD5)] [-2(SHA1)] [-3(SHA224)] [-4(SHA256)]')
    print('         [-5(SHA384)] [-6(SHA512)] [-7(BLAKE2B_256)] [-8(BLAKE2B_512)]')
    print('         [-b(ase64url)] [-b(ase64url)] [-c(lassic)] [-C(lassic_BSD)]')
    print('         [-f filelist] [-h(elp)] [-n byte_count] [-o offset] [-p(ipe)]')
    print('         [-q(uiet)] [-s(ize)] [-T threads] [-v(erbose)] filename ...')
    sys.exit(rc)
# load list of files
def load_list_from_file() -> None:
    if not os.path.isfile(opt_f):  # abort if file does not exist
        print('filehash error: file list %s does not exist' % opt_f, file=sys.stderr)
        sys.exit(1)
    try:
        FILE = open(opt_f, 'r')
    except:  # abort if file cannot be opened for read
        print('filehash error: file list %s cannot be opened' % opt_f, file=sys.stderr)
        sys.exit(1)
    filelist.extend(FILE.read().splitlines())
    FILE.close()
# load list of files from stdin
def load_list_from_stdin() -> None:
    filelist.extend(sys.stdin.read().splitlines())
    sys.stdin.close()
# set defaults
filelist = []
requested_digests = set()
output_format = 'standard'
digest_format = 'hexadecimal'
verbosity = 'default'
opt_f = None   # file containing list of files to process
opt_p = False  # read list of files to process from stdin
opt_T = 1      # requested thread-count
opt_n = None   # read only the number of bytes from the specified offset
opt_o = 0      # start reading bytes at the specified offset
opt_s = False  # print size
opt_q = False  # be quiet
opt_v = False  # be verbose
# get command-line options
try:
    opts, filelist = getopt.getopt(sys.argv[1:], '12345678bBcCf:hn:o:psT:v', 'help')
except getopt.error as msg:
    # print help if bad opts used, then quit
    print(msg)
    usage(1)
# parse options
for o, v in opts:
    if o in ('-h', '--help'): usage(0)
    elif o == '-1': requested_digests.add(1)
    elif o == '-2': requested_digests.add(2)
    elif o == '-3': requested_digests.add(3)
    elif o == '-4': requested_digests.add(4)
    elif o == '-5': requested_digests.add(5)
    elif o == '-6': requested_digests.add(6)
    elif o == '-7': requested_digests.add(7)
    elif o == '-8': requested_digests.add(8)
    elif o == '-b': digest_format = 'base64url'
    elif o == '-B': digest_format = 'base64'
    elif o == '-c': output_format = 'classic'
    elif o == '-C': output_format = 'BSD-style'
    elif o == '-f': opt_f = str(v)
    elif o == '-n': opt_n = int(v)
    elif o == '-o': opt_o = int(v)
    elif o == '-p': opt_p = True
    elif o == '-q': verbosity = 'quiet'
    elif o == '-s': opt_s = True
    elif o == '-T': opt_T = int(v)
    elif o == '-v': verbosity = 'verbose'
# use SHA256 if no digest is specified
if len(requested_digests) == 0: requested_digests.add(4)
# load file list from file and/or stdin if requested
if opt_f != None: load_list_from_file()
if opt_p: load_list_from_stdin()
# make sure we have at least one file to process
if len(filelist) == 0:
    if (not opt_f) and (not opt_p): usage(1)
    sys.exit(0)
# trim list to unique items
seen = set()
unique_filelist = [x for x in filelist if x not in seen and (seen.add(x) or True)]
# set verbosity if necessary
if verbosity == 'default':
    verbosity = 'quiet' if len(filelist) == 1 else 'verbose'

### MAIN PROGRAM ###
digest_name = ('none', 'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512', 'blake2b-256', 'blake2b')
bufsize = 1024 * 1024
results = {}
# precompute digests for zero-length files
zero_digest = {}
def compute_zero_byte_digests() -> None:
    for digest_idx in requested_digests:
        if digest_idx == 8: tmp = hashlib.blake2b(digest_size=64)
        elif digest_idx == 7: tmp = hashlib.blake2b(digest_size=32)
        else: tmp = hashlib.new(digest_name[digest_idx])
        zero_digest[digest_idx] = tmp.digest()
# convert a digest to ASCII
def convert_to_ascii(digest: bytes) -> str:
    if digest_format == 'base64url': return base64.urlsafe_b64encode(digest).decode().rstrip('=')
    elif digest_format == 'base64': return base64.b64encode(digest).decode().rstrip('=')
    else: return digest.hex()
# compute digests for the files
def process_file(filename: str) -> None:
    # abort if file does not exist
    if interrupted: return
    if not os.path.isfile(filename):
        if not opt_q: print('filehash message: skipping non-file %s' % filename, file=sys.stderr)
        return
    # grab filesize
    filesize = os.path.getsize(filename)
    # compute the digests
    file_digest = {}
    if (filesize != 0) and (opt_o < filesize):
        # open the file
        try:
            FILE = open(filename, 'rb', buffering=0)
        except:
            print('filehash error: %s cannot be opened for read' % filename, file=sys.stderr)
            return
        bytes_to_read = (filesize - opt_o) if opt_n == None else min(opt_n, filesize - opt_o)
        os.posix_fadvise(FILE.fileno(), opt_o, bytes_to_read, os.POSIX_FADV_DONTNEED)
        # initialize digest objects
        tmp = {}
        for digest_idx in requested_digests:
            if digest_idx == 8: tmp[digest_idx] = hashlib.blake2b(digest_size=64)
            elif digest_idx == 7: tmp[digest_idx] = hashlib.blake2b(digest_size=32)
            else: tmp[digest_idx] = hashlib.new(digest_name[digest_idx])
        # compute the digests
        file_idx = opt_o
        FILE.seek(file_idx)
        while (file_idx < filesize) and (bytes_to_read > 0):
            data = FILE.read(bufsize) if bytes_to_read > bufsize else FILE.read(bytes_to_read)
            if len(data) > 0:
                for digest in requested_digests: tmp[digest].update(data)
                bytes_to_read -= len(data)
                file_idx += len(data)
            if interrupted: return
        for digest in requested_digests: file_digest[digest] = tmp[digest].digest()
    else:
        for digest in requested_digests: file_digest[digest] = zero_digest[digest]
    # gather the digests
    results[filename] = list()
    if output_format == 'standard':
        outstring = filename + '\t' if verbosity == 'verbose' else ''
        if opt_s: outstring = outstring + str(filesize) + '\t'
        for digest_idx in range(1, len(digest_name)):
            if digest_idx in requested_digests:
                outstring = outstring + convert_to_ascii(file_digest[digest_idx]) + '\t'
        results[filename].append(outstring[:-1])
    elif output_format == 'classic':
        if opt_s: results[filename].append(str(filesize) + '  ' + filename)
        for digest_idx in range(1, len(digest_name)):
            if digest_idx in requested_digests:
                results[filename].append(convert_to_ascii(file_digest[digest_idx]) + '  ' + filename)
    else:  # assume BSD-style
        if opt_s: results[filename].append('SIZE (' + filename + ') = ' + str(filesize))
        for digest_idx in range(1, len(digest_name)):
            if digest_idx in requested_digests:
                digest_BSD = digest_name[digest_idx].upper().replace('BLAKE2B', 'BLAKE2b')
                results[filename].append(digest_BSD + ' (' + filename + ') = ' + convert_to_ascii(file_digest[digest_idx]))
    # print the results here if single-threaded
    if opt_T < 2:
        for line in results[filename]: print(line)

# process files
compute_zero_byte_digests()
if opt_T < 2:
    for filename in unique_filelist:
        if interrupted: break
        process_file(filename)
else:
    with concurrent.futures.ThreadPoolExecutor(max_workers=opt_T) as executor:
        executor.map(process_file, unique_filelist)
    # print the results here if multi-threaded (to avoid stdout overlap)
    for filename in filelist:
        if filename in results:
            for line in results[filename]: print(line)
