PK!ܔGwwdiff_pdf_visually/__init__.pyfrom typing import List __all__ = [] # type: List[str] from .diff import pdfdiff from .diff import pdftopng, imgdiff PK! ? diff_pdf_visually/__main__.pyimport argparse, sys from . import pdfdiff from .constants import DEFAULT_THRESHOLD, DEFAULT_VERBOSITY, DEFAULT_DPI from .constants import MAX_VERBOSITY, VERB_WARN_SANITY def main(): description = """ Compare two PDFs visually. The exit code is 0 if they are the same, and 2 if there are significant differences. """.strip() parser = argparse.ArgumentParser(description=description) verbosity = DEFAULT_VERBOSITY def more_silent(): assert verbosity <= DEFAULT_VERBOSITY, "cannot be both silent and verbose" verbosity -= 1 def more_verbose(): assert verbosity >= DEFAULT_VERBOSITY, "cannot be both silent and verbose" verbosity += 1 parser.add_argument('a', metavar='a.pdf') parser.add_argument('b', metavar='b.pdf') parser.add_argument('--silent', '-q', action='count', default=0, help="silence output (can be used only once); the result can be found " "in the exit code") assert DEFAULT_VERBOSITY==1 parser.add_argument('--verbose', '-v', action='count', default=0, help="show more information (can be used {} times)".format( MAX_VERBOSITY - DEFAULT_VERBOSITY)) parser.add_argument('--threshold', default=DEFAULT_THRESHOLD, type=float, help="PSNR threshold to consider a change significant, " "higher is more sensitive (default: %(default)s)") parser.add_argument('--dpi', default=DEFAULT_DPI, type=int, help="resolution for the rasterised files (default: %(default)s)") parser.add_argument('--time', default=0, type=int, help="number of seconds to wait before discarding temporary files, " "or 0 to immediately discard (hint: use -v)") args = parser.parse_args() assert args.silent == 0 or args.verbose == 0, "cannot be silent and verbose" assert 1 <= args.dpi verbosity = DEFAULT_VERBOSITY + args.verbose - args.silent if verbosity >= VERB_WARN_SANITY: if not args.a[-4:].lower() == ".pdf": print("Warning: {!r} does not end in .pdf.".format(args.a)) if not args.b[-4:].lower() == ".pdf": print("Warning: {!r} does not end in .pdf.".format(args.b)) if pdfdiff(args.a, args.b, verbosity=verbosity, threshold=args.threshold, dpi=args.dpi, time_to_inspect=args.time): sys.exit(0) else: sys.exit(2) if __name__ == '__main__': main() PK![33diff_pdf_visually/constants.pyINFINITY = float('inf') # Default threshold: lower means to ignore more. DEFAULT_THRESHOLD = 100 # Default verbosity. Zero means quiet. DEFAULT_VERBOSITY = 1 MAX_VERBOSITY = 3 # Resolution (in dots per inch) in which to render pages DEFAULT_DPI = 50 # Default number of parallel threads DEFAULT_NUM_THREADS = 2 # Minimum verbosity for printing what the result is, and why VERB_PRINT_REASON=1 # Minimum verbosity for doing sanity warnings VERB_WARN_SANITY=1 # Minimum verbosity for printing what the temporary directory is VERB_PRINT_TMPDIR=1 # Minimum verbosity for printing rough progress VERB_ROUGH_PROGRESS=1 # Minimum verbosity for printing the significance for each page VERB_PERPAGE=2 # Minimum verbosity for printing commands VERB_PRINT_CMD=3 # Maximum number of differing pages to report MAX_REPORT_PAGENOS=5PK!1<<  diff_pdf_visually/diff.py#!/usr/bin/env python3 """ Test if there is a significant difference between two PDFs using ImageMagick and pdftocairo. """ INFINITY = float('inf') import os.path, pathlib, subprocess, sys, tempfile, time from concurrent.futures import ThreadPoolExecutor from .constants import DEFAULT_THRESHOLD, DEFAULT_VERBOSITY, DEFAULT_DPI from .constants import VERB_PRINT_REASON, VERB_PRINT_TMPDIR from .constants import VERB_PERPAGE, VERB_PRINT_CMD, VERB_ROUGH_PROGRESS from .constants import DEFAULT_NUM_THREADS, MAX_REPORT_PAGENOS def pdftopng(sourcepath, destdir, basename, verbosity, dpi): """ Invoke pdftocairo to convert the given PDF path to a PNG per page. Return a list of page numbers (as strings). """ if [] != list(destdir.glob(basename + '*')): raise ValueError("destdir not clean: " + repr(destdir)) verbose_run((verbosity > VERB_PRINT_CMD), [ 'pdftocairo', '-png', '-r', str(dpi), str(sourcepath), str(destdir / basename) ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, ) # list of strings with decimals numbers = sorted(path.name for path in destdir.glob(basename + '*' + '.png')) return [s[len(basename)+1:-4] for s in numbers] # returns a float, which can be inf def imgdiff(a, b, diff, log, print_cmds): assert a.is_file() assert b.is_file() assert not diff.exists() assert not log.exists() with log.open('wb') as f: cmdresult = verbose_run(print_cmds, [ 'compare', '-verbose', '-metric', 'PSNR', str(a), str(b), str(diff), ], stdout=f, stderr=subprocess.STDOUT, ) if cmdresult.returncode > 1: raise ValueError("compare crashed, status="+str(cmdresult.returncode)) with log.open('r') as f: lines = f.readlines() if any('image widths or heights differ' in l for l in lines): raise ValueError("image widths or heights differ") PREF=' all: ' all_line = [l for l in lines if l.startswith(PREF)] assert len(all_line) == 1 all_str = all_line[0][len(PREF):].strip() all_num = INFINITY if all_str == '0' else float(all_str) return all_num def pdfdiff(a, b, threshold=DEFAULT_THRESHOLD, verbosity=DEFAULT_VERBOSITY, dpi=DEFAULT_DPI, time_to_inspect=0, num_threads=DEFAULT_NUM_THREADS, max_report_pagenos=MAX_REPORT_PAGENOS): """ Return True if the PDFs are sufficiently similar. The name of this function is slightly confusing: it returns whether the PDFs are *not* different. """ assert os.path.isfile(a), "file {} must exist".format(a) assert os.path.isfile(b), "file {} must exist".format(b) with tempfile.TemporaryDirectory(prefix="diffpdf") as d: p = pathlib.Path(d) if verbosity >= VERB_PRINT_TMPDIR: print(" Temporary directory: {}".format(p)) if verbosity >= VERB_ROUGH_PROGRESS: print(" Converting each page of the PDFs to an image...") # expand pdfs to pngs with ThreadPoolExecutor(max_workers=num_threads) as pool: a_i_ = pool.submit(pdftopng, a, p, "a", verbosity=verbosity, dpi=dpi) b_i_ = pool.submit(pdftopng, b, p, "b", verbosity=verbosity, dpi=dpi) # Wait for results a_i = a_i_.result() b_i = b_i_.result() if a_i != b_i: assert len(a_i) != len(b_i), "mishap with weird page numbers: {} vs {}".format(a_i, b_i) if verbosity >= VERB_PRINT_REASON: print("Different number of pages: {} vs {}".format(len(a_i), len(b_i))) return False assert len(a_i) > 0 if verbosity >= VERB_ROUGH_PROGRESS: print(" PDFs have same number of pages. Checking each pair of converted images...") significances = [] for pageno in a_i: # remember pageno is a string pageapath = p / "a-{}.png".format(pageno) pagebpath = p / "b-{}.png".format(pageno) diffpath = p / "diff-{}.png".format(pageno) logpath = p / "log-{}.txt".format(pageno) s = imgdiff(pageapath, pagebpath, diffpath, logpath, (verbosity > VERB_PRINT_CMD)) if verbosity >= VERB_PERPAGE: print("- Page {}: significance={}".format(pageno, s)) significances.append(s) min_significance = min(significances, default=INFINITY) significant = (min_significance <= threshold) largest_significances = sorted( (sgf, pageno_minus_one+1) for (pageno_minus_one, sgf) in enumerate(significances) if sgf < INFINITY ) if verbosity >= VERB_PRINT_REASON: freetext = "different" if significant else "the same" print("Min sig = {}, significant?={}. The PDFs are {}.{}".format( min_significance, significant, freetext, '' if largest_significances == [] else " The most different pages are: {}.".format(', '.join( "page {} (sgf. {})".format(pageno, sgf) for (sgf, pageno) in largest_significances[:max_report_pagenos] )) ) ) if time_to_inspect > 0: print( "Waiting for {} seconds before removing temporary directory..." .format(time_to_inspect), end='', flush=True ) time.sleep(time_to_inspect) print(" done.") return not significant def verbose_run(print_cmd, args, *restargs, **kw): if print_cmd: print(" Running: {}".format(' '.join(args)), file=sys.stderr) return subprocess.run(args, *restargs, **kw) PK!H+;E2diff_pdf_visually-1.3.1.dist-info/entry_points.txtN+I/N.,()JLK-HI-,.MɩEa"V񹉙yz PK!HW"TT'diff_pdf_visually-1.3.1.dist-info/WHEEL A н#J."jm)Afb~ ڡ5 G7hiޅF4+-3ڦ/̖?XPK!HA\51*diff_pdf_visually-1.3.1.dist-info/METADATAXr6ϧxw&G;h{l4it:D$j`P6}>E_ls@JTd7ө~4wsLzӦQJj&2qZNV6mz\bۼki3D8].bS*_*+K ƢѹNeYT7F\=ss)u- R:&ښ$znjXͦB[%BYLBq8z=|iLZ|K'WfbWr%zu*l q|T ?5W9Ob|Eg-\3 #{$<#TEJ%Lm(Mܺ>3/m^HZgbl$;?WEb -Uz\0T% G!$bnfQKA8Lo k:dp 9gg`cUvvܼ[yQZ}HnDk&M=0V·ɃLyA;DMANx*;^ma߀$/ց$Y Б4 ieL/+Nîo_zI AuTMƮZKt%-wXo%{1D-ҋl`~עk[e57 & yp~5Zol#M|~)@Ǐv8owЇEɣ]E@ бyE"=^]vnGlbob,(#*{*Qk\l#{) @8Hœ/e#Hl@uMtxnK;lfLHTLz V <1Ԍ0XO]ֿ7wqba;ELVho<= O&dew}TUtU;.ǔ#Q{`sanِco]E3`rsK '*X{%)Bݼ6=DS0 N3v@E(A: $rqѺ;`Vw ^mlErCr|ǃO=@.ՖZ߅žL{zinDr dzilW"H$F;wL B@TuDOCLKpp% bFB+&Ep _7k2t j7XqzJ@5NC7A@0"/PK!H(diff_pdf_visually-1.3.1.dist-info/RECORD=s@> ˇ@q $"YY_&E2"[\=o,j,0:?G&E334G6Xb) í5noDV*W5<@+tׂ{֗M>E;kËǒw!N;n1$>RjB;D:m`W{}^6F\RxCYOXrۨrmQ~\yI"h Rf+@*IQ" ,L;~NIQSn>.N +V(7˓8m,Jׯ4>|kbO|IdyL,>n}zgU$0պ~e\vˮSnVHl+/T3I~<PK!ܔGwwdiff_pdf_visually/__init__.pyPK! ? diff_pdf_visually/__main__.pyPK![33 diff_pdf_visually/constants.pyPK!1<<  Qdiff_pdf_visually/diff.pyPK!H+;E2%diff_pdf_visually-1.3.1.dist-info/entry_points.txtPK!HW"TT'&diff_pdf_visually-1.3.1.dist-info/WHEELPK!HA\51*&diff_pdf_visually-1.3.1.dist-info/METADATAPK!H(//diff_pdf_visually-1.3.1.dist-info/RECORDPK 1