PK!WSioLICENSECopyright (c) 2019 Mark Gemmill. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PK!nj README.md### pdftxt The goal of this project is to provide an api to extract text from specific regions of a pdf document/page and a cli to assist identifying the location of text within a document. ### Installation ... pip install pdftxt ### Basic Command Line Usage Let's say we have a PDF file (PDF-DOC.pdf) that looks like this: ![Source File Image](https://bytebucket.org/mgemmill/pdftxt/raw/36ef6c80f953ac5d4eae712d5c7943c23e8914bc/assets/readme_src_doc_.jpg) The `pdftxt` command: ... pdftxt PDF-DOC.pdf Will output a visual layout of the pdf document's pages and text elements to an html page: ![Output File Image](https://bytebucket.org/mgemmill/pdftxt/raw/36ef6c80f953ac5d4eae712d5c7943c23e8914bc/assets/readme_output_doc_.jpg) ### API Usage from pathlib import Path from pdftxt import api filepath = 'tests/Word_PDF.pdf' with api.PdfTxtContext(filepath) as pdf: for page in pdf: # To fetch text objects from specific region # of the page, first define the region: region = api.Region(400, 300, 512, 317) # Initialize layout parameters: params = api.PdfTxtParams() # Then analyze that area of the page for text objects: text = page.analyze(region, params) # Do whatever it is we need to do with the results: for txt in text: print(txt.text) PK!ތm22pdftxt/__init__.py__version__ = "0.3.2" __author__ = "Mark Gemmill" PK!dg pdftxt/api.pyfrom pathlib import Path from .miner import PdfTxtParams, PdfTxtDocument, PdfTxtLayoutAnalyzer from .region import Region __all__ = ["PdfTxtParams", "PdfTxtText", "Region"] class PDFObject: """ Base PDF object wrapper exposes common object position info. """ def __init__(self, obj): self.obj = obj def __getattr__(self, name): return getattr(self.obj, name) @property def media_box(self): return self.obj.bbox class PdfTxtText(PDFObject): """ Wraps up LTTextBox object. """ def __init__(self, parent, text_obj): super(PdfTxtText, self).__init__(text_obj) self.page = parent if isinstance(parent, PdfTxtPage) else PdfTxtPage(parent) self._reset = 0 @property def sort_key(self): return (self.y0, self.x0, self.y1, self.x1) @property def text(self): return self.obj.get_text().strip() def __str__(self): return self.text def __repr__(self): return f"<{self.__class__.__name__} ({self.y0:03.2f}, {self.x0:03.2f}) '{self.text}'>" class PdfTxtPage(PDFObject): """ Wraps up Page object. """ def __init__(self, page_obj): super(PdfTxtPage, self).__init__(page_obj) self.text = [] def add(self, text_obj): self.text.append(PdfTxtText(self, text_obj)) def resort_text(self): for x in self.text: x.reverse_y_axis() self.text.sort(key=lambda x: x.sort_key) class PdfTxtContext: """ Wraps PdfMiner in a context manager """ def __init__(self, pdf_doc, pdf_pwd=""): self.pdf_doc = Path(pdf_doc) self.pdf_pwd = pdf_pwd def __enter__(self): self.fp = fp = self.pdf_doc.open(mode="rb") # pylint: disable=W0201 self.doc = PdfTxtDocument(fp, passwd=self.pdf_pwd) # pylint: disable=W0201 self.doc.assert_extractable() return iter(self._parse_pages()) def _parse_pages(self): device = PdfTxtLayoutAnalyzer() interpreter = device.create_interpreter() for page in self.doc.create_pages(): interpreter.process_page(page) yield device.get_result() def __exit__(self, _type, value, traceback): self.fp.close() PK!; pdftxt/cli.py""" pdftxt Usage: pdftxt [--region=] [--pages=] [--analyze-grid | --analyze-rows] [--char-margin=] [--line-margin=] [--line-overlap=] [--word-margin=] [--box-flow=] [--collapse-lines] [--column-boundaries=] [--open-output] [--debug] [] Options: --line-overlap= Line Overlap [Default: 0.5] --line-margin= Line Margin. [Default: 0.5] --char-margin= Character Margin. [Default: 2.0] --word-margin= Word Margin. [Default: 0.1] --box-flow= Box Flow. [Default: 0.5] --collapse-lines Analyze lines of text as if they had no height. --region= Region to assess. [Default: (0,0,0,0)] --pages= Which pages to analyze. [Default: 1+] --analyze-grid Parse page region as a grid. --column-boundaries= Pass a list of custom column boundaries to use for grid analysis. --analyze-rows Parse page region as a rows. --open-output Open the html file after it is generated. --debug Run in debug mode. -h --help Show this screen. --version Show version. """ import sys from datetime import datetime from pathlib import Path from docopt import docopt from .region import fetch_region from .api import PdfTxtParams, PdfTxtContext from .util import parse_float_list, parse_page_notation from . import __version__ def fetch_args(): # pylint: disable=R0902,R0903 _arg = docopt(__doc__, argv=sys.argv[1:], version=f"pdft v{__version__}") class Namespace: def get(self, name): return self.args.get(name) args = Namespace() args.debug = _arg.get("--debug", False) if args.debug is True: print(_arg) args.args = _arg args.pdf_doc = Path(_arg[""]) args.output_doc = output_doc = _arg.get("", None) args.point_measure = "point" args.accept_page = parse_page_notation(_arg.get("--pages", "1+")) args.region = _arg.get("--region") args.analyze_grid = _arg.get("--analyze-grid", False) args.analyze_rows = _arg.get("--analyze-rows", False) args.expand_bbox = int(_arg.get("--expand-bbox", 0)) args.line_overlap = float(_arg["--line-overlap"]) args.char_margin = float(_arg["--char-margin"]) args.line_margin = float(_arg["--line-margin"]) args.word_margin = float(_arg["--word-margin"]) args.boxes_flow = float(_arg["--box-flow"]) args.collapse_lines = _arg.get("--collapse-lines", False) args.column_boundaries = parse_float_list(_arg.get("--column-boundaries")) if output_doc: args.output_file = Path(output_doc) else: args.output_file = generate_default_html_filename(args) return args def generate_default_html_filename(args): identifier = "" timestamp = f"-{datetime.now():%Y-%m-%d-%H%M%S}" if args.analyze_grid: identifier = "-grid" if args.analyze_rows: identifier = "-rows" file_name = args.pdf_doc.stem + identifier + timestamp + ".html" return args.pdf_doc.parent / file_name def html(args): from .html import HTMLDocument accept_page = args.accept_page with PdfTxtContext(args.pdf_doc) as pdf: html_doc = HTMLDocument(args.pdf_doc.name) for page in pdf: if not accept_page(page.pageid): continue region = fetch_region(page, args.region) if args.debug: print(f"region: {region}") analyze = page.analyze if args.analyze_grid: analyze = page.analyze_grid elif args.analyze_rows: analyze = page.analyze_rows params = PdfTxtParams( line_overlap=args.line_overlap, char_margin=args.char_margin, line_margin=args.line_margin, word_margin=args.word_margin, boxes_flow=args.boxes_flow, collapse_lines=args.collapse_lines, column_boundaries=args.column_boundaries, ) text = analyze(region=region, layout_params=params) html_doc.start_page(page, region, params) for txt in text: html_doc.add_text_block(txt) html_doc.end_page() html_doc.save_to(args.output_file) if args.get("--open-output"): import subprocess cmd = "open" subprocess.call([cmd, str(args.output_file)]) def main(): args = fetch_args() html(args) PK!epdftxt/grid.py""" grid.py provides functions used to analyze PDF characters grid arrangement. """ from math import floor, ceil from statistics import median from functools import partial from .util import character_in_region def iter_lanes(points): """ Loop through collection of numbers (points) and yield the sequential groupings. Any gap in the sequence will produce a new grouping for example this sequence: [1, 2, 3, 7, 8, 9, 12, 13, 14] with yield: ((1, 2, 3), (7, 8, 9), (12, 13, 14)) """ last = -5 lane = [] for pt in points: if pt == last + 1: lane.append(pt) else: if lane: yield lane lane = [pt] last = pt yield lane def get_attr(attr_name): def _getattr(obj): return getattr(obj, attr_name) return _getattr pt_x0 = get_attr("x0") pt_x1 = get_attr("x1") pt_y0 = get_attr("y0") pt_y1 = get_attr("y1") def fetch_cell_boundaries( characters, start_outer_bound, end_outer_bound, fetch_char_start=None, fetch_char_end=None, ): # collect all points along the range all_points = set(range(floor(start_outer_bound), ceil(end_outer_bound) + 1)) char_points = set() # fetch all horizontal (x) points occupied by charcters: for c in characters: for pt in range(floor(fetch_char_start(c)), ceil(fetch_char_end(c) + 1)): char_points.add(pt) # fetch all horizontal (x) points NOT occupied by characters: boundary_points = all_points - char_points # find the center of each column: return sorted([median(c) for c in iter_lanes(sorted(boundary_points))]) fetch_column_boundaries = partial( fetch_cell_boundaries, fetch_char_start=pt_x0, fetch_char_end=pt_x1 ) fetch_row_boundaries = partial( fetch_cell_boundaries, fetch_char_start=pt_y0, fetch_char_end=pt_y1 ) def iter_cell_bounds(grid_boundaries): for i in range(0, len(grid_boundaries) - 1): yield (grid_boundaries[i], grid_boundaries[i + 1]) class CollapsedCharacter: """ When rows of text are too close together, or their bounding boxes overlap it is not possible, with either PdfMiner's row grouping, or by our row boundary logic, to easily separate those rows. This proxy class of PdfMiner's Char object reduces the height of a row down to middle pt of the characters height. This will allow us to effectively isolate rows in these instances. """ def __init__(self, character): self._char = character # colapse the x and y coordinates of character to the # center of the character height and width. Essentially # the character will now be a point 1 pt. x_mid = (character.x1 - character.x0) / 2 self._char_x0 = character.x0 + floor(x_mid) self._char_x1 = character.x0 + ceil(x_mid) y_mid = (character.y1 - character.y0) / 2 self._char_y0 = character.y0 + floor(y_mid) self._char_y1 = character.y0 + ceil(y_mid) def __getattr__(self, name): return getattr(self._char, name) @property def uncollapsed(self): return self._char @property def x0(self): return self._char_x0 @property def x1(self): return self._char_x1 @property def y0(self): return self._char_y0 @property def y1(self): return self._char_y1 def __repr__(self): return f'' @staticmethod def collapse(characters): for i in range(0, len(characters)): # pylint: disable=consider-using-enumerate characters[i] = CollapsedCharacter(characters[i]) @staticmethod def uncollapse(characters): for i in range(0, len(characters)): # pylint: disable=consider-using-enumerate characters[i] = characters[i].uncollapsed @staticmethod def get_uncollapsed(characters): new = [] for c in characters: if isinstance(c, CollapsedCharacter): new.append(c.uncollapsed) else: new.append(c) return new class Cell: def __init__(self, x0, y0, x1, y1): self.x0 = x0 self.y0 = y0 self.x1 = x1 self.y1 = y1 self.characters = [] self._text = "" @property def region(self): return (self.x0, self.y0, self.x1, self.y1) def __contains__(self, c): return character_in_region(c, self.x0, self.y0, self.x1, self.y1) def chars_to_str(self): return "".join([c.get_text() for c in self.characters]) @property def text(self): return "".join([t.get_text() for t in self._text]) @text.setter def text(self, value): self._text = value def get_text(self): return self.text @property def width(self): return self.x1 - self.x0 @property def height(self): return self.y1 - self.y0 def __str__(self): return self.text def __repr__(self): txt = self.text if len(txt) > 20: txt = txt[:20] + "..." return f'' class Table: def __init__(self): self.rows = [] self.current_row = None def add_row(self): self.current_row = new_row = [] self.rows.append(new_row) return new_row def add_cell(self, cell): self.current_row.append(cell) def __getitem__(self, key): return self.rows[key] def __iter__(self): for row in self.rows: for cell in row: yield cell def fetch_table_cells(row_boundaries, column_boundaries): """ Returns a list of row/column bounding boxes (x0, y0, x1, y1). """ table = Table() for y0, y1 in iter_cell_bounds(row_boundaries): table.add_row() for x0, x1 in iter_cell_bounds(column_boundaries): table.add_cell(Cell(x0, y0, x1, y1)) return table def fetch_row_cells(row_boundaries, x0, x1): rows = [] for y0, y1 in iter_cell_bounds(row_boundaries): rows.append(Cell(x0, y0, x1, y1)) return rows def table_cells_to_regions(table): """ Transform from a table of Cell objects to a table of cell region coordinates. This is primarily for testing purposes. """ _table = [] for row in table.rows: _row = [] for cell in row: _row.append(cell.region) _table.append(_row) return _table def allocate_characters_to_table(characters, table): """ Apply characters to individual Cell objects they fall within on the page. """ for char in characters: for cell in table: if char in cell: cell.characters.append(char) PK!lpdftxt/html.pyHTML_HEADER = """ PDF Text Outline
""" HTML_FOOTER = """
""" HTML_TABLE = """
Page Width: {page_width: 7.2f}pt Line Overlap: {line_overlap: 7.2f}
Page Height: {page_height: 7.2f}pt Character Margin: {char_margin: 7.2f}
Selected Region Line Margin: {line_margin: 7.2f}
x0: {region.x0: 7.2f}pt Word Margin: {word_margin: 7.2f}
y0: {region.y0: 7.2f}pt Box Flow: {boxes_flow: 7.2f}
x1: {region.x1: 7.2f}pt Collapse Lines: {collapse_lines}
y1: {region.y1: 7.2f}pt Column Boundaries: {column_boundaries}
""" class HTMLDocument: def __init__(self, doc_name): self.doc_name = doc_name self.elements = [] self.text_count = 0 self.page_height = 0 def start_page( self, pdf_page, region, params ): div = ( '
' '
'
            "{doc}
" "Page: {page_cnt}" "
" "{table}" "
" '
' ) self.page_height = pdf_page.height table = HTML_TABLE.format( region=region, page_width=pdf_page.width, page_height=pdf_page.height, line_overlap=params.line_overlap, char_margin=params.char_margin, line_margin=params.line_margin, word_margin=params.word_margin, boxes_flow=params.boxes_flow, collapse_lines=str(params.collapse_lines).lower(), column_boundaries=','.join([str(c) for c in params.column_boundaries]), ) self.elements.append( div.format( doc=self.doc_name, page_cnt=pdf_page.pageid, width=pdf_page.width, height=pdf_page.height, table=table, ) ) def end_page(self): self.elements.append("
") def add_text_block(self, txt): div = ( '
' '{text}' "{title}" "
" ) self.text_count += 1 text_text = txt.get_text() title_text = ( '' f'{text_text}
' "
"
            f"x0: {txt.x0:7.2f}   x1: {txt.x1:7.2f}  width:  {txt.width:7.2f}
" f"y0: {txt.y0:7.2f} y1: {txt.y1:7.2f} height: {txt.height:7.2f}" "
" "
" ) self.elements.append( div.format( text_cnt=self.text_count, width=txt.width, height=txt.height, top=self.page_height - (txt.y0 + txt.height), left=txt.x0, title=title_text, text=text_text, ) ) def _inner_html(self): return "\n".join(self.elements) def __str__(self): return HTML_HEADER + self._inner_html() + HTML_FOOTER def save_to(self, filepath): with filepath.open(mode="w", encoding="utf-8", errors='replace') as fh_: fh_.write(str(self)) PK!JL%%pdftxt/miner.py""" miner.py provides direct class overrides from PdfMiner.Six for use in PDFtxt. """ from pdfminer.converter import PDFLayoutAnalyzer from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFResourceManager from pdfminer.layout import LAParams, LTPage, LTChar, LTTextBoxVertical, IndexAssigner from pdfminer.pdfdocument import PDFDocument, PDFTextExtractionNotAllowed from pdfminer.pdfparser import PDFParser from pdfminer.pdfpage import PDFPage from pdfminer.utils import apply_matrix_pt from pdftxt.grid import fetch_column_boundaries, fetch_row_boundaries from pdftxt.grid import fetch_table_cells, fetch_row_cells from pdftxt.grid import allocate_characters_to_table from pdftxt.grid import CollapsedCharacter from pdftxt.util import character_in_region from pdftxt.util import fetch_left_most_coordinate from pdftxt.util import fetch_right_most_coordinate def is_white_space(txtobj): return txtobj.get_text().strip() != "" class PdfTxtParams(LAParams): """Wraps LAParams, adding the existing PDFMiner `exclude_white_space` option and the PdfTxt-specific option `collapse_lines`. """ # pylint: disable=too-many-arguments,too-few-public-methods def __init__( self, line_overlap=0.5, char_margin=2.0, line_margin=0.5, word_margin=0.1, boxes_flow=0.5, collapse_lines=False, exclude_white_space=True, column_boundaries=None, ): super(PdfTxtParams, self).__init__( line_overlap=line_overlap, char_margin=char_margin, line_margin=line_margin, word_margin=word_margin, boxes_flow=boxes_flow, ) self.collapse_lines = collapse_lines self.exclude_white_space = exclude_white_space self.column_boundaries = column_boundaries def make_param_obj(line_overlap, char_margin, line_margin, word_margin, boxes_flow): laparams = LAParams() laparams.line_overlap = line_overlap laparams.char_margin = char_margin laparams.line_margin = line_margin laparams.word_margin = word_margin laparams.boxes_flow = boxes_flow return laparams class PdfTxtPage(LTPage): """ Inherits pdfminer LTPage object and changes the functionality to: 1. ignore everything but LTChar objects in the analysis method 2. include the ability to filter for and analyze smaller regions of the page. """ def __init__(self, pageno, bbox, rotate=0): super(PdfTxtPage, self).__init__(pageno, bbox, rotate=rotate) # filter out all the character objects in the page self._txt_objs = None def fetch_characters(self): self.characters = list(filter(lambda obj: isinstance(obj, LTChar), self)) def _fetch_bounding_box(self, x0=None, y0=None, x1=None, y1=None): # default coordinates to page size x1 = self.x1 if not x1 else x1 y1 = self.y1 if not y1 else y1 x0 = self.x0 if not x0 else x0 y0 = self.y0 if not y0 else y0 return x0, y0, x1, y1 def filter_characters(self, x0=None, y0=None, x1=None, y1=None): x0, y0, x1, y1 = self._fetch_bounding_box(x0, y0, x1, y1) fchar = [] for c in self.characters: if character_in_region(c, x0, y0, x1, y1): fchar.append(c) return fchar def _analyze_characters(self, characters, laparams, exclude_white_space=True): # convert text objects into text lines textlines = list(self.group_objects(laparams, characters)) textlines = list(filter(lambda obj: not obj.is_empty(), textlines)) # convert text lines in to text boxes textboxes = list(self.group_textlines(laparams, textlines)) if -1 <= laparams.boxes_flow and laparams.boxes_flow <= +1 and textboxes: self.groups = self.group_textboxes(laparams, textboxes) assigner = IndexAssigner() for group in self.groups: group.analyze(laparams) assigner.run(group) textboxes.sort(key=lambda box: box.index) else: def getkey(box): if isinstance(box, LTTextBoxVertical): return (0, -box.x1, box.y0) return (1, box.y0, box.x0) textboxes.sort(key=getkey) if exclude_white_space: return list(filter(is_white_space, textboxes)) return textboxes def analyze(self, region, layout_params): """Does standard PDFMiner analysis, but only on characters within the selected region. Parameters deviat from parent class method. """ # pylint: disable=arguments-differ characters = self.filter_characters( x0=region.x0, y0=region.y0, x1=region.x1, y1=region.y1 ) if not characters: return [] return self._analyze_characters( characters, layout_params, exclude_white_space=layout_params.exclude_white_space, ) def analyze_grid(self, region, layout_params): """Analyzes the region for natural vertical and horizontal gaps in the text that extend to the edges of the region and returns text objects organized by table/row/cell. """ characters = self.filter_characters( x0=region.x0, y0=region.y0, x1=region.x1, y1=region.y1 ) # import ipdb;ipdb.set_trace() if not characters: return [] if layout_params.collapse_lines is True: CollapsedCharacter.collapse(characters) if layout_params.column_boundaries: columns = layout_params.column_boundaries else: columns = fetch_column_boundaries( CollapsedCharacter.get_uncollapsed(characters), region.x0, region.x1 ) rows = fetch_row_boundaries(characters, region.y0, region.y1) table = fetch_table_cells(rows, columns) allocate_characters_to_table(characters, table) for cell in table: if layout_params.collapse_lines is True: CollapsedCharacter.uncollapse(cell.characters) if not cell.characters: continue cell.text = self._analyze_characters( cell.characters, layout_params, exclude_white_space=layout_params.exclude_white_space, ) return table def analyze_rows(self, region, layout_params): """Analyzes the region for the natural horizontal gap in the text that extend from the left to right edges of the region and returns text objects organized by table and row. """ characters = self.filter_characters( x0=region.x0, y0=region.y0, x1=region.x1, y1=region.y1 ) if not characters: return [] if layout_params.collapse_lines is True: CollapsedCharacter.collapse(characters) x0 = fetch_left_most_coordinate(characters) x1 = fetch_right_most_coordinate(characters) row_boundaries = fetch_row_boundaries(characters, region.y0, region.y1) rows = fetch_row_cells(row_boundaries, x0, x1) allocate_characters_to_table(characters, rows) for cell in rows: if layout_params.collapse_lines is True: CollapsedCharacter.uncollapse(cell.characters) if not cell.characters: continue text = self._analyze_characters( cell.characters, layout_params, exclude_white_space=layout_params.exclude_white_space, ) cell.text = text return rows class PdfTxtLayoutAnalyzer(PDFLayoutAnalyzer): """ Inherits PDFLayoutAnalyzer, replacing the LTPage object with pdftxt.PDFTextPage object, and removes the text analysis to be explicity called by the user. We are also embedding the resource manager and PDFPageInterpreter objects in this class. """ def __init__(self): # creating the resource manager here # examples show this as being external self.rsrcmgr = rsrcmgr = PDFResourceManager() super(PdfTxtLayoutAnalyzer, self).__init__(rsrcmgr) def begin_page(self, page, ctm): # override existing - only change here # is replacement of LTPage object with a PdfTxtPage (x0, y0, x1, y1) = page.mediabox (x0, y0) = apply_matrix_pt(ctm, (x0, y0)) (x1, y1) = apply_matrix_pt(ctm, (x1, y1)) mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1)) self.cur_item = PdfTxtPage(self.pageno, mediabox) def end_page(self, page): # override existing, skipping the self.cur_item.analyse() step assert not self._stack, str(len(self._stack)) assert isinstance(self.cur_item, LTPage), str(type(self.cur_item)) self.pageno += 1 self.cur_item.fetch_characters() self.receive_layout(self.cur_item) def get_result(self): return self.cur_item def create_interpreter(self): # convenience function return PDFPageInterpreter(self.rsrcmgr, self) class PdfTxtDocument(PDFDocument): def __init__(self, pdf_file_handle, passwd=""): parser = PDFParser(pdf_file_handle) super(PdfTxtDocument, self).__init__(parser, password=passwd) def assert_extractable(self): if not self.is_extractable: raise PDFTextExtractionNotAllowed def create_pages(self): return PDFPage.create_pages(self) PK! pdftxt/region.pyfrom collections import namedtuple from .util import PdfTxtError, parse_float_list PT2IN = 72.571_429 PT2CM = 28.346_456_7 def pt2in(points): return points / PT2IN def pt2cm(points): return points / PT2CM def in2pt(inches): return inches * PT2IN def cm2pt(cm): return cm * PT2CM Region = namedtuple("Region", "x0 y0 x1 y1") def parse_region(text, x1=0, y1=0): """ Parse commandline argument text in the form of: "(10,20,30,40)" Into a Region tuple: Region(x0=10, y0=20, x1=30, y1=40) The x1 and y1 arguments are place holders for the pages x1 and y1 coordinates. """ # if isinstance(text, list): # text = text[0] # coord = [float(r.strip()) for r in text.strip(")").strip("(").split(",")] coord = parse_float_list(text) if len(coord) != 4: raise PdfTxtError( 'Region argument must contain 4 arguments. Example: "(0,0,45,100)".' ) # The default region is the total page, which is represented as # "(0,0,0,0)". The x1 and y1 coordinates are unknown until the page # has been parsed. In this case, we need to swap out the x1 and y1 # zero values in the default for the actual x1 and y1 page coordinates. coord[2] = x1 if coord[2] == 0 else coord[2] coord[3] = y1 if coord[3] == 0 else coord[3] if coord[2] <= 0 or coord[3] <= 0 or coord[2] < coord[0] or coord[3] < coord[1]: raise PdfTxtError( 'X1 and y1 coordinates must be greater than 0. Example: "(0,0,45,100)".' ) return Region(*coord) def calculate_predefined_regions(width, height): """ Generate a list of pre-defined Regions based on the provided width and height variables. """ half_height = height / 2 half_width = width / 2 return { "full-page": Region(0, 0, width, height), "top-half": Region(0, half_height, width, height), "bottom-half": Region(0, 0, width, half_height), "left-half": Region(0, 0, half_width, height), "right-half": Region(half_width, 0, width, height), "top-left": Region(0, half_height, half_width, height), "top-right": Region(half_width, half_height, width, height), "bottom-left": Region(0, 0, half_width, half_height), "bottom-right": Region(half_width, 0, width, half_height), } def fetch_region(page, region_arg): """ Determine the region from the commandline arguments. """ predefined_regions = calculate_predefined_regions(page.width, page.height) region = predefined_regions.get(region_arg) if not region: region = parse_region(region_arg, page.x1, page.y1) return region PK!s pdftxt/util.pyimport re from math import ceil, floor class PdfTxtError(Exception): pass def character_in_region(c, x0, y0, x1, y1): return c.x0 >= x0 and c.x1 <= x1 and c.y0 >= y0 and c.y1 <= y1 def fetch_left_most_coordinate(characters): return floor(min([c.x0 for c in characters])) def fetch_right_most_coordinate(characters): return ceil(max([c.x1 for c in characters])) def parse_float_list(text): """ text: expecting a comma separated list of numbers optionally contained in (brackets). Returns a list of floats. """ if text is None: return [] if isinstance(text, list): text = text[0] return [float(r.strip()) for r in text.strip(")").strip("(").split(",")] def parse_page_notation(text): """ Acceptable notations: ALL (accepts all pages) 1+ (accepts all pages) 2+ (pages 2 to end) 2 (just page 2) 2-4 (just pages 2-4) 2,5,8 (pages 2, 5 and 8) Return a function for matching page number. """ text = text.strip().lower() def matcher(rx, txt): m = re.match(rx, txt) if m: return m.groupdict() return None # return accept all pages if text.strip().lower() in ("", "all", "1+", "1-"): def _accept_all(pageno): return True return _accept_all # return accept page plus number match = matcher(r"^(?P\d+)\+$", text) if match: accepted_number = int(match["pageno"]) def _accept_ge(pageno): return pageno >= accepted_number return _accept_ge # return accept single page number match = matcher(r"^(?P\d+)$", text) if match: accepted_number = int(match["pageno"]) def _accept_eq(pageno): return pageno == accepted_number return _accept_eq # return accept page in match = matcher(r"^(?P\d+) ?- ?(?P\d+)$", text) if match: start_no = int(match["pgstart"]) end_no = int(match["pgend"]) accepted_number = list(range(start_no, end_no + 1)) def _accept_in(pageno): return pageno in accepted_number return _accept_in match = matcher(r"^(?P(\d+,)+\d)$", text) if match: accepted_number = [int(t) for t in match["pages"].split(",")] def _accept_in(pageno): return pageno in accepted_number return _accept_in raise PdfTxtError("Invalid page selection notation.") PK!HN'*'pdftxt-0.3.2.dist-info/entry_points.txtN+I/N.,()*HI+(Pz9Vy\\PK!WSiopdftxt-0.3.2.dist-info/LICENSECopyright (c) 2019 Mark Gemmill. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PK!HڽTUpdftxt-0.3.2.dist-info/WHEEL A н#Z;/"d&F[xzw@Zpy3Fv]\fi4WZ^EgM_-]#0(q7PK!H\!pdftxt-0.3.2.dist-info/METADATAUMo6WLC| ɢI6&"H#ITɑSR6(̛7G$KXFb*+L z͒EmUINa>}32#mDpG/񓢧6{F,jU2oUe`<)ek1jiM y!`%U\s*2]EVwcqҿ$b|kUR7$0\3);16WXgZ=2)k8 P@M'>s![jZWʶRy+w49Z?p1 ^)%Uw6/~WfޞFҲTOF_pHh9gmGQzf]+JMwxX{)[;n}.X 'Bҝ>$I0vGjEJnHMm 04[@Y|kYa8}ݔgn\r=E :9™c@k"un ϶~B_Ms}FcoPK!H2=pdftxt-0.3.2.dist-info/RECORDǒP}? 䰘ʥ *aCp.ѧ7XVMj:77+'$[Xr+KVpdkTP-ۣt@ 2()Du',) eؙ)N;ؑɻ)l0PZHt0 EY]Yi U:am}g~LK"1Drw$FpxeE0"nh\6\{+[-"Gʴ\y2z:\.8Zÿ/NR v, zNp>B[z|_SvN*n¨h Ht? 0gtn1rPQmƠIg]Z UDCfi*{eV7:Drp /= JGن^(ѨFu? V_PK!WSioLICENSEPK!nj README.mdPK!ތm22 pdftxt/__init__.pyPK!dg 4 pdftxt/api.pyPK!; Hpdftxt/cli.pyPK!e'(pdftxt/grid.pyPK!lECpdftxt/html.pyPK!JL%% `pdftxt/miner.pyPK! pdftxt/region.pyPK!s pdftxt/util.pyPK!HN'*'ښpdftxt-0.3.2.dist-info/entry_points.txtPK!WSioFpdftxt-0.3.2.dist-info/LICENSEPK!HڽTU|pdftxt-0.3.2.dist-info/WHEELPK!H\! pdftxt-0.3.2.dist-info/METADATAPK!H2=pdftxt-0.3.2.dist-info/RECORDPK