{ "info": { "author": "Phil Gooch and Kyle Cronan", "author_email": "philgooch@users.noreply.github.com", "bugtrack_url": null, "classifiers": [ "License :: OSI Approved :: GNU General Public License (GPL)", "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python", "Topic :: Office/Business", "Topic :: Software Development :: Libraries", "Topic :: Text Processing" ], "description": "\n This is a Python 3 module and command line utility that analyzes XML output from the\n program `pdftohtml` in order to extract tables from PDF files and output the data as CSV.\n \n For example:\n \n pdftohtml -xml -stdout file.pdf | pdftable -f file%d.csv\n \n See also `pdftable -h` and http://sourceforge.net/projects/pdftable\n \n Original author: (c) 2009 Kyle Cronan \n \n This Python 3 implementation: (c) 2017 Phil Gooch\n \n As per Kyle's code, this version is licensed under GPLv3. See LICENSE file.\n \n # Installation\n \n Install `pdftohtml` via `poppler-utils` (Linux) or `poppler` (Mac OSX)\n \n Then install the module\n \n python setup.py install\n \n or\n pip install pdftablr\n \n ## Command line usage\n \n pdftohtml -xml -stdout file.pdf | pdftable -f file%d.csv\n \n ## Module usage\n \n from pdftablr.table_extractor import Extractor\n \n # XML file created from pdftohtml\n input_path = '/path/to/file.xml'\n \n # Output CSV file\n output_path = '/path/to/output.csv'\n \n with open(output_path, 'w') as output_file:\n table_extractor = Extractor(output_file=output_file)\n \n with open(input_path) as f:\n table_extractor.read_file(f)\n \n tables = table_extractor.extract()\n for table in tables:\n table.output(writer=None)\n \n ", "description_content_type": null, "docs_url": null, "download_url": "https://github.com/philgooch/pdftable/archive/v0.1.0.tar.gz", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://github.com/philgooch/pdftable", "keywords": "python3", "license": "GPLv3", "maintainer": "", "maintainer_email": "", "name": "pdftablr", "package_url": "https://pypi.org/project/pdftablr/", "platform": "", "project_url": "https://pypi.org/project/pdftablr/", "project_urls": { "Download": "https://github.com/philgooch/pdftable/archive/v0.1.0.tar.gz", "Homepage": "https://github.com/philgooch/pdftable" }, "release_url": "https://pypi.org/project/pdftablr/0.1.0/", "requires_dist": null, "requires_python": "", "summary": "Python3 implementation of Kyle Cronan's pdftable module, with unit tests", "version": "0.1.0" }, "last_serial": 3296579, "releases": { "0.1.0": [ { "comment_text": "", "digests": { "md5": "cd004355619ecbf2883360170ad71dfe", "sha256": "485c4ce1df97176231ff6269eaaab3c1c58f578b5cea395c645ebb4f5e662a1f" }, "downloads": -1, "filename": "pdftablr-0.1.0.tar.gz", "has_sig": false, "md5_digest": "cd004355619ecbf2883360170ad71dfe", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 7356, "upload_time": "2017-11-01T09:29:13", "url": "https://files.pythonhosted.org/packages/2f/b6/e9c613e989f7e95d21e83d988fd27141e52460b1d973a4f6a74051d0af5a/pdftablr-0.1.0.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "cd004355619ecbf2883360170ad71dfe", "sha256": "485c4ce1df97176231ff6269eaaab3c1c58f578b5cea395c645ebb4f5e662a1f" }, "downloads": -1, "filename": "pdftablr-0.1.0.tar.gz", "has_sig": false, "md5_digest": "cd004355619ecbf2883360170ad71dfe", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 7356, "upload_time": "2017-11-01T09:29:13", "url": "https://files.pythonhosted.org/packages/2f/b6/e9c613e989f7e95d21e83d988fd27141e52460b1d973a4f6a74051d0af5a/pdftablr-0.1.0.tar.gz" } ] }