{ "info": { "author": "Daniel Perez Rada", "author_email": "dperezrada@gmail.com", "bugtrack_url": null, "classifiers": [ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Topic :: Utilities" ], "description": ".. -*- mode: rst; coding: utf-8 -*-\n\nWelcome to Html2Data\n====================\n\n:Author: * Daniel Perez Rada \n\nDescription\n===========\nA simple way to transform a HTML file or URL to structured data. You only need to define the xpath to the element. Optionaly you can define functions to be applied after. You can easily write XPATH using the firebug extension, copy XPATH (I recommend edit the XPATH given by firebug, making it shorter).\n\nExample\n=======\nImport\n------\n::\n\n>>> from html2data import HTML2Data\n\nCreate instance\n---------------\n::\n\n\t>>> html = \"\"\"\n\t \t\n\t \tExample Page\n\t \t\n\t\t\t\n\t\t\t\n\t \t\t

Title

\n\t \t\t
This is not a valid HTML\n\t\t\t\n\t\t\"\"\"\n\t>>> h2d_instance = HTML2Data(html = html) #You can also create it from a url = url\n\nUsing XPATH config\n--------------------\nOne you have the object \n::\n\n\t>>> config = [\n {'name': 'header_title', 'xpath': '//head/title/text()'},\n {'name': 'body_title', 'xpath': '//h1/b/text()'},\n {'name': 'description', 'xpath': '//div[@class=\"description\"]/text()'},\n ]\n\n\t>>> h2d_instance.parse(config = config)\n\t{'header_title': 'Example Page', 'body_title': 'Title', 'description': 'This is not a valid HTML'}\n\nUsing CSS SELECTOR config\n-------------------------\n::\n\n\t>>> config = [\n\t {'name': 'header_title', 'css': 'head title'},\n\t {'name': 'body_title', 'css': 'h1 b '},\n\t {'name': 'description', 'css': 'div.description'},\n\t ]\n\n\t>>> h2d_instance.parse(config = config)\n\t{'header_title': 'Example Page', 'body_title': 'Title', 'description': 'This is not a valid HTML'}\n\n\nReal life example\n-----------------\n::\n\n\timport urllib2\n\n\tfrom html2data import HTML2Data\n\n\tresponse = urllib2.urlopen('http://sil.senado.cl/cgi-bin/sil_ultproy.pl')\n\thtml = response.read()\n\n\tconfig = [\n\t {'name': 'fecha', 'css': 'td:nth-child(1)'},\n\t {'name': 'id', 'css': 'td:nth-child(2) a'},\n\t {'name': 'nombre', 'css': 'td:nth-child(3)'},\n\t {'name': 'estado', 'css': 'td:nth-child(4)'},\n\t]\n\n\thtml_instance = HTML2Data(html = html)\n\trows = html_instance.parse_one(css = 'td td tr', multiple = True, text = False)\n\tfor row_element in rows:\n\t row_in_html = HTML2Data(tree = row_element)\n\t print row_in_html.parse(config = config)\n\nYou will get something like:\n::\n\n\t{'nombre': 'Reforma Constitucional que restablece obligatoriedad del voto.', 'fecha': '24/11/2011', 'estado': 'En tramitaci\u00f3n', 'id': '8062-07'}\n\t..\n\t{'nombre': 'Proh\u00edbe el anatocismo.', 'fecha': '02/11/2011', 'estado': 'En tramitaci\u00f3n', 'id': '8007-03'}\n\n\nRequirement\n===========\n\n * lxml 2.0+\n * httplib2\n\nTests\n=====\nRequirement\n-----------\n\n * ludibrio\n * nose\n\nRun\n---\n\n >> nosetests", "description_content_type": null, "docs_url": null, "download_url": "UNKNOWN", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://github.com/dperezrada/html2data", "keywords": "html2data html data xpath crawler transform", "license": "BSD", "maintainer": null, "maintainer_email": null, "name": "html2data", "package_url": "https://pypi.org/project/html2data/", "platform": "UNKNOWN", "project_url": "https://pypi.org/project/html2data/", "project_urls": { "Download": "UNKNOWN", "Homepage": "https://github.com/dperezrada/html2data" }, "release_url": "https://pypi.org/project/html2data/0.4.3/", "requires_dist": null, "requires_python": null, "summary": "A simple way to transform a HTML file or URL to structured data.", "version": "0.4.3" }, "last_serial": 793036, "releases": { "0.1": [ { "comment_text": "", "digests": { "md5": "dac674411abd2550ca01ef47e4db0c6c", "sha256": "5366153f6b059c0c62b3d65f664563987a312d7f0f78f05f42bc0ec723f21d12" }, "downloads": -1, "filename": "html2data-0.1.tar.gz", "has_sig": false, "md5_digest": "dac674411abd2550ca01ef47e4db0c6c", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 1944, "upload_time": "2010-08-30T00:27:53", "url": "https://files.pythonhosted.org/packages/53/d7/9a4a5ceba2a9f4bb7692551fa448e5da28dc1a7f3c4f1652715af00e87dd/html2data-0.1.tar.gz" } ], "0.2": [ { "comment_text": "", "digests": { "md5": "72f5a884625f7f299ff3b2b7f901384b", "sha256": "e1f007745dc45dee46cb3a0adf899706a5a1f7c4d43293a6553afcc61e2e5de8" }, "downloads": -1, "filename": "html2data-0.2.tar.gz", "has_sig": false, "md5_digest": "72f5a884625f7f299ff3b2b7f901384b", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 2085, "upload_time": "2010-08-30T01:23:09", "url": "https://files.pythonhosted.org/packages/df/f8/3794cfc594507443c52827298809e4b6e50209390eeb930810e8c3295863/html2data-0.2.tar.gz" } ], "0.3": [ { "comment_text": "", "digests": { "md5": "bea39d30c952a8057fdef1ff5c7fe647", "sha256": "6de4aa992a1ba76d030db89fbbaefaa4c54c6e1e24b7e43449e5eeca7e038986" }, "downloads": -1, "filename": "html2data-0.3.tar.gz", "has_sig": false, "md5_digest": "bea39d30c952a8057fdef1ff5c7fe647", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 2433, "upload_time": "2010-09-02T15:36:03", "url": "https://files.pythonhosted.org/packages/d9/30/e3a33f1fd937a6ac26c509b867a6242d51598ffbaec6242b98c51efa96c0/html2data-0.3.tar.gz" } ], "0.4": [ { "comment_text": "", "digests": { "md5": "d29d2a15fb9579bd9c8f5c7ba27f8b77", "sha256": "af7290e935282f861e44bc7cad5a4ed8a7d6912279ae6c49ba25a65685942521" }, "downloads": -1, "filename": "html2data-0.4.tar.gz", "has_sig": false, "md5_digest": "d29d2a15fb9579bd9c8f5c7ba27f8b77", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 3331, "upload_time": "2011-11-10T13:48:08", "url": "https://files.pythonhosted.org/packages/3e/e4/c73febf873f4ee38e6e2cc976db7489affed8d06394f5cb431b462144d78/html2data-0.4.tar.gz" } ], "0.4.1": [ { "comment_text": "", "digests": { "md5": "c8ca4dd0759f06381e9efce50fb48bb1", "sha256": "4956968a43a5bba4b797432eeda9925ee2e04dd4b7b946033c4df76dfcc46af9" }, "downloads": -1, "filename": "html2data-0.4.1.tar.gz", "has_sig": false, "md5_digest": "c8ca4dd0759f06381e9efce50fb48bb1", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 3509, "upload_time": "2011-11-10T14:46:08", "url": "https://files.pythonhosted.org/packages/5c/32/5a68e5a5aed43d9f3833a76d0e6d362fbb3d73a609d32f8cb99688dd1173/html2data-0.4.1.tar.gz" } ], "0.4.2": [ { "comment_text": "", "digests": { "md5": "0596eadf7a3b7f2487616c9fc4fd9ba0", "sha256": "49d60b4a765480a998ff1113a3b0c8e5d98a8c496333a3c480804f1b2fd726f2" }, "downloads": -1, "filename": "html2data-0.4.2.tar.gz", "has_sig": false, "md5_digest": "0596eadf7a3b7f2487616c9fc4fd9ba0", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 3833, "upload_time": "2011-11-25T02:42:24", "url": "https://files.pythonhosted.org/packages/b0/3b/04496322d1e893440b83805dfa6e513a10a1d1efe94f6e3c492346b1a974/html2data-0.4.2.tar.gz" } ], "0.4.3": [ { "comment_text": "", "digests": { "md5": "a3710a11125fc0ce353f44e28178e9cb", "sha256": "0c3335cadb1ac350c3a6dae571a96b45289c7dceb8de95dbe24ca9fa49199628" }, "downloads": -1, "filename": "html2data-0.4.3.tar.gz", "has_sig": false, "md5_digest": "a3710a11125fc0ce353f44e28178e9cb", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4337, "upload_time": "2011-11-25T11:58:13", "url": "https://files.pythonhosted.org/packages/bd/91/b718643eaa9dd8307236ddf3100c3e40236e0aa03b0e08237a55998d0b40/html2data-0.4.3.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "a3710a11125fc0ce353f44e28178e9cb", "sha256": "0c3335cadb1ac350c3a6dae571a96b45289c7dceb8de95dbe24ca9fa49199628" }, "downloads": -1, "filename": "html2data-0.4.3.tar.gz", "has_sig": false, "md5_digest": "a3710a11125fc0ce353f44e28178e9cb", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4337, "upload_time": "2011-11-25T11:58:13", "url": "https://files.pythonhosted.org/packages/bd/91/b718643eaa9dd8307236ddf3100c3e40236e0aa03b0e08237a55998d0b40/html2data-0.4.3.tar.gz" } ] }