{ "info": { "author": "Chris Spencer", "author_email": "chrisspen@gmail.com", "bugtrack_url": null, "classifiers": [ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)", "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3.2", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Text Processing :: General" ], "description": "# Webarticle2Text - Extracts the main article text from a webpage.\n\n[![](https://img.shields.io/pypi/v/webarticle2text.svg)](https://pypi.python.org/pypi/webarticle2text) [![Build Status](https://img.shields.io/travis/chrisspen/webarticle2text.svg?branch=master)](https://travis-ci.org/chrisspen/webarticle2text) [![](https://pyup.io/repos/github/chrisspen/webarticle2text/shield.svg)](https://pyup.io/repos/github/chrisspen/webarticle2text)\n\n## Overview\n-----------\n\nThis project is obsolete and now only serves as a reference. I recommend you instead use [newspaper](https://github.com/codelucas/newspaper), which is an order-of-magnitude more accurate than any other article extraction library I've encountered.\n\nPlease see `compare.csv` for a performance comparison of several similar tools.\n\nThis attempts to locate and extract the largest cluster of text in a\nwebpage. It does this by walking the DOM-tree, identifying all text\nsegments and their depth inside the DOM, appends all text at roughly\nthe same depth, and then returns the chunk with the largest total\nlength.\n\nThis approach usually works well with typical news sites where one\nnews article is displayed per URL. This approach usually fails with\nURLs displaying multiple news blurbs (e.g. news aggregators).\n\n## Installation\n---------------\n\nYou may need to install the tidylib system package, which you can get on Ubuntu 12.04 using:\n\n sudo apt-get install libtidy-0.99-0\n\nor on Fedora using:\n\n sudo yum install libtidy\n\nThen, simply install the package using pip:\n\n pip install webarticle2text\n\n## Usage\n--------\n\nYou can invoke the script either as a Python module:\n\n from webarticle2text import webarticle2text\n print webarticle2text.extractFromURL(\"http://some/arbitrary/url\")\n\nor as a standalone command line script:\n\n webarticle2text.py http://some/arbitrary/url\n \nNote, to use it from the command line, you'll need to ensure it has execute\npermission and is located in your PATH. On most platforms, this should\nautomatically be done by setup.py.\n\n## Development\n\nTests require the Python development headers to be installed, which you can install on Ubuntu with:\n\n sudo apt-get install python-dev python3-dev python3.4-dev\n\nTo run unittests across multiple Python versions, install:\n\n sudo apt-get install python3.4-minimal python3.4-dev python3.5-minimal python3.5-dev\n\nTo run all [tests](http://tox.readthedocs.org/en/latest/):\n\n export TESTNAME=; tox\n\nTo run tests for a specific environment (e.g. Python 2.7):\n \n export TESTNAME=; tox -e py27\n\nTo run a specific test:\n \n export TESTNAME=.test_extract; tox -e py27\n\n## History\n----------\n\n* 1.0.0 (2008.9.16) Initial public release.\n* 1.2.0 (2011.1.3) Update to support Unicode.\n* 1.2.2 (2011.12.17) Cleaned up installation procedure and documentation and moved to github.com. \n* 1.2.3 (2011.12.21) Fixed encoding error when redirecting stdout. e.g. webarticle2text.py http://some/arbitrary/url > output.txt\n* 1.2.5 (2012.11.5) Added the option to specify user-agent header to use when requesting URLs.\n* 2.0.0 (2014.4.20) Added support for Python 3.2.\n", "description_content_type": null, "docs_url": null, "download_url": "UNKNOWN", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://github.com/chrisspen/webarticle2text", "keywords": null, "license": "LGPL License", "maintainer": null, "maintainer_email": null, "name": "webarticle2text", "package_url": "https://pypi.org/project/webarticle2text/", "platform": "OS Independent", "project_url": "https://pypi.org/project/webarticle2text/", "project_urls": { "Download": "UNKNOWN", "Homepage": "https://github.com/chrisspen/webarticle2text" }, "release_url": "https://pypi.org/project/webarticle2text/3.0.2/", "requires_dist": null, "requires_python": null, "summary": "Extracts the main article text from a webpage.", "version": "3.0.2" }, "last_serial": 2983640, "releases": { "1.2.4": [ { "comment_text": "", "digests": { "md5": "c49bf4f0c0186bb51d51993091e0bcfd", "sha256": "e29df6bf837e284bb44998a4e1035e225957943b4e59f2e4c4c705a362ff8d26" }, "downloads": -1, "filename": "webarticle2text-1.2.4.tar.gz", "has_sig": false, "md5_digest": "c49bf4f0c0186bb51d51993091e0bcfd", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 5302, "upload_time": "2012-09-30T23:30:26", "url": "https://files.pythonhosted.org/packages/21/00/e471c20c5ae84cb149be789d9223640772ecf5486a38bdf40a49cdd2977f/webarticle2text-1.2.4.tar.gz" } ], "1.2.5": [ { "comment_text": "", "digests": { "md5": "ec2746e583a867e49c38aeade0f10d41", "sha256": "24de681b9078f64f41175253a47580eb18d3d92f4d637ebcc33a5b1ed6b11da8" }, "downloads": -1, "filename": "webarticle2text-1.2.5.tar.gz", "has_sig": false, "md5_digest": "ec2746e583a867e49c38aeade0f10d41", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 5423, "upload_time": "2013-05-22T19:26:32", "url": "https://files.pythonhosted.org/packages/08/9b/bbc159efeafe44bade572d3ae0295d67bb7bb4245272823804e3b988195a/webarticle2text-1.2.5.tar.gz" } ], "2.0.1": [ { "comment_text": "", "digests": { "md5": "fce867d5d43dd955fdaab9c2bc827718", "sha256": "9dda7c4a1b7ff7dc85bf1f8e518fc8732fc5f931aaa972838d105261b0f621c7" }, "downloads": -1, "filename": "webarticle2text-2.0.1.tar.gz", "has_sig": false, "md5_digest": "fce867d5d43dd955fdaab9c2bc827718", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 7353, "upload_time": "2014-07-28T01:59:14", "url": "https://files.pythonhosted.org/packages/39/05/16d547cbfb37d0ef85922e1b634a77e6865458a2fc9c763412e798648a8b/webarticle2text-2.0.1.tar.gz" } ], "2.0.2": [ { "comment_text": "", "digests": { "md5": "8b3a4d1a6b370a3313fd07f29ae1fd85", "sha256": "3fd5be4f4d062e66fbf58edb9af2601799f96faf91636f972e2f7e475c3fa24b" }, "downloads": -1, "filename": "webarticle2text-2.0.2.tar.gz", "has_sig": false, "md5_digest": "8b3a4d1a6b370a3313fd07f29ae1fd85", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 7358, "upload_time": "2014-09-09T21:08:04", "url": "https://files.pythonhosted.org/packages/e2/44/3706f76c16df1f1ae1643f4103dcc154a420e5326ffffc7409ee249c316c/webarticle2text-2.0.2.tar.gz" } ], "2.0.3": [ { "comment_text": "", "digests": { "md5": "24713d1580c2d3560e16853b1a0e75b9", "sha256": "782c6e508b549e74446d7b7b04166e86048ee147ebd732ddb67b251c14564cd1" }, "downloads": -1, "filename": "webarticle2text-2.0.3.tar.gz", "has_sig": false, "md5_digest": "24713d1580c2d3560e16853b1a0e75b9", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 7352, "upload_time": "2014-11-14T15:04:14", "url": "https://files.pythonhosted.org/packages/a1/e2/f3d39b87779b4cc0ba66dcd608132d162d7b8f463eca994eae7daca9b2a1/webarticle2text-2.0.3.tar.gz" } ], "3.0.2": [ { "comment_text": "", "digests": { "md5": "1d8e5b8615751d862a7a77cbb96969a3", "sha256": "032bdb1f53c8558006c44fb0ec23349056aa69697ac371394b16a8b7cfb381ad" }, "downloads": -1, "filename": "webarticle2text-3.0.2.tar.gz", "has_sig": false, "md5_digest": "1d8e5b8615751d862a7a77cbb96969a3", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 658787, "upload_time": "2017-06-28T02:16:40", "url": "https://files.pythonhosted.org/packages/fa/0b/c970f74c22879fc40ab87ef8fef38b0c166962e6031d82cf3a8e997dca44/webarticle2text-3.0.2.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "1d8e5b8615751d862a7a77cbb96969a3", "sha256": "032bdb1f53c8558006c44fb0ec23349056aa69697ac371394b16a8b7cfb381ad" }, "downloads": -1, "filename": "webarticle2text-3.0.2.tar.gz", "has_sig": false, "md5_digest": "1d8e5b8615751d862a7a77cbb96969a3", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 658787, "upload_time": "2017-06-28T02:16:40", "url": "https://files.pythonhosted.org/packages/fa/0b/c970f74c22879fc40ab87ef8fef38b0c166962e6031d82cf3a8e997dca44/webarticle2text-3.0.2.tar.gz" } ] }