{ "info": { "author": "Alister Cordiner", "author_email": "alister@cordiner.net", "bugtrack_url": null, "classifiers": [ "Development Status :: 3 - Alpha", "Environment :: Web Environment", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)", "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 2", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Topic :: Software Development :: Libraries :: Application Frameworks" ], "description": "===============\nscrapy-twostage\n===============\n\nHave you ever written a web scraper, only to find out after\na long time that there's some extra data on the pages you\nshould've been scraping all along?\n\nOr a change on a website means your scraper stops working,\nand you lose days or weeks of data until you can find the\ntime to fix it?\n\nThis library aims to solve this problem by splitting a `Scrapy\n`_ scraper up into two asynchronous stages:\n\n1. **Download stage** - The website is crawled, and the pages to\n be scraped are downloaded and saved to disk.\n2. **Extract stage** - The pages to be scraped are loaded from disk.\n The desired data is extracted from the pages and exported (e.g. to\n a file or database).\n\nThe crawler logic for the download stage should be kept as simple\nas possible. It would typically open a known URL and perform very\nsimple actions such as clicking a \"next page\" button or submitting\na search query. This reduces the risk of the downloader breaking if\nthere are minor changes made to the website.\n\nAnd since all of the raw data is being saved, if you ever decide to\nchange your extractor logic, you can simply re-run the extractor on\nall of the data that has been downloaded.\n\nInstallation\n=============\n\nDownloading and installing from PyPI\n------------------------------------\n\nTo install using ``pip``::\n\n $ pip install scrapy-twostage\n\nOr to install using ``easy_install``::\n\n $ easy_install scrapy-twostage\n\nDownloading and installing from source\n--------------------------------------\n\nDownload the latest version of ``scrapy-twostage`` from\nhttp://pypi.python.org/pypi/scrapy-twostage/.\n\nYou can install it by doing the following::\n\n $ tar xvfz scrapy-twostage-0.0.0.tar.gz\n $ cd scrapy-twostage-0.0.0\n # python setup.py install # as root\n\nUsing the development version\n------------------------------\n\nYou can clone the git repository by doing the following::\n\n $ git clone git://github.com/acordiner/scrapy-twostage.git\n\nUsing scrapy-twostage\n=====================\n\nComing soon...\n\nBug tracker\n===========\n\nIf you have any suggestions, bug reports or annoyances please report them\nat http://github.com/acordiner/scrapy-twostage/issues/\n\nLicense\n=======\n\nThis software is licensed under the ``GPL v2 License``. See the ``LICENSE``\nfile in the top distribution directory for the full license text.", "description_content_type": null, "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "http://github.com/acordiner/scrapy-twostage", "keywords": "", "license": "GPL v2", "maintainer": "", "maintainer_email": "", "name": "scrapy-twostage", "package_url": "https://pypi.org/project/scrapy-twostage/", "platform": "UNKNOWN", "project_url": "https://pypi.org/project/scrapy-twostage/", "project_urls": { "Homepage": "http://github.com/acordiner/scrapy-twostage" }, "release_url": "https://pypi.org/project/scrapy-twostage/0.0.4/", "requires_dist": null, "requires_python": "", "summary": "Two stage Scrapy spider: download and extract", "version": "0.0.4" }, "last_serial": 2717103, "releases": { "0.0.1": [ { "comment_text": "", "digests": { "md5": "fb66373e80f9bb47d7d9eaf409784062", "sha256": "f0505c15c4a1b7ab9d378e2953e581185fd07549f792b2863c95274fb017f508" }, "downloads": -1, "filename": "scrapy-twostage-0.0.1.tar.gz", "has_sig": false, "md5_digest": "fb66373e80f9bb47d7d9eaf409784062", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4372, "upload_time": "2017-03-19T12:01:59", "url": "https://files.pythonhosted.org/packages/df/9f/b9ba8fb31d807f19a7b56f3bcf1a1a2de1fe3cfbb48c9da22c561731621b/scrapy-twostage-0.0.1.tar.gz" } ], "0.0.2": [ { "comment_text": "", "digests": { "md5": "5a7096db3f1e0ae58ebd144d2a3ae848", "sha256": "6be65becaa4bd23b56cfbbfc8d3513ca2d31fe32985dca69dfe4d696b10102f5" }, "downloads": -1, "filename": "scrapy-twostage-0.0.2.tar.gz", "has_sig": false, "md5_digest": "5a7096db3f1e0ae58ebd144d2a3ae848", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4324, "upload_time": "2017-03-19T12:04:41", "url": "https://files.pythonhosted.org/packages/7e/ae/7fad8050d300c00da26a51c66976fceaa5ba61eb48b7ebc129e1c21d0839/scrapy-twostage-0.0.2.tar.gz" } ], "0.0.3": [ { "comment_text": "", "digests": { "md5": "30eef4461a243fe4654953e8f439d498", "sha256": "c24eef7fa352f6b3d173a92de38a20e0f2c355d2287e769726e0c3e022f2e55c" }, "downloads": -1, "filename": "scrapy-twostage-0.0.3.tar.gz", "has_sig": false, "md5_digest": "30eef4461a243fe4654953e8f439d498", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4383, "upload_time": "2017-03-19T12:43:30", "url": "https://files.pythonhosted.org/packages/cb/49/6b5dcff3ab3cd65ea73cc26455e2b68aba0ede92d1c84be0a26f47f2a41f/scrapy-twostage-0.0.3.tar.gz" } ], "0.0.4": [ { "comment_text": "", "digests": { "md5": "f6be4cfb6804522445c3bde1047f1c3c", "sha256": "9c7e43ed1bdb3906030af562cb488c29aadd4fa916e4643dc40ac811c05f5984" }, "downloads": -1, "filename": "scrapy-twostage-0.0.4.tar.gz", "has_sig": false, "md5_digest": "f6be4cfb6804522445c3bde1047f1c3c", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4366, "upload_time": "2017-03-20T09:35:33", "url": "https://files.pythonhosted.org/packages/9d/23/fd935c8442ad9093f7c357ea7a26dbc7d4e03df526eaa40db3f1c0654a69/scrapy-twostage-0.0.4.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "f6be4cfb6804522445c3bde1047f1c3c", "sha256": "9c7e43ed1bdb3906030af562cb488c29aadd4fa916e4643dc40ac811c05f5984" }, "downloads": -1, "filename": "scrapy-twostage-0.0.4.tar.gz", "has_sig": false, "md5_digest": "f6be4cfb6804522445c3bde1047f1c3c", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4366, "upload_time": "2017-03-20T09:35:33", "url": "https://files.pythonhosted.org/packages/9d/23/fd935c8442ad9093f7c357ea7a26dbc7d4e03df526eaa40db3f1c0654a69/scrapy-twostage-0.0.4.tar.gz" } ] }