{ "info": { "author": "Lucas Simpson", "author_email": "lucassimpson05@gmail.com", "bugtrack_url": null, "classifiers": [ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", "Topic :: Software Development" ], "description": "Concurrent Flood Scraper\n========================\n\nIt's probably exactly what you think it is, based off the name\n--------------------------------------------------------------\n\nGET a page. scrape for urls, filter those according to some regex. Put all those in a master queue. Scrape page for any data you want. Repeat...\n\nThere's a small demo in the wikipedia_demo. There you can see how easy it is to set up to fit your web scraping needs!\n\n\nSpecifics\n=========\n\n1. Create a child class of concurrentfloodscraper.Scraper and implement the scrape_page(self, text) method. text is the raw html. In this method you do the specific scraping required. Note that only urls that match the class url_filter_regex will be added to the master queue.\n\n2. Annotate your Scraper subclass with concurrentfloodscraper.Route. The single parameter is a regex; URL's that match the regex will be parsed with that scraper.\n\n3. Repeat steps 1 and 2 for as many different types of pages you expect to be scraping from.\n\n4. Create an instance of concurrentfloodscraper.ConcurrentFloodScraper, pass it the root URL, the number of threads to use, and a page limit. Page limit defaults to None, which means 'go forever'.\n\n5. Start the ConcurrentFloodScraper instance, and enjoy the magic!", "description_content_type": null, "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://github.com/LucasSimpson/ConcurrentFloodScraper", "keywords": "crawl crawler scrape scraper web internet", "license": "MIT", "maintainer": "", "maintainer_email": "", "name": "concurrentfloodscraper", "package_url": "https://pypi.org/project/concurrentfloodscraper/", "platform": "UNKNOWN", "project_url": "https://pypi.org/project/concurrentfloodscraper/", "project_urls": { "Homepage": "https://github.com/LucasSimpson/ConcurrentFloodScraper" }, "release_url": "https://pypi.org/project/concurrentfloodscraper/1.0.1/", "requires_dist": [ "requests" ], "requires_python": "", "summary": "A concurrent flood web scraper.", "version": "1.0.1" }, "last_serial": 2642155, "releases": { "1.0.0": [ { "comment_text": "", "digests": { "md5": "f6ff065d8c786d35016f2de1452a22f2", "sha256": "717bad5c60d21f5be2a8e2d14bcb425dff1ec13760a842a018a947025fbc5f9a" }, "downloads": -1, "filename": "concurrentfloodscraper-1.0.0-py3-none-any.whl", "has_sig": false, "md5_digest": "f6ff065d8c786d35016f2de1452a22f2", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 9640, "upload_time": "2017-02-14T20:49:12", "url": "https://files.pythonhosted.org/packages/0e/a9/59d0b305df5e6be03c5a69533e2e5707f21946757ef55b51347787541195/concurrentfloodscraper-1.0.0-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "270bbb9ae6eaab4e43317df2a3b0a803", "sha256": "1c6ee38f60cdc141e8d95c11fcd88e17df2d98eb453767c39a137978eaf2d698" }, "downloads": -1, "filename": "concurrentfloodscraper-1.0.0.tar.gz", "has_sig": false, "md5_digest": "270bbb9ae6eaab4e43317df2a3b0a803", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6278, "upload_time": "2017-02-14T20:49:14", "url": "https://files.pythonhosted.org/packages/d4/15/786a012c9f4e0edd7955fea0114674bbaf54e29158dbb02f586bd996bffd/concurrentfloodscraper-1.0.0.tar.gz" } ], "1.0.1": [ { "comment_text": "", "digests": { "md5": "30db68b52d4375893c571360abe8ad4e", "sha256": "27a56763c000c81d987efc3bf82835772f0695899d088b61e434d29bf0fac8a8" }, "downloads": -1, "filename": "concurrentfloodscraper-1.0.1-py3-none-any.whl", "has_sig": false, "md5_digest": "30db68b52d4375893c571360abe8ad4e", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 9780, "upload_time": "2017-02-14T20:59:20", "url": "https://files.pythonhosted.org/packages/e0/00/995311f710f0a7217b65cbf128a636d7d14e764a5e4883b9b5e6beb31a84/concurrentfloodscraper-1.0.1-py3-none-any.whl" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "30db68b52d4375893c571360abe8ad4e", "sha256": "27a56763c000c81d987efc3bf82835772f0695899d088b61e434d29bf0fac8a8" }, "downloads": -1, "filename": "concurrentfloodscraper-1.0.1-py3-none-any.whl", "has_sig": false, "md5_digest": "30db68b52d4375893c571360abe8ad4e", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 9780, "upload_time": "2017-02-14T20:59:20", "url": "https://files.pythonhosted.org/packages/e0/00/995311f710f0a7217b65cbf128a636d7d14e764a5e4883b9b5e6beb31a84/concurrentfloodscraper-1.0.1-py3-none-any.whl" } ] }