{ "info": { "author": "weaming", "author_email": "garden.yuen@gmail.com", "bugtrack_url": null, "classifiers": [], "description": "## Install\n\n`pip3 install simple-crawler`\n\nSet environment `AUTO_CHARSET=1` to pass `bytes` to beautifulsoup4 and let it detect the charset.\n\n## Classes\n\n* `URL`: define a URL\n* `URLExt`: class to handle `URL`\n* `Page`: define a request result of a `URL`\n * `url`: type `URL`\n * `content`, `text`, `json`: response content properties from library `requests`\n * `type`: the response body type, is a enum which allows `BYTES`, `TEXT`, `HTML`, `JSON`\n * `is_html`: check whether is html accorrding to the response headers's `Content-Type`\n * `soup`: `BeautifulSoup` contains html if `is_html`\n* `Crawler`: schedule the crawler by calling `handler_page()` recusively\n\n## Example\n\n```\nfrom simple_crawler import *\n\n\nclass MyCrawler(Crawler):\n name = 'output.txt'\n aysnc def custom_handle_page(self, page):\n print(page.url)\n tags = page.soup.select(\"#container\")\n tag = tags and tags[0]\n with open(self.name, 'a') as f:\n f.write(tag.text)\n # do some async call\n\n def filter_url(self, url: URL) -> bool:\n return url.url.startswith(\"https://xxx.com/xxx\")\n\n\nloop = get_event_loop(True)\nc = MyCrawler(\"https://xxx.com/xxx\", loop, concurrency=10)\nschedule_future_in_loop(c.start(), loop=loop)\n```\n\n## TODO\n\n* [x] Speed up using async or threading\n\n\n", "description_content_type": "text/markdown", "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://github.com/weaming/simple-crawler", "keywords": "crawler", "license": "", "maintainer": "", "maintainer_email": "", "name": "simple-crawler", "package_url": "https://pypi.org/project/simple-crawler/", "platform": "", "project_url": "https://pypi.org/project/simple-crawler/", "project_urls": { "Bug Reports": "https://github.com/weaming/simple-crawler", "Homepage": "https://github.com/weaming/simple-crawler", "Source": "https://github.com/weaming/simple-crawler" }, "release_url": "https://pypi.org/project/simple-crawler/1.2/", "requires_dist": [ "beautifulsoup4", "requests" ], "requires_python": "", "summary": "my simple crawler", "version": "1.2" }, "last_serial": 5133517, "releases": { "0.1": [ { "comment_text": "", "digests": { "md5": "7d5d447e9a82dc28d26f496e7a61f440", "sha256": "6507c469adb397d4686ad2d5951170061212f58fbdddf5ef49e4655b1960b484" }, "downloads": -1, "filename": "simple_crawler-0.1-py3-none-any.whl", "has_sig": false, "md5_digest": "7d5d447e9a82dc28d26f496e7a61f440", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 3131, "upload_time": "2019-04-12T03:23:00", "url": "https://files.pythonhosted.org/packages/40/d2/f2d67c6958ef1e4c23db45b5fbdb11ca2e8e33463b9b590ef8973cbc06a5/simple_crawler-0.1-py3-none-any.whl" } ], "0.2": [ { "comment_text": "", "digests": { "md5": "d3d503511340371386be47c4150232ff", "sha256": "879ad5a13bc34a2de0a5eaa1c7eb9559b0eebfee1e6f53bde694c773265919b4" }, "downloads": -1, "filename": "simple_crawler-0.2-py3-none-any.whl", "has_sig": false, "md5_digest": "d3d503511340371386be47c4150232ff", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 3449, "upload_time": "2019-04-12T03:37:17", "url": "https://files.pythonhosted.org/packages/bd/69/5573ee31e2c0b066ada6a9c9e9c3ff9f359ae8742f4e1fc4c654881d8849/simple_crawler-0.2-py3-none-any.whl" } ], "0.3": [ { "comment_text": "", "digests": { "md5": "c313e050b45c519dbeeb3c8a6752d4e8", "sha256": "9ea7ac1b5e7fe826d66c7bc126a437931be9450d2742ba51d2d9ed3bd83ce6b0" }, "downloads": -1, "filename": "simple_crawler-0.3-py3-none-any.whl", "has_sig": false, "md5_digest": "c313e050b45c519dbeeb3c8a6752d4e8", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 3428, "upload_time": "2019-04-12T04:09:34", "url": "https://files.pythonhosted.org/packages/d1/29/03a5dea9d35361bf43931f7dbe3cf63c2d77d0a688b9c5e8749421468e65/simple_crawler-0.3-py3-none-any.whl" } ], "1.0": [ { "comment_text": "", "digests": { "md5": "161015b60a0c2513dfeb6a456c9e7ddc", "sha256": "8e75cd18d07a478c0c5a17218c4097826fcf87c4d25da855a34f32c1e9bcc8d5" }, "downloads": -1, "filename": "simple_crawler-1.0-py3-none-any.whl", "has_sig": false, "md5_digest": "161015b60a0c2513dfeb6a456c9e7ddc", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 4015, "upload_time": "2019-04-12T10:56:31", "url": "https://files.pythonhosted.org/packages/21/bd/e934c6b01d2f8b9c8c148d5eb6527965aaf2533cecd8ca4983464d9891de/simple_crawler-1.0-py3-none-any.whl" } ], "1.1": [ { "comment_text": "", "digests": { "md5": "d9c3185abc00a9361e2d4b43a5176ea6", "sha256": "27a6b6dab8b6d39a3f41efa6e3474f60f92040a2ca6ced4a173e38ac89dcc0c1" }, "downloads": -1, "filename": "simple_crawler-1.1-py3-none-any.whl", "has_sig": false, "md5_digest": "d9c3185abc00a9361e2d4b43a5176ea6", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 4017, "upload_time": "2019-04-12T10:59:00", "url": "https://files.pythonhosted.org/packages/fb/79/4c639a0b0ef2c55e87aaaef5ae8cfa15509af977c71f10a3fc1916748f81/simple_crawler-1.1-py3-none-any.whl" } ], "1.2": [ { "comment_text": "", "digests": { "md5": "0b3e4430c45c054d877cf6fc701a649c", "sha256": "f61142598cf572cdc323aa12c1bc6619ad033f21254e9986e7acefde07c035bc" }, "downloads": -1, "filename": "simple_crawler-1.2-py3-none-any.whl", "has_sig": false, "md5_digest": "0b3e4430c45c054d877cf6fc701a649c", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 4021, "upload_time": "2019-04-12T11:03:12", "url": "https://files.pythonhosted.org/packages/fd/58/c4e510ff908d0bd3ee3f457f53960e8059ed37b425a8458bd11e511366aa/simple_crawler-1.2-py3-none-any.whl" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "0b3e4430c45c054d877cf6fc701a649c", "sha256": "f61142598cf572cdc323aa12c1bc6619ad033f21254e9986e7acefde07c035bc" }, "downloads": -1, "filename": "simple_crawler-1.2-py3-none-any.whl", "has_sig": false, "md5_digest": "0b3e4430c45c054d877cf6fc701a649c", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 4021, "upload_time": "2019-04-12T11:03:12", "url": "https://files.pythonhosted.org/packages/fd/58/c4e510ff908d0bd3ee3f457f53960e8059ed37b425a8458bd11e511366aa/simple_crawler-1.2-py3-none-any.whl" } ] }