{ "info": { "author": "Alexander Schepanovski", "author_email": "suor.web@gmail.com", "bugtrack_url": null, "classifiers": [ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: Implementation :: CPython", "Topic :: Software Development :: Libraries :: Python Modules" ], "description": "AioScrape\n=========\n\nA scraping library on top of `aiohttp `_ and `parsechain `_. Note that this is **alpha** software.\n\n\nInstallation\n-------------\n\n::\n\n pip install aioscrape\n\n\nUsage\n-----\n\n.. code:: python\n\n from aioscrape import run, fetch, settings\n from aioscrape.middleware import last_fetch, make_filecache\n from aioscrape.utils import SOME_HEADERS # To not look like a bot\n\n from urllib.parse import urljoin\n from parsechain import C\n from funcy import lcat, lconcat\n\n\n def main():\n # Settings are scoped and can be redefined later with another \"with\"\n cache = make_filecache('.fcache')\n with settings(headers=SOME_HEADERS, middleware=[cache, last_fetch]):\n print(run(scrape_all()))\n\n\n async def scrape_all():\n # All the settings in scope like headers and middleware are applied to fetch()\n start_page = await fetch(START_URL)\n\n # AioScrape integrates with parsechain to make extracting a breeze\n urls = start_page.css('.pagingLinks a').attrs('href')\n list_urls = [urljoin(start_page.url, page_url) for page_url in urls]\n\n # Using asyncio.wait() and friends to run requests in parallel\n list_pages = [start_page] + await wait_all(map(fetch, list_urls))\n\n # Scrape articles\n result = lcat(await wait_all(map(scrape_articles, list_pages)))\n write_to_csv('export.csv', result)\n\n\n async def scrape_articles(list_page):\n urls = list_page.css('#headlines .titleLink').attrs('href')\n abs_urls = [urljoin(list_page.url, url) for url in urls]\n return await wait_all(map(scrape_article, abs_urls))\n\n\n async def scrape_article(url):\n resp = await fetch(url)\n return resp.root.multi({\n 'url': C.const(resp.url),\n 'title': C.microdata('headline').first,\n 'date': C.microdata('datePublished').first,\n 'text': C.microdata('articleBody').first,\n 'contacts': C.css('.sidebars .contact p')\n .map(C.inner_html + html_to_text) + lconcat + ''.join,\n })\n\n\n if __name__ == '__main__':\n main()\n\n\nTODO\n----\n\n- Response.follow()\n- non-GET requests\n- work with forms\n", "description_content_type": "", "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "http://github.com/Suor/aioscrape", "keywords": "", "license": "BSD", "maintainer": "", "maintainer_email": "", "name": "aioscrape", "package_url": "https://pypi.org/project/aioscrape/", "platform": "", "project_url": "https://pypi.org/project/aioscrape/", "project_urls": { "Homepage": "http://github.com/Suor/aioscrape" }, "release_url": "https://pypi.org/project/aioscrape/0.0.2/", "requires_dist": null, "requires_python": "", "summary": "Async scraping library", "version": "0.0.2" }, "last_serial": 5255604, "releases": { "0.0.1": [ { "comment_text": "", "digests": { "md5": "9fbb67c8c65957c45910e2fd431d98dd", "sha256": "984c0da6775a04f4a18df6a5f95112a9794844f69fee5d503d18a77177b3b71b" }, "downloads": -1, "filename": "aioscrape-0.0.1-py2.py3-none-any.whl", "has_sig": false, "md5_digest": "9fbb67c8c65957c45910e2fd431d98dd", "packagetype": "bdist_wheel", "python_version": "2.7", "requires_python": null, "size": 6961, "upload_time": "2019-02-08T09:57:23", "url": "https://files.pythonhosted.org/packages/bb/c0/df53b8c8838fdb9fc7bc03f2204141520aec9705333c0f624f70eaa9cb80/aioscrape-0.0.1-py2.py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "44af30e822c0fd0f5fce4c37655eff15", "sha256": "a17fb6cd3b481b0946dc0bf28dbc25b6b36313ea97cab69e23b623b6a3eff99f" }, "downloads": -1, "filename": "aioscrape-0.0.1.tar.gz", "has_sig": false, "md5_digest": "44af30e822c0fd0f5fce4c37655eff15", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4534, "upload_time": "2019-02-08T09:57:20", "url": "https://files.pythonhosted.org/packages/20/36/a88813b2a51f6f4799844afa6ee213a0ce6da6e1096fd47de6caa5144d1e/aioscrape-0.0.1.tar.gz" } ], "0.0.2": [ { "comment_text": "", "digests": { "md5": "1b9ee39a8e5e858f99a263390725143b", "sha256": "80e16ade3c66d571de176a59b44eba5b5762868dcb380839eed31a7d5dc8c6b7" }, "downloads": -1, "filename": "aioscrape-0.0.2-py2.py3-none-any.whl", "has_sig": false, "md5_digest": "1b9ee39a8e5e858f99a263390725143b", "packagetype": "bdist_wheel", "python_version": "2.7", "requires_python": null, "size": 9015, "upload_time": "2019-05-11T09:38:45", "url": "https://files.pythonhosted.org/packages/b0/b6/4e86c9d78cf1716e451d3cb70d2ff0e1ac48d2448ba01e81b3653a3c128d/aioscrape-0.0.2-py2.py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "16d964f39b6221fad8e177fc0525d810", "sha256": "0610acaf29b493b94cc7e9744f10ebbfc25d836e05e9100b2a28d50b1ab71bbd" }, "downloads": -1, "filename": "aioscrape-0.0.2.tar.gz", "has_sig": false, "md5_digest": "16d964f39b6221fad8e177fc0525d810", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6478, "upload_time": "2019-05-11T09:38:43", "url": "https://files.pythonhosted.org/packages/ac/76/93fa57f4c0458a1dc1de563ab0ce6d40fada2cecc5df13abaa7e0afaefc1/aioscrape-0.0.2.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "1b9ee39a8e5e858f99a263390725143b", "sha256": "80e16ade3c66d571de176a59b44eba5b5762868dcb380839eed31a7d5dc8c6b7" }, "downloads": -1, "filename": "aioscrape-0.0.2-py2.py3-none-any.whl", "has_sig": false, "md5_digest": "1b9ee39a8e5e858f99a263390725143b", "packagetype": "bdist_wheel", "python_version": "2.7", "requires_python": null, "size": 9015, "upload_time": "2019-05-11T09:38:45", "url": "https://files.pythonhosted.org/packages/b0/b6/4e86c9d78cf1716e451d3cb70d2ff0e1ac48d2448ba01e81b3653a3c128d/aioscrape-0.0.2-py2.py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "16d964f39b6221fad8e177fc0525d810", "sha256": "0610acaf29b493b94cc7e9744f10ebbfc25d836e05e9100b2a28d50b1ab71bbd" }, "downloads": -1, "filename": "aioscrape-0.0.2.tar.gz", "has_sig": false, "md5_digest": "16d964f39b6221fad8e177fc0525d810", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6478, "upload_time": "2019-05-11T09:38:43", "url": "https://files.pythonhosted.org/packages/ac/76/93fa57f4c0458a1dc1de563ab0ce6d40fada2cecc5df13abaa7e0afaefc1/aioscrape-0.0.2.tar.gz" } ] }