{ "info": { "author": "Peng Shiyu", "author_email": "pengshiyuyx@gmail.com", "bugtrack_url": null, "classifiers": [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python :: 3" ], "description": "PageParser\n==========\n\n|Build Status| |GitHub|\n\n\u9879\u76ee\u7b80\u4ecb\n--------\n\n\u9879\u76ee\u540d\u79f0\uff1a\u516d\u884c\u4ee3\u7801\u5199\u722c\u866b\n\n\u82f1\u6587\u540d\u79f0\uff1aPageParser\n\n\u9879\u76ee\u7b80\u4ecb\uff1a\u4e00\u4e2a\u722c\u866b\u4f7f\u7528\u7684\u7f51\u9875\u89e3\u6790\u5305\uff0c\u5b9e\u73b0\u6700\u5927\u9650\u5ea6\u7684\u4ee3\u7801\u590d\u7528\n\n\u9879\u76ee\u76ee\u6807\uff1a\u4e0d\u61c2\u7f51\u9875\u89e3\u6790\u4e5f\u80fd\u5199\u722c\u866b\n\n\u5b89\u88c5\u6a21\u5757\n--------\n\n::\n\n pip install page-parser\n\n\u6700\u5c0f\u9879\u76ee\u793a\u4f8b\uff1a\n\n.. code:: python\n\n import requests\n from page_parser import BaiduParser\n\n # 1\u3001\u4e0b\u8f7d\u7f51\u9875\n response = requests.get(\"https://www.baidu.com/\")\n html = response.content.decode(\"utf-8\")\n\n # 2\u3001\u89e3\u6790\u7f51\u9875\n items = BaiduParser.parse_index(html)\n\n # 3\u3001\u8f93\u51fa\u6570\u636e\n for item in items: print(item)\n # {'title': '\u767e\u5ea6\u4e00\u4e0b\uff0c\u4f60\u5c31\u77e5\u9053'}\n\n\u652f\u6301\u7f51\u9875\n--------\n\n+--------+------------+--------------------+-----------------------------------------------------+\n| \u5e8f\u53f7 | \u7f51\u7ad9 | \u7f51\u9875\u540d\u79f0 | \u7f51\u9875\u5730\u5740 |\n+========+============+====================+=====================================================+\n| 1 | \u767e\u5ea6 | \u4e3b\u9875 | https://www.baidu.com/ |\n+--------+------------+--------------------+-----------------------------------------------------+\n| 2 | \u8c46\u74e3 | \u7535\u5f71 \u6b63\u5728\u70ed\u6620 | https://movie.douban.com/ |\n+--------+------------+--------------------+-----------------------------------------------------+\n| 3 | \u62c9\u52fe | \u62db\u8058\u804c\u4f4d\u5217\u8868\u9875 | https://www.lagou.com/zhaopin/ |\n+--------+------------+--------------------+-----------------------------------------------------+\n| 4 | \u4f01\u67e5\u67e5 | \u878d\u8d44\u4e8b\u4ef6\u9875 | https://www.qichacha.com/elib\\_financing |\n+--------+------------+--------------------+-----------------------------------------------------+\n| 5 | \u897f\u523a\u4ee3\u7406 | \u4e3b\u9875 | http://www.xicidaili.com/ |\n+--------+------------+--------------------+-----------------------------------------------------+\n| 6 | \u897f\u523a\u4ee3\u7406 | \u56fd\u5185\u9ad8\u533f\u4ee3\u7406 | http://www.xicidaili.com/nn/ |\n+--------+------------+--------------------+-----------------------------------------------------+\n| 7 | \u897f\u523a\u4ee3\u7406 | \u56fd\u5185\u666e\u901a\u4ee3\u7406 | http://www.xicidaili.com/nt/ |\n+--------+------------+--------------------+-----------------------------------------------------+\n| 8 | \u897f\u523a\u4ee3\u7406 | \u56fd\u5185HTTPS\u4ee3\u7406 | http://www.xicidaili.com/wn/ |\n+--------+------------+--------------------+-----------------------------------------------------+\n| 9 | \u897f\u523a\u4ee3\u7406 | \u56fd\u5185HTTP\u4ee3\u7406 | http://www.xicidaili.com/wt/ |\n+--------+------------+--------------------+-----------------------------------------------------+\n| 10 | \u641c\u72d7\u641c\u7d22 | \u5fae\u4fe1\u516c\u4f17\u53f7\u641c\u7d22\u9875 | https://weixin.sogou.com/weixin?type=1&query=\u767e\u5ea6 |\n+--------+------------+--------------------+-----------------------------------------------------+\n| 11 | \u714e\u86cb\u7f51 | \u4e3b\u9875\u5217\u8868 | http://jandan.net/ |\n+--------+------------+--------------------+-----------------------------------------------------+\n| 12 | \u4f2f\u4e50\u5728\u7ebf | python\u680f\u76ee | http://python.jobbole.com/ |\n+--------+------------+--------------------+-----------------------------------------------------+\n\n\u4f7f\u7528\u793a\u4f8b\n--------\n\n.. code:: python\n\n # -*- coding: utf-8 -*-\n\n import requests\n from page_parser import BaiduParser\n\n # 1\u3001\u4e0b\u8f7d\u7f51\u9875\n url = \"https://www.baidu.com/\"\n response = requests.get(url)\n response.encoding = response.apparent_encoding\n\n # 2\u3001\u89e3\u6790\u7f51\u9875\n items = BaiduParser.parse_index(response.text)\n\n # 3\u3001\u8f93\u51fa\u6570\u636e\n for item in items:\n print(item)\n\n # {'title': '\u767e\u5ea6\u4e00\u4e0b\uff0c\u4f60\u5c31\u77e5\u9053'}\n\n\u7f51\u7edc\u722c\u866b\u5de5\u4f5c\u6d41\u7a0b\uff1a\n------------------\n\n::\n\n \u9875\u9762\u4e0b\u8f7d\u5668 -> \u9875\u9762\u89e3\u6790\u5668 -> \u6570\u636e\u5b58\u50a8\n\n``\u9875\u9762\u4e0b\u8f7d\u5668``: \u4e3b\u8981\u6d89\u53ca\u9632\u722c\u653b\u7834\uff0c\u65b9\u6cd5\u5404\u5f02\uff0c\u722c\u866b\u7684\u96be\u70b9\u4e5f\u5728\u6b64\n\n``\u9875\u9762\u89e3\u6790\u5668``:\n\u4e00\u822c\u9875\u9762\u5728\u4e00\u6bb5\u65f6\u95f4\u5185\u662f\u56fa\u5b9a\u7684\uff0c\u6bcf\u4e2a\u4eba\u4e0b\u8f7d\u9875\u9762\u540e\u90fd\u9700\u8981\u89e3\u6790\u51fa\u9875\u9762\u5185\u5bb9\uff0c\u5c5e\u4e8e\u91cd\u590d\u5de5\u4f5c\n\n``\u6570\u636e\u5b58\u50a8``: \u4e0d\u7ba1\u662f\u5b58\u50a8\u5230\u4ec0\u4e48\u6587\u4ef6\u6216\u6570\u636e\u5e93\uff0c\u4e3b\u8981\u770b\u4e1a\u52a1\u9700\u6c42\n\n\u6b64\u9879\u76ee\u5c31\u662f\u5c06\u8fd9\u9879\u5de5\u4f5c\u62bd\u79bb\u51fa\u6765\uff0c\u8ba9\u7f51\u7edc\u722c\u866b\u7a0b\u5e8f\u91cd\u70b9\u5173\u6ce8\u4e8e\uff1a\u7f51\u9875\u4e0b\u8f7d\uff0c\u800c\u4e0d\u662f\u91cd\u590d\u7684\u7f51\u9875\u89e3\u6790\n\n\u9879\u76ee\u8bf4\u660e\n--------\n\n\u6b64\u9879\u76ee\u53ef\u4ee5\u548cpython \u7684requests \u548cscrapy \u914d\u5408\u4f7f\u7528\n\n\u5f53\u7136\u5982\u679c\u8981\u548c\u5176\u4ed6\u7f16\u7a0b\u8bed\u8a00\u4f7f\u7528\uff0c\u53ef\u4ee5\u4f7f\u7528flask\u7b49\u7f51\u7edc\u6846\u67b6\u518d\u6b21\u5bf9\u6b64\u9879\u76ee\u8fdb\u884c\u5c01\u88c5\uff0c\u63d0\u4f9b\u7f51\u7edc\u63a5\u53e3\u5373\u53ef\n\n\u53d1\u8d77\u4eba\uff1amouday\n\n\u53d1\u8d77\u65f6\u95f4\uff1a2018-10-13\n\n\u9700\u8981\u66f4\u591a\u7684\u4eba\u4e00\u8d77\u6765\u7ef4\u62a4\n\n\u8d21\u732e\u4ee3\u7801\n--------\n\n\u8d21\u732e\u7684\u4ee3\u7801\u7edf\u4e00\u653e\u5165\u6587\u4ef6\u5939\uff1apage\\_parser\n\n\u4ee3\u7801\u793a\u4f8b\uff0c\u5982\u6ca1\u6709\u66f4\u597d\u7684\u7406\u7531\uff0c\u5e94\u8be5\u6309\u7167\u4e0b\u9762\u7684\u683c\u5f0f\uff0c\u4fbf\u4e8e\u4f7f\u7528\u8005\u8c03\u7528\n\nbaidu\\_parser.py\n\n.. code:: python\n\n\n # -*- coding: utf-8 -*-\n\n # @Date : 2018-10-13\n # @Author : Peng Shiyu\n\n from parsel import Selector\n\n\n class BaiduParser(object):\n \"\"\"\n \u767e\u5ea6\u7f51\uff1ahttps://www.baidu.com/\n \"\"\"\n\n @staticmethod\n def parse_index(html):\n \"\"\"\n \u89e3\u6790\u4e3b\u9875\uff1ahttps://www.baidu.com/\n 2018-10-13 pengshiyuyx@gmai.com\n :param html: {str} \u7f51\u9875\u6587\u672c\n :return: {iterator} \u62bd\u53d6\u7684\u5185\u5bb9\n \"\"\"\n sel = Selector(html)\n title = sel.css(\"title::text\").extract_first()\n item = {\n \"title\": title\n }\n yield item\n\n\n if __name__ == '__main__':\n import requests\n response = requests.get(\"https://www.baidu.com/\")\n response.encoding = response.apparent_encoding\n items = BaiduParser.parse_index(response.text)\n for item in items:\n print(item)\n\n # {'title': '\u767e\u5ea6\u4e00\u4e0b\uff0c\u4f60\u5c31\u77e5\u9053'}\n\n\u8bf4\u660e\uff1a\n------\n\n\u539f\u5219\uff1a\n~~~~~~\n\n1. \u6309\u7167\u7f51\u7ad9\u5206\u7c7b\u5efa\u7acb\u89e3\u6790\u7c7b\n\n2. \u89e3\u6790\u65b9\u6cd5\u5305\u542b\u5728\u89e3\u6790\u7c7b\u4e2d \u4e3a\u65b9\u4fbf\u8c03\u7528\u9700\u8981\u9759\u6001\u65b9\u6cd5\n\n3. \u56e0\u4e3a\u7f51\u9875\u89e3\u6790\u6709\u65f6\u6548\u6027\uff0c\u6240\u4ee5\u5fc5\u987b\\ ``\u6ce8\u660e\u65e5\u671f``\n\n\u547d\u540d\u89c4\u5219\uff1a\n~~~~~~~~~~\n\n\u4f8b\u5982:\n\n::\n\n \u6587\u4ef6\u540d\uff1abaidu_parser\n \u7c7b\u540d\uff1aBaiduParser\n \u65b9\u6cd5\u540d\uff1aparse_index\n\n\u5176\u4ed6\n~~~~\n\n1. \u5fc5\u8981\u7684\u4ee3\u7801\u6ce8\u91ca\n\n2. \u5fc5\u8981\u7684\u6d4b\u8bd5\u4ee3\u7801\n\n3. \u5176\u4ed6\u5fc5\u8981\u7684\u4ee3\u7801\n\n\u52a0\u5165\u6211\u4eec\n--------\n\n\u57fa\u672c\u8981\u6c42\n~~~~~~~~\n\n1. python\u7684\u57fa\u672c\u8bed\u6cd5 + \u9762\u5411\u5bf9\u8c61 + \u8fed\u4ee3\u5668\uff08yield\uff09\n2. \u638c\u63e1\u7684\u5e93\uff1arequests\u3001parsel\u3001scrapy\uff08\u4e86\u89e3\u5373\u53ef\uff09\n3. \u89e3\u6790\u5e93\u7edf\u4e00\u4f7f\u7528parsel\uff08\u57fa\u4e8expath\uff09\uff0c\u7b80\u5355\u9ad8\u6548\uff0c\u4e0escrapy\u65e0\u7f1d\u8854\u63a5\n4. \u4e0d\u592a\u61c2\u4e5f\u6ca1\u5173\u7cfb\uff0c\u81ea\u5df1\u770b\u53c2\u8003\u6587\u7ae0\uff0c\u53ea\u8981\u613f\u610f\u5b66\u5c31\u4f1a\uff0c\u77ac\u95f4\u63d0\u5347\u81ea\u5df1\n\n\u53c2\u8003\u6587\u7ae0\uff1a\n\n1. `Python\u7f16\u7a0b\uff1aclass\u7c7b\u9762\u5411\u5bf9\u8c61 `__\n\n2. `Python\u7f16\u7a0b\uff1a\u751f\u6210\u5668yield\u4e0eyield\n from\u533a\u522b\u7b80\u5355\u7406\u89e3 `__\n\n3. `Python\u722c\u866b\uff1arequests\u5e93\u57fa\u672c\u4f7f\u7528 `__\n\n4. `Python\u7f51\u7edc\u722c\u866b\u4e4bscrapy\u6846\u67b6 `__\n\n5. `Python\u722c\u866b\uff1axpath\u5e38\u7528\u65b9\u6cd5\u793a\u4f8b `__\n\n6. `python\u722c\u866b\uff1ascrapy\u6846\u67b6xpath\u548ccss\u9009\u62e9\u5668\u8bed\u6cd5 `__\n\n\u8054\u7cfb\u65b9\u5f0f\n~~~~~~~~\n\nPageParser QQ\u7fa4\u53f7: 932301512\n\n.. figure:: images/page-parser-min.jpeg\n :alt: \n\n.. |Build Status| image:: https://travis-ci.org/mouday/PageParser.svg?branch=master\n :target: https://travis-ci.org/mouday/PageParser\n.. |GitHub| image:: https://img.shields.io/github/license/mashape/apistatus.svg\n\n\n", "description_content_type": "", "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://github.com/mouday/PageParser", "keywords": "", "license": "", "maintainer": "", "maintainer_email": "", "name": "page-parser", "package_url": "https://pypi.org/project/page-parser/", "platform": "", "project_url": "https://pypi.org/project/page-parser/", "project_urls": { "Homepage": "https://github.com/mouday/PageParser" }, "release_url": "https://pypi.org/project/page-parser/0.0.4/", "requires_dist": [ "parsel (>=1.4.0)", "requests (>=2.18.4)" ], "requires_python": "", "summary": "web crawler or spider parse page", "version": "0.0.4" }, "last_serial": 4966287, "releases": { "0.0.1": [ { "comment_text": "", "digests": { "md5": "7fb555f80ca9c034e88e37873f8414b2", "sha256": "cf1486e545d0a16f02138ce2b9fc07f78c1f5ff2f2634ecac6451b54ddfef4a8" }, "downloads": -1, "filename": "page_parser-0.0.1-py3-none-any.whl", "has_sig": false, "md5_digest": "7fb555f80ca9c034e88e37873f8414b2", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 5066, "upload_time": "2018-10-15T07:57:21", "url": "https://files.pythonhosted.org/packages/1d/18/3f0ff2f366ba2495e6cfa1f15f623a3e6ce36cc1cf50f254768ba2784061/page_parser-0.0.1-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "f138b0594c97070015ade754be9dbb15", "sha256": "86cfd50d015efe09072766deb188121b07c5be6916838830b2d13b8c23a2e690" }, "downloads": -1, "filename": "page-parser-0.0.1.tar.gz", "has_sig": false, "md5_digest": "f138b0594c97070015ade754be9dbb15", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 3263, "upload_time": "2018-10-15T07:57:23", "url": "https://files.pythonhosted.org/packages/ce/c1/6b611999aac0372c4b2fe28cab02b8ba796ac0ff0871874578e3bf9e27c0/page-parser-0.0.1.tar.gz" } ], "0.0.2": [ { "comment_text": "", "digests": { "md5": "3a3a235c893bfaf85ae1efd28536c9f8", "sha256": "c34f3104a557b51aa279019c7dd749b51b465fe70825ba865e220c47acc17b58" }, "downloads": -1, "filename": "page_parser-0.0.2-py3-none-any.whl", "has_sig": false, "md5_digest": "3a3a235c893bfaf85ae1efd28536c9f8", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 99894, "upload_time": "2018-10-15T08:32:34", "url": "https://files.pythonhosted.org/packages/8f/2d/76862275e2c041143dc5befb5fb7295a5308ba59dddb49f6dcaf49c59293/page_parser-0.0.2-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "3c225fd10bcdf70e35b3580b16f08027", "sha256": "1874d3990a466bd03b09ffe4c977acabcbc81dda2b3fabf40907a7384ffc430f" }, "downloads": -1, "filename": "page_parser-0.0.2.tar.gz", "has_sig": false, "md5_digest": "3c225fd10bcdf70e35b3580b16f08027", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 96216, "upload_time": "2018-10-15T08:32:35", "url": "https://files.pythonhosted.org/packages/fe/ef/66e5f6c0656e25261b41767acb5f1750a02fa74db393910b6bd7d201b643/page_parser-0.0.2.tar.gz" } ], "0.0.3": [ { "comment_text": "", "digests": { "md5": "99a823c9f617d681e59b9f20d8d58847", "sha256": "cc2d96ea1ff8a5c413bffe7c421a164320b66340802a105acea75910de0d50b4" }, "downloads": -1, "filename": "page_parser-0.0.3-py3-none-any.whl", "has_sig": false, "md5_digest": "99a823c9f617d681e59b9f20d8d58847", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 32847, "upload_time": "2018-10-17T05:35:34", "url": "https://files.pythonhosted.org/packages/7b/d9/c6800f0f27c220a857b92c68295590a9b8aaf84dce8faedc8df96d6eb62a/page_parser-0.0.3-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "c12b7ffe00f2353ab6d76e06453769e3", "sha256": "2c0738742b503fe603c404ff1bb50a41bbf1405b4e07c81660087125b234fde9" }, "downloads": -1, "filename": "page_parser-0.0.3.tar.gz", "has_sig": false, "md5_digest": "c12b7ffe00f2353ab6d76e06453769e3", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 26232, "upload_time": "2018-10-17T05:35:36", "url": "https://files.pythonhosted.org/packages/72/88/5ad59981d212f69883038ba267acf421b7dc5957c0e492d8bfcb490fbcd9/page_parser-0.0.3.tar.gz" } ], "0.0.4": [ { "comment_text": "", "digests": { "md5": "8bc631693a0ad573083a2073e67940bc", "sha256": "0679f154273e3f71773071c9651df2e3f07d999859c8a4eb8c492dcdbf897d48" }, "downloads": -1, "filename": "page_parser-0.0.4-py3-none-any.whl", "has_sig": false, "md5_digest": "8bc631693a0ad573083a2073e67940bc", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 14828, "upload_time": "2019-03-21T02:15:04", "url": "https://files.pythonhosted.org/packages/5c/ec/9640dfcbb0440a7bf94deb317615800768d57b2b27bb3fa9d0e8353296cb/page_parser-0.0.4-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "d97c0d866d041b86cfb11928dd6ffed5", "sha256": "7bffbb1b502f9c0c7a260aefba280a700bea4c95310b05e2271863f9be1a731e" }, "downloads": -1, "filename": "page_parser-0.0.4.tar.gz", "has_sig": false, "md5_digest": "d97c0d866d041b86cfb11928dd6ffed5", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 7737, "upload_time": "2019-03-21T02:15:06", "url": "https://files.pythonhosted.org/packages/46/66/9a32790324fe241c3c4cee6eb9b9e3605ea83bcc312aa56e00958353e182/page_parser-0.0.4.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "8bc631693a0ad573083a2073e67940bc", "sha256": "0679f154273e3f71773071c9651df2e3f07d999859c8a4eb8c492dcdbf897d48" }, "downloads": -1, "filename": "page_parser-0.0.4-py3-none-any.whl", "has_sig": false, "md5_digest": "8bc631693a0ad573083a2073e67940bc", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 14828, "upload_time": "2019-03-21T02:15:04", "url": "https://files.pythonhosted.org/packages/5c/ec/9640dfcbb0440a7bf94deb317615800768d57b2b27bb3fa9d0e8353296cb/page_parser-0.0.4-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "d97c0d866d041b86cfb11928dd6ffed5", "sha256": "7bffbb1b502f9c0c7a260aefba280a700bea4c95310b05e2271863f9be1a731e" }, "downloads": -1, "filename": "page_parser-0.0.4.tar.gz", "has_sig": false, "md5_digest": "d97c0d866d041b86cfb11928dd6ffed5", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 7737, "upload_time": "2019-03-21T02:15:06", "url": "https://files.pythonhosted.org/packages/46/66/9a32790324fe241c3c4cee6eb9b9e3605ea83bcc312aa56e00958353e182/page_parser-0.0.4.tar.gz" } ] }