{ "info": { "author": "huangzhen", "author_email": "huangzhen@baixing.com", "bugtrack_url": null, "classifiers": [ "Environment :: Web Environment", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Operating System :: MacOS", "Operating System :: Microsoft", "Operating System :: POSIX", "Operating System :: Unix", "Programming Language :: Python :: 2.6", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Topic :: Internet", "Topic :: Multimedia :: Video", "Topic :: Software Development :: Libraries :: Python Modules" ], "description": "## Parser Engine \n\u4ee3\u53f7PE\uff0c\u4e3a[scrapy](https://scrapy.org/)\u6846\u67b6\u91cf\u8eab\u5b9a\u5236\u7684 **\"\u53ef\u914d\u7f6e\u5316\u7684\u54cd\u5e94\u89e3\u6790\u5668\u5f15\u64ce\"**\u3002\n\n\u4e3b\u8981\u652f\u6301\u4ee5\u4e0b\u7279\u6027\uff1a\n- [x] \u57fa\u4e8expath\u3001jsonpath\u7b49\u89c4\u5219\u89e3\u6790html\u548cjson\u683c\u5f0f\u7684http\u8bf7\u6c42\u54cd\u5e94\u4f53\n- [x] \u8f93\u51fa\u503c\u57fa\u4e8e`scrapy.Item`\uff0c\u81ea\u52a8\u5b9a\u4f4d\u5e76\u8fd4\u56de\u5b9e\u4f8b\u5316\u7684`Item`\n- [x] \u652f\u6301 **\u7236\u8282\u70b9**\u3001**\u5217\u8868for\u5faa\u73af\u89e3\u6790**\n- [x] \u5b57\u6bb5\u679a\u4e3e\u503c\u7684\u6620\u5c04\n- [x] \u8bbe\u7f6e\u5b57\u6bb5\u4e3a\u5fc5\u6709\u5b57\u6bb5\uff0c\u7f3a\u5931\u65f6\u4e22\u5f03\u6574\u4e2a`Item`\n- [x] \u5c06list\u7c7b\u578b\u7684\u5b57\u6bb5\u62fc\u63a5\u6210\u5b57\u7b26\u4e32\n\n### \u5b89\u88c5\n- \u5b89\u88c5\u6700\u65b0\u4f53\u9a8c\u7248: \n >`pip install git+https://github.com/Danceiny/parser_engine`\n \n- \u5b89\u88c5\u7a33\u5b9a\u7248\uff1a\n >`pip install -U parser_engine`\n\n### \u66f4\u65b0\u65e5\u5fd7\n\u8bf7\u79fb\u6b65[CHANGELOG.md](CHANGELOG.md)\u3002\n\n### \u901f\u89c8\n>\u5982\u4f55\u4f7f\u7528PE\u4ece\u96f6\u5f00\u59cb\u5feb\u901f\u7f16\u5199\u4e00\u4e2a\u7f51\u7ad9\u7684\u722c\u866b\uff0c\u5e76\u6301\u4e45\u5316\u6570\u636e\uff1f\u53ef\u79fb\u6b65[\u5feb\u901f\u5f00\u59cb](./TUTORIAL.md)\u3002\n\n- \u6781\u7b80\u7248\uff0c\u4f7f\u7528`CrawlSpider`\u7684rules\u673a\u5236\u3002\n```python\nfrom parser_engine import TemplateAnnotation\nfrom scrapy.spiders.crawl import CrawlSpider\n@TemplateAnnotation(tpls=\"demo\")\nclass DemoSpider4(CrawlSpider):\n name = \"demo4\"\n start_urls = [\n \"http://github.cannot.cc/baixing-helper\"\n ]\n```\n\n- \u4f7f\u7528scrapy_redis\uff0c\u89e3\u6790start_urls\u7684\u54cd\u5e94\u3002\n```python\nfrom parser_engine import TemplateAnnotation\nfrom parser_engine.clue.spider import ClueSpider\n@TemplateAnnotation(start_url_tpl=({\n \"name\": \"zhongguozhongqi_xiaoshouwangluo\",\n \"itemname\": \"HuocheDealerItem\",\n \"parent\": {\n \"xpath\": \"//tr[@class=\\\"bgcolor2\\\"]\"\n },\n \"fields\": [\n {\n \"key\": \"area\",\n \"xpath\": \"td[1]/text()\",\n \"value_type\": \"stripped_string\"\n }, {\n \"key\": \"leads_name\",\n \"xpath\": \"td[2]/text()\",\n \"value_type\": \"stripped_string\"\n }, {\n \"key\": \"address\",\n \"xpath\": \"td[3]/text()\",\n \"value_type\": \"stripped_string\"\n }, {\n \"key\": \"phone\",\n \"xpath\": \"td[5]/text()\",\n \"value_type\": \"stripped_string\"\n }\n ]\n}), channel='zhongguozhongqi', leads_src='\u4e2d\u56fd\u91cd\u6c7d')\nclass ZhongguozhongqiSpider(ClueSpider):\n name = 'zhongguozhongqi'\n def parse(self, response):\n items = self._parse_start_url(response)\n for item in items:\n phone = item.get('phone')\n if phone:\n item['phone'] = phone.replace('\u3001', ',')\n yield item\n self.finish_clue(response, len(items))\n```\n\n- \u4f7f\u7528scrapy_redis\uff0c\u7075\u6d3b\u8fd0\u7528\u591a\u79cdPE\u7279\u6027\u3002\n```python\nfrom parser_engine.clue.spider import ClueSpider\nfrom parser_engine import TemplateAnnotation\nfrom parser_engine.clue.items import ClueItem\nfrom parser_engine.request import TaskRequest\nfrom scrapy import Request\n@TemplateAnnotation(start_url_tpl=({\n \"name\": \"youka_shop_listing_api\",\n \"parent\": {\n \"json_key\": \"data\",\n },\n \"fields\": [{\n \"key\": \"totalPage\",\n \"json_key\": \"totalPage\",\n\n }, {\n \"key\": \"ids\",\n \"json_path\": \"dataList[*].id\"\n }]\n },),\n tpls=({\n \"name\": \"youka_shop_detail_api\",\n \"itemname\": \"HuocheDealerItem\",\n \"parent\": {\n \"json_key\": \"data\",\n },\n \"fields\": [{\n \"key\": \"company_type\",\n \"json_key\": \"category\",\n \"mapper\": {\n 1: \"\u4e8c\u624b\u8f66\u76f4\u8425\u5e97\",\n 2: \"4S\u5e97\"\n }\n }, {\n \"key\": \"dealer_id\",\n \"json_key\": \"id\",\n \"required\": 1,\n }, {\n \"key\": \"leads_name\",\n \"json_key\": \"shopName\",\n }, {\n \"key\": \"area\",\n \"json_path\": \"districtDto.districtName\",\n \"value_type\": \"singleton\"\n }, {\n \"key\": \"city\",\n \"json_path\": \"cityDto.cityName\",\n \"value_type\": \"singleton\"\n }, {\n \"key\": \"service_phone\",\n \"default_value\": \"\",\n }, {\n \"key\": \"wechat\",\n \"json_key\": \"wechat\",\n }, {\n \"key\": \"tags\",\n \"json_key\": \"tags\",\n \"join\": \",\"\n }]\n }), channel='youka', leads_src='\u4f18\u5361')\nclass YoukaSpider(ClueSpider):\n name = 'youka'\n custom_settings = {\n 'CONCURRENT_REQUESTS': 2,\n 'CONCURRENT_REQUESTS_PER_DOMAIN': 1\n }\n def parse(self, response):\n items = self._parse_start_url(response)\n meta = response.meta\n clue_id = meta.get('clue_id')\n from_url = response.request.url\n if meta.get('open_pages'):\n total_page = items[0]['totalPage']\n import re\n current_page = int(re.findall('page=(\\\\d+)', from_url)[0])\n for i in range(1, total_page + 1):\n if current_page == i:\n continue\n url = \"http://www.china2cv.com/truck-foton-web/api/shop/v1/getShopList?page=%d&pageSize=10\" % i\n yield ClueItem({\"project\": \"huoche\", \"spider\": self.name, \"req\": TaskRequest(\n url=url,\n meta={\"from_clue_id\": clue_id}\n )})\n for item in items:\n for id in item['ids']:\n r = Request(url=\"http://www.china2cv.com/truck-foton-web/api/shop/v1/getShopInfo?shopId=%d\" % int(id),\n callback=self._response_downloaded)\n r.meta.update(rule=0, from_clue_id=clue_id)\n yield r\n\n def process_results(self, response, results):\n for item in results:\n item['url'] = 'http://www.china2cv.com/storeDetail.html?typess=1&shopId=' + str(item['dealer_id'])\n return results\n```\n\n\u5b8c\u6574\u793a\u4f8b\u8bf7\u53c2\u8003\uff1a[examples](./examples)\u3002\n\n### \u539f\u7406\n- \u89e3\u6790\u5668\n >PE\u5411\u8c03\u7528\u65b9\u63d0\u4f9b\u4e00\u5957\u7b80\u5355\u3001\u6613\u61c2\u7684\u53c2\u6570\uff0c\u5b9e\u9645\u4f1a\u5c06\u5176`\u7f16\u8bd1`\u6210\u8f83\u4e3a\u590d\u6742\u7684xpath\u8868\u8fbe\u5f0f\uff0c\u518d\u501f\u52a9scrapy\u5c01\u88c5\u7684\u89e3\u6790\u5668\u5c06\u6240\u9700\u5185\u5bb9\u63d0\u53d6\u51fa\u6765\u3002\n\n- \u8fd4\u56de\u503c\n >\u901a\u8fc7\u914d\u7f6e`itemname`\u53c2\u6570\uff0cPE\u5c06`\u53cd\u5c04`\u5f97\u5230\u6240\u9700\u7684`Item`\u7c7b\uff0c\u6309\u7167\u914d\u7f6e\u7684\u6620\u5c04\u5173\u7cfb\uff0c\u4ece\u63d0\u53d6\u51fa\u7684\u503c\u521b\u5efa\u4e00\u4e2a`Item`\u5b9e\u4f8b\uff08\u6216\u8005\u591a\u4e2a\uff09\uff0c\u5e76\u8fd4\u56de\u4e00\u4e2a\u53ef\u8fed\u4ee3\u7684`Item`\u5b9e\u4f8b\u5217\u8868\u3002\n \n### \u5df2\u77e5\u95ee\u9898\n\n- \u5982\u679c\u63d0\u53d6\u89c4\u5219\u8f83\u4e3a\u590d\u6742\uff0c\u5efa\u8bae\u76f4\u63a5\u4f7f\u7528xpath\u548ccss\u53c2\u6570\uff0c\u56e0\u4e3aPE\u7684\u53c2\u6570\u7f16\u8bd1\u53ef\u80fd\u5b58\u5728\u95ee\u9898\u3002\n- \u5982\u679c\u5bf9\u6027\u80fd\u6709\u6bd4\u8f83\u5f3a\u7684\u9700\u6c42\uff0c\u4e0d\u5efa\u8bae\u4f7f\u7528\u3002\n\n### \u7279\u6027\u4ecb\u7ecd\n- [x] \u652f\u6301\u8868\u683c\u3001\u5217\u8868\u7b49\u5f62\u5f0f\u7684\u6279\u91cf\u89e3\u6790\n >\u901a\u8fc7\u5728\u6a21\u677f\u4e2d\u5b9a\u4e49\u4e00\u4e2a\u7236\u8282\u70b9\uff0c\u4ecehtml\u9875\u9762\u4e2d\u7684\u8868\u683c\u3001\u5217\u8868\u7b49\u7ec4\u4ef6\u4e2d\uff0c\u6279\u91cf\u6293\u53d6\u591a\u4e2a\u540c\u7c7bitem\n \n >\u7528\u6cd5\u793a\u4f8b\uff1a\u4f7f\u7528`{\"parent\": {}, \"fields\": []}`\u8fd9\u6837\u7684\u914d\u7f6e\uff0c\u5c06\u9996\u5148\u67e5\u627e\u5339\u914d`parent`\u7684\u8282\u70b9\uff0c\u7136\u540e\u904d\u5386\u5176\u6bcf\u4e2a\u5b50\u8282\u70b9\uff0c\u5bf9\u6bcf\u4e2a\u5b50\u8282\u70b9\u5e94\u7528`fields`\u89c4\u5219\uff0c\u751f\u6210\u4e00\u4e2aitem\u3002\n \n- [x] Clue Mechanism\n > \u57fa\u4e8escrapy_redis\u7684\u7ebf\u7d22\u673a\u5236\uff0c\u53ef\u6301\u4e45\u5316\uff08\u76ee\u524d\u652f\u6301mysql\uff09\u7ebf\u7d22\uff0c\u65b9\u4fbf\u8ffd\u8e2a\u3002\n\n- [x] \u503c\u6620\u5c04\n >\u4e00\u4e2a\u7b80\u5355\u7684\u9700\u6c42\u573a\u666f\uff1aAPI\u8fd4\u56de\u7684\u6027\u522b\u5b57\u6bb5\u662f0\u548c1\uff0c\u4f46\u662f\u9700\u8981\u5c06\u5176\u8f6c\u6362\u6210\"\u7537\"\u548c\"\u5973\"\u3002\n \n### \u5f85\u505a\u6e05\u5355\n- \u4f18\u5316\n - [ ] \u652f\u6301\u76f4\u63a5\u5728`Item`\u7684\u7c7b\u5b9a\u4e49\u4e2d\u5b9a\u4e49\u6a21\u677f\n >\u7528\u6cd5\u793a\u4f8b\uff1a\u539f\u6a21\u677f\u7684`itemname`\u53c2\u6570\u901a\u8fc7\u6ce8\u89e3\u4f20\u53c2\uff0c\u5176\u4ed6\u7684\u6a21\u677f\u53c2\u6570\u5b9a\u4e49\u5728`Item`\u7c7b\u4e2d\uff0c\u5982\u4e0b\u6240\u793a\u3002\n ```\n class MyItem(scrapy.Item):\n tpl = {\"parent\": {\"xpath\":\"//div[@id=\\\"contentDiv\\\"]//table/tbody/tr[position()>1]\"}\n name = Field(xpath=\"//a[@href]/text()\")\n ```\n\n### scrapy\u914d\u7f6e\u53c2\u6570\n\n- PARSER_ENGINE_CONFIG_FILE \n > \u6a21\u677f\u914d\u7f6e\u6587\u4ef6\u7684\u4f4d\u7f6e\u3002\u9ed8\u8ba4\u662f`parser_engine.json`\uff0c\u4e0e`scrapy.cfg`\u6587\u4ef6\u540c\u7ea7\u3002\n- MYSQL\n > MySQL\u914d\u7f6e\u4fe1\u606f\uff0cdict\u7c7b\u578b\uff0c\u5305\u542b\u4ee5\u4e0b\u5b57\u6bb5\uff1a\n - DATABASE\n - USER\n - PASSWORD\n - PORT \u9ed8\u8ba43306\n - HOST \u9ed8\u8ba4127.0.0.1\n\n\u4e0b\u9762\u7684\u5b57\u6bb5\u5728MYSQL\u914d\u7f6e\u7f3a\u5931\u65f6\u751f\u6548\uff1a\n- MYSQL_USER\n- MYSQL_PASSWORD\n- MYSQL_PORT\n- MYSQL_DATABASE\n\n### \u6a21\u677f\u53c2\u6570\n\u6a21\u677f\u914d\u7f6e\u6587\u4ef6\uff0c\u5982`parser_engine.json`\uff0c\u5176\u6784\u6210\u7ed3\u6784\u5982\u4e0b\uff1a\n```json\n{\n \"templates\": [\n {\n \"name\": \"tpl1\" \n },\n {\n \"name\": \"tpl2\"\n }\n ]\n\n}\n\n```\n\u6a21\u677f\u914d\u7f6e\u6587\u4ef6\u4e2d`templates`\u5217\u8868\u4e2d\u7684\u6bcf\u4e00\u9879\uff0c\u5373\u4e3a\u4e00\u4e2a\u6a21\u677f\u3002\n\n\u6240\u8c13\u6a21\u677f\uff0c\u5bf9\u5e94\u7684\u7c7b\u662f[PETemplate](./parser_engine/template.py)\uff0c\u5176\u6784\u6210\u7ed3\u6784\u5982\u4e0b\uff1a\n\n- name \u5fc5\u8981\u3002\u76f8\u5f53\u4e8e\u8be5\u6a21\u677f\u5728\u8be5\u6a21\u677f\u6587\u4ef6\u4e2d\u7684id\n- parent \u4e0d\u5fc5\u8981\u3002\u5982\u679c\u6307\u5b9a\uff0c\u5c06\u6309\u7167\u5176\u6307\u5b9a\u7684\u89c4\u5219\uff0c\u4ece\u54cd\u5e94\u4e2d\u53d6\u51fa\u67d0\u8282\u70b9\u4f5c\u4e3a\u540e\u7eed\u63d0\u53d6\u89c4\u5219\u7684\u6839\u8282\u70b9\u3002\n- itemname \u4e0d\u5fc5\u8981\u3002\u5982\u679c\u6307\u5b9a\uff0c\u5c06\u5c1d\u8bd5\u52a0\u8f7d\u5b9e\u4f8b\u5316\u8be5item\u7c7b\uff1b\u5982\u679c\u6ca1\u6709\u627e\u5230\u7c7b\u6216\u8005\u6ca1\u6709\u6307\u5b9a\uff0c\u5219\u8fd4\u56de`dict`\u7c7b\u578b\u7684\u539f\u59cb\u6570\u636e\u3002\n- extract_keys \u4e0d\u5fc5\u8981\u3002json\u683c\u5f0f\u4e13\u5c5e\uff0c\u7528\u4e8e\u76f4\u63a5\u4ece\u67d0\u4e2ajson\u5bf9\u8c61\u4e2d\u63d0\u53d6\u4e00\u7ec4\u952e\u503c\uff0c\u901a\u5e38\u53ef\u4ee5\u642d\u914d`parent`\u53c2\u6570\u4f7f\u7528\uff0c\u8f83\u4e3a\u9ad8\u6548\u3002\n- extract_keys_map \u4e0d\u5fc5\u8981\u3002\u7c7b\u4f3c`extract_keys_map`\uff0c\u9002\u7528\u4e8e\u9700\u8981\u8f6c\u6362\u539fjson\u5bf9\u8c61\u4e2d\u7684\u952e\u540d\u7684\u573a\u666f\u3002\n- fields \u4e0d\u5fc5\u8981\u3002\u662f\u4e00\u4e2a`\u5b57\u6bb5`\u7684\u6570\u7ec4\u3002\n\n\u6240\u8c13\u5b57\u6bb5\uff0c\u5bf9\u5e94\u7684\u7c7b\u662f[PEField](./parser_engine/template.py)\uff0c\u5c5e\u4e8ePE\u7684\u6838\u5fc3\uff0c\u5176\u6784\u6210\u7ed3\u6784\u5982\u4e0b\uff1a\n- key \u5fc5\u8981\u3002\u63d0\u53d6\u7ed3\u679c\u4fdd\u5b58\u7684\u952e\u540d\uff0c\u901a\u5e38\u662fitem\u7684\u67d0\u4e2a`Field`\u7684\u53d8\u91cf\u540d\u3002\n- xpath\n- css\n- tags \u4e0d\u5fc5\u8981\u3002\u4e00\u7ec4\u6709\u5e8f\u7684html\u6807\u7b7e\u3002\u5982`[\"div\",\"a\"]`\u4f1a\u88ab\u7ffb\u8bd1\u6210`//div/a`\u7684xpath\u3002\n- attr_name \u4e0d\u5fc5\u8981\u3002\u5bf9\u4e8ehtml\u6765\u8bf4\uff0c\u7ecf\u5e38\u9700\u8981\u83b7\u53d6\u67d0\u4e2atag\u7684\u67d0\u4e2a\u5c5e\u6027\u503c\u3002\n- attributes \u4e0d\u5fc5\u8981\u3002\u652f\u6301\u591a\u79cd\u7ed3\u6784\u3002\n - string \u76f4\u63a5\u4f5c\u4e3a`//div/a[{attributes}]`\u4e2d\u7684`{attributes}`\n - map `{\"type\": \"password\", \"id\": \"id1\"}` => `//div/a[@type=\\\"password\\\" and @id=\\\"id1\\\"]`\n - list `[[\"type\",\"=\",\"password\"],[\"id\", \"!=\",\"id1\"]]` => `//div/a[@type=\\\"password\\\" and @id!=\\\"id1\\\"]`\n- json_path \u4e0d\u5fc5\u8981\u3002`json_path`\u5b8c\u5168\u9075\u5faa[json_path\u534f\u8bae](https://goessner.net/articles/JsonPath/)\uff0c[json_path\u5728\u7ebf\u8c03\u8bd5](http://jsonpath.com/)\u3002\n- json_key \u4e0d\u5fc5\u8981\u3002\u76f4\u63a5\u4f5c\u4e3ajson\u53d6\u503c\u7684\u952e\u540d\u3002\n- value_type \u4e0d\u5fc5\u8981\u3002**\u4f46\u5f88\u6709\u7528**\uff0c\u4e3b\u8981\u662f\u56e0\u4e3a\u4e0d\u7ba1\u662fxpath\u8fd8\u662fjson_path\uff0c\u63d0\u53d6\u51fa\u6765\u90fd\u662f\u4e00\u4e2alist\uff0c\u5c3d\u7ba1\u6709\u65f6\u5b57\u6bb5\u660e\u660e\u662f\u5b8c\u5168\u786e\u5b9a\u7684\uff0c\u5982\u679c\u8bbe\u7f6evalue_type=`singleton`\uff0c\u5219PE\u5c06\u63d0\u53d6list\u7684\u7b2c\u4e00\u4e2a\u5143\u7d20\u3002\n- position \u4e0d\u5fc5\u8981\u3002\n- mapper \u4e0d\u5fc5\u8981\u3002dict\u7c7b\u578b\uff0c\u7528\u4e8e\u5b9e\u73b0\u503c\u6620\u5c04\uff0c\u4f8b\u5982\uff1a`{1: \"male\", 2: \"femail\"}`\u3002\n- join \u4e0d\u5fc5\u8981\u3002str\u7c7b\u578b\uff0c\u7528\u4e8e\u5b9e\u73b0`','.join([])`\u3002\n\n### TemplateAnnotation\u53c2\u6570\u8bf4\u660e\nTemplateAnnotation\u6ce8\u89e3\u4e2d\u4f20\u8fdb\u6765\u7684\u53c2\u6570\uff0c\u9664\u4e86\u4e0b\u9762\u5217\u51fa\u7684\uff0c\u5176\u4ed6\u7684\u53c2\u6570\u90fd\u4f1a\u88ab\u585e\u5230\u8fd4\u56de\u503c\u4e2d\uff08\u5f53\u7136\uff0c\u5982\u679c\u901a\u8fc7\u5b9a\u4e49`itemname`\uff0c\u5b9e\u4f8b\u5316item\u7684\u65f6\u5019\u4f1a\u9759\u9ed8\u629b\u5f03\u90a3\u4e9b\u4e0d\u5c5e\u4e8eitem\u7684\u503c\uff09\u3002\n\n- start_url_tpl: \u6a21\u677f\u7684\u6570\u7ec4\uff0c\u6216\u8005\u6a21\u677fid\u7684\u6570\u7ec4\u3002\n >\u5bf9\u5e94\u4e8estart_urls\u7684\u6a21\u677f\uff0c\u4f1a\u751f\u6210\u4e00\u4e2a`_parse_start_url`\u65b9\u6cd5\u7ed1\u5b9a\u5230spider\u7c7b\u4e0a\uff0c\u8be5\u65b9\u6cd5\u6709\u4e24\u4e2a\u53c2\u6570\uff08\u4e0d\u5305\u62ecself\uff09\uff1a\n - response \n - tpl_index_or_id\uff0c\u9ed8\u8ba4\u662fNone\n \n- tpls: \u6a21\u677f\u7684\u6570\u7ec4\uff0c\u6216\u8005\u6a21\u677fid\u7684\u6570\u7ec4\n\n\u5177\u4f53\u8bf7\u53c2\u8003[decorator.py](./parser_engine/decorator.py)\u4e2d\u7684\u6ce8\u91ca\u53ca\u6e90\u4ee3\u7801\u3002\n\n#### Html\u683c\u5f0f\n\u4e3e\u4e2a\u7b80\u5355\u7684\u4f8b\u5b50\u3002\n\n\u76ee\u6807\uff1a\u6293\u53d6[\u6296\u97f3\u7528\u6237\u5173\u952e\u5b57\u641c\u7d22\u6293\u5305\u6570\u636e\u5206\u6790\u811a\u672c\u4f7f\u7528\u6307\u5357](http://github.cannot.cc/baixing-helper/\u6296\u97f3\u7528\u6237\u5173\u952e\u5b57\u641c\u7d22\u6293\u5305\u6570\u636e\u5206\u6790\u811a\u672c\u4f7f\u7528\u6307\u5357.html)\u9875\u9762\u7684\u51e0\u4e2a\u6b65\u9aa4\u6807\u9898\uff0c\u6bcf\u4e2a\u6b65\u9aa4\u662f\u4e00\u4e2a`h3`\u6807\u7b7e\uff0c\u6b65\u9aa4\u6807\u9898\u5728`id`\u5c5e\u6027\u91cc\uff0c\u5e76\u4e14\u9700\u8981\u53bb\u6389\u5f62\u5982`1-`\u7684\u524d\u7f00\u3002\n\u90a3\u4e48\u76f8\u5e94\u7684\u914d\u7f6e\u6587\u4ef6\u662f\uff1a\n```json\n {\n \"name\": \"demo\",\n \"fields\": [\n {\n \"dom_id\": null,\n \"_css\": null,\n \"xpath\": null,\n \"tags\": [\n \"h3\"\n ],\n \"classes\": [],\n \"attributes\": null,\n \"position\": null,\n \"key\": \"\u6b65\u9aa4\",\n \"value_type\": null,\n \"regexp\": \"[\\\\d]{1,2}-(\\\\w+)\",\n \"attr_name\": \"id\"\n }\n ]\n }\n```\n\u8f93\u51fa\n```\n{'\u6b65\u9aa4': ['\u51c6\u5907\u5de5\u4f5c', '\u627e\u5230\u7535\u8111\u7684ip\u5730\u5740\u548c\u7aef\u53e3', '\u786e\u4fdd\u624b\u673a\u4e0e\u7535\u8111\u5efa\u7acb\u8fde\u63a5', '\u6296\u97f3\u641c\u7d22\u5173\u952e\u8bcd', '\u6293\u5305\u6570\u636e\u5bfc\u51fa', '\u63d0\u53d6\u7528\u6237\u4fe1\u606f', '\u63a8\u8350\u5728\u7ebf\u8f6c\u6362\u5de5\u5177', 'python\u811a\u672c\u5bfc\u51fa']}\n```\n\n\u5982\u679c\u53ea\u9700\u8981\u7b2c\u4e8c\u4e2a\u6b65\u9aa4\uff0c\u5c06json\u914d\u7f6e\u4e2d\u7684`position`\u53c2\u6570\u6539\u4e3a`2`\uff0c\u5373\u53ef\u5f97\u5230\u5982\u4e0b\u8f93\u51fa\uff1a\n```\n{'\u6b65\u9aa4': ['\u627e\u5230\u7535\u8111\u7684ip\u5730\u5740\u548c\u7aef\u53e3']}\n```\n\n#### JSON\u683c\u5f0f\n```json\n {\n \"name\": \"json-api-demo\",\n \"fields\": [\n {\n \"key\": \"poi_id\",\n \"json_path\": \"$.pois[:1].id\"\n },\n {\n \"key\": \"\u5730\u540d\",\n \"json_path\": \"$.data.name\",\n \"value_type\": \"singleton\"\n },\n {\n \"key\": \"\u4e0b\u7ea7\",\n \"json_path\": \"$.data.children[*].name\"\n }\n ]\n }\n```\n\n\n\u7531\u4e8e`json_path`\u89e3\u6790\u603b\u662f\u8fd4\u56de\u4e00\u4e2alist\uff0c\u5bf9\u4e8e\u4e00\u4e9b\u786e\u5b9a\u7684\u5b57\u6bb5\uff0c\u6bd4\u5982\u901a\u8fc7\u8c03\u7528API`http://172.31.1.4:30815/api/dict/area/0?childrenDepth=1`\uff0c\u60f3\u62ff\u5230\u8be5\u5730\u533a\u7684name\u5b57\u6bb5\uff0c\u5219\u53ef\u4ee5\u8bbe\u7f6e`value_type`\u4e3a`singleton`\uff0c\u5219PE\u4f1a\u505a\u4e00\u6b21\u8f6c\u6362\u3002", "description_content_type": "text/markdown", "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://gitlab.baixing.cn/spider/parser_engine", "keywords": "", "license": "MIT", "maintainer": "", "maintainer_email": "", "name": "parser-engine", "package_url": "https://pypi.org/project/parser-engine/", "platform": "", "project_url": "https://pypi.org/project/parser-engine/", "project_urls": { "Homepage": "https://gitlab.baixing.cn/spider/parser_engine" }, "release_url": "https://pypi.org/project/parser-engine/0.1.4/", "requires_dist": null, "requires_python": "", "summary": "template-driven parser engine for scrapy", "version": "0.1.4" }, "last_serial": 5149258, "releases": { "0.1.2": [ { "comment_text": "", "digests": { "md5": "a3e4d6b91ccfa80c462ff81f79e63bce", "sha256": "b05d431b9247d7f7bb5352615aad6ebfd8ecdef1a344d0025fef963b69a290ae" }, "downloads": -1, "filename": "parser_engine-0.1.2-py3.6.egg", "has_sig": false, "md5_digest": "a3e4d6b91ccfa80c462ff81f79e63bce", "packagetype": "bdist_egg", "python_version": "3.6", "requires_python": null, "size": 60769, "upload_time": "2019-03-28T15:57:43", "url": "https://files.pythonhosted.org/packages/a9/18/78e2b550329b05e6f72e10b914302109064ac71890a048654ba04483eae1/parser_engine-0.1.2-py3.6.egg" } ], "0.1.4": [ { "comment_text": "", "digests": { "md5": "d54e17b9e35514eef9dac2fb9ef44961", "sha256": "97f39f3ec3c341782fc933751650e5a8aa36b63b0eff4c26b5d29dc664fd9d1e" }, "downloads": -1, "filename": "parser_engine-0.1.4-py3.6.egg", "has_sig": false, "md5_digest": "d54e17b9e35514eef9dac2fb9ef44961", "packagetype": "bdist_egg", "python_version": "3.6", "requires_python": null, "size": 63578, "upload_time": "2019-04-16T10:01:25", "url": "https://files.pythonhosted.org/packages/2b/ab/18d9295fc2c92889a98b10156919d4635bb1b8ccade4d4eef177cfd60f8b/parser_engine-0.1.4-py3.6.egg" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "d54e17b9e35514eef9dac2fb9ef44961", "sha256": "97f39f3ec3c341782fc933751650e5a8aa36b63b0eff4c26b5d29dc664fd9d1e" }, "downloads": -1, "filename": "parser_engine-0.1.4-py3.6.egg", "has_sig": false, "md5_digest": "d54e17b9e35514eef9dac2fb9ef44961", "packagetype": "bdist_egg", "python_version": "3.6", "requires_python": null, "size": 63578, "upload_time": "2019-04-16T10:01:25", "url": "https://files.pythonhosted.org/packages/2b/ab/18d9295fc2c92889a98b10156919d4635bb1b8ccade4d4eef177cfd60f8b/parser_engine-0.1.4-py3.6.egg" } ] }