{ "info": { "author": "Tommy", "author_email": "tooooommy@163.com", "bugtrack_url": null, "classifiers": [ "License :: OSI Approved :: MIT License", "Programming Language :: Python", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy" ], "description": "
\n\n
\n\n## \u63cf\u8ff0\nrequests_spider \u662f\u4e00\u4e2a\u8f7b\u91cf\u7ea7\u7684\u5f02\u6b65\u722c\u866b\u6846\u67b6\uff0c\u57fa\u4e8erequests_html\u8fdb\u884c\u4e8c\u6b21\u5f00\u53d1\uff0c\u7c7b\u4f3cflask\n\n## \u5b89\u88c5\npip install requests_spider\n\n## \u4f9d\u8d56\npython: > 3.6\nuvloop\nrequests_html\n\n## \u7528\u6cd5\n##### \u57fa\u7840\u4f8b\u5b50\n```python3\nimport json\nfrom spider import XField, Spider, Model, Response, Request\n\n\nclass Proxy(Model):\n ip = XField(rule='//tr[contains(@class, \"odd\")]/td[2]', first=False)\n port = XField(rule='//tr[contains(@class, \"odd\")]/td[3]', first=False)\n\n async def process(self, response: Response):\n with open('proxy1.txt', 'a+') as file:\n for result in self.merge():\n file.write(json.dumps(result) + '\\n')\n\n\nspider = Spider('proxy', workers=15)\nspider.domains = ['www.xicidaili.com']\nspider.init_requests = [\n Request(url='http://www.xicidaili.com/nn/{}'.format(x), model=Proxy) for x in range(1, 10)\n]\n\nspider.async_limit = 5\n\nif __name__ == '__main__':\n spider.run()\n\n```\n\u722c\u53d6\u4ee3\u7406\u7f51\u7ad9ip\n\n##### \u4e2d\u95f4\u7ec4\u4ef6\n```python3\nimport random\nimport re\n\nfrom spider import Spider, Request, XRequest, Model, XField, RField, Response, Field, asyncio\n\n# \u83b7\u53d6\u67d0\u4e2a\u7528\u6237\u7684\u6240\u6709\u7684\u89c6\u9891\u4fe1\u606f ===> \u83b7\u53d6aid / page\nvideos_url = \"https://space.bilibili.com/ajax/member/getSubmitVideos?mid={mid}\" \\\n \"&pagesize=30&tid=0&page={page}&keyword=&order=pubdate\"\n\n# \u67d0\u4e2a\u89c6\u9891\u63a8\u8350\u7684\u89c6\u9891 ===> \u83b7\u53d6aid\nrecommend_url = \"https://comment.bilibili.com/playtag,{cid}-{aid}?html5=1\"\n\n# \u7528\u6237\u4fe1\u606f post csrf/mid\nuser_url = \"https://space.bilibili.com/ajax/member/GetInfo\"\n\n# av\u9875\u9762\uff0c====> \u83b7\u53d6\u4e0b\u8f7d\u89c6\u9891\u7684url\u3001cid-aid, mid\nav_url = \"https://www.bilibili.com/video/av{aid}\"\n\n\nclass AV(Model):\n urls = RField(rule='\"url\":\"(.*?)\",\"backup_url\"', first=False)\n cid = RField(rule='cid=(.*?)&aid=')\n aid = RField(rule='&aid=(.*?)&pre_ad=')\n mid = RField(rule='\"owner\":{\"mid\":(.*?),')\n\n async def process(self, response: Response):\n print(self['urls'])\n print(self['cid'])\n print(self['aid'])\n print(self['mid'])\n print(self.json())\n if self['mid'] and self['aid'] and self['urls'] and self['cid']:\n # \u63a8\u8350\u89c6\u9891\n yield Request(url=recommend_url.format(cid=self['cid'], aid=self['aid']), model=Recommend)\n\n # \u7528\u6237\u4fe1\u606f\n yield Request(url=user_url, method='POST', data={'csrf': '', 'mid': self['mid']},\n model=UserInfo, not_filter=True)\n\n # \u4e0b\u8f7d\u89c6\u9891\n for order, url in enumerate(self['urls']):\n yield Request(url=url.replace('http', 'https'),\n meta={'name': self['aid'] + '_' + str(order)}, model=Video)\n\n\nclass UserInfo(Model):\n mid = Field()\n name = Field()\n sex = Field()\n rank = Field()\n face = Field()\n regtime = Field()\n birthday = Field()\n sign = Field()\n level_info = Field()\n\n async def process(self, response: Response):\n status = response.json().get('status')\n if status:\n data = response.json().get('data')\n for k in self.keys():\n if k in data:\n self[k] = data[k]\n with open('user_' + str(self['mid']) + '.txt', 'w') as f:\n f.write(self.dumps() + '\\n')\n\n\nclass Recommend(Model):\n\n async def process(self, response: Response):\n for data in response.json():\n yield Request(av_url.format(aid=data[1]), model=AV)\n\n\nclass VideoInfo(Model):\n\n async def process(self, response: Response):\n status = response.json().get('status')\n if status:\n data = response.json().get('data')\n pattern = 'mid=(\\d+?)&pagesize=30&tid=0&page=(\\d+?)&keyword=&order=pubdate'\n patn = re.findall(pattern, response.url)[0]\n print(patn)\n yield Request(url=videos_url.format(mid=patn[0], page=int(patn[1]) + 1), model=VideoInfo),\n for v in data['vlist']:\n yield Request(url=av_url.format(aid=v.get('aid')), model=AV)\n\n\nclass Video(Model):\n\n async def process(self, response: Response):\n file_name = response.current_request.meta.get('name')\n if file_name and response.status_code == 200:\n with open(file_name + '.mp4', 'wb') as f:\n for content in response.iter_content(chunk_size=512):\n f.write(content)\n f.flush()\n\n\nspider = Spider('bilibili', workers=5)\n\nspider.init_requests = [\n Request(url=videos_url.format(mid='35789774', page=1), model=VideoInfo),\n]\nspider.async_limit = 5\n\n\n@spider.Middleware('request')\nasync def test(request):\n print(request.url)\n if request.url.startswith('https://space.bilibili.com/'):\n request.info.update({'headers': {'Referer': 'https://space.bilibili.com/'}})\n else:\n request.info.update({'headers': {'Referer': 'https://bilibili.com/'}})\n\n asyncio.sleep(round(random.random() * 5))\n return request\n\n\nif __name__ == '__main__':\n spider.run()\n```\n\u722c\u53d6bilibili\u7528\u6237\u89c6\u9891\uff0c\u7528\u6237\u8d44\u6599\uff0c\u89c6\u9891\u8d44\u6599\uff0c\u5229\u7528\u4e2d\u95f4\u7ec4\u4ef6\u8fdb\u884c\u5207\u6362headers\n\n## API\n#### Spider\n\u7ee7\u627frequests_html\u7684HTMLSession\n\n- **Spider.async_limit**\n\n \u5229\u7528asyncio.Semaphore\u9650\u5236\u5e76\u53d1\u6570\u91cf\n\n\n- **Spider.queue_timeout**\n\n \u4ece\u961f\u5217\u83b7\u53d6\u6570\u636e\u65f6\u5019\u8d85\u65f6\u8bbe\u7f6e\n\n- **Spider.request_depth**\n\n \u8bf7\u6c42\u7684\u6df1\u5ea6\n\n- **Spider.init_requests**\n\n \u521d\u59cb\u5316\u8bf7\u6c42\n\n- **Spider.domains**\n\n \u722c\u53d6\u57df\u540d\u8bbe\u7f6e\n\n- **Spider.rules**\n\n \u4ece\u54cd\u5e94\u7684\u6570\u636e\u4e2d\u83b7\u53d6\u4e0b\u6b21\u8bf7\u6c42\u7684\u4fe1\u606f\uff0c\u5e76\u52a0\u5165\u961f\u5217\n\n- **Spider.Middleware**\n\n \u4e2d\u95f4\u7ec4\u4ef6\n Middleware('request'), request\u5165\u961f\u4e4b\u524d\u6267\u884c\uff0c\u8fd4\u56derequest, response, None\n Middleware('response'), response\u5165\u961f\u4e4b\u524d\uff0c\u8fd4\u56derequest, response, None\n\n\n##### Model\nModel\u7c7b\u4f3c\u4e00\u4e2a\u5b57\u5178\u7684\u6570\u636e\u6a21\u578b\n\n- **Model.keys**\n\n \u7c7b\u4f3c\u5b57\u5178\u7684keys\n\n- **Model.values**\n\n \u7c7b\u4f3c\u5b57\u5178\u7684values\n\n- **Model.items**\n\n \u7c7b\u4f3c\u5b57\u5178\u7684items\n\n- **Model.json**\n\n \u83b7\u53d6\u6240\u6709Field\u7684\u5b57\u5178\u5f62\u5f0f\n\n- **Model.dumps**\n\n \u83b7\u53d6\u6240\u6709\u7684Field\u7684\u5b57\u7b26\u4e32\n\n- **Model.merge**\n\n \u5f53\u6240\u6709\u7684Field\u4ece\u54cd\u5e94\u6570\u636e\u83b7\u53d6\u7684\u6570\u636e\u662f\u5217\u8868\u7684\u65f6\u5019\uff0c\u5c06\u83b7\u53d6\u7684\u5217\u8868\u5408\u5e76\u6210\u4e3ajson\u6570\u636e\n\n- **Model.process**\n\n \u5904\u7406\u54cd\u5e94\u6570\u636e\n\n\n##### Field\n\n- **Field**\n\n \u4e0d\u5904\u7406\u6216\u5f85\u5904\u7406\u6570\u636e\u9879\n\n- **XField**\n\n \u5229\u7528xpath\u4ece\u54cd\u5e94\u6570\u636e\u4e2d\u83b7\u53d6\u6570\u636e\n\n- **CField**\n\n \u5229\u7528css\u83b7\u53d6\u6570\u636e\n\n- **RField**\n\n \u5229\u7528\u6b63\u5219\u83b7\u53d6\u6570\u636e\n\n##### Request\n\n- **Request**\n \u6b63\u5e38\u7684\u8bf7\u6c42\n\n- **XRequest**\n\n \u5229\u7528xpath, \u7528\u4e8eSpider.rules\n\n- **RRequest**\n\n \u5229\u7528\u6b63\u5219\uff0c\u7528\u4e8eSpider.rules\n\n\n\n## \u4f8b\u5b50\nexamples\u76ee\u5f55\u4e0b\nbilibili.py \u722c\u53d6\u54d4\u54e9\u54d4\u54e9\u7528\u6237\u4fe1\u606f\u3001\u89c6\u9891\u4fe1\u606f\u548c\u89c6\u9891\nqidian.py \u722c\u53d6\u8d77\u70b9\u5c0f\u8bf4\u6708\u7968\u6392\u884c\u5305\u62ec\u8bc4\u5206\nproxy.py \u722c\u53d6\u4ee3\u7406ip\u7f51\u7ad9\u4ee3\u7406\npearvideo.py \u722c\u53d6\u68a8\u89c6\u9891\u7f51\u7ad9\u7684\u89c6\u9891\n\n# License\nMIT", "description_content_type": "", "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://github.com/Tooooomy/requests_spider", "keywords": "", "license": "MIT", "maintainer": "", "maintainer_email": "", "name": "requests_spider", "package_url": "https://pypi.org/project/requests_spider/", "platform": "all", "project_url": "https://pypi.org/project/requests_spider/", "project_urls": { "Homepage": "https://github.com/Tooooomy/requests_spider" }, "release_url": "https://pypi.org/project/requests_spider/0.0.8/", "requires_dist": null, "requires_python": "", "summary": "Web crawling framework like flask.", "version": "0.0.8" }, "last_serial": 4260355, "releases": { "0.0.4": [ { "comment_text": "", "digests": { "md5": "df96cb9eecd0f428c33b90ce87ff0d47", "sha256": "03223fdf6bc21bbf1e5bfb9758495249deca10149e37e85830195535fb374939" }, "downloads": -1, "filename": "requests_spider-0.0.4.tar.gz", "has_sig": false, "md5_digest": "df96cb9eecd0f428c33b90ce87ff0d47", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 12612, "upload_time": "2018-09-07T08:09:56", "url": "https://files.pythonhosted.org/packages/00/bf/e07d82bbfbdbe49e5d08f4c53684a95fe4b9dae671972f330d2a9768d07e/requests_spider-0.0.4.tar.gz" } ], "0.0.5": [ { "comment_text": "", "digests": { "md5": "04f1f1303c2fdda0ece325481d1af1ca", "sha256": "3f3b19131ab9833fc609003fa034d8994f47c4f68235e43533307cd5768db86e" }, "downloads": -1, "filename": "requests_spider-0.0.5.tar.gz", "has_sig": false, "md5_digest": "04f1f1303c2fdda0ece325481d1af1ca", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 12648, "upload_time": "2018-09-09T06:08:02", "url": "https://files.pythonhosted.org/packages/0c/d7/fbb9ee4d5df258156458c58bb2d677d1c1354b5b0637ab91199a4a860403/requests_spider-0.0.5.tar.gz" } ], "0.0.6": [ { "comment_text": "", "digests": { "md5": "931ca90b4e2c9e0b14c35c58366b6fd2", "sha256": "67c99c61583c860a848cd8060c07d51b6b0952ebb83ca53b77206cf3e7a27598" }, "downloads": -1, "filename": "requests_spider-0.0.6.tar.gz", "has_sig": false, "md5_digest": "931ca90b4e2c9e0b14c35c58366b6fd2", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 12486, "upload_time": "2018-09-10T10:06:20", "url": "https://files.pythonhosted.org/packages/c3/c1/7c1da76db49bf4ef6df90fc6eda8c2f1bfc87106bdf342381ec9c43fbb47/requests_spider-0.0.6.tar.gz" } ], "0.0.7": [ { "comment_text": "", "digests": { "md5": "248acdffd39e3f7116aae5b8ef30c3d1", "sha256": "b629aba23c4f4c5d1ddf5a28824cc7add3873b4d964baa0302f9fedafd5e4356" }, "downloads": -1, "filename": "requests_spider-0.0.7.tar.gz", "has_sig": false, "md5_digest": "248acdffd39e3f7116aae5b8ef30c3d1", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 12333, "upload_time": "2018-09-11T07:50:12", "url": "https://files.pythonhosted.org/packages/1f/57/75e41ffe80fb003ac0a319305327b8e2cad64d8f7c59a8885c87c8616a6c/requests_spider-0.0.7.tar.gz" } ], "0.0.8": [ { "comment_text": "", "digests": { "md5": "752bd64c80de32d4a25fc14793ca2056", "sha256": "4133cc986b7afd9e88258bdd89073b5fa9c0ce6344df40647e4b9b625461d012" }, "downloads": -1, "filename": "requests_spider-0.0.8.tar.gz", "has_sig": false, "md5_digest": "752bd64c80de32d4a25fc14793ca2056", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 12346, "upload_time": "2018-09-11T08:02:07", "url": "https://files.pythonhosted.org/packages/6b/db/9db353c19606b2aefb3c876f14fe9abd69b93c9dbd8c2f6e08f2c18c902b/requests_spider-0.0.8.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "752bd64c80de32d4a25fc14793ca2056", "sha256": "4133cc986b7afd9e88258bdd89073b5fa9c0ce6344df40647e4b9b625461d012" }, "downloads": -1, "filename": "requests_spider-0.0.8.tar.gz", "has_sig": false, "md5_digest": "752bd64c80de32d4a25fc14793ca2056", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 12346, "upload_time": "2018-09-11T08:02:07", "url": "https://files.pythonhosted.org/packages/6b/db/9db353c19606b2aefb3c876f14fe9abd69b93c9dbd8c2f6e08f2c18c902b/requests_spider-0.0.8.tar.gz" } ] }