{ "info": { "author": "CyberZHG", "author_email": "CyberZHG@gmail.com", "bugtrack_url": null, "classifiers": [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python :: 3" ], "description": "# Wiki-Dump Reader\n\n[![Travis](https://travis-ci.org/CyberZHG/wiki-dump-reader.svg)](https://travis-ci.org/CyberZHG/wiki-dump-reader)\n[![Coverage](https://coveralls.io/repos/github/CyberZHG/wiki-dump-reader/badge.svg?branch=master)](https://coveralls.io/github/CyberZHG/wiki-dump-reader)\n\nExtract corpora from wiki-dump.\n\n## Install\n\n```bash\npip install wiki-dump-reader\n```\n\n## Usage\n\nThe dump file `*wiki-*-pages-articles.xml` should be downloaded first. Then you can iterate and get cleaned text from the text:\n\n```python\nfrom wiki_dump_reader import Cleaner, iterate\n\ncleaner = Cleaner()\nfor title, text in iterate('*wiki-*-pages-articles.xml'):\n text = cleaner.clean_text(text)\n cleaned_text, links = cleaner.build_links(text)\n```\n\nJust ignore `links` if you don't need them:\n\n```\ncleaned_text, _ = cleaner.build_links(text)\n```\n\nSee [examples](tests/targets) for an intuitive feeling.", "description_content_type": "text/markdown", "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://github.com/CyberZHG/wiki-dump-reader", "keywords": "", "license": "MIT", "maintainer": "", "maintainer_email": "", "name": "wiki-dump-reader", "package_url": "https://pypi.org/project/wiki-dump-reader/", "platform": "", "project_url": "https://pypi.org/project/wiki-dump-reader/", "project_urls": { "Homepage": "https://github.com/CyberZHG/wiki-dump-reader" }, "release_url": "https://pypi.org/project/wiki-dump-reader/0.0.4/", "requires_dist": null, "requires_python": "", "summary": "Extract corpora from Wikipedia dumps", "version": "0.0.4" }, "last_serial": 4766833, "releases": { "0.0.1": [ { "comment_text": "", "digests": { "md5": "88210b0ccf12236da3f26bd9ef430789", "sha256": "17806f5f1c1d61f15ee1261a6235eb64843d272e2528cdb53b78ab1191a6cf33" }, "downloads": -1, "filename": "wiki-dump-reader-0.0.1.tar.gz", "has_sig": false, "md5_digest": "88210b0ccf12236da3f26bd9ef430789", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 2981, "upload_time": "2018-08-07T03:32:02", "url": "https://files.pythonhosted.org/packages/2d/0e/db30299c694b6b2c3821f81ea24384028787d3cac8b38c7b08332b4a93be/wiki-dump-reader-0.0.1.tar.gz" } ], "0.0.2": [ { "comment_text": "", "digests": { "md5": "6b12ac4fe14e00048c057a29ff210532", "sha256": "66d65eacaaafd706ec0acf7fcedd6de5a6f77a3aee863894d324064f1ab97d47" }, "downloads": -1, "filename": "wiki-dump-reader-0.0.2.tar.gz", "has_sig": false, "md5_digest": "6b12ac4fe14e00048c057a29ff210532", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 3311, "upload_time": "2018-08-07T04:03:26", "url": "https://files.pythonhosted.org/packages/e4/a3/53e2d77ba23d8744a4795674692e3f068ea0add8422308c6953d6c0539a0/wiki-dump-reader-0.0.2.tar.gz" } ], "0.0.3": [ { "comment_text": "", "digests": { "md5": "ada1ba472fbf6695b6e30c31775f0312", "sha256": "7328f09353385206e899dd3b0981a2da42841c65aef70f995723d6bda72277d1" }, "downloads": -1, "filename": "wiki-dump-reader-0.0.3.tar.gz", "has_sig": false, "md5_digest": "ada1ba472fbf6695b6e30c31775f0312", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 3309, "upload_time": "2018-08-15T07:45:42", "url": "https://files.pythonhosted.org/packages/2f/62/4fcb9bc186b372165316dbf91c643ce5cba00587473db0e4342af6538804/wiki-dump-reader-0.0.3.tar.gz" } ], "0.0.4": [ { "comment_text": "", "digests": { "md5": "1ee0b84ddb642e703579a1754e69599c", "sha256": "86532997c6870b46182eed6c461049dfebaea37d6f59f90d7ff5bcd4d85db04d" }, "downloads": -1, "filename": "wiki-dump-reader-0.0.4.tar.gz", "has_sig": false, "md5_digest": "1ee0b84ddb642e703579a1754e69599c", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 3416, "upload_time": "2019-02-01T04:12:22", "url": "https://files.pythonhosted.org/packages/ab/79/e70b9c27a3038bad28448e8183ee59a248d968aeb942dff94d61dcf10c45/wiki-dump-reader-0.0.4.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "1ee0b84ddb642e703579a1754e69599c", "sha256": "86532997c6870b46182eed6c461049dfebaea37d6f59f90d7ff5bcd4d85db04d" }, "downloads": -1, "filename": "wiki-dump-reader-0.0.4.tar.gz", "has_sig": false, "md5_digest": "1ee0b84ddb642e703579a1754e69599c", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 3416, "upload_time": "2019-02-01T04:12:22", "url": "https://files.pythonhosted.org/packages/ab/79/e70b9c27a3038bad28448e8183ee59a248d968aeb942dff94d61dcf10c45/wiki-dump-reader-0.0.4.tar.gz" } ] }