{ "info": { "author": "Kenta Kase", "author_email": "kesin1202000@gmail.com", "bugtrack_url": null, "classifiers": [ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Natural Language :: Japanese", "Programming Language :: Python :: 3.2", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: System :: Operating System Kernels :: Linux", "Topic :: Text Processing", "Topic :: Text Processing :: Filters" ], "description": ".. image:: https://travis-ci.org/Kesin11/JpTokenPreprocessing.svg?branch=master\n :target: https://travis-ci.org/Kesin11/JpTokenPreprocessing\n\n\n====================================================\nJpTokenPreprocessing -- Japanese Token Preprocessing\n====================================================\n\nJpTokenPreprocessing is a Python library for token preprocessing. It supports filtering noize (e.g. too short token, only number or only symbol token) and normalizing (support alphabet case and unicode normalize).\nThere are common preprocessing for natural language processing (NLP).\n\nUsage\n====================================\n\n.. code-block :: python\n\n #coding: utf-8\n # Python3\n from jp_token_preprocessing import JpTokenPreprocessing\n import MeCab\n\n # Return japanese word tokens using morphological analyzer MeCab.\n # And select only noun.\n def tokenize(text):\n tagger = MeCab.Tagger()\n node = tagger.parseToNode(text)\n while node:\n if '\u540d\u8a5e' in node.feature:\n surface = node.surface\n yield surface\n node = node.next\n\n if __name__=='__main__':\n text = \"\"\"\n \u3053\u308c\u306f\u81ea\u7136\u8a00\u8a9e\u51e6\u7406\u306b\u5fc5\u9808\u306a\u524d\u51e6\u7406\u306e\u305f\u3081\u306e\u30e2\u30b8\u30e5\u30fc\u30eb\u3067\u3059\u3002\n \u5f62\u614b\u7d20\u89e3\u6790\u3084\u3001n-gram\u3067\u30c8\u30fc\u30af\u30f3\u5316\u3057\u305f\u5f8c\u306e\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\u3001\u6b63\u898f\u5316\u3092\u88dc\u52a9\u3057\u307e\u3059\u3002\n \u4e00\u8a9e\u3060\u3051\u306e\u30c8\u30fc\u30af\u30f3\u3084'1234'\u306e\u3088\u3046\u306a\u6570\u5b57\u3060\u3051\u306e\u30c8\u30fc\u30af\u30f3\u3001'!!'\u306e\u3088\u3046\u306a\u8a18\u53f7\u3060\u3051\u306e\u30c8\u30fc\u30af\u30f3\u306e\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\u3001\n \u5168\u89d2\u6587\u5b57'\uff30\uff39\uff34\uff28\uff2f\uff2e'\u306e\u534a\u89d2\u5316\u3001\u82f1\u5358\u8a9e'Word'\u306e\u5c0f\u6587\u5b57\u5316\u3068\u3044\u3063\u305f\u6b63\u898f\u5316\u3082\u884c\u3048\u307e\u3059\u3002\n \u3055\u3089\u306b\u5fc5\u305a\u9664\u5916\u3057\u305f\u3044\u30c8\u30fc\u30af\u30f3\u3092\u30b9\u30c8\u30c3\u30d7\u30ef\u30fc\u30c9\u306b\u8a2d\u5b9a\u3059\u308b\u3053\u3068\u3082\u3067\u304d\u307e\u3059\u3002\n \"\"\"\n stopwords = ['\u3053\u308c', '\u3053\u3068']\n\n tokens = tokenize(text)\n \"\"\"\n >>> print(list(tokens))\n\n ['', '', '\u8a00\u8a9e', '\u51e6\u7406', '\u5fc5\u9808', '\u524d', '\u51e6\u7406', '\u305f\u3081', '\u30e2\u30b8\u30e5\u30fc\u30eb', '\u5f62\u614b\u7d20',\n '\u89e3\u6790', 'n', '-', 'gram', '\u30c8\u30fc', '\u30af\u30f3', '\u5316', '\u5f8c', '\u30d5\u30a3\u30eb\u30bf', '\u30ea\u30f3\u30b0', '\u6b63\u898f',\n '\u5316', '\u88dc\u52a9', '\u4e00\u8a9e', '\u30c8\u30fc\u30af', '\u30f3', \"'\", '1234', \"'\", '\u3088\u3046', '\u6570\u5b57','\u30c8\u30fc',\n '\u30af\u30f3', \"'!!'\", '\u3088\u3046', '\u8a18\u53f7', '\u30c8\u30fc', '\u30af\u30f3', '\u30d5\u30a3\u30eb\u30bf', '\u30ea\u30f3\u30b0', '\u5168\u89d2',\n '\u6587\u5b57', \"'\", '\uff30\uff39\uff34\uff28\uff2f\uff2e', \"'\", '\u534a\u89d2', '\u5316', '\u82f1\u5358\u8a9e', \"'\", 'Word',\"'\", '\u5c0f\u6587\u5b57',\n '\u5316', '\u6b63\u898f', '\u5316', '\u9664\u5916', '\u30c8\u30fc\u30af', '\u30f3', '\u30b9\u30c8\u30c3\u30d7', '\u30ef\u30fc\u30c9', '\u8a2d\u5b9a', '\u3053\u3068']\n \"\"\"\n\n tokens = tokenize(text)\n preprocessor = JpTokenPreprocessing(number=False,\n symbol=False,\n case='lower',\n unicode='NFKC',\n min_len=2,\n stopwords=stopwords)\n tokens = preprocessor.preprocessing(tokens)\n # Return iterator of tokens. Using list() for print sample.\n \"\"\"\n >>> print(list(tokens))\n ['\u8a00\u8a9e', '\u51e6\u7406', '\u5fc5\u9808', '\u51e6\u7406', '\u305f\u3081', '\u30e2\u30b8\u30e5\u30fc\u30eb', '\u5f62\u614b\u7d20', '\u89e3\u6790', 'gram',\n '\u30c8\u30fc', '\u30af\u30f3', '\u30d5\u30a3\u30eb\u30bf', '\u30ea\u30f3\u30b0', '\u6b63\u898f', '\u88dc\u52a9', '\u4e00\u8a9e', '\u30c8\u30fc\u30af', '\u3088\u3046',\n '\u6570\u5b57', '\u30c8\u30fc', '\u30af\u30f3', '\u3088\u3046', '\u8a18\u53f7', '\u30c8\u30fc', '\u30af\u30f3', '\u30d5\u30a3\u30eb\u30bf', '\u30ea\u30f3\u30b0',\n '\u5168\u89d2', '\u6587\u5b57', 'python', '\u534a\u89d2', '\u82f1\u5358\u8a9e', 'word', '\u5c0f\u6587\u5b57', '\u6b63\u898f', '\u9664\u5916',\n '\u30c8\u30fc\u30af', '\u30b9\u30c8\u30c3\u30d7', '\u30ef\u30fc\u30c9', '\u8a2d\u5b9a']\n \"\"\"\n\nInstallation\n====================================\n\n.. code-block :: bash\n\n pip install JpTokenPreprocessing\n\nMeCab for python3\n-----------------------------------\n\nPlease apply below patch for installing and using MeCab module with python3. (2014/09/07 MeCab 0.996)\n\nhttps://code.google.com/p/mecab/issues/detail?id=7\n\nMETHODS\n====================================\n\nJpTokenPreprocessing(args)\n-----------------------------------\n\n- number = BOOL (default: False)\n\n Allow only number token.\n\n- symbol = BOOL (default: False)\n\n Allow only symbol token.\n\n- case = 'lower' or 'upper' or 'capitalize'\n\n Normalize alphabet case.\n\n- unicode = 'NFC' or 'NFKC' or 'NFD' or 'NFKD'a (default: 'NFKC')\n\n Normalize unicode string with unicodedata.normalize().\n\n- min_len = int (default: 2)\n\n Filter out few character token. If min_len = 2 filter out token that has only 1 or 0 character.\n\n- stopwords = list (default: [])\n\n Filter out any token that are contained in stopword list.\n\n- JpTokenPreprocessing.preprocessing(iterable)\n\n Return preprocessed tokens iterator.\n\nFuture work\n====================================\n\n- Add some hook point for extending own preprocess.\n\nAuthors\n====================================\nKenta kase kesin1202000@gmail.com\n\nLicense\n====================================\nMIT License", "description_content_type": null, "docs_url": null, "download_url": "https://github.com/Kesin11/JpTokenPreprocessing/archive/master.zip", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://github.com/Kesin11/JpTokenPreprocessing", "keywords": "NLP,natural language processing,token,japaneese", "license": "MIT", "maintainer": null, "maintainer_email": null, "name": "JpTokenPreprocessing", "package_url": "https://pypi.org/project/JpTokenPreprocessing/", "platform": "UNKNOWN", "project_url": "https://pypi.org/project/JpTokenPreprocessing/", "project_urls": { "Download": "https://github.com/Kesin11/JpTokenPreprocessing/archive/master.zip", "Homepage": "https://github.com/Kesin11/JpTokenPreprocessing" }, "release_url": "https://pypi.org/project/JpTokenPreprocessing/0.1.5a2/", "requires_dist": null, "requires_python": null, "summary": "JpTokenPreprocessing is Python library for token preprocessing.", "version": "0.1.5a2" }, "last_serial": 1764412, "releases": { "0.1.1a": [ { "comment_text": "", "digests": { "md5": "fb068edbca0b742e35767cc221fa08e3", "sha256": "524fda6d9520bc31085a75f09468acf02a87f88b0e103123549de28f98279a41" }, "downloads": -1, "filename": "JpTokenPreprocessing-0.1.1a.tar.gz", "has_sig": false, "md5_digest": "fb068edbca0b742e35767cc221fa08e3", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4228, "upload_time": "2014-09-14T05:33:55", "url": "https://files.pythonhosted.org/packages/a5/69/7354695d1dbbc10643ae9940446a59b249c7334485aaa6901ad787a8caa7/JpTokenPreprocessing-0.1.1a.tar.gz" } ], "0.1.2a": [ { "comment_text": "", "digests": { "md5": "ad23b088321969be465275b2bf71b1e3", "sha256": "02c11055caaac13506fe494364a576be820c128ed686e461ccbb7868f2cd6916" }, "downloads": -1, "filename": "JpTokenPreprocessing-0.1.2a.tar.gz", "has_sig": false, "md5_digest": "ad23b088321969be465275b2bf71b1e3", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4227, "upload_time": "2014-09-14T05:35:31", "url": "https://files.pythonhosted.org/packages/15/11/114c561707e71c5a8d02a6fc3b291efa4992f13f2104b18bd872b9fff149/JpTokenPreprocessing-0.1.2a.tar.gz" } ], "0.1.3a": [ { "comment_text": "", "digests": { "md5": "3a57eeec880d9848cd8cbf53e4650e11", "sha256": "565bd05bb7242d6ec5b30e42ca1c86e06be8178823c99f60727558da34e255dc" }, "downloads": -1, "filename": "JpTokenPreprocessing-0.1.3a.tar.gz", "has_sig": false, "md5_digest": "3a57eeec880d9848cd8cbf53e4650e11", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4226, "upload_time": "2014-09-14T05:41:50", "url": "https://files.pythonhosted.org/packages/d9/11/8d507971a7d6576849a9671b9ae1b2513bf6546a6974c4010c999c2d2bed/JpTokenPreprocessing-0.1.3a.tar.gz" } ], "0.1.4a": [ { "comment_text": "", "digests": { "md5": "f9c918e9ca7fcd5419bac01a3e6e7517", "sha256": "3136835b68ca6901fd45af3772e3a0dff8f25b128ead44983666fb36ea2d9f62" }, "downloads": -1, "filename": "JpTokenPreprocessing-0.1.4a.tar.gz", "has_sig": false, "md5_digest": "f9c918e9ca7fcd5419bac01a3e6e7517", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4234, "upload_time": "2014-09-14T05:46:04", "url": "https://files.pythonhosted.org/packages/c5/8e/82219626a18bdf5406215f7b494309a1699462a274a786dc65044e693d8e/JpTokenPreprocessing-0.1.4a.tar.gz" } ], "0.1.5a": [ { "comment_text": "", "digests": { "md5": "27f5c7fa149cc6ade68d612d65a9a88b", "sha256": "bbe0fbd644b36a0a5d56a4864e566496a47f6aa2df7cfba94efe26f8a83ef2ea" }, "downloads": -1, "filename": "JpTokenPreprocessing-0.1.5a.tar.gz", "has_sig": false, "md5_digest": "27f5c7fa149cc6ade68d612d65a9a88b", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4231, "upload_time": "2014-09-14T05:59:23", "url": "https://files.pythonhosted.org/packages/13/d5/32888ad6560926cbe1cd7eab8d975d8165a4c069c25dae5b4d47a40f2a05/JpTokenPreprocessing-0.1.5a.tar.gz" } ], "0.1.5a2": [ { "comment_text": "", "digests": { "md5": "715b704f4992e85162806a33636c88ca", "sha256": "b3c4d4520cf676f2fb236aed302195c4761751569332ebd11f2ee3ab07766f85" }, "downloads": -1, "filename": "JpTokenPreprocessing-0.1.5a2.tar.gz", "has_sig": false, "md5_digest": "715b704f4992e85162806a33636c88ca", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4264, "upload_time": "2015-10-12T13:53:21", "url": "https://files.pythonhosted.org/packages/b9/cb/c5c3d000513afaad2c8b2216da83cd64cc4a7b1801d10d9a80f6bc607559/JpTokenPreprocessing-0.1.5a2.tar.gz" } ], "0.1a": [ { "comment_text": "", "digests": { "md5": "29aae6dd1ccd996991fd3c69ef5b89f5", "sha256": "162cc9fedb4229e6ee0274eb8fa5eec130bd2e229c57459322dee70c7462e09a" }, "downloads": -1, "filename": "JpTokenPreprocessing-0.1a.tar.gz", "has_sig": false, "md5_digest": "29aae6dd1ccd996991fd3c69ef5b89f5", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 3812, "upload_time": "2014-09-14T03:20:30", "url": "https://files.pythonhosted.org/packages/3c/8d/d870b2a0a9d89de139459c435475d32393d9ef0230868d6a440a7338c3ba/JpTokenPreprocessing-0.1a.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "715b704f4992e85162806a33636c88ca", "sha256": "b3c4d4520cf676f2fb236aed302195c4761751569332ebd11f2ee3ab07766f85" }, "downloads": -1, "filename": "JpTokenPreprocessing-0.1.5a2.tar.gz", "has_sig": false, "md5_digest": "715b704f4992e85162806a33636c88ca", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 4264, "upload_time": "2015-10-12T13:53:21", "url": "https://files.pythonhosted.org/packages/b9/cb/c5c3d000513afaad2c8b2216da83cd64cc4a7b1801d10d9a80f6bc607559/JpTokenPreprocessing-0.1.5a2.tar.gz" } ] }