{
    "info": {
        "author": "Kenta Kase",
        "author_email": "kesin1202000@gmail.com",
        "bugtrack_url": null,
        "classifiers": [
            "Development Status :: 3 - Alpha",
            "Intended Audience :: Developers",
            "Intended Audience :: Information Technology",
            "Intended Audience :: Science/Research",
            "License :: OSI Approved :: MIT License",
            "Natural Language :: Japanese",
            "Programming Language :: Python :: 3.2",
            "Programming Language :: Python :: 3.3",
            "Programming Language :: Python :: 3.4",
            "Programming Language :: Python :: 3.5",
            "Topic :: Scientific/Engineering :: Artificial Intelligence",
            "Topic :: System :: Operating System Kernels :: Linux",
            "Topic :: Text Processing",
            "Topic :: Text Processing :: Filters"
        ],
        "description": ".. image:: https://travis-ci.org/Kesin11/JpTokenPreprocessing.svg?branch=master\n    :target: https://travis-ci.org/Kesin11/JpTokenPreprocessing\n\n\n====================================================\nJpTokenPreprocessing -- Japanese Token Preprocessing\n====================================================\n\nJpTokenPreprocessing is a Python library for token preprocessing. It supports filtering noize (e.g. too short token, only number or only symbol token) and normalizing (support alphabet case and unicode normalize).\nThere are common preprocessing for natural language processing (NLP).\n\nUsage\n====================================\n\n.. code-block :: python\n\n    #coding: utf-8\n    # Python3\n    from jp_token_preprocessing import JpTokenPreprocessing\n    import MeCab\n\n    # Return japanese word tokens using morphological analyzer MeCab.\n    # And select only noun.\n    def tokenize(text):\n        tagger = MeCab.Tagger()\n        node = tagger.parseToNode(text)\n        while node:\n            if '\u540d\u8a5e' in node.feature:\n                surface = node.surface\n                yield surface\n            node = node.next\n\n    if __name__=='__main__':\n        text = \"\"\"\n        \u3053\u308c\u306f\u81ea\u7136\u8a00\u8a9e\u51e6\u7406\u306b\u5fc5\u9808\u306a\u524d\u51e6\u7406\u306e\u305f\u3081\u306e\u30e2\u30b8\u30e5\u30fc\u30eb\u3067\u3059\u3002\n        \u5f62\u614b\u7d20\u89e3\u6790\u3084\u3001n-gram\u3067\u30c8\u30fc\u30af\u30f3\u5316\u3057\u305f\u5f8c\u306e\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\u3001\u6b63\u898f\u5316\u3092\u88dc\u52a9\u3057\u307e\u3059\u3002\n        \u4e00\u8a9e\u3060\u3051\u306e\u30c8\u30fc\u30af\u30f3\u3084'1234'\u306e\u3088\u3046\u306a\u6570\u5b57\u3060\u3051\u306e\u30c8\u30fc\u30af\u30f3\u3001'!!'\u306e\u3088\u3046\u306a\u8a18\u53f7\u3060\u3051\u306e\u30c8\u30fc\u30af\u30f3\u306e\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\u3001\n        \u5168\u89d2\u6587\u5b57'\uff30\uff39\uff34\uff28\uff2f\uff2e'\u306e\u534a\u89d2\u5316\u3001\u82f1\u5358\u8a9e'Word'\u306e\u5c0f\u6587\u5b57\u5316\u3068\u3044\u3063\u305f\u6b63\u898f\u5316\u3082\u884c\u3048\u307e\u3059\u3002\n        \u3055\u3089\u306b\u5fc5\u305a\u9664\u5916\u3057\u305f\u3044\u30c8\u30fc\u30af\u30f3\u3092\u30b9\u30c8\u30c3\u30d7\u30ef\u30fc\u30c9\u306b\u8a2d\u5b9a\u3059\u308b\u3053\u3068\u3082\u3067\u304d\u307e\u3059\u3002\n        \"\"\"\n        stopwords = ['\u3053\u308c', '\u3053\u3068']\n\n        tokens = tokenize(text)\n        \"\"\"\n        >>> print(list(tokens))\n\n        ['', '', '\u8a00\u8a9e', '\u51e6\u7406', '\u5fc5\u9808', '\u524d', '\u51e6\u7406', '\u305f\u3081', '\u30e2\u30b8\u30e5\u30fc\u30eb', '\u5f62\u614b\u7d20',\n        '\u89e3\u6790', 'n', '-', 'gram', '\u30c8\u30fc', '\u30af\u30f3', '\u5316', '\u5f8c', '\u30d5\u30a3\u30eb\u30bf', '\u30ea\u30f3\u30b0', '\u6b63\u898f',\n        '\u5316', '\u88dc\u52a9', '\u4e00\u8a9e', '\u30c8\u30fc\u30af', '\u30f3', \"'\", '1234', \"'\", '\u3088\u3046', '\u6570\u5b57','\u30c8\u30fc',\n        '\u30af\u30f3', \"'!!'\", '\u3088\u3046', '\u8a18\u53f7', '\u30c8\u30fc', '\u30af\u30f3', '\u30d5\u30a3\u30eb\u30bf', '\u30ea\u30f3\u30b0', '\u5168\u89d2',\n        '\u6587\u5b57', \"'\", '\uff30\uff39\uff34\uff28\uff2f\uff2e', \"'\", '\u534a\u89d2', '\u5316', '\u82f1\u5358\u8a9e', \"'\", 'Word',\"'\", '\u5c0f\u6587\u5b57',\n        '\u5316', '\u6b63\u898f', '\u5316', '\u9664\u5916', '\u30c8\u30fc\u30af', '\u30f3', '\u30b9\u30c8\u30c3\u30d7', '\u30ef\u30fc\u30c9', '\u8a2d\u5b9a', '\u3053\u3068']\n        \"\"\"\n\n        tokens = tokenize(text)\n        preprocessor = JpTokenPreprocessing(number=False,\n                                            symbol=False,\n                                            case='lower',\n                                            unicode='NFKC',\n                                            min_len=2,\n                                            stopwords=stopwords)\n        tokens = preprocessor.preprocessing(tokens)\n        # Return iterator of tokens. Using list() for print sample.\n        \"\"\"\n        >>> print(list(tokens))\n        ['\u8a00\u8a9e', '\u51e6\u7406', '\u5fc5\u9808', '\u51e6\u7406', '\u305f\u3081', '\u30e2\u30b8\u30e5\u30fc\u30eb', '\u5f62\u614b\u7d20', '\u89e3\u6790', 'gram',\n        '\u30c8\u30fc', '\u30af\u30f3', '\u30d5\u30a3\u30eb\u30bf', '\u30ea\u30f3\u30b0', '\u6b63\u898f', '\u88dc\u52a9', '\u4e00\u8a9e', '\u30c8\u30fc\u30af', '\u3088\u3046',\n        '\u6570\u5b57', '\u30c8\u30fc', '\u30af\u30f3', '\u3088\u3046', '\u8a18\u53f7', '\u30c8\u30fc', '\u30af\u30f3', '\u30d5\u30a3\u30eb\u30bf', '\u30ea\u30f3\u30b0',\n        '\u5168\u89d2', '\u6587\u5b57', 'python', '\u534a\u89d2', '\u82f1\u5358\u8a9e', 'word', '\u5c0f\u6587\u5b57', '\u6b63\u898f', '\u9664\u5916',\n        '\u30c8\u30fc\u30af', '\u30b9\u30c8\u30c3\u30d7', '\u30ef\u30fc\u30c9', '\u8a2d\u5b9a']\n        \"\"\"\n\nInstallation\n====================================\n\n.. code-block :: bash\n\n    pip install JpTokenPreprocessing\n\nMeCab for python3\n-----------------------------------\n\nPlease apply below patch for installing and using MeCab module with python3. (2014/09/07 MeCab 0.996)\n\nhttps://code.google.com/p/mecab/issues/detail?id=7\n\nMETHODS\n====================================\n\nJpTokenPreprocessing(args)\n-----------------------------------\n\n- number = BOOL (default: False)\n\n    Allow only number token.\n\n- symbol = BOOL (default: False)\n\n    Allow only symbol token.\n\n- case = 'lower' or 'upper' or 'capitalize'\n\n    Normalize alphabet case.\n\n- unicode = 'NFC' or 'NFKC' or 'NFD' or 'NFKD'a (default: 'NFKC')\n\n    Normalize unicode string with unicodedata.normalize().\n\n- min_len = int (default: 2)\n\n    Filter out few character token. If min_len = 2 filter out token that has only 1 or 0 character.\n\n- stopwords = list (default: [])\n\n    Filter out any token that are contained in stopword list.\n\n- JpTokenPreprocessing.preprocessing(iterable)\n\n    Return preprocessed tokens iterator.\n\nFuture work\n====================================\n\n- Add some hook point for extending own preprocess.\n\nAuthors\n====================================\nKenta kase kesin1202000@gmail.com\n\nLicense\n====================================\nMIT License",
        "description_content_type": null,
        "docs_url": null,
        "download_url": "https://github.com/Kesin11/JpTokenPreprocessing/archive/master.zip",
        "downloads": {
            "last_day": -1,
            "last_month": -1,
            "last_week": -1
        },
        "home_page": "https://github.com/Kesin11/JpTokenPreprocessing",
        "keywords": "NLP,natural language processing,token,japaneese",
        "license": "MIT",
        "maintainer": null,
        "maintainer_email": null,
        "name": "JpTokenPreprocessing",
        "package_url": "https://pypi.org/project/JpTokenPreprocessing/",
        "platform": "UNKNOWN",
        "project_url": "https://pypi.org/project/JpTokenPreprocessing/",
        "project_urls": {
            "Download": "https://github.com/Kesin11/JpTokenPreprocessing/archive/master.zip",
            "Homepage": "https://github.com/Kesin11/JpTokenPreprocessing"
        },
        "release_url": "https://pypi.org/project/JpTokenPreprocessing/0.1.5a2/",
        "requires_dist": null,
        "requires_python": null,
        "summary": "JpTokenPreprocessing is Python library for token preprocessing.",
        "version": "0.1.5a2"
    },
    "last_serial": 1764412,
    "releases": {
        "0.1.1a": [
            {
                "comment_text": "",
                "digests": {
                    "md5": "fb068edbca0b742e35767cc221fa08e3",
                    "sha256": "524fda6d9520bc31085a75f09468acf02a87f88b0e103123549de28f98279a41"
                },
                "downloads": -1,
                "filename": "JpTokenPreprocessing-0.1.1a.tar.gz",
                "has_sig": false,
                "md5_digest": "fb068edbca0b742e35767cc221fa08e3",
                "packagetype": "sdist",
                "python_version": "source",
                "requires_python": null,
                "size": 4228,
                "upload_time": "2014-09-14T05:33:55",
                "url": "https://files.pythonhosted.org/packages/a5/69/7354695d1dbbc10643ae9940446a59b249c7334485aaa6901ad787a8caa7/JpTokenPreprocessing-0.1.1a.tar.gz"
            }
        ],
        "0.1.2a": [
            {
                "comment_text": "",
                "digests": {
                    "md5": "ad23b088321969be465275b2bf71b1e3",
                    "sha256": "02c11055caaac13506fe494364a576be820c128ed686e461ccbb7868f2cd6916"
                },
                "downloads": -1,
                "filename": "JpTokenPreprocessing-0.1.2a.tar.gz",
                "has_sig": false,
                "md5_digest": "ad23b088321969be465275b2bf71b1e3",
                "packagetype": "sdist",
                "python_version": "source",
                "requires_python": null,
                "size": 4227,
                "upload_time": "2014-09-14T05:35:31",
                "url": "https://files.pythonhosted.org/packages/15/11/114c561707e71c5a8d02a6fc3b291efa4992f13f2104b18bd872b9fff149/JpTokenPreprocessing-0.1.2a.tar.gz"
            }
        ],
        "0.1.3a": [
            {
                "comment_text": "",
                "digests": {
                    "md5": "3a57eeec880d9848cd8cbf53e4650e11",
                    "sha256": "565bd05bb7242d6ec5b30e42ca1c86e06be8178823c99f60727558da34e255dc"
                },
                "downloads": -1,
                "filename": "JpTokenPreprocessing-0.1.3a.tar.gz",
                "has_sig": false,
                "md5_digest": "3a57eeec880d9848cd8cbf53e4650e11",
                "packagetype": "sdist",
                "python_version": "source",
                "requires_python": null,
                "size": 4226,
                "upload_time": "2014-09-14T05:41:50",
                "url": "https://files.pythonhosted.org/packages/d9/11/8d507971a7d6576849a9671b9ae1b2513bf6546a6974c4010c999c2d2bed/JpTokenPreprocessing-0.1.3a.tar.gz"
            }
        ],
        "0.1.4a": [
            {
                "comment_text": "",
                "digests": {
                    "md5": "f9c918e9ca7fcd5419bac01a3e6e7517",
                    "sha256": "3136835b68ca6901fd45af3772e3a0dff8f25b128ead44983666fb36ea2d9f62"
                },
                "downloads": -1,
                "filename": "JpTokenPreprocessing-0.1.4a.tar.gz",
                "has_sig": false,
                "md5_digest": "f9c918e9ca7fcd5419bac01a3e6e7517",
                "packagetype": "sdist",
                "python_version": "source",
                "requires_python": null,
                "size": 4234,
                "upload_time": "2014-09-14T05:46:04",
                "url": "https://files.pythonhosted.org/packages/c5/8e/82219626a18bdf5406215f7b494309a1699462a274a786dc65044e693d8e/JpTokenPreprocessing-0.1.4a.tar.gz"
            }
        ],
        "0.1.5a": [
            {
                "comment_text": "",
                "digests": {
                    "md5": "27f5c7fa149cc6ade68d612d65a9a88b",
                    "sha256": "bbe0fbd644b36a0a5d56a4864e566496a47f6aa2df7cfba94efe26f8a83ef2ea"
                },
                "downloads": -1,
                "filename": "JpTokenPreprocessing-0.1.5a.tar.gz",
                "has_sig": false,
                "md5_digest": "27f5c7fa149cc6ade68d612d65a9a88b",
                "packagetype": "sdist",
                "python_version": "source",
                "requires_python": null,
                "size": 4231,
                "upload_time": "2014-09-14T05:59:23",
                "url": "https://files.pythonhosted.org/packages/13/d5/32888ad6560926cbe1cd7eab8d975d8165a4c069c25dae5b4d47a40f2a05/JpTokenPreprocessing-0.1.5a.tar.gz"
            }
        ],
        "0.1.5a2": [
            {
                "comment_text": "",
                "digests": {
                    "md5": "715b704f4992e85162806a33636c88ca",
                    "sha256": "b3c4d4520cf676f2fb236aed302195c4761751569332ebd11f2ee3ab07766f85"
                },
                "downloads": -1,
                "filename": "JpTokenPreprocessing-0.1.5a2.tar.gz",
                "has_sig": false,
                "md5_digest": "715b704f4992e85162806a33636c88ca",
                "packagetype": "sdist",
                "python_version": "source",
                "requires_python": null,
                "size": 4264,
                "upload_time": "2015-10-12T13:53:21",
                "url": "https://files.pythonhosted.org/packages/b9/cb/c5c3d000513afaad2c8b2216da83cd64cc4a7b1801d10d9a80f6bc607559/JpTokenPreprocessing-0.1.5a2.tar.gz"
            }
        ],
        "0.1a": [
            {
                "comment_text": "",
                "digests": {
                    "md5": "29aae6dd1ccd996991fd3c69ef5b89f5",
                    "sha256": "162cc9fedb4229e6ee0274eb8fa5eec130bd2e229c57459322dee70c7462e09a"
                },
                "downloads": -1,
                "filename": "JpTokenPreprocessing-0.1a.tar.gz",
                "has_sig": false,
                "md5_digest": "29aae6dd1ccd996991fd3c69ef5b89f5",
                "packagetype": "sdist",
                "python_version": "source",
                "requires_python": null,
                "size": 3812,
                "upload_time": "2014-09-14T03:20:30",
                "url": "https://files.pythonhosted.org/packages/3c/8d/d870b2a0a9d89de139459c435475d32393d9ef0230868d6a440a7338c3ba/JpTokenPreprocessing-0.1a.tar.gz"
            }
        ]
    },
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "md5": "715b704f4992e85162806a33636c88ca",
                "sha256": "b3c4d4520cf676f2fb236aed302195c4761751569332ebd11f2ee3ab07766f85"
            },
            "downloads": -1,
            "filename": "JpTokenPreprocessing-0.1.5a2.tar.gz",
            "has_sig": false,
            "md5_digest": "715b704f4992e85162806a33636c88ca",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 4264,
            "upload_time": "2015-10-12T13:53:21",
            "url": "https://files.pythonhosted.org/packages/b9/cb/c5c3d000513afaad2c8b2216da83cd64cc4a7b1801d10d9a80f6bc607559/JpTokenPreprocessing-0.1.5a2.tar.gz"
        }
    ]
}