{
    "info": {
        "author": "Shinya Fujino",
        "author_email": "shf0811@gmail.com",
        "bugtrack_url": null,
        "classifiers": [
            "Development Status :: 3 - Alpha",
            "Environment :: Other Environment",
            "Intended Audience :: Developers",
            "License :: OSI Approved :: MIT License",
            "Operating System :: MacOS :: MacOS X",
            "Operating System :: Microsoft :: Windows",
            "Operating System :: POSIX",
            "Programming Language :: Python :: 3",
            "Programming Language :: Python :: 3.6",
            "Programming Language :: Python :: 3 :: Only",
            "Topic :: Education",
            "Topic :: Software Development :: Libraries :: Python Modules",
            "Topic :: Text Processing :: Linguistic"
        ],
        "description": "chinese\n=======\n\nchinese is a Chinese text analyzer.\n\n.. figure:: https://github.com/morinokami/chinese/blob/master/docs/overview.png?raw=true\n   :alt: Overview\n\nNOTE: Python 2.\\* is not supported.\n\nGetting Started\n---------------\n\nInstall chinese using pip:\n\n.. code:: sh\n\n    $ pip install chinese\n    $ pynlpir update\n\nStart analyzing Chinese text:\n\n.. code:: py\n\n    >>> from chinese import ChineseAnalyzer\n    >>> analyzer = ChineseAnalyzer()\n    >>> result = analyzer.parse('\u6211\u5f88\u9ad8\u5174\u8ba4\u8bc6\u4f60')\n    >>> result.tokens()\n    ['\u6211', '\u5f88', '\u9ad8\u5174', '\u8ba4\u8bc6', '\u4f60']\n    >>> result.pinyin()\n    'w\u01d2 h\u011bn g\u0101ox\u00ecng r\u00e8nshi n\u01d0'\n    >>> result.pprint()\n    {'original': '\u6211\u5f88\u9ad8\u5174\u8ba4\u8bc6\u4f60',\n     'parsed': [{'dict_data': [{'definitions': ['I', 'me', 'my'],\n                              'kind': 'Simplified',\n                              'match': '\u6211',\n                              'pinyin': ['wo3']}],\n               'token': ('\u6211', 0, 1)},\n               {'dict_data': [{'definitions': ['(adverb of degree)',\n                                             'quite',\n                                             'very',\n                                             'awfully'],\n                              'kind': 'Simplified',\n                              'match': '\u5f88',\n                              'pinyin': ['hen3']}],\n               'token': ('\u5f88', 1, 2)},\n               {'dict_data': [{'definitions': ['happy',\n                                             'glad',\n                                             'willing (to do sth)',\n                                             'in a cheerful mood'],\n                              'kind': 'Simplified',\n                              'match': '\u9ad8\u8208',\n                              'pinyin': ['gao1', 'xing4']}],\n               'token': ('\u9ad8\u5174', 2, 4)},\n               {'dict_data': [{'definitions': ['to know',\n                                             'to recognize',\n                                             'to be familiar with',\n                                             'to get acquainted with sb',\n                                             'knowledge',\n                                             'understanding',\n                                             'awareness',\n                                             'cognition'],\n                              'kind': 'Simplified',\n                              'match': '\u8a8d\u8b58',\n                              'pinyin': ['ren4', 'shi5']}],\n               'token': ('\u8ba4\u8bc6', 4, 6)},\n               {'dict_data': [{'definitions': ['you (informal, as opposed to '\n                                             'courteous \u60a8[nin2])'],\n                              'kind': 'Simplified',\n                              'match': '\u4f60',\n                              'pinyin': ['ni3']}],\n               'token': ('\u4f60', 6, 7)}]}\n    >>> result = analyzer.parse('\u6211\u559c\u6b61\u9019\u500b\u5473\u9053', traditional=True)\n    >>> print(res)\n    {'\u5473\u9053': [{'definitions': ['flavor', 'smell', 'hint of'],\n          'kind': 'Traditional',\n          'match': '\u5473\u9053',\n          'pinyin': ['wei4', 'dao5']}],\n     '\u559c\u6b61': [{'definitions': ['to like', 'to be fond of'],\n          'kind': 'Traditional',\n          'match': '\u559c\u6b22',\n          'pinyin': ['xi3', 'huan5']}],\n     '\u6211': [{'definitions': ['I', 'me', 'my'],\n          'kind': 'Traditional',\n          'match': '\u6211',a\n          'pinyin': ['wo3']}],\n     '\u9019\u500b': [{'definitions': ['this', 'this one'],\n          'kind': 'Traditional',\n          'match': '\u8fd9\u4e2a',\n          'pinyin': ['zhe4', 'ge5']}]}\n\nFeatures\n--------\n\n-  ``parse()`` returns a ChineseAnalyzerResult object.\n\n.. code:: py\n\n    >>> from chinese import ChineseAnalyzer\n    >>> analyzer = ChineseAnalyzer()\n    # Basic usage.\n    >>> result = analyzer.parse('\u4f60\u597d\u4e16\u754c')\n    # If the traditional option is set to True, the analyzer tries to parse the\n    # provided text as \u7e41\u4f53\u5b57.\n    >>> result = analyzer.parse('\u4f60\u597d\u4e16\u754c', traditional=True)\n    # The default tokenizer uses jieba's. You can also use pynlpir's to tokenize.\n    >>> result = analyzer.parse('\u4f60\u597d\u4e16\u754c', using=analyzer.tokenizer.pynlpir)\n    # In addition, a custom tokenizer can be passed to the method.\n    >>> from chinese.tokenizer import TokenizerInterface\n    >>> class MyTokenizer(TokenizerInterface): # Custom tokenizer must inherit from TokenizerInterface.\n    ...     # Custom tokenizer must implement tokenize() method.\n    ...     def tokenize(self, string):\n    ...         # tokenize() must return a list of tuples containing at least\n    ...         # a string as a first element.\n    ...         # For example: [('token1', ...), ('token2', ...), ...].\n    ...\n    >>> my_tokenizer = MyTokenizer()\n    >>> result = analyzer.parse('\u4f60\u597d\u4e16\u754c', using=my_tokenizer)\n    # You can also specify the dictionary used for looking up each token.\n    # You specify a path to a dictionary file for that and the file must have\n    # the CC-CEDICT's dictionary file structure.\n    # CC-CEDICT's dictionary is used for looking up by default.\n    >>> result = analyzer.parse('\u4f60\u597d\u4e16\u754c', dictionary='path/to/dict')\n\n-  ``original()`` returns the supplied text as is.\n\n.. code:: py\n\n    >>> result = analyzer.parse('\u6211\u6700\u559c\u6b22\u5403\u6c34\u716e\u8089\u7247')\n    >>> result.original()\n    '\u6211\u6700\u559c\u6b22\u5403\u6c34\u716e\u8089\u7247'\n\n-  ``tokens()`` returns tokens in the provided text.\n\n.. code:: py\n\n    >>> result = analyzer.parse('\u6211\u7684\u6c49\u8bed\u9a6c\u9a6c\u864e\u864e')\n    >>> result.tokens()\n    ['\u6211', '\u7684', '\u6c49\u8bed', '\u9a6c\u9a6c\u864e\u864e']\n    >>> result.tokens(details=True) # If the details option is set to True, additional information is also attached.\n    [('\u6211', 0, 1), ('\u7684', 1, 2), ('\u6c49\u8bed', 2, 4), ('\u9a6c\u9a6c\u864e\u864e', 4, 8)] # In this case, the positions of tokens are included.\n    >>> result = analyzer.parse('\u7684\u7684\u7684\u7684\u7684\u5728\u7684\u7684\u7684\u7684\u5c31\u4ee5\u548c\u548c\u548c')\n    >>> result.tokens(unique=True) # You can get a unique collection of tokens using unique option.\n    ['\u7684', '\u5728', '\u5c31', '\u4ee5', '\u548c']\n\n-  ``freq()`` returns a Counter object that counts the number of\n   occurrences for each token.\n\n.. code:: py\n\n    >>> result = analyzer.parse('\u7684\u7684\u7684\u7684\u7684\u5728\u7684\u7684\u7684\u7684\u5c31\u4ee5\u548c\u548c\u548c')\n    >>> result.freq()\n    Counter({'\u7684': 9, '\u548c': 3, '\u5728': 1, '\u5c31': 1, '\u4ee5': 1})\n\n-  ``sentences()`` returns a list of paragraphs in a provided text.\n\n.. code:: py\n\n    >>> s = '''\u60a8\u597d\u3002\u8bf7\u95ee\u5c0f\u7f8e\u5728\u5bb6\u5417\uff1f\n    ...\n    ... \u5728\u3002\u8bf7\u7a0d\u7b49\u3002'''\n    >>> result = analyzer.parse(s)\n    >>> result.sentences()\n    ['\u60a8\u597d', '\u8bf7\u95ee\u5c0f\u7f8e\u5728\u5bb6\u5417', '\u5728', '\u8bf7\u7a0d\u7b49']\n\n-  ``search()`` returns a list of sentences containing the argument\n   string.\n\n.. code:: py\n\n    >>> s = '\u81ea\u7136\u8bed\u8a00\u5904\u7406\u662f\u8ba1\u7b97\u673a\u79d1\u5b66\u9886\u57df\u4e0e\u4eba\u5de5\u667a\u80fd\u9886\u57df\u4e2d\u7684\u4e00\u4e2a\u91cd\u8981\u65b9\u5411\u3002\u5b83\u7814\u7a76\u80fd\u5b9e\u73b0\u4eba\u4e0e\u8ba1\u7b97\u673a\u4e4b\u95f4\u7528\u81ea\u7136\u8bed\u8a00\u8fdb\u884c\u6709\u6548\u901a\u4fe1\u7684\u5404\u79cd\u7406\u8bba\u548c\u65b9\u6cd5\u3002\u81ea\u7136\u8bed\u8a00\u5904\u7406\u662f\u4e00\u95e8\u878d\u8bed\u8a00\u5b66\u3001\u8ba1\u7b97\u673a\u79d1\u5b66\u3001\u6570\u5b66\u4e8e\u4e00\u4f53\u7684\u79d1\u5b66\u3002\u56e0\u6b64\uff0c\u8fd9\u4e00\u9886\u57df\u7684\u7814\u7a76\u5c06\u6d89\u53ca\u81ea\u7136\u8bed\u8a00\uff0c\u5373\u4eba\u4eec\u65e5\u5e38\u4f7f\u7528\u7684\u8bed\u8a00\uff0c\u6240\u4ee5\u5b83\u4e0e\u8bed\u8a00\u5b66\u7684\u7814\u7a76\u6709\u7740\u5bc6\u5207\u7684\u8054\u7cfb\uff0c\u4f46\u53c8\u6709\u91cd\u8981\u7684\u533a\u522b\u3002\u81ea\u7136\u8bed\u8a00\u5904\u7406\u5e76\u4e0d\u662f\u4e00\u822c\u5730\u7814\u7a76\u81ea\u7136\u8bed\u8a00\uff0c\u800c\u5728\u4e8e\u7814\u5236\u80fd\u6709\u6548\u5730\u5b9e\u73b0\u81ea\u7136\u8bed\u8a00\u901a\u4fe1\u7684\u8ba1\u7b97\u673a\u7cfb\u7edf\uff0c\u7279\u522b\u662f\u5176\u4e2d\u7684\u8f6f\u4ef6\u7cfb\u7edf\u3002\u56e0\u800c\u5b83\u662f\u8ba1\u7b97\u673a\u79d1\u5b66\u7684\u4e00\u90e8\u5206\u3002'\n    >>> result = analyzer.parse(s)\n    >>> result.search('\u6570\u5b66')\n    ['\u81ea\u7136\u8bed\u8a00\u5904\u7406\u662f\u4e00\u95e8\u878d\u8bed\u8a00\u5b66\u3001\u8ba1\u7b97\u673a\u79d1\u5b66\u3001\u6570\u5b66\u4e8e\u4e00\u4f53\u7684\u79d1\u5b66']\n\n-  ``paragraphs()`` returns a list of sentences in a provided text.\n\n.. code:: py\n\n    >>> s = '''\u60a8\u597d\u3002\u8bf7\u95ee\u5c0f\u7f8e\u5728\u5bb6\u5417\uff1f\n    ...\n    ... \u5728\u3002\u8bf7\u7a0d\u7b49\u3002'''\n    >>> result = analyzer.parse(s)\n    >>> result.paragraphs()\n    ['\u60a8\u597d\u3002\u8bf7\u95ee\u5c0f\u7f8e\u5728\u5bb6\u5417\uff1f', '\u5728\u3002\u8bf7\u7a0d\u7b49\u3002']\n\n-  ``pinyin()`` returns a pinyin representation of the provided text.\n\n.. code:: py\n\n    >>> result = analyzer.parse('\u6211\u559c\u6b22Python\u3002')\n    >>> result.pinyin()\n    'w\u01d2 x\u01d0huan Python.'\n    >>> result = analyzer.parse('\u4e0b\u4e2a\u6708\u6211\u53bb\u6da9\u8c37')\n    >>> result.pinyin() # Sometimes the analyzer cannot find a correponding pinyin.\n    'xi\u00e0g\u00e8yu\u00e8 w\u01d2 q\u00f9 \u6da9\u8c37'\n    >>> result.pinyin(force=True) # The force option forces it to try to convert an unknown word to pinyin.\n    'xi\u00e0g\u00e8yu\u00e8 w\u01d2 q\u00f9 s\u00e8g\u01d4'\n\n-  ``pprint()`` prints a formatted description of the parsed text.\n\n.. code:: py\n\n    >>> result = analyzer.parse('\u6211\u7231\u770b\u4e66')\n    >>> result.pprint()\n    {'original': '\u6211\u7231\u770b\u4e66',\n     'parsed': [{'dict_data': [{'definitions': ['I', 'me', 'my'],\n                                'kind': 'Simplified',\n                                'match': '\u6211',\n                                'pinyin': ['wo3']}],\n                 'token': ('\u6211', 0, 1)},\n                {'dict_data': [{'definitions': ['to love',\n                                                'to be fond of',\n                                                'to like',\n                                                'affection',\n                                                'to be inclined (to do sth)',\n                                                'to tend to (happen)'],\n                                'kind': 'Simplified',\n                                'match': '\u611b',\n                                'pinyin': ['ai4']}],\n                 'token': ('\u7231', 1, 2)},\n                {'dict_data': [{'definitions': ['to read', 'to study'],\n                                'kind': 'Simplified',\n                                'match': '\u770b\u66f8',\n                                'pinyin': ['kan4', 'shu1']}],\n                 'token': ('\u770b\u4e66', 2, 4)}]}\n\n-  ``say()`` converts the provided text to Chinese audible speech (macOS\n   only).\n\n.. code:: py\n\n    >>> result = analyzer.parse('\u60a8\u597d\uff0c\u6211\u53ebTing-Ting\u3002\u6211\u8bb2\u4e2d\u6587\u666e\u901a\u8bdd\u3002')\n    >>> result.say()              # Output the speech.\n    >>> result.say(out='say.aac') # Save the speech to out.\n\n-  Get the number of tokens.\n\n.. code:: py\n\n    >>> result = analyzer.parse('\u6211\u662f\u4e2d\u56fd\u4eba')\n    >>> result.tokens()\n    ['\u6211', '\u662f', '\u4e2d\u56fd', '\u4eba']\n    >>> len(result)\n    4\n\n-  Check whether a token is in the result.\n\n.. code:: py\n\n    >>> result = analyzer.parse('\u6211\u662f\u4e2d\u56fd\u4eba')\n    >>> '\u4e2d\u56fd' in result\n    True\n    >>> '\u6211\u662f' in result\n    False\n\n-  Extract the lookup result.\n\n.. code:: py\n\n    >>> result = analyzer.parse('\u4f60\u53eb\u4ec0\u4e48\u540d\u5b57\uff1f')\n    >>> result.tokens()\n    ['\u4f60', '\u53eb', '\u4ec0\u4e48', '\u540d\u5b57', '\uff1f']\n    >>> shenme = result['\u4ec0\u4e48'] # It's just a list of lookup results.\n    >>> len(shenme)             # It has only one entry.\n    1\n    >>> print(shenme[0])        # Print that entry.\n    {'definitions': ['what?', 'something', 'anything'],\n     'kind': 'Simplified',\n     'match': '\u4ec0\u9ebc',\n     'pinyin': ['shen2', 'me5']}\n    >>> shenme_info = shenme[0]\n    >>> shenme_info.definitions # Definitions of the token.\n    ['what?', 'something', 'anything']\n    >>> shenme_info.match       # The corresponding \u7e41\u4f53\u5b57.\n    '\u4ec0\u9ebc'\n    >>> shenme_info.pinyin      # The pinyin of the token.\n    ['shen2', 'me5']\n\nLicense\n-------\n\nMIT License\n\nThanks\n------\n\n`jieba <https://github.com/fxsjy/jieba>`__ and\n`PyNLPIR <https://github.com/tsroten/pynlpir>`__ are used to tokenize a\nChinese text.\n\n`CC-CEDICT <https://www.mdbg.net/chinese/dictionary?page=cc-cedict>`__\nis used to lookup information for tokens.\n\n\n",
        "description_content_type": "",
        "docs_url": null,
        "download_url": "",
        "downloads": {
            "last_day": -1,
            "last_month": -1,
            "last_week": -1
        },
        "home_page": "https://github.com/morinokami/chinese",
        "keywords": "Chinese,text analysis",
        "license": "MIT",
        "maintainer": "",
        "maintainer_email": "",
        "name": "chinese",
        "package_url": "https://pypi.org/project/chinese/",
        "platform": "",
        "project_url": "https://pypi.org/project/chinese/",
        "project_urls": {
            "Homepage": "https://github.com/morinokami/chinese"
        },
        "release_url": "https://pypi.org/project/chinese/0.2.1/",
        "requires_dist": [
            "jieba",
            "pynlpir"
        ],
        "requires_python": "",
        "summary": "Chinese text analyzer",
        "version": "0.2.1"
    },
    "last_serial": 3703203,
    "releases": {
        "0.1.0": [
            {
                "comment_text": "",
                "digests": {
                    "md5": "1cc7cc14d3bdbbe86f524987f46d3159",
                    "sha256": "f64b1ae7ab47d6e462f5b8c5e7b0b5a2aa76b06c47299a076b2e8e58ef205e4e"
                },
                "downloads": -1,
                "filename": "chinese-0.1.0-py3-none-any.whl",
                "has_sig": false,
                "md5_digest": "1cc7cc14d3bdbbe86f524987f46d3159",
                "packagetype": "bdist_wheel",
                "python_version": "py3",
                "requires_python": null,
                "size": 6595854,
                "upload_time": "2018-03-19T16:07:00",
                "url": "https://files.pythonhosted.org/packages/84/a1/8bd5790e838ccf39ca09e77c826c6b8ff94472609812ded82eb90fd8a0b0/chinese-0.1.0-py3-none-any.whl"
            }
        ],
        "0.2.0": [
            {
                "comment_text": "",
                "digests": {
                    "md5": "6e5113f1461e4249cbf5f9886fbdaa7e",
                    "sha256": "63f2ee4b943dc22a704dc8b8e9b83f708aa3e447973f605eb2bdfc508b1544c1"
                },
                "downloads": -1,
                "filename": "chinese-0.2.0-py3-none-any.whl",
                "has_sig": false,
                "md5_digest": "6e5113f1461e4249cbf5f9886fbdaa7e",
                "packagetype": "bdist_wheel",
                "python_version": "py3",
                "requires_python": null,
                "size": 6597610,
                "upload_time": "2018-03-24T16:35:09",
                "url": "https://files.pythonhosted.org/packages/74/73/8171d46b1e4c341e769f29b73550e7c7e30af1ba9128b6d8a9d04d274ac6/chinese-0.2.0-py3-none-any.whl"
            }
        ],
        "0.2.1": [
            {
                "comment_text": "",
                "digests": {
                    "md5": "e23b5fd511782f67c9556be82712d7ba",
                    "sha256": "04a8052166fd9524ef5a6022810223da2a765b394b6280b6014f33de635842b1"
                },
                "downloads": -1,
                "filename": "chinese-0.2.1-py3-none-any.whl",
                "has_sig": false,
                "md5_digest": "e23b5fd511782f67c9556be82712d7ba",
                "packagetype": "bdist_wheel",
                "python_version": "py3",
                "requires_python": null,
                "size": 12583488,
                "upload_time": "2018-03-25T10:27:30",
                "url": "https://files.pythonhosted.org/packages/15/fe/35c1cd7792f0c899fbeae66d35491721cae6be6d8a128d4f77e6e3479b3a/chinese-0.2.1-py3-none-any.whl"
            }
        ]
    },
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "md5": "e23b5fd511782f67c9556be82712d7ba",
                "sha256": "04a8052166fd9524ef5a6022810223da2a765b394b6280b6014f33de635842b1"
            },
            "downloads": -1,
            "filename": "chinese-0.2.1-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "e23b5fd511782f67c9556be82712d7ba",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": null,
            "size": 12583488,
            "upload_time": "2018-03-25T10:27:30",
            "url": "https://files.pythonhosted.org/packages/15/fe/35c1cd7792f0c899fbeae66d35491721cae6be6d8a128d4f77e6e3479b3a/chinese-0.2.1-py3-none-any.whl"
        }
    ]
}