{ "info": { "author": "Milan Straka", "author_email": "straka@ufal.mff.cuni.cz", "bugtrack_url": null, "classifiers": [ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", "Programming Language :: C++", "Programming Language :: Python", "Programming Language :: Python :: 2", "Programming Language :: Python :: 3", "Topic :: Software Development :: Libraries" ], "description": "ufal.nametag\n============\n\nThe ``ufal.nametag`` is a Python binding to NameTag library .\n\nThe bindings is a straightforward conversion of the ``C++`` bindings API.\nIn Python 2, strings can be both ``unicode`` and UTF-8 encoded ``str``, and the\nlibrary always produces ``unicode``. In Python 3, strings must be only ``str``.\n\n\nWrapped C++ API\n---------------\n\nThe C++ API being wrapped follows. For a API reference of the original\nC++ API, see .\n\n::\n\n Helper Structures\n -----------------\n\n typedef vector Forms;\n\n struct TokenRange {\n size_t start;\n size_t length;\n };\n typedef vector TokenRanges;\n\n struct NamedEntity {\n size_t start;\n size_t length;\n string type;\n\n NamedEntity();\n NamedEntity(size_t start, size_t length, const string& type);\n };\n typedef vector NamedEntities;\n\n\n Main Classes\n ------------\n\n class Version {\n public:\n unsigned major;\n unsigned minor;\n unsigned patch;\n string prerelease;\n\n static Version current();\n };\n\n class Tokenizer {\n public:\n virtual void setText(const char* text);\n virtual bool nextSentence(Forms* forms, TokenRanges* tokens);\n\n static Tokenizer* newVerticalTokenizer();\n };\n\n class Ner {\n static ner* load(const char* fname);\n\n virtual void recognize(Forms& forms, NamedEntities& entities) const;\n\n virtual Tokenizer* newTokenizer() const;\n };\n\n\nExamples\n========\n\nrun_ner\n-------\n\nSimple example performing named entity recognition::\n\n import sys\n\n from ufal.nametag import *\n\n def encode_entities(text):\n return text.replace('&', '&').replace('<', '<').replace('>', '>').replace('\"', '"')\n\n def sort_entities(entities):\n return sorted(entities, key=lambda entity: (entity.start, -entity.length))\n\n # In Python2, wrap sys.stdin and sys.stdout to work with unicode.\n if sys.version_info[0] < 3:\n import codecs\n import locale\n encoding = locale.getpreferredencoding()\n sys.stdin = codecs.getreader(encoding)(sys.stdin)\n sys.stdout = codecs.getwriter(encoding)(sys.stdout)\n\n if len(sys.argv) == 1:\n sys.stderr.write('Usage: %s recognizer_model\\n' % sys.argv[0])\n sys.exit(1)\n\n sys.stderr.write('Loading ner: ')\n ner = Ner.load(sys.argv[1])\n if not ner:\n sys.stderr.write(\"Cannot load recognizer from file '%s'\\n\" % sys.argv[1])\n sys.exit(1)\n sys.stderr.write('done\\n')\n\n forms = Forms()\n tokens = TokenRanges()\n entities = NamedEntities()\n sortedEntities = []\n openEntities = []\n tokenizer = ner.newTokenizer()\n if tokenizer is None:\n sys.stderr.write(\"No tokenizer is defined for the supplied model!\")\n sys.exit(1)\n\n not_eof = True\n while not_eof:\n text = ''\n\n # Read block\n while True:\n line = sys.stdin.readline()\n not_eof = bool(line)\n if not not_eof: break\n line = line.rstrip('\\r\\n')\n text += line\n text += '\\n';\n if not line: break\n\n # Tokenize and recognize\n tokenizer.setText(text)\n t = 0\n while tokenizer.nextSentence(forms, tokens):\n ner.recognize(forms, entities)\n sortedEntities = sort_entities(entities)\n\n # Write entities\n e = 0\n for i in range(len(tokens)):\n sys.stdout.write(encode_entities(text[t:tokens[i].start]))\n if (i == 0): sys.stdout.write(\"\")\n\n # Open entities starting at current token\n while (e < len(sortedEntities) and sortedEntities[e].start == i):\n sys.stdout.write('' % encode_entities(sortedEntities[e].type))\n openEntities.append(sortedEntities[e].start + sortedEntities[e].length - 1)\n e = e + 1\n\n # The token itself\n sys.stdout.write('%s' % encode_entities(text[tokens[i].start : tokens[i].start + tokens[i].length]))\n\n # Close entities ending after current token\n while openEntities and openEntities[-1] == i:\n sys.stdout.write('')\n openEntities.pop()\n if (i + 1 == len(tokens)): sys.stdout.write(\"\")\n t = tokens[i].start + tokens[i].length\n # Write rest of the text\n sys.stdout.write(encode_entities(text[t:]))\n\n\nAUTHORS\n=======\n\nMilan Straka \n\nJana Strakov\u00e1 \n\n\nCOPYRIGHT AND LICENCE\n=====================\n\nCopyright 2016 Institute of Formal and Applied Linguistics, Faculty of\nMathematics and Physics, Charles University in Prague, Czech Republic.\n\nThis Source Code Form is subject to the terms of the Mozilla Public\nLicense, v. 2.0. If a copy of the MPL was not distributed with this\nfile, You can obtain one at http://mozilla.org/MPL/2.0/.", "description_content_type": null, "docs_url": null, "download_url": null, "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "http://ufal.mff.cuni.cz/nametag", "keywords": null, "license": "MPL 2.0", "maintainer": null, "maintainer_email": null, "name": "ufal.nametag", "package_url": "https://pypi.org/project/ufal.nametag/", "platform": "UNKNOWN", "project_url": "https://pypi.org/project/ufal.nametag/", "project_urls": { "Homepage": "http://ufal.mff.cuni.cz/nametag" }, "release_url": "https://pypi.org/project/ufal.nametag/1.1.2.1/", "requires_dist": null, "requires_python": null, "summary": "Bindings to NameTag library", "version": "1.1.2.1" }, "last_serial": 2991934, "releases": { "1.0.0.1": [ { "comment_text": "", "digests": { "md5": "adb32c31eb405bc4c46ed5ef5ee6aeb3", "sha256": "0ab59f6cd96d341376b9badf3a7e5ce36c2e84fd99eab8a4c5bce1fb25f75606" }, "downloads": -1, "filename": "ufal.nametag-1.0.0.1.tar.gz", "has_sig": false, "md5_digest": "adb32c31eb405bc4c46ed5ef5ee6aeb3", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 256344, "upload_time": "2014-04-11T11:30:39", "url": "https://files.pythonhosted.org/packages/ef/24/8c0521b166d09732ee34566f1c9d7da19360e33f34197239d8d6be0faeb3/ufal.nametag-1.0.0.1.tar.gz" } ], "1.1.1.2": [ { "comment_text": "", "digests": { "md5": "c28efefb2a6ad01c4c580c5f5ddbca8e", "sha256": "f0a575717afed2f326e79bfc155aff5c1f8a6ef39132a303cefcf898910d4830" }, "downloads": -1, "filename": "ufal.nametag-1.1.1.2.tar.gz", "has_sig": false, "md5_digest": "c28efefb2a6ad01c4c580c5f5ddbca8e", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 183626, "upload_time": "2016-05-05T19:21:09", "url": "https://files.pythonhosted.org/packages/6a/b7/9dbf07a4df913b96979fb9b7da8ab8bbf1fbcd9e961e68d787b5b695cb09/ufal.nametag-1.1.1.2.tar.gz" } ], "1.1.2.1": [ { "comment_text": "", "digests": { "md5": "839f35e866e164f46130d493beed5b52", "sha256": "0eea18c2166c4dcdb8b1d8c6af45de33676fe98113f377ff49254c1daa2ad296" }, "downloads": -1, "filename": "ufal.nametag-1.1.2.1.tar.gz", "has_sig": false, "md5_digest": "839f35e866e164f46130d493beed5b52", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 184743, "upload_time": "2017-07-01T06:10:38", "url": "https://files.pythonhosted.org/packages/da/f9/5a8e66b3dec64cd96f2c034b02a2847ae16591c12a405d28165b71cc8493/ufal.nametag-1.1.2.1.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "839f35e866e164f46130d493beed5b52", "sha256": "0eea18c2166c4dcdb8b1d8c6af45de33676fe98113f377ff49254c1daa2ad296" }, "downloads": -1, "filename": "ufal.nametag-1.1.2.1.tar.gz", "has_sig": false, "md5_digest": "839f35e866e164f46130d493beed5b52", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 184743, "upload_time": "2017-07-01T06:10:38", "url": "https://files.pythonhosted.org/packages/da/f9/5a8e66b3dec64cd96f2c034b02a2847ae16591c12a405d28165b71cc8493/ufal.nametag-1.1.2.1.tar.gz" } ] }