{ "info": { "author": "Johannes Filter", "author_email": "hi@jfilter.de", "bugtrack_url": null, "classifiers": [ "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Topic :: Utilities" ], "description": "
\n \"Scissors\"\n
\n\n# German Lemmatizer\n\nA Python package (using a Docker image under the hood) to [lemmatize](https://en.wikipedia.org/wiki/Lemmatisation) German texts.\n\nBuilt upon:\n\n- [IWNLP](https://github.com/Liebeck/spacy-iwnlp) uses the crowd-generated token tables on [de.wikitionary](https://de.wiktionary.org/).\n- [GermaLemma](https://github.com/WZBSocialScienceCenter/germalemma): Looks up lemmas in the [TIGER Corpus](http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/) and uses [Pattern](https://www.clips.uantwerpen.be/pattern) as a fallback for some rule-based lemmatizations.\n\nIt works as follows. First [spaCy](https://spacy.io/) tags the token with POS. Then `German Lemmatizer` looks up lemmas on IWNLP and GermanLemma. If they disagree, choose the one from IWNLP. If they agree or only one tool finds it, take it. Try to preserve the casing of the original token.\n\nYou may want to use underlying Docker image: [german-lemmatizer-docker](https://github.com/jfilter/german-lemmatizer-docker)\n\n## Installation\n\n1. Install [Docker](https://docs.docker.com/).\n2. `pip install german-lemmatizer`\n\n## Usage\n\n1. Read and accept the [license terms of the TIGER Corpus](http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/license/htmlicense.html) (free to use for non-commercial purposes).\n2. Make sure the Docker daemons runs.\n3. Write some Python code\n\n```python\nfrom german_lemmatizer import lemmatize\n\nlemmatize(\n ['Johannes war ein guter Sch\u00fcler', 'Sabiene sang zahlreiche Lieder'],\n working_dir='*',\n chunk_size=10000,\n n_jobs=1,\n escape=False,\n remove_stop=False)\n```\n\nThe list of texts is split into chunks (`chunk_size`) and processed in parallel (`n_jobs`).\n\nEnable the `escape` parameter if your text contains newslines. `remove_stop` removes stopwords as defined by spaCy.\n\n## License\n\nMIT.\n\n\n", "description_content_type": "text/markdown", "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://github.com/jfilter/german-lemmatizer", "keywords": "", "license": "MIT", "maintainer": "", "maintainer_email": "", "name": "german-lemmatizer", "package_url": "https://pypi.org/project/german-lemmatizer/", "platform": "", "project_url": "https://pypi.org/project/german-lemmatizer/", "project_urls": { "Homepage": "https://github.com/jfilter/german-lemmatizer" }, "release_url": "https://pypi.org/project/german-lemmatizer/0.1.1/", "requires_dist": [ "docker", "joblib", "tqdm" ], "requires_python": "", "summary": "A Python package (using a Docker image under the hood) to lemmatize German texts.", "version": "0.1.1" }, "last_serial": 5607501, "releases": { "0.1.0": [ { "comment_text": "", "digests": { "md5": "3603d220d66463813f5f8d426b96aa3a", "sha256": "d733b2db7aa3b239cba1b41cf0ca8fdd8f29a02e074d00faf81f926fd611a14e" }, "downloads": -1, "filename": "german_lemmatizer-0.1.0-py3-none-any.whl", "has_sig": false, "md5_digest": "3603d220d66463813f5f8d426b96aa3a", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 4509, "upload_time": "2019-07-30T12:31:54", "url": "https://files.pythonhosted.org/packages/1b/f4/bf4b39dc2aca1cd855fac9fde5f4beb6725d54c103ffb71592f094956e66/german_lemmatizer-0.1.0-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "0d97892a79facb26c4cbfa3d6404626c", "sha256": "d1fe2eb2e8e6e9d2870e2ab75f28295604c405cbd37d8090873010fd3b102c43" }, "downloads": -1, "filename": "german_lemmatizer-0.1.0.tar.gz", "has_sig": false, "md5_digest": "0d97892a79facb26c4cbfa3d6404626c", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 3231, "upload_time": "2019-07-30T12:31:56", "url": "https://files.pythonhosted.org/packages/bf/2b/5ee76f2e21129832bf82bf0f355319b2c4b02bc6d9439dc7c42db95766e0/german_lemmatizer-0.1.0.tar.gz" } ], "0.1.1": [ { "comment_text": "", "digests": { "md5": "77dae0fb78551f7091fab11ed629e847", "sha256": "b7275adaab3259f6e907629e148c2f66ad6a9af3cbfb871b01f611f2a3c85092" }, "downloads": -1, "filename": "german_lemmatizer-0.1.1-py3-none-any.whl", "has_sig": false, "md5_digest": "77dae0fb78551f7091fab11ed629e847", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 4498, "upload_time": "2019-07-30T12:34:44", "url": "https://files.pythonhosted.org/packages/22/06/c1958afb1a0d9979423eb67b2acecff6b95762b346fd9c872cfffa2d867a/german_lemmatizer-0.1.1-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "a975285f1f73b0352e9a3f211e636830", "sha256": "a4b638853e0f4549fb2866d89c9208e50692c2b3ddf6e2c6e75456e67aaf0790" }, "downloads": -1, "filename": "german_lemmatizer-0.1.1.tar.gz", "has_sig": false, "md5_digest": "a975285f1f73b0352e9a3f211e636830", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 3220, "upload_time": "2019-07-30T12:34:46", "url": "https://files.pythonhosted.org/packages/64/64/c7c2913cff0eb14d08440cdb9ff7f63292dfb9032f65f4275306e4912f5c/german_lemmatizer-0.1.1.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "77dae0fb78551f7091fab11ed629e847", "sha256": "b7275adaab3259f6e907629e148c2f66ad6a9af3cbfb871b01f611f2a3c85092" }, "downloads": -1, "filename": "german_lemmatizer-0.1.1-py3-none-any.whl", "has_sig": false, "md5_digest": "77dae0fb78551f7091fab11ed629e847", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 4498, "upload_time": "2019-07-30T12:34:44", "url": "https://files.pythonhosted.org/packages/22/06/c1958afb1a0d9979423eb67b2acecff6b95762b346fd9c872cfffa2d867a/german_lemmatizer-0.1.1-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "a975285f1f73b0352e9a3f211e636830", "sha256": "a4b638853e0f4549fb2866d89c9208e50692c2b3ddf6e2c6e75456e67aaf0790" }, "downloads": -1, "filename": "german_lemmatizer-0.1.1.tar.gz", "has_sig": false, "md5_digest": "a975285f1f73b0352e9a3f211e636830", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 3220, "upload_time": "2019-07-30T12:34:46", "url": "https://files.pythonhosted.org/packages/64/64/c7c2913cff0eb14d08440cdb9ff7f63292dfb9032f65f4275306e4912f5c/german_lemmatizer-0.1.1.tar.gz" } ] }