{ "info": { "author": "Marcnuth", "author_email": "hxianxian@gmail.com", "bugtrack_url": null, "classifiers": [ "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Software Development :: Libraries" ], "description": "# deduplication\nRemove duplicate documents via popular algorithms such as SimHash, SpotSig, Shingling, etc.\n\n## Install\n\nRun following commands:\n\n```\n# install current library\npip install deduplication\n\n# install required pretrained NLP models \npython -m spacy download xx_ent_wiki_sm\npython -m spacy download en_core_web_sm\n```\n\n## Example\n\n__SimHash__\n\n```python\nfrom deduplication import simhash\n\nhashvalue1 = simhash('this is text')\nhashvalue2 = simhash('this is another text', n_block=4)\n```\n\n__L-SimHash__\n\n```python\nfrom deduplication import lsimhash\n\nhashvalue = lsimhash('this is very long article texts. maybe with a lot of sentences.')\n```\n\n## Citation\n\n__SimHash__\n\n```\nSadowski C, Levin G. \nSimhash: Hash-based similarity detection[J]. \nTechnical report, Google, 2007.\n```\n\n", "description_content_type": "text/markdown", "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://github.com/Marcnuth/deduplication", "keywords": "", "license": "Apache License 2.0", "maintainer": "", "maintainer_email": "", "name": "deduplication", "package_url": "https://pypi.org/project/deduplication/", "platform": "", "project_url": "https://pypi.org/project/deduplication/", "project_urls": { "Homepage": "https://github.com/Marcnuth/deduplication" }, "release_url": "https://pypi.org/project/deduplication/0.0.3/", "requires_dist": [ "spacy (>='2.1.4')" ], "requires_python": "", "summary": "Remove duplicate documents via popular algorithms such as SimHash, SpotSig, Shingling, etc.", "version": "0.0.3" }, "last_serial": 5709482, "releases": { "0.0.1": [ { "comment_text": "", "digests": { "md5": "3c1981ae73531fa00eae820c5dde9c2e", "sha256": "65e152abfe16aa5c31a79107ad07cb7ab2582775062ec92fa6c3f246577670f8" }, "downloads": -1, "filename": "deduplication-0.0.1-py3-none-any.whl", "has_sig": false, "md5_digest": "3c1981ae73531fa00eae820c5dde9c2e", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 6724, "upload_time": "2019-08-19T04:44:31", "url": "https://files.pythonhosted.org/packages/a8/e3/6a28a7b3d6f15cc3af8b34577ae0af5b3aed01caa9402a7b806823b0c176/deduplication-0.0.1-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "f5a2573628c05df2d549f6fcb9b314e2", "sha256": "7b89fbb8f4575b59d8c11dc51c4be5e6a9ac74aa84cf8c2f84ed92c536e03528" }, "downloads": -1, "filename": "deduplication-0.0.1.tar.gz", "has_sig": false, "md5_digest": "f5a2573628c05df2d549f6fcb9b314e2", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 2000, "upload_time": "2019-08-19T04:44:33", "url": "https://files.pythonhosted.org/packages/3b/12/c6902c7b00da3c03403c983945bdb715507df69d29484fc84c6edd35d4c2/deduplication-0.0.1.tar.gz" } ], "0.0.2": [ { "comment_text": "", "digests": { "md5": "dcc67b4d1eacbfa9dd8d224099db7563", "sha256": "00ddbb6bbd71921a21813f97b4dcc691f5e6ad329238c95331351ae523452ddd" }, "downloads": -1, "filename": "deduplication-0.0.2-py3-none-any.whl", "has_sig": false, "md5_digest": "dcc67b4d1eacbfa9dd8d224099db7563", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 7553, "upload_time": "2019-08-20T03:06:54", "url": "https://files.pythonhosted.org/packages/86/60/59fdd09ec7db2c17811ae56415d95047ab6dd9553552bda32205e4a1b88b/deduplication-0.0.2-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "f493ab0b6a9ea52948be3de62112630d", "sha256": "8b6791a64e858cd42c876b8f8e5bac12aea27c0839ae3d54f92222464c72ab62" }, "downloads": -1, "filename": "deduplication-0.0.2.tar.gz", "has_sig": false, "md5_digest": "f493ab0b6a9ea52948be3de62112630d", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 2654, "upload_time": "2019-08-20T03:06:55", "url": "https://files.pythonhosted.org/packages/f9/51/f4c40e406e8fe3e5421d2edfdc0a73944b787c8591b75be680ba25e03b9d/deduplication-0.0.2.tar.gz" } ], "0.0.3": [ { "comment_text": "", "digests": { "md5": "1d04ecf536ef033ac5539f4847e0800c", "sha256": "93d281032bf44c6311b532146a9cb63a39f9b77b1037533f78180b9b3afcdedf" }, "downloads": -1, "filename": "deduplication-0.0.3-py3-none-any.whl", "has_sig": false, "md5_digest": "1d04ecf536ef033ac5539f4847e0800c", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 7552, "upload_time": "2019-08-21T12:43:29", "url": "https://files.pythonhosted.org/packages/10/fa/2c13ae4cf01ef31991ab3d7ecbc0fe86e24f6b1f9b26c7dde36797c691b9/deduplication-0.0.3-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "560fc54f419473a488456643ab707690", "sha256": "545e75b2e6acd9a9ac0d32dfb9e50c6fcb6d11f79eeec5cef9a1ad3182efc983" }, "downloads": -1, "filename": "deduplication-0.0.3.tar.gz", "has_sig": false, "md5_digest": "560fc54f419473a488456643ab707690", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 2661, "upload_time": "2019-08-21T12:43:30", "url": "https://files.pythonhosted.org/packages/63/75/c2c29b42bcdaf9a9790f74e84e035a76e8be9a3f74402ef05db9cdbb8dd2/deduplication-0.0.3.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "1d04ecf536ef033ac5539f4847e0800c", "sha256": "93d281032bf44c6311b532146a9cb63a39f9b77b1037533f78180b9b3afcdedf" }, "downloads": -1, "filename": "deduplication-0.0.3-py3-none-any.whl", "has_sig": false, "md5_digest": "1d04ecf536ef033ac5539f4847e0800c", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": null, "size": 7552, "upload_time": "2019-08-21T12:43:29", "url": "https://files.pythonhosted.org/packages/10/fa/2c13ae4cf01ef31991ab3d7ecbc0fe86e24f6b1f9b26c7dde36797c691b9/deduplication-0.0.3-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "560fc54f419473a488456643ab707690", "sha256": "545e75b2e6acd9a9ac0d32dfb9e50c6fcb6d11f79eeec5cef9a1ad3182efc983" }, "downloads": -1, "filename": "deduplication-0.0.3.tar.gz", "has_sig": false, "md5_digest": "560fc54f419473a488456643ab707690", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 2661, "upload_time": "2019-08-21T12:43:30", "url": "https://files.pythonhosted.org/packages/63/75/c2c29b42bcdaf9a9790f74e84e035a76e8be9a3f74402ef05db9cdbb8dd2/deduplication-0.0.3.tar.gz" } ] }