{ "info": { "author": "Ayush Singh", "author_email": "singhay@ccs.neu.edu", "bugtrack_url": null, "classifiers": [], "description": "===================\r\nPlagiarism Detector\r\n===================\r\n\r\nCalculates percentage of N-Tuples from file1 found in another file2::\r\n\r\n from plagiarismdetector.detector import Detector\r\n\r\n print Detector.detect(synonyms_file_path,\r\n eval_file_path,\r\n source_file_path,\r\n n_tuples_value=3)\r\n\r\n\r\n\r\nRunning\r\n=======\r\n``python plagiarismdetector/main.py synonyms_file_path eval_file_path source_file_path 3``\r\n\r\n\r\nAssumptions & Overview\r\n======================\r\n* Dependent on Python2.7\r\n* Tokenizer is only adapted to English language text using Penn TreeBank tokenizer, reason being it divides strings based off structures in english language that might fail in other languages e.g. in Hindi since sentence separators and punctuations are entirely different.\r\n* The module is optimized to be as fast as possible, some of the optimizations are:\r\n * Only n-grams for file 2 are generated and stored, file 1 n-tuples are generated but not stored.\r\n * Not holding generated n-grams in memory, a generator is used\r\n * Dictionary of n-grams is created from file2 n-grams for constant time lookup of file1 tuples\r\n * Keys in file2 n-gram dictionary contains hashes for tuples instead of actual tuples to reduce space complexity.\r\n * Since we are only concerned with percentage of file1 n-tuples found in file2, we do not need to store any tuples. Therefore, we first generate n-grams for file2 and then calculate count for file1 on the fly instead of generating all n-tuples of file1 and cross referencing it with those of file2.\r\n\r\n\r\nTesting\r\n=======\r\n``python -m unittest discover tests``\r\n\r\nHelp\r\n====\r\n``python plagiarismdetector/main.py -h``\r\n\r\npositional arguments\r\n--------------------\r\n ======================== ==============================================\r\n ``synonym_file_path`` Path to file to be used for synonyms\r\n ``evaluation_file_path`` Path to file to be evaluated\r\n ``source_file_path`` Path to file to be used as source for matching\r\n ``n-tuples`` Number of N-tuples, Optional and Defaults to 3\r\n ======================== ==============================================\r\n\r\noptional arguments\r\n------------------\r\n -h, --help show this help message and exit\r\n\r\nExample\r\n=======\r\n ================ ===============\r\n Returns 100.0\r\n ================ ===============\r\n Evaluation File go for a run\r\n Source File go for a jog\r\n N-tuples 3\r\n ================ ===============", "description_content_type": null, "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "http://pypi.python.org/pypi/PlagiarismDetector/", "keywords": "", "license": "MIT", "maintainer": "", "maintainer_email": "", "name": "PlagiarismDetector", "package_url": "https://pypi.org/project/PlagiarismDetector/", "platform": "", "project_url": "https://pypi.org/project/PlagiarismDetector/", "project_urls": { "Homepage": "http://pypi.python.org/pypi/PlagiarismDetector/" }, "release_url": "https://pypi.org/project/PlagiarismDetector/0.2.2/", "requires_dist": null, "requires_python": "", "summary": "Calculates percentage of N-Tuples from file1 found in another file2", "version": "0.2.2" }, "last_serial": 3166229, "releases": { "0.1.0": [ { "comment_text": "", "digests": { "md5": "b168acd6a6356524fb2baceef32244c8", "sha256": "822a6e24f2a0b786ba22b259879ab9a1722ed05ea253b1cb79ddc3613d738611" }, "downloads": -1, "filename": "PlagiarismDetector-0.1.0.tar.gz", "has_sig": false, "md5_digest": "b168acd6a6356524fb2baceef32244c8", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 5909, "upload_time": "2017-09-10T23:04:59", "url": "https://files.pythonhosted.org/packages/61/20/182fd9fcbc1e73649406126a1bacb0d15592d5d40eb8c7191ebf4b6c514c/PlagiarismDetector-0.1.0.tar.gz" } ], "0.1.1": [ { "comment_text": "", "digests": { "md5": "88d9b5eb76e535060f82a5d92c8fdda4", "sha256": "725203609a2fa70d7aafce547c0bca1bbaba4ed3a69e2ff3b07631d70848deed" }, "downloads": -1, "filename": "PlagiarismDetector-0.1.1.tar.gz", "has_sig": false, "md5_digest": "88d9b5eb76e535060f82a5d92c8fdda4", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 5972, "upload_time": "2017-09-10T23:14:14", "url": "https://files.pythonhosted.org/packages/a6/41/afe25e53c93a2b4f7555eeabf8bb4140ea7994f455980252af534741d764/PlagiarismDetector-0.1.1.tar.gz" } ], "0.1.2": [ { "comment_text": "", "digests": { "md5": "e0c896589e84a3164c8fe0d8ddb0af03", "sha256": "d40ea3c8cdb4a970982e5d4eaa377c2a6115506a160800ba7657bf970791b1b1" }, "downloads": -1, "filename": "PlagiarismDetector-0.1.2.tar.gz", "has_sig": false, "md5_digest": "e0c896589e84a3164c8fe0d8ddb0af03", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6010, "upload_time": "2017-09-10T23:16:41", "url": "https://files.pythonhosted.org/packages/0d/52/a3398b05f29378fa17520338ca24932f9f242041addda2334f98ebfc6fea/PlagiarismDetector-0.1.2.tar.gz" } ], "0.1.3": [ { "comment_text": "", "digests": { "md5": "541dde0010294605821d68c9cb86f8c5", "sha256": "0a5f5814caf5467307b3604d8acb5147fcc21cccd9d772630f3fcc59463d6dd4" }, "downloads": -1, "filename": "PlagiarismDetector-0.1.3.tar.gz", "has_sig": false, "md5_digest": "541dde0010294605821d68c9cb86f8c5", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6507, "upload_time": "2017-09-10T23:24:17", "url": "https://files.pythonhosted.org/packages/34/7d/a3c12f86683af92ace5bfadf9bd02a9682ff404b97b67491fae1db69c71b/PlagiarismDetector-0.1.3.tar.gz" } ], "0.1.4": [ { "comment_text": "", "digests": { "md5": "8653ccb493f228ccf679c6f47d460929", "sha256": "578631ed013e8f283a64fbd52c41b50d690e9f05535fd48a415956695adc8323" }, "downloads": -1, "filename": "PlagiarismDetector-0.1.4.tar.gz", "has_sig": false, "md5_digest": "8653ccb493f228ccf679c6f47d460929", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6571, "upload_time": "2017-09-10T23:35:58", "url": "https://files.pythonhosted.org/packages/66/7e/363db9c13c417d8ac9101ea1e1d644ab198b5f985bfb56846d4f6e038307/PlagiarismDetector-0.1.4.tar.gz" } ], "0.1.5": [ { "comment_text": "", "digests": { "md5": "c1d39b4b44b443425b675f1ed13b4ffa", "sha256": "90171dc7687fede38e64881f961403dab815fc92dfc458d769c85b27a5064926" }, "downloads": -1, "filename": "PlagiarismDetector-0.1.5.tar.gz", "has_sig": false, "md5_digest": "c1d39b4b44b443425b675f1ed13b4ffa", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6623, "upload_time": "2017-09-10T23:47:03", "url": "https://files.pythonhosted.org/packages/97/cb/f6d26e776f6e0f874e128ad82320ca5fa7caeadfb8b259d6a31843747b7f/PlagiarismDetector-0.1.5.tar.gz" } ], "0.1.6": [ { "comment_text": "", "digests": { "md5": "ae9a641663d9c841c9538bd8e78c2a5d", "sha256": "01bd8da49287bb92eed974e0af7dcb0a1dd797121c8286ff32ad781813ae33a2" }, "downloads": -1, "filename": "PlagiarismDetector-0.1.6.tar.gz", "has_sig": false, "md5_digest": "ae9a641663d9c841c9538bd8e78c2a5d", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6604, "upload_time": "2017-09-10T23:48:15", "url": "https://files.pythonhosted.org/packages/a2/78/827e6db389c5a29e41f5db20e52f296133c14f75dbf6299c252b17cb4e09/PlagiarismDetector-0.1.6.tar.gz" } ], "0.1.7": [ { "comment_text": "", "digests": { "md5": "01f4ee35f06896f9905447c233637ba8", "sha256": "2f905ad64b6ca6c80c58518c8bcff81ebcee1840fd1238597eb3a71360c348b0" }, "downloads": -1, "filename": "PlagiarismDetector-0.1.7.tar.gz", "has_sig": false, "md5_digest": "01f4ee35f06896f9905447c233637ba8", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6606, "upload_time": "2017-09-10T23:50:25", "url": "https://files.pythonhosted.org/packages/c2/7a/f2517f169086eb3d43269a3612a3163f64fe04a5098a06e19e65cd625cf3/PlagiarismDetector-0.1.7.tar.gz" } ], "0.1.8": [ { "comment_text": "", "digests": { "md5": "9e799d3668d5d2abb60515a21d1e1c4d", "sha256": "1164c0f52591a8563090c6a318ec12a4e606cc5d2c17d0d680d43d3c3cb60fa4" }, "downloads": -1, "filename": "PlagiarismDetector-0.1.8.tar.gz", "has_sig": false, "md5_digest": "9e799d3668d5d2abb60515a21d1e1c4d", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6650, "upload_time": "2017-09-10T23:57:20", "url": "https://files.pythonhosted.org/packages/c1/4a/79c47b826300ac0338f759bc49dd7aa5710ef2323e1c5ced95146fa01b2b/PlagiarismDetector-0.1.8.tar.gz" } ], "0.1.9": [ { "comment_text": "", "digests": { "md5": "b8f9f360a701ad2adb2a24ceff7010da", "sha256": "beddd034063ab171027e15c61c00a3a80cec78d2c05eafbe065045b089d4be11" }, "downloads": -1, "filename": "PlagiarismDetector-0.1.9.tar.gz", "has_sig": false, "md5_digest": "b8f9f360a701ad2adb2a24ceff7010da", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6667, "upload_time": "2017-09-11T00:14:05", "url": "https://files.pythonhosted.org/packages/d3/a1/201021267414200cc0f460b265db788043a8966bbc15d7ac2f91b3d221ab/PlagiarismDetector-0.1.9.tar.gz" } ], "0.2.0": [ { "comment_text": "", "digests": { "md5": "6556ea164c176e7a8852acb1b53a116b", "sha256": "68346cd24fc02d2ffb005db6066ec1153090e3dd7eac339727b20f80fbc94c64" }, "downloads": -1, "filename": "PlagiarismDetector-0.2.0.tar.gz", "has_sig": false, "md5_digest": "6556ea164c176e7a8852acb1b53a116b", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6683, "upload_time": "2017-09-11T00:16:23", "url": "https://files.pythonhosted.org/packages/25/72/98a6aec90b0f959499a85dfd1697479f9bb4605cd8dcd5b500ebfd7dabdc/PlagiarismDetector-0.2.0.tar.gz" } ], "0.2.1": [ { "comment_text": "", "digests": { "md5": "a8ee66f11ca677e740d0dc9032fc9d9c", "sha256": "2f07b31e91aa806ef4ad99ceabff23a8645bf0c00dcb2f4235f5a8eb00495088" }, "downloads": -1, "filename": "PlagiarismDetector-0.2.1.tar.gz", "has_sig": false, "md5_digest": "a8ee66f11ca677e740d0dc9032fc9d9c", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6728, "upload_time": "2017-09-11T00:18:06", "url": "https://files.pythonhosted.org/packages/08/ad/a94f90261900d8d162a6aa04c6297489b644030e4466c8952fdf909d22d0/PlagiarismDetector-0.2.1.tar.gz" } ], "0.2.2": [ { "comment_text": "", "digests": { "md5": "71714c5c981eb6e1616b87045edf31ee", "sha256": "0c90c9c1bfbadc5f82773f7afeaf6b7e12b6164bc7446bfa4af120834c9a580e" }, "downloads": -1, "filename": "PlagiarismDetector-0.2.2.tar.gz", "has_sig": false, "md5_digest": "71714c5c981eb6e1616b87045edf31ee", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6738, "upload_time": "2017-09-11T20:58:47", "url": "https://files.pythonhosted.org/packages/1d/a0/9c75367c9182ecb23e05e8ca90ddf32c664d6ed645aa9200f34b806722df/PlagiarismDetector-0.2.2.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "71714c5c981eb6e1616b87045edf31ee", "sha256": "0c90c9c1bfbadc5f82773f7afeaf6b7e12b6164bc7446bfa4af120834c9a580e" }, "downloads": -1, "filename": "PlagiarismDetector-0.2.2.tar.gz", "has_sig": false, "md5_digest": "71714c5c981eb6e1616b87045edf31ee", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6738, "upload_time": "2017-09-11T20:58:47", "url": "https://files.pythonhosted.org/packages/1d/a0/9c75367c9182ecb23e05e8ca90ddf32c664d6ed645aa9200f34b806722df/PlagiarismDetector-0.2.2.tar.gz" } ] }