{ "info": { "author": "Krzysztof Dorosz", "author_email": "cypreess@gmail.com", "bugtrack_url": null, "classifiers": [ "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)", "Topic :: Text Processing", "Topic :: Text Processing :: Indexing", "Topic :: Text Processing :: Linguistic", "Topic :: Utilities" ], "description": "Welcome to Corpora!\n===================\n*Corpora* is a lightweight, fast and scalable corpus library able to store a collection of raw text documents with additional key-value headers. It uses Berkeley DB (bsddb3 module) for index managing what guarantee speed and bullet-proof. Text storage model is based on chunked flat, human readable text files. This architecture can easily scale up to millions documents, hundred of gigabytes collections.\n\nCorpora module provides four main features:\n * create a new corpus,\n * append documents to a corpus,\n * random access to any document in a corpus using it's unique ``id``,\n * sequential access to document collection (generator over collection).\n\nKey-Value document headers supports storing any kind of objects seriazable with yaml_. Corpora supports only append & read-only philosophy, for more information please read section :doc:`motivation`.\n\n.. _yaml: http://www.yaml.org/\n\nQuickstart\n----------\nInstallation:\n::\n \n > sudo pip install corpora\n\nBasic usage:\n\n \n >>> from corpora import Corpus\n >>> Corpus.create('/tmp/test_corpus')\n >>> c = Corpus('/tmp/test_corpus')\n >>> c.add('First document', 1)\n >>> c.add('Second document', 2)\n >>> c.save_indexes()\n >>> len(c)\n 2\n >>> c[1]\n ({'id': 1}, u'First document')\n >>> c[2]\n ({'id': 2}, u'Second document')\n >>> for t in c:\n ... print t\n ... \n ({'id': 1}, u'First document')\n ({'id': 2}, u'Second document')", "description_content_type": null, "docs_url": "https://pythonhosted.org/Corpora/", "download_url": "UNKNOWN", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "http://packages.python.org/Corpora", "keywords": "text utf corpus corpora nlp toolkit", "license": "LGPL", "maintainer": null, "maintainer_email": null, "name": "Corpora", "package_url": "https://pypi.org/project/Corpora/", "platform": "UNKNOWN", "project_url": "https://pypi.org/project/Corpora/", "project_urls": { "Download": "UNKNOWN", "Homepage": "http://packages.python.org/Corpora" }, "release_url": "https://pypi.org/project/Corpora/1.0/", "requires_dist": null, "requires_python": null, "summary": "Lightweight, fast and scalable text corpus library.", "version": "1.0" }, "last_serial": 784023, "releases": { "1.0": [ { "comment_text": "", "digests": { "md5": "02781c45591ff458819e13120394828d", "sha256": "208a68da259c6c5ccd36a85d83c5cfdf43f1fb25aa77d21b7a2b0c6bfa1cd1db" }, "downloads": -1, "filename": "Corpora-1.0.tar.gz", "has_sig": false, "md5_digest": "02781c45591ff458819e13120394828d", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 5147, "upload_time": "2011-12-14T18:25:44", "url": "https://files.pythonhosted.org/packages/6c/f5/998ee3d19c64e42a5a3839858ede61ccd504c13f24fbe3bf48ddb6fd3592/Corpora-1.0.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "02781c45591ff458819e13120394828d", "sha256": "208a68da259c6c5ccd36a85d83c5cfdf43f1fb25aa77d21b7a2b0c6bfa1cd1db" }, "downloads": -1, "filename": "Corpora-1.0.tar.gz", "has_sig": false, "md5_digest": "02781c45591ff458819e13120394828d", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 5147, "upload_time": "2011-12-14T18:25:44", "url": "https://files.pythonhosted.org/packages/6c/f5/998ee3d19c64e42a5a3839858ede61ccd504c13f24fbe3bf48ddb6fd3592/Corpora-1.0.tar.gz" } ] }