{ "info": { "author": "Isaac Sijaranamual", "author_email": "isaacsijaranamual@gmail.com", "bugtrack_url": null, "classifiers": [ "Development Status :: 4 - Beta", "Environment :: Console", "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python :: 2.7", "Topic :: Internet :: WWW/HTTP", "Topic :: Text Processing :: Linguistic", "Topic :: Utilities" ], "description": "Google News Crawler\n===================\n\nA utility to fetch news articles from `Google News`_.\n\nGNC retrieves the latest items from the Google News feeds and stores\nthem in ElasticSearch_ or on disk.\n\nWritten by `Isaac Sijaranamual`_ at the University of Amsterdam/ILPS_.\n\n.. _`Google News`: http://news.google.com/\n.. _ILPS: http://ilps.science.uva.nl/\n.. _ElasticSearch: http://www.elasticsearch.org/\n.. _`Isaac Sijaranamual`: mailto:isaacsijaranamual@gmail.com\n\n\nInstallation\n------------\n\nGoogle News Crawler can be installed with ``pip`` as usual::\n\n pip install google_news_crawler\n\n\nUsage\n-----\n\nRetrieve news items belonging to the 'science/technology' topic for\nthe region Botswana from Google News, storing the articles in an\nElasticSearch instance::\n\n google_news_crawler --datastore=ES --feed=\"http://news.google.com/news?cf=all&ned=en_bw&output=rss&topic=t&sort=newest\"\n\nYou would typically want to run a command like the one above in a\n``crontab`` to periodically fetch all the items::\n\n # m h dom mon dow command\n 01-59/10 * * * * google_news_crawler --log-config=/path/to/gnc/logging.yaml --datastore=ES --feed=\"http://news.google.com/news?cf=all&ned=en_bw&output=rss&topic=t&sort=newest\"\n\nThe complete list of usage options can be obtained with the ``--help``\nargument::\n\n google_news_crawler --help\n\n\nNota Bene\n---------\n\nThe store-to-disk backend is still available, but has been dropped as\na dependency because of a license incompatibility, since warc_ is\nlicensed under the GPL (version 2).\n\n.. _warc: https://pypi.python.org/pypi/warc\n\n\nTODO\n----\n\n* general\n\n * make user-agent configurable\n * expand documentation\n\n* Elasticsearch backend\n\n * make all ES related settings configurable\n * update metadata for existing documents instead of skipping them\n entirely\n * improve index mapping for the documents\n\n\nLicense\n-------\n\nCopyright 2013-2014 Isaac Sijaranamual, University of Amsterdam/ILPS\n\nLicensed under the Apache License, Version 2.0 (the \"License\"); you\nmay not use this Work or Derivative Works except in compliance with\nthe License. You may obtain a copy of the License at\n\n http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\nimplied. See the License for the specific language governing\npermissions and limitations under the License.\n\n\n", "description_content_type": null, "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://bitbucket.org/ilps/google_news_crawler", "keywords": "Google News crawling RSS Atom feed download corpus creation", "license": "Apache License, Version 2.0", "maintainer": "", "maintainer_email": "", "name": "google_news_crawler", "package_url": "https://pypi.org/project/google_news_crawler/", "platform": "", "project_url": "https://pypi.org/project/google_news_crawler/", "project_urls": { "Homepage": "https://bitbucket.org/ilps/google_news_crawler" }, "release_url": "https://pypi.org/project/google_news_crawler/0.3.9/", "requires_dist": [ "docopt", "elasticsearch", "feedparser", "lxml", "pytz", "pyyaml", "requests", "tldextract" ], "requires_python": "", "summary": "Google News Crawler", "version": "0.3.9" }, "last_serial": 2389736, "releases": { "0.3.0": [ { "comment_text": "", "digests": { "md5": "cbc97d556bd75d6bc34e433864761531", "sha256": "f226e31d0e6caf30766292cedd0d095e00a8fbf4c154b6fe45da8f87bfd9f837" }, "downloads": -1, "filename": "google_news_crawler-0.3.0.tar.gz", "has_sig": false, "md5_digest": "cbc97d556bd75d6bc34e433864761531", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 16438, "upload_time": "2014-03-14T14:15:46", "url": "https://files.pythonhosted.org/packages/7d/5e/9173b970ba68aa764b694ebe702323f65bfc0bf5608934999626ade1ffce/google_news_crawler-0.3.0.tar.gz" } ], "0.3.1": [ { "comment_text": "", "digests": { "md5": "910e0adbf9eaa1637bdac91b091be479", "sha256": "4eb4303297244dc93964d7ea23ef78f894bdd8be51b358707c169a29327154e0" }, "downloads": -1, "filename": "google_news_crawler-0.3.1.tar.gz", "has_sig": false, "md5_digest": "910e0adbf9eaa1637bdac91b091be479", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 16429, "upload_time": "2014-03-14T19:17:25", "url": "https://files.pythonhosted.org/packages/a0/b3/055586b032a5f6becff6b7bb58a48405d53d0a49fb5a6feada3cb2755fbc/google_news_crawler-0.3.1.tar.gz" } ], "0.3.2": [ { "comment_text": "", "digests": { "md5": "695f1aa5f61edcacdd21f654f0382231", "sha256": "2f7b326f1190780b767738614f1c75cced2f0fb4a5b6263273122059b439c2c2" }, "downloads": -1, "filename": "google_news_crawler-0.3.2.tar.gz", "has_sig": false, "md5_digest": "695f1aa5f61edcacdd21f654f0382231", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 14695, "upload_time": "2014-03-14T20:38:08", "url": "https://files.pythonhosted.org/packages/69/d0/5fa24cbc7544de45a4414240403ff6f2c7ebea86d483c935a10fd99ec8ca/google_news_crawler-0.3.2.tar.gz" } ], "0.3.3": [ { "comment_text": "", "digests": { "md5": "d5f0a1cad7f51cdd4d963b30f6e0dab7", "sha256": "086892b30e156dc216a8c6e24775dfa855204997812ec28c3e2e2e9c04b3a6b8" }, "downloads": -1, "filename": "google_news_crawler-0.3.3.tar.gz", "has_sig": false, "md5_digest": "d5f0a1cad7f51cdd4d963b30f6e0dab7", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 14688, "upload_time": "2014-03-14T20:59:15", "url": "https://files.pythonhosted.org/packages/16/79/7941dfd52dbdb4825936282ff78bbca83a0deb51e68454b98b5878986ae9/google_news_crawler-0.3.3.tar.gz" } ], "0.3.4": [ { "comment_text": "", "digests": { "md5": "cdee01a5cc42cbcefa9e96b5c8b3be1f", "sha256": "fd7309b7ebdfccb15b388ebda6e5a919b0ce0be784a7daf7eea9f8a5279e362a" }, "downloads": -1, "filename": "google_news_crawler-0.3.4.tar.gz", "has_sig": false, "md5_digest": "cdee01a5cc42cbcefa9e96b5c8b3be1f", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 17044, "upload_time": "2014-03-14T21:14:38", "url": "https://files.pythonhosted.org/packages/01/ce/6ef518cd3df28f64bc16166c64b56c02418984923766a262ad63ba40555d/google_news_crawler-0.3.4.tar.gz" } ], "0.3.5": [ { "comment_text": "", "digests": { "md5": "a0f4eeb5e2a4c970db47152b2504bf49", "sha256": "6f432b88f56882cc1be96c5c0ce2d87dc1d2e7750e28d7e21bf50696a7c18661" }, "downloads": -1, "filename": "google_news_crawler-0.3.5.tar.gz", "has_sig": false, "md5_digest": "a0f4eeb5e2a4c970db47152b2504bf49", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 19215, "upload_time": "2014-03-15T22:13:00", "url": "https://files.pythonhosted.org/packages/d4/68/622956f1615aa42e259ee9312545d496d4fff86b5dfac90d1290aba027f5/google_news_crawler-0.3.5.tar.gz" } ], "0.3.6": [ { "comment_text": "", "digests": { "md5": "027960c1a565f72ec96df3c4f65f97e5", "sha256": "3977903956ecbd5332516c87b8d2910ac40142a8c2613ae0d70618470258269a" }, "downloads": -1, "filename": "google_news_crawler-0.3.6.tar.gz", "has_sig": false, "md5_digest": "027960c1a565f72ec96df3c4f65f97e5", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 19458, "upload_time": "2014-03-17T11:42:05", "url": "https://files.pythonhosted.org/packages/ed/8f/9e95d7d057f8ac13c5a30238034d2061e1b16ce1d6d2362699e1985d097c/google_news_crawler-0.3.6.tar.gz" } ], "0.3.7": [ { "comment_text": "", "digests": { "md5": "94b5dc3a205b286fb3542d7534d87c06", "sha256": "e1b802594f14f787fa86dd61e8567f03d0784fcfaf33205263a01fbe8f21b351" }, "downloads": -1, "filename": "google_news_crawler-0.3.7.tar.gz", "has_sig": false, "md5_digest": "94b5dc3a205b286fb3542d7534d87c06", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 23760, "upload_time": "2014-03-17T16:28:22", "url": "https://files.pythonhosted.org/packages/64/fd/5530c78a0e510790840e0ac7133178c16ac7ca004764e24f9c09a360ec6c/google_news_crawler-0.3.7.tar.gz" } ], "0.3.8": [ { "comment_text": "", "digests": { "md5": "07fea546b8e752558d82db41618a3fda", "sha256": "53991626abd80cbf554b29498465d1a47c27a75d330584c716747f1ca50d8af2" }, "downloads": -1, "filename": "google_news_crawler-0.3.8.tar.gz", "has_sig": false, "md5_digest": "07fea546b8e752558d82db41618a3fda", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 24167, "upload_time": "2014-03-17T16:29:12", "url": "https://files.pythonhosted.org/packages/43/34/c3c528407c5ac906ca76ff5e69ceb48952a5fb1e2f7fd4334be9036df5de/google_news_crawler-0.3.8.tar.gz" } ], "0.3.9": [ { "comment_text": "", "digests": { "md5": "bf559d0e3732537aca03ab3475f90be2", "sha256": "8142acc88cea681628bcfc549db0ccbcfa54d8715bb638fb5653100fc958caa7" }, "downloads": -1, "filename": "google_news_crawler-0.3.9-py2-none-any.whl", "has_sig": false, "md5_digest": "bf559d0e3732537aca03ab3475f90be2", "packagetype": "bdist_wheel", "python_version": "py2", "requires_python": null, "size": 16670, "upload_time": "2016-10-09T20:22:44", "url": "https://files.pythonhosted.org/packages/c0/97/22310b5392066ba055c00848648d815d4f4ba1775960ef1bcd4e739534f3/google_news_crawler-0.3.9-py2-none-any.whl" }, { "comment_text": "", "digests": { "md5": "7cf160c10f5ac60559d7adae85da3c40", "sha256": "7841ad137e3c51bf76e9cde71c921bac1ba4dc082f9f94857b77028789be4336" }, "downloads": -1, "filename": "google_news_crawler-0.3.9.tar.gz", "has_sig": false, "md5_digest": "7cf160c10f5ac60559d7adae85da3c40", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 23274, "upload_time": "2016-10-09T20:22:47", "url": "https://files.pythonhosted.org/packages/83/9d/499e6c0c24ffe0ade0655092fdb3742abd201a67500cc9556be7a77e254d/google_news_crawler-0.3.9.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "bf559d0e3732537aca03ab3475f90be2", "sha256": "8142acc88cea681628bcfc549db0ccbcfa54d8715bb638fb5653100fc958caa7" }, "downloads": -1, "filename": "google_news_crawler-0.3.9-py2-none-any.whl", "has_sig": false, "md5_digest": "bf559d0e3732537aca03ab3475f90be2", "packagetype": "bdist_wheel", "python_version": "py2", "requires_python": null, "size": 16670, "upload_time": "2016-10-09T20:22:44", "url": "https://files.pythonhosted.org/packages/c0/97/22310b5392066ba055c00848648d815d4f4ba1775960ef1bcd4e739534f3/google_news_crawler-0.3.9-py2-none-any.whl" }, { "comment_text": "", "digests": { "md5": "7cf160c10f5ac60559d7adae85da3c40", "sha256": "7841ad137e3c51bf76e9cde71c921bac1ba4dc082f9f94857b77028789be4336" }, "downloads": -1, "filename": "google_news_crawler-0.3.9.tar.gz", "has_sig": false, "md5_digest": "7cf160c10f5ac60559d7adae85da3c40", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 23274, "upload_time": "2016-10-09T20:22:47", "url": "https://files.pythonhosted.org/packages/83/9d/499e6c0c24ffe0ade0655092fdb3742abd201a67500cc9556be7a77e254d/google_news_crawler-0.3.9.tar.gz" } ] }