{ "info": { "author": "Viet Le", "author_email": "vietlq85@gmail.com", "bugtrack_url": null, "classifiers": [ "Development Status :: 3 - Alpha", "Environment :: Console", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Topic :: Software Development :: Libraries :: Python Modules" ], "description": "A simple sitemap builder\n========================\n\nThe sitemap builder traverses links from a website and constrains itself to\nthe given domain name. The final result will be a simple sitemap deduced\nfrom the links visited. The crawler will accept & process only URLs with\nhttp or https schemes.\n\nInstallation and usage\n======================\n\nTo run the following command to install the tool:\n\n.. code-block:: bash\n\n pip install -U sitemapbuilder\n\nTo run the sitemap builder:\n\n.. code-block:: bash\n\n sitemapbuilder -u 'https://monzo.com' -o test_monzo.dot\n\nSome websites have strong protection and the tool will not work for them:\n\n.. code-block:: bash\n\n sitemapbuilder -u 'https://bloomberg.com' -o test_bloomberg.dot\n\nHighlights\n==========\n\n#. Generate Graphviz `.dot` file showing directed links between pages. One can generate PNG/PDF and other image/document formats.\n#. Have `configurable decay` (maximum depth) to avoid abuse.\n#. Visit web link within the same hostname by default.\n#. Use `5 threads` by default and times out after `10 seconds`.\n#. Timeout after `5 seconds` when fetching a URL.\n#. Handle timeout exceptions when querying a website.\n#. Send a `HTTP HEAD` request and verify that `Content-Type` is `text/html` and `charset` is either `UTF-8` or `US-ASCII`.\n#. Have a map of visited URLs to avoid revisiting them.\n#. Follow HTTP redirects.\n\nUpcoming features\n=================\n* Configure the number of threads and timeout via cmd args.\n* Allow web links from all subdomains.\n* Allow web links from a list of domains.\n* Allow web links matching a pattern.\n* Add an option for hierarchical sitemap instead of directed graph.\n* Use PriorityQueue instead of Queue to process links with higher decay first.\n* Fine-graned info, warn and error logging.\n* Pass seed links from a file.\n* Save to and resume from a DB/persistent data source.\n* Faster concurrency and better performance with asyncio.\n", "description_content_type": "", "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://github.com/vietlq/sitemapbuilder", "keywords": "sitemapbuilder sitemap builder http", "license": "", "maintainer": "", "maintainer_email": "", "name": "sitemapbuilder", "package_url": "https://pypi.org/project/sitemapbuilder/", "platform": "", "project_url": "https://pypi.org/project/sitemapbuilder/", "project_urls": { "Homepage": "https://github.com/vietlq/sitemapbuilder" }, "release_url": "https://pypi.org/project/sitemapbuilder/0.0.7/", "requires_dist": null, "requires_python": "", "summary": "Simple sitemap builder", "version": "0.0.7" }, "last_serial": 5299750, "releases": { "0.0.1": [ { "comment_text": "", "digests": { "md5": "490b098defaedb49a966a0fed4e0ea47", "sha256": "e8efb20e56a3aea4e5de576f6467d94448c8366a6b8f9c42327f1de3ad5cf929" }, "downloads": -1, "filename": "sitemapbuilder-0.0.1.tar.gz", "has_sig": false, "md5_digest": "490b098defaedb49a966a0fed4e0ea47", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 5206, "upload_time": "2019-05-12T21:23:36", "url": "https://files.pythonhosted.org/packages/ff/78/956cc67e415827537f39a182d9b7843b209596b7e65b4e6e921aa13eb53b/sitemapbuilder-0.0.1.tar.gz" } ], "0.0.2": [ { "comment_text": "", "digests": { "md5": "534e2b8829c94809a344cc968070a7b4", "sha256": "bd007cd549c5304895d1367f3000d33bb98e1022808fcbf21ec47a50a5042429" }, "downloads": -1, "filename": "sitemapbuilder-0.0.2.tar.gz", "has_sig": false, "md5_digest": "534e2b8829c94809a344cc968070a7b4", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 5193, "upload_time": "2019-05-12T21:25:30", "url": "https://files.pythonhosted.org/packages/20/7c/f1ab18f7655fa7cae20ecac1d1b73a3cf5b4b4d8c9215ba862d3100950cd/sitemapbuilder-0.0.2.tar.gz" } ], "0.0.3": [ { "comment_text": "", "digests": { "md5": "56b6ceefbb52a66dcb4bc1c610b47579", "sha256": "8da7b2f6899203eda156f785b94faaffe44f00790daa9989007a0fb2dc33c9e0" }, "downloads": -1, "filename": "sitemapbuilder-0.0.3.tar.gz", "has_sig": false, "md5_digest": "56b6ceefbb52a66dcb4bc1c610b47579", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 5624, "upload_time": "2019-05-12T22:00:55", "url": "https://files.pythonhosted.org/packages/f4/1a/aa96a0f10443694e86766191513a0c10230a9b3a426300d69995842cd880/sitemapbuilder-0.0.3.tar.gz" } ], "0.0.4": [ { "comment_text": "", "digests": { "md5": "318dc4926ed6355918c5c1eff8f650bd", "sha256": "dd5a2e5ea133861921d1b63c36429bf1c62d8f454fe7acb9cb0d64e71c2ba6c5" }, "downloads": -1, "filename": "sitemapbuilder-0.0.4.tar.gz", "has_sig": false, "md5_digest": "318dc4926ed6355918c5c1eff8f650bd", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 5651, "upload_time": "2019-05-13T09:19:30", "url": "https://files.pythonhosted.org/packages/1c/66/75d4d6403685bfbc0086a5049490b735ae1ba8540ad70ea58b062a2d3516/sitemapbuilder-0.0.4.tar.gz" } ], "0.0.5": [ { "comment_text": "", "digests": { "md5": "4019f1117cdb9e6b1a0a77e2709bb84f", "sha256": "2c85e515fedb1a9dd7ce3070daa27bc475def8c17406ca5eeafce169b29c84f5" }, "downloads": -1, "filename": "sitemapbuilder-0.0.5.tar.gz", "has_sig": false, "md5_digest": "4019f1117cdb9e6b1a0a77e2709bb84f", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6064, "upload_time": "2019-05-13T10:45:16", "url": "https://files.pythonhosted.org/packages/9e/70/a9cf5c9a93ea06036ae8c72c83a6648c6a5887cb829bfef3a0b096ba8a5a/sitemapbuilder-0.0.5.tar.gz" } ], "0.0.6": [ { "comment_text": "", "digests": { "md5": "6835322503b0b76b3657ac751f68e45e", "sha256": "5be7e99be7dbc5888a258781735bb27f44dcfb17b2454d5dcf8a8904518f3150" }, "downloads": -1, "filename": "sitemapbuilder-0.0.6.tar.gz", "has_sig": false, "md5_digest": "6835322503b0b76b3657ac751f68e45e", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6114, "upload_time": "2019-05-15T08:07:19", "url": "https://files.pythonhosted.org/packages/7d/2e/cfe86369c8bc8c82445182747db9723deb097ef4b63587b4840504a299f2/sitemapbuilder-0.0.6.tar.gz" } ], "0.0.7": [ { "comment_text": "", "digests": { "md5": "be7e603d126eb38fac3557f840094e0a", "sha256": "e749f0336d4707ce2007d14caf07f90efdebc4ffefadbfddc593898acfd085c0" }, "downloads": -1, "filename": "sitemapbuilder-0.0.7.tar.gz", "has_sig": false, "md5_digest": "be7e603d126eb38fac3557f840094e0a", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6084, "upload_time": "2019-05-21T20:52:27", "url": "https://files.pythonhosted.org/packages/79/9c/5218276f3476c6d9d77f3737930015435b3b9d2464bfe00f99500436a477/sitemapbuilder-0.0.7.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "be7e603d126eb38fac3557f840094e0a", "sha256": "e749f0336d4707ce2007d14caf07f90efdebc4ffefadbfddc593898acfd085c0" }, "downloads": -1, "filename": "sitemapbuilder-0.0.7.tar.gz", "has_sig": false, "md5_digest": "be7e603d126eb38fac3557f840094e0a", "packagetype": "sdist", "python_version": "source", "requires_python": null, "size": 6084, "upload_time": "2019-05-21T20:52:27", "url": "https://files.pythonhosted.org/packages/79/9c/5218276f3476c6d9d77f3737930015435b3b9d2464bfe00f99500436a477/sitemapbuilder-0.0.7.tar.gz" } ] }