{ "info": { "author": "wansho", "author_email": "wanshojs@gmail.com", "bugtrack_url": null, "classifiers": [ "Development Status :: 4 - Beta", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python :: 3" ], "description": "# weibo-preprocess-toolkit\nWeibo Preprocess Toolkit\n\n## Getting Started\n\n### Installation\n\n```shell\npip install weibo-preprocess-toolkit\n```\n\n### Tutorial\n\n```Python\nfrom weibo_preprocess_toolkit import WeiboPreprocess\n\npreprocess = WeiboPreprocess()\n\ntest_weibo = \"\u6240\u4ee5\u6211\u90fd\u4e0d\u559d\u8499\u725b #\u5357\u4eac\u00b7\u5927\u884c\u5bab[\u5730\u70b9]#\uff0c\u4e00\u76f4\u4e0d\u559c\u6b22\u8499\u725b\u3002\u8b1d\u99ff\u6bc5 \u8d5e[122]\u8f6c\u53d1[11] [\u8d85\u8bdd] \u6536\u85cf09\u670811\u65e5 18:57\u00a0\"\n\n# traditional2simplified\nprint(preprocess.traditional2simplified(test_weibo))\n# \u6240\u4ee5\u6211\u90fd\u4e0d\u559d\u8499\u725b #\u5357\u4eac\u00b7\u5927\u884c\u5bab[\u5730\u70b9]#\uff0c\u4e00\u76f4\u4e0d\u559c\u6b22\u8499\u725b\u3002\u8c22\u9a8f\u6bc5 \u8d5e[122]\u8f6c\u53d1[11] [\u8d85\u8bdd] \u6536\u85cf09\u670811\u65e5 18:57\n\n# clean weibo with simplified Chinese\nprint(preprocess.clean(test_weibo))\n# \u6240\u4ee5\u6211\u90fd\u4e0d\u559d\u8499\u725b \u4e00\u76f4\u4e0d\u559c\u6b22\u8499\u725b \u8c22\u9a8f\u6bc5\n\n# clean weibo \nprint(preprocess.clean(test_weibo, simplified=False))\n# \u6240\u4ee5\u6211\u90fd\u4e0d\u559d\u8499\u725b \u4e00\u76f4\u4e0d\u559c\u6b22\u8499\u725b \u8b1d\u99ff\u6bc5\n\n# seg weibo, keep stop words\nprint(preprocess.cut(test_weibo))\n# ['\u6240\u4ee5', '\u6211', '\u90fd', '\u4e0d\u559d', '\u8499\u725b', '#', '\u5357\u4eac', '\u00b7', '\u5927\u884c\u5bab', '[', '\u5730\u70b9', ']', '#', '\uff0c', '\u4e00\u76f4', '\u4e0d\u559c\u6b22', '\u8499\u725b', '\u3002', '\u8b1d\u99ff\u6bc5', '\u8d5e', '[', '122', ']', '\u8f6c\u53d1', '[', '11', ']', '[', '\u8d85\u8bdd', ']', '\u6536\u85cf', '09', '\u6708', '11', '\u65e5', '18', ':', '57', '\\xa0']\n\n# seg weibo, don't keep stop words\nprint(preprocess.cut(test_weibo, keep_stop_word=False))\n# ['\u90fd', '\u4e0d\u559d', '\u8499\u725b', '#', '\u5357\u4eac', '\u00b7', '\u5927\u884c\u5bab', '[', '\u5730\u70b9', ']', '#', '\uff0c', '\u4e0d\u559c\u6b22', '\u8499\u725b', '\u3002', '\u8b1d\u99ff\u6bc5', '\u8d5e', '[', '122', ']', '\u8f6c\u53d1', '[', '11', ']', '[', '\u8d85\u8bdd', ']', '\u6536\u85cf', '09', '\u6708', '11', '\u65e5', '18', ':', '57', '\\xa0']\n\n# clean and cut weibo, keep_stop_words, simplified Chinese\nprint(preprocess.preprocess(test_weibo))\n# \u6240\u4ee5 \u6211 \u90fd \u4e0d\u559d \u8499\u725b \u4e00\u76f4 \u4e0d\u559c\u6b22 \u8499\u725b \u8c22\u9a8f\u6bc5\nprint(preprocess.preprocess(test_weibo, simplified=False, keep_stop_word=False))\n# \u90fd \u4e0d\u559d \u8499\u725b \u4e0d\u559c\u6b22 \u8499\u725b \u8b1d\u99ff\u6bc5\n```\n\n\n## Introduction\n\u8be5\u5de5\u5177\u7528\u4e8e\u5fae\u535a\u6587\u672c\u7684\u9884\u5904\u7406\uff1a\u6e05\u6d17 + \u5206\u8bcd\u3002\n\n### Inspiration\n\u5728\u4e2d\u6587 NLP \u9886\u57df\uff0c\u6587\u672c\u6e05\u6d17\u548c\u5206\u8bcd\u5bf9\u4e8e\u6a21\u578b\u7684\u6027\u80fd\u6709\u7740\u5f88\u5927\u7684\u5f71\u54cd\uff0c\u5982\u679c\u8bed\u6599\u5e93\u548c\u6d4b\u8bd5\u96c6/**\u7ebf\u4e0a\u73af\u5883**\u7684\u6587\u672c\u6e05\u6d17\u89c4\u5219\u548c\u5206\u8bcd\u5de5\u5177\u4e0d\u540c\uff0c\u5c31\u4f1a\u5bfc\u81f4\u5728\u8bed\u6599\u5e93\u4e0a\u8bad\u7ec3\u51fa\u6765\u7684\u6a21\u578b\u5728\u6d4b\u8bd5\u96c6\u4e0a\u6548\u679c\u5f88\u5dee\u3002\u4e3e\u4f8b\u6765\u8bf4\uff0c\u8bed\u6599\u5e93\u91c7\u7528\u4e86\u6e05\u6d17\u89c4\u5219 Clean-A \u548c \u5206\u8bcd\u5de5\u5177 Seg-A \u6765\u6e05\u6d17\u548c\u5206\u8bcd\u5fae\u535a\uff0c\u800c\u7528\u6237\u5728\u7ebf\u4e0a\u73af\u5883\u91c7\u7528\u4e86\u53e6\u4e00\u79cd\u6e05\u6d17\u89c4\u5219 Clean-B \u548c\u53e6\u4e00\u79cd\u5206\u8bcd\u5de5\u5177 Seg-B\uff0c\u90a3\u4e48\u7ebf\u4e0a\u73af\u5883\u5c31\u4f1a\u4ea7\u751f\u5f88\u591a\u4e0d\u5728\u8bed\u6599\u5e93\u8bcd\u5178\u4e2d\u7684**\u672a\u767b\u9646\u8bcd\uff08Unknown Words\uff09**\uff0c\u8fd9\u4e9b\u672a\u767b\u9646\u8bcd\u4f1a\u5bfc\u81f4\u9884\u5148\u8bad\u7ec3\u597d\u7684\u6a21\u578b\uff0c\u9762\u5bf9\u7ebf\u4e0a\u73af\u5883\u7684\u53e6\u4e00\u79cd\u89c4\u5219\u65f6\uff0c\u6027\u80fd\u53d8\u5dee\u3002\n\n\u672c\u4eba\u5728\u5bf9\u5fae\u535a\u8fdb\u884c\u60c5\u611f\u5206\u6790\u7684\u8fc7\u7a0b\u4e2d\uff0c\u603b\u7ed3\u4e86\u8f83\u591a\u7684\u5fae\u535a\u6e05\u6d17\u6280\u5de7\u548c\u5206\u8bcd\u89c4\u5219\uff0c\u5e76\u603b\u7ed3\u4e86\u4e00\u4efd\u5fae\u535a\u60c5\u611f\u5206\u6790\u8bcd\u5178\u7528\u4e8e\u4f18\u5316 jieba \u5206\u8bcd\u3002\u6240\u4ee5\u6211\u5728\u8fd9\u91cc\u5c1d\u8bd5\u5bf9\u5fae\u535a\u7684\u6e05\u6d17\u548c\u5206\u8bcd\u89c4\u5219\u8fdb\u884c\u6574\u7406\uff0c\u540c\u65f6\u4e5f\u662f\u4e3a\u4e86\u4fdd\u6301\u8bed\u6599\u5e93\u548c\u7ebf\u4e0a\u73af\u5883\u7684\u89c4\u5219\u540c\u6b65\uff0c\u4e3a\u5176\u4ed6\u7814\u7a76\u8005\u548c\u4f7f\u7528\u6211\u7684\u6a21\u578b\u7684\u4eba\uff0c\u63d0\u4f9b\u4e00\u4e2a\u548c\u8bed\u6599\u5e93\u5339\u914d\u7684\u6e05\u6d17\u548c\u5206\u8bcd\u89c4\u5219\u3002\n\n### Weibo Cleaning\n\n\u672c\u4eba\u5bf9\u5fae\u535a\u6587\u672c\u7684\u6e05\u6d17\u89c4\u5219\u8fdb\u884c\u4e86\u6574\u7406\uff0c\u4e3b\u8981\u6d89\u53ca\u5230\u5982\u4e0b\u7684\u89c4\u5219\uff1a\n\n1. \u4e2d\u6587\u7e41\u4f53\u8f6c\u7b80\u4f53\n2. [\u5fae\u535a\u505c\u7528\u8bcd\u89c4\u52191(\u6b63\u5219\u8868\u8fbe\u5f0f)](weibo_preprocess_toolkit/dictionary/weibo_stopwords1_regex.csv)\uff0c\u5305\u62ec url, email, @\u67d0\u4eba, \u5730\u70b9\uff0c\u2026\u2026 \u7b49\u505c\u7528\u8bcd\u89c4\u5219\n3. [\u5fae\u535a\u505c\u7528\u8bcd\u89c4\u52192(\u6b63\u5219\u8868\u8fbe\u5f0f)](weibo_preprocess_toolkit/dictionary/weibo_stopwords2_regex.csv)\uff0c\u5305\u62ec \u65f6\u95f4\uff0c\u6570\u5b57\u548c\u5fae\u535a\u4e2d\u5e38\u51fa\u73b0\u7684\u65e0\u610f\u4e49\u7684\u8bcd\u7b49\u505c\u7528\u8bcd\u89c4\u5219\n4. [\u5fae\u535a\u7279\u6b8a\u5b57\u7b26](weibo_preprocess_toolkit/dictionary/special_chars.csv)\n5. \u5176\u4ed6\u7ec6\u8282\u5904\u7406\n\n\u6ce8\u610f\uff1a\u8003\u8651\u5230\u505c\u7528\u8bcd\u5728\u8bcd\u5411\u91cf\u8bad\u7ec3\u4e2d\u8854\u63a5\u4e0a\u4e0b\u6587\u7684\u4f5c\u7528\uff0c\u672c\u5de5\u5177\u5e76\u6ca1\u6709\u5bf9\u5fae\u535a\u7684\u505c\u7528\u8bcd\u8fdb\u884c\u6e05\u6d17\n\n### Weibo Seg\n\n\u57fa\u4e8e jieba \u5206\u8bcd\u5bf9\u5fae\u535a\u6587\u672c\u8fdb\u884c\u5206\u8bcd\u4f18\u5316\uff0c\u4f18\u5316\u7684\u5730\u65b9\u4e3b\u8981\u6709\u4e24\u70b9\uff1a\n\n1. \u6269\u79cd jieba \u5206\u8bcd\u8bcd\u5178\uff0c\u6784\u5efa\u60c5\u611f\u8bcd\u5178\uff0c\u4f18\u5316\u60c5\u611f\u5206\u6790\u7684\u5206\u8bcd\u7ed3\u679c\n2. \u5bf9\u5426\u5b9a\u524d\u7f00\u8bcd\u8fdb\u884c\u7279\u6b8a\u5904\u7406\n\n## Dependencies\n```bash\npip install jieba\n```\n\n## Acknowledgment\n[jieba \u7ed3\u5df4\u4e2d\u6587\u5206\u8bcd](https://github.com/fxsjy/jieba)\n\n[nstools \u4e2d\u6587\u7e41\u4f53\u8f6c\u7b80\u4f53](https://github.com/skydark/nstools)\n\n[NTUSD \u60c5\u611f\u8bcd\u5178](https://www.aaai.org/Papers/Symposia/Spring/2006/SS-06-03/SS06-03-020.pdf)\n\n[\u54c8\u5de5\u5927\u505c\u7528\u8bcd\u8868](https://github.com/goto456/stopwords)\n\n## License\n\nMIT\n\n", "description_content_type": "text/markdown", "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://github.com/wansho/weibo-preprocess-toolkit", "keywords": "", "license": "MIT", "maintainer": "", "maintainer_email": "", "name": "weibo-preprocess-toolkit", "package_url": "https://pypi.org/project/weibo-preprocess-toolkit/", "platform": "", "project_url": "https://pypi.org/project/weibo-preprocess-toolkit/", "project_urls": { "Homepage": "https://github.com/wansho/weibo-preprocess-toolkit" }, "release_url": "https://pypi.org/project/weibo-preprocess-toolkit/1.1.0/", "requires_dist": [ "jieba (>=0.39)" ], "requires_python": ">=3", "summary": "Weibo Preprocess Toolkit.", "version": "1.1.0" }, "last_serial": 5526436, "releases": { "1.0.0": [ { "comment_text": "", "digests": { "md5": "6bc25c42902a29fb06420ab517e9c9f5", "sha256": "fa83f4d3d7354d79eef253070cf793dacb99a77285dfe32906500a5c9b333447" }, "downloads": -1, "filename": "weibo_preprocess_toolkit-1.0.0-py3-none-any.whl", "has_sig": false, "md5_digest": "6bc25c42902a29fb06420ab517e9c9f5", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": ">=3", "size": 252123, "upload_time": "2019-06-10T05:45:54", "url": "https://files.pythonhosted.org/packages/b6/5f/b61b094f36dabee0b69dcd359118a3d94d32c67557fca01cdb8ae35d89f6/weibo_preprocess_toolkit-1.0.0-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "15365237283b7016aeb0ceaa5c71406f", "sha256": "abb3ddd38c1a27697c0ee97c24ca37c2eb5a2324529c48d2e98bf3f23dc09386" }, "downloads": -1, "filename": "weibo-preprocess-toolkit-1.0.0.tar.gz", "has_sig": false, "md5_digest": "15365237283b7016aeb0ceaa5c71406f", "packagetype": "sdist", "python_version": "source", "requires_python": ">=3", "size": 119212, "upload_time": "2019-06-10T05:46:01", "url": "https://files.pythonhosted.org/packages/02/bf/a0e5acd33e5bfda5b7c82e2e860e8c9ee3b577401f61f3fa2cc671b3fe1c/weibo-preprocess-toolkit-1.0.0.tar.gz" } ], "1.1.0": [ { "comment_text": "", "digests": { "md5": "56df0501228ac7caea820a351a3534c5", "sha256": "c1ee6e0d1ca45e47a5afdf7f52f27b6efdca2203669230af56b9caa857fc37c0" }, "downloads": -1, "filename": "weibo_preprocess_toolkit-1.1.0-py3-none-any.whl", "has_sig": false, "md5_digest": "56df0501228ac7caea820a351a3534c5", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": ">=3", "size": 254817, "upload_time": "2019-07-13T08:53:18", "url": "https://files.pythonhosted.org/packages/f2/fa/1ac334eeeef0546c8c4752435cb037995564272e1edb1b4354bcd4952272/weibo_preprocess_toolkit-1.1.0-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "4f935a3be0c7dd3fe960dfb31928054b", "sha256": "aa33bcc820c810aa5d4007ef0f903e41f4df66a1029c188e883996f78cb6bf7a" }, "downloads": -1, "filename": "weibo-preprocess-toolkit-1.1.0.tar.gz", "has_sig": false, "md5_digest": "4f935a3be0c7dd3fe960dfb31928054b", "packagetype": "sdist", "python_version": "source", "requires_python": ">=3", "size": 200254, "upload_time": "2019-07-13T08:53:21", "url": "https://files.pythonhosted.org/packages/c7/e0/66ae4ab2cd1412689919135e7b5fbbbb5ee435df82c4df6ead5cdcc31890/weibo-preprocess-toolkit-1.1.0.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "56df0501228ac7caea820a351a3534c5", "sha256": "c1ee6e0d1ca45e47a5afdf7f52f27b6efdca2203669230af56b9caa857fc37c0" }, "downloads": -1, "filename": "weibo_preprocess_toolkit-1.1.0-py3-none-any.whl", "has_sig": false, "md5_digest": "56df0501228ac7caea820a351a3534c5", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": ">=3", "size": 254817, "upload_time": "2019-07-13T08:53:18", "url": "https://files.pythonhosted.org/packages/f2/fa/1ac334eeeef0546c8c4752435cb037995564272e1edb1b4354bcd4952272/weibo_preprocess_toolkit-1.1.0-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "4f935a3be0c7dd3fe960dfb31928054b", "sha256": "aa33bcc820c810aa5d4007ef0f903e41f4df66a1029c188e883996f78cb6bf7a" }, "downloads": -1, "filename": "weibo-preprocess-toolkit-1.1.0.tar.gz", "has_sig": false, "md5_digest": "4f935a3be0c7dd3fe960dfb31928054b", "packagetype": "sdist", "python_version": "source", "requires_python": ">=3", "size": 200254, "upload_time": "2019-07-13T08:53:21", "url": "https://files.pythonhosted.org/packages/c7/e0/66ae4ab2cd1412689919135e7b5fbbbb5ee435df82c4df6ead5cdcc31890/weibo-preprocess-toolkit-1.1.0.tar.gz" } ] }