PK f£J šççazuresearchta.py#!/usr/bin/env python # -*- coding: utf-8 -*- import os import sys import argparse if sys.version_info[0] == 3: import http.client as httplib import urllib.request, urllib.parse, urllib.error from urllib.request import urlopen from urllib.parse import urlparse from io import StringIO else: import httplib from urllib import urlopen from urlparse import urlparse from io import open from StringIO import StringIO import simplejson as json from bs4 import BeautifulSoup, NavigableString, Declaration, Comment ### Global Defines _AZURE_SEARCH_TEXT_ANALYZE_VERSION = '0.2.1' _AZURE_SEARCH_API_VERSION = '2015-02-28-Preview' _AZURE_SEARCH_CONFIG_FILE = 'search.conf' def print_out(s): #print (type(outs)) if sys.version_info[0] == 3: sys.stdout.buffer.write(u"{}\n".format(s).encode('utf-8')) else: print(s.encode('utf-8')) def print_err(s): sys.stderr.write(u"[ERROR] {}\n".format(s)) def print_quit(s): print(s) quit() def format_join(l,f): o = u'' n = 0 for i in l: if n==0: o = o + f.format(i) else: o = o + u' ' o = o + f.format(i) n += 1 return o def is_URL(s): o = urlparse(s) if len(o.scheme) > 0: return True return False def read_config(s): config = {} f = open(s) line = f.readline().strip() while line: #print line line = f.readline().strip() # skip if line start from sharp if line[0:1] == '#': continue arrs=line.split('=') if len(arrs) != 2: continue config[arrs[0]] = arrs[1] f.close return config class WebScraper: def __init__(self, url): res = urlopen(url) self.__content = res.read().decode('utf-8') def __get_navigable_strings(self,soup): if isinstance(soup, NavigableString): if type(soup) not in (Comment, Declaration) and soup.strip(): yield soup elif soup.name not in ('script', 'style'): for c in soup.contents: for g in self.__get_navigable_strings(c): yield g def __get_Html_tag_stripped_text(self,s): soup = BeautifulSoup(s, "html.parser") return ' '.join(self.__get_navigable_strings(soup)) def get_content(self): return self.__content def get_html_stripped_content(self): return self.__get_Html_tag_stripped_text(self.__content) class AzureSearchClient: def __init__(self, api_url, api_key): self.__api_url=api_url self.__api_key=api_key self.headers={ 'Content-Type': "application/json; charset=UTF-8", 'Api-Key': self.__api_key, 'Accept': "application/json", 'Accept-Charset':"UTF-8" } def textanalyze(self,index_name, analyzer, text): # Create JSON string for request body reqobject={} reqobject['text'] = text reqobject['analyzer'] = analyzer io=StringIO() json.dump(reqobject, io) req_body = io.getvalue() # HTTP request to Azure search REST API conn = httplib.HTTPSConnection(self.__api_url) conn.request("POST", u"/indexes/{0}/analyze?api-version={1}".format(index_name, _AZURE_SEARCH_API_VERSION), req_body, self.headers) response = conn.getresponse() #print "status:", response.status, response.reason data = (response.read()).decode('utf-8') #print("data:{}".format(data)) conn.close() return data def main(): parser = argparse.ArgumentParser(description='This program do text analysis and generate formatted output by using Azure Search Text Analyze API') parser.add_argument( '-v','--version', action='version', version=_AZURE_SEARCH_TEXT_ANALYZE_VERSION) parser.add_argument( '-c','--conf', default=_AZURE_SEARCH_CONFIG_FILE, help='Azure Search Configuration file. Default:search.conf') parser.add_argument( '-i','--index', help='Azure Search index name') parser.add_argument( '-a','--analyzer', help='Azure Search analyzer name') parser.add_argument( '-t','--text', help='A file path or HTTP(s) URL from which the command line reads the text to analyze') parser.add_argument( '-o','--output', default='normal', help='Output format ("simple" or "normal"). Default:normal') args = parser.parse_args() ### Args Validation if not os.path.exists(args.conf): print_err(u"Azure Search config file doesn't exist: {0}\n" u"Please speicify the file with --conf option\n".format(args.conf)) print_quit(parser.parse_args(['-h'])) if not args.index: print_err(u"Please specify index name with --index option!\n") print_quit(parser.parse_args(['-h'])) if not args.analyzer: print_err(u"Please specify analyzer name with --analyzer option!\n") print_quit(parser.parse_args(['-h'])) if not args.text: print_err(u"Please specify text file with --text option!\n") print_quit(parser.parse_args(['-h'])) if not is_URL(args.text) and not os.path.exists(args.text): print_err(u"Please speicfiy either URL or text file path that really does exist for --text option value!: {}\n".format(args.text)) if args.output !="simple" and args.output !="normal": print_err(u"Please specify either \"simple\" or \"normal\" for --output option value!\n") ## Read from URL target_text = u'' if (is_URL(args.text)): ## Read from URL ws = WebScraper(args.text) target_text = ws.get_html_stripped_content() else: ## Read from file so = u'' with open(args.text, encoding='utf-8') as f: lines = f.readlines() for l in lines: l = l.strip() if len(l) > 1 and not l.isspace(): so = so + l target_text = so ### do Azure Search operations c = read_config(args.conf) client=AzureSearchClient( u"{0}.search.windows.net".format(c["SEARCH_SERVICE_NAME"]), c["SEARCH_API_KEY"]) resstr = client.textanalyze(args.index, args.analyzer, target_text) tokens=[] resobj=json.loads(resstr) tokenobjs = resobj['tokens'] for tokenobj in tokenobjs: tokens.append(tokenobj['token']) ### print TOKENS with specified output format outs = u'' if (args.output == 'simple'): outs = outs + format_join(tokens, u"'{}'" ) else: outs = outs + u'INPUT: ' outs = outs + target_text outs = outs + u'\n' outs = outs + u'TOKENS: ' outs = outs + format_join(tokens, u"[{}]" ) print_out(outs) PKèi£Jƒ;ï€ - -/azure_search_ta-0.2.1.dist-info/DESCRIPTION.rstazure-search-ta =============== Azure Search Test Analyzer API client tool that shows how an analyzer breaks text into tokens utlizing Azure Search `Analyze API `__. Web UI for Test Analyzer API ============================ Web UI Tool that allows you to see how an analyzer breaks text into tokens via Web UI. |image0| Installation is very simple - (1) just copying files under `azure-search-ta/ui `__ onto your web server, (2) Open analyze-api.php with your editor and configure your Azure Search serivce name and Azure Search API Admin key, that's it! Make sure if all related files are accessible from the web server, and also if .php file is executable in the web server. :: vi analyze-api.php $azureSearchAccount=""; $azureSearchApiKey = "" Command-Line Tool ================= 1. Installation --------------- Install `azure-search-ta `__ python package by uinsg `pip `__. Pip is a package management system used to install and manage software packages, such as those found in the `Python Package Index `__. :: pip install azure-search-ta 2. Preparation -------------- 2-1. Create Azure Search Account and configure search.conf ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ To enjoy text analysis using this command, you must create an Azure Search service in the Azure Portal. Please follow the instrucftion below: \* `Create a service `__ Once the Azure search account is created, add Azure Search service name and API Key to the following search.conf file. Regarding API Key, an admin key must be added instead of a query key as the Analyze API request requires an admin key. :: # Azure Search Service Name ( never put space before and after = ) SEARCH_SERVICE_NAME= # Azure Search API Admin Key ( never put space before and after = ) SEARCH_API_KEY= 2-2. Create Index Schema to Analyze Text ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You need an index name to construct Azure Search Analyze API request internally in the tool. For creating an index, please follow the instruction below - `Create an Azure Search index `__ - `Azure Search Service REST API:Version 2015-02-28-Preview `__ Regardless of your index definitions you can test with any Azure Search's **predefined analyzers**. Therefore the following index schema (index name:'ta') is enough for the testing with predefined analyzers: :: { "name": "ta", "fields": [ { "name":"id", "type":"Edm.String", "key": true, "searchable": false }, { "name":"content", "type":"Edm.String" } ] } In the meanwhile, in order for you to test with your **custom analyzer**, you need to define the custom analyzer in your index definition. Here is a sample index schema (index name: 'tacustom') that has custom analyzer definition: :: { "name":"tacustom", "fields":[ { "name":"id", "type":"Edm.String", "key":true, "searchable":false }, { "name":"content","type":"Edm.String", "analyzer":"my_ngram" } ], "analyzers":[ { "name":"my_ngram", "@odata.type":"#Microsoft.Azure.Search.CustomAnalyzer", "charFilters": ["html_strip"], "tokenizer":"my_tokenizer", "tokenFilters":[ "cjk_width","lowercase" ] } ], "tokenizers":[ { "name":"my_tokenizer", "@odata.type":"#Microsoft.Azure.Search.NGramTokenizer", "minGram":2, "maxGram":5 } ] } [NOTE] For **predefined analyzers**, please refer to `Language support (Azure Search Service REST API) `__ and `this document `__'s `Analyzers' section `__. For **custom analyzers**, please refer to `Custom analyzers in Azure Search `__. 3. Executing command -------------------- azure-search-ta usage ~~~~~~~~~~~~~~~~~~~~~ :: usage: azure-search-ta [-h] [-v] [-c CONF] [-i INDEX] [-a ANALYZER] [-t TEXT] [-o OUTPUT] This program do text analysis and generate formatted output by using Azure Search Analyze API optional arguments: -h, --help show this help message and exit -v, --version show program's version number and exit -c CONF, --conf CONF Azure Search Configuration file. Default:search.conf -i INDEX, --index INDEX Azure Search index name -a ANALYZER, --analyzer ANALYZER Azure Search analyzer name -t TEXT, --text TEXT A file path or HTTP(s) URL from which the command line reads the text to analyze -o OUTPUT, --output OUTPUT Output format ("simple" or "normal"). Default:normal Example1: Analyzing text from a file with ja.microsoft analyzer and 'normal' output format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Suppose you want to read text from simple1.txt and make analysis for the text with ja.microsoft analyzer :: $ cat sample1.txt å¾è¼©ã¯çŒ«ã§ã‚ã‚‹ $ azure-search-ta -c ./search.conf -i ta -a ja.microsoft --t sample1.txt INPUT: å¾è¼©ã¯çŒ«ã§ã‚ã‚‹ TOKENS: [å¾è¼©] [猫] [ã‚ã‚‹] Example2: Analyzing text from a file with ja.microsoft analyzer and 'simple' output format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Suppose you want to read text from simple1.txt and make analysis for the text with ja.microsoft analyzer :: $ cat sample1.txt å¾è¼©ã¯çŒ«ã§ã‚ã‚‹ $ azure-search-ta -c ./search.conf -i ta -a ja.microsoft --t sample1.txt -o simple 'å¾è¼©' '猫' 'ã‚ã‚‹' Example3: Analyzing text from a file with custome analyzer and 'simple' output format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Suppose you want to read text from simple1.txt and make analysis for the text with custom analyzer ('my\_ngram') defined in tacustom index :: $ cat sample1.txt å¾è¼©ã¯çŒ«ã§ã‚ã‚‹ $ azure-search-ta -c ./search.conf -i tacustom -a my_ngram --t sample1.txt -o simple 'å¾è¼©' 'å¾è¼©ã¯' 'å¾è¼©ã¯çŒ«ã§' 'å¾è¼©ã¯çŒ«' '輩ã¯çŒ«ã§ã‚' '輩ã¯' '輩ã¯çŒ«' '輩ã¯çŒ«ã§' 'ã¯çŒ«ã§ã‚' 'ã¯çŒ«ã§' 'ã¯çŒ«' 'ã¯çŒ«ã§ã‚ã‚‹' '猫ã§ã‚' '猫ã§' '猫㧠ã‚ã‚‹' 'ã§ã‚ã‚‹' 'ã§ã‚' 'ã‚ã‚‹' Example4: Analyzing text from URL with ja.lucene analyzer and 'simple' output format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Suppose you want to read text from URL(http://www.yahoo.co.jp) and make analysis for the text with ja.lucene analyzer :: $ azure-search-ta -i ta -a ja.lucene --t http://www.yahoo.co.jp -o simple 'yahoo' 'japan' 'ヘルプ' 'yahoo' 'japan' 'トップページ' '機能' 'æ­£ã—ã' 'ã”' '利用' 'ã„ãŸã ã' '下記' '環境' 'å¿…è¦' 'windows' 'internet' 'explorer' '9' '0' '以上' 'chrome' '最新' '版' 'firefox' '最新' '版' 'microsoft' 'edge' 'macintosh' 'safari' '5' '0' '以上' 'internet' 'explorer' '9' '0' '以上' 'ã”' '利用' 'å ´åˆ' 'internet' 'explorer' '互æ›' '表示' 'å‚考' '互æ›' '表示' '無効' '化' '試ã—' 'ãã ã•ã‚‹' 'キャンペーン' 'å‚加' 'å®¶é›»' 'ブランド' 'å“' 'ãƒã‚¤ãƒ³ãƒˆ' '11' 'å€' 'ユニãƒãƒ¼ã‚µãƒ«' 'スタジオ' 'ジャパン' 'ã”' '招待' 'é›»å­' '書ç±' '5' '冊' '購入' '555' 'ãƒã‚¤ãƒ³ãƒˆ' ' 進呈' 'ニュース' '6' '時' '34' '分' 'æ›´æ–°' '韓国' 'å‰' '首席' '秘書官' '逮æ•' 'ç”·å…' '䏿˜Ž' '父' '供述' 'æµ®ã‹ã¶' '謎' '事故' '車外' '出る' 'ã¯ã­ã‚‹' '死亡' '麻薬' 'å–引' '疑惑' '市長' '射殺' '比' 'パナ' 'led' 'é›»çƒ' '5' 'å¹´' 'ä¿è¨¼' 'éŽåŽ»' 'ジョコビッãƒ' '世界' '1' 'ä½' '陥è½' 'ガイア' '夜明ã‘' '心' '刺ã•ã‚‹' '訳' 'ã•ã‚“ã¾' 'åˆ' '紅白' '出演' '濃厚' 'ã‚‚ã£ã¨' '見る' '記事' '一覧' '夜' 'ワラ' 'ゴジラ' '11' '月' '5' 'æ—¥' '19' '時' '40' '分' 'é…ä¿¡' '時事' '時事通信' '通信' 'ショッピング' 'ヤフオク' '旅行' 'ホテル' '予約' 'ニュース' '天気' 'スãƒãƒ¼ãƒ„ナビ' 'ファイナンス' 'テレビ' 'gyao' 'y' 'モãƒã‚²' '地域' '地図' '路線' '食ã¹ã‚‹' 'ログ' '求人' 'アルãƒã‚¤ãƒˆ' 'ä¸å‹•産' '自動車' '掲示æ¿' 'ブログ' 'ビューティ' '出会ã„' 'é›»å­' '書ç±' '映画' 'ゲーム' 'å ã„' 'サービス' '一覧' 'ログイン' 'id' 'ã‚‚ã£ã¨' '便利' 'æ–°è¦' 'å–å¾—' 'メール' 'メールアドレス' 'å–å¾—' 'カレンダ' 'カレンダ' '活用' 'ãƒã‚¤ãƒ³ãƒˆ' '確èª' 'ログイン' '履歴' '確èª' '会社' '概è¦' '投資' 'å®¶' '情報' '社会' 'çš„' '責任' '伿¥­' '行動' '憲章' '広 告' '掲載' '採用' '情報' '利用' 'è¦ç´„' 'å…責' '事項' 'メディア' 'ステートメント' 'セキュリティ' 'è€ƒãˆæ–¹' 'プライãƒã‚·' 'ãƒãƒªã‚·' 'copyright' 'c' '2016' 'yahoo' 'japan' 'corporation' 'all' 'rights' 'reserved' Suppose you want to read text from URL(http://news.microsoft.com/ja-jp/) and get the 10 most popular keywords that are contained in the results of test analysis with ja.lucene analyzer :: azure-search-ta -i ta -a ja.lucene --t http://news.microsoft.com/ja-jp/ -o simple | tr " " "\n" | sort |uniq -c | sort -nr |head -10 97 'ストア' 74 'デãƒã‚¤ã‚¹' 71 'マイクロソフト' 39 '日本' 32 'æ ªå¼ä¼šç¤¾' 32 'æ ªå¼' 32 '会社' 30 'ソフトウェア' 29 'microsoft' 27 '2016' Todo ---- - Support HTML output format option Change log ---------- - `Changelog `__ Links ----- - https://pypi.python.org/pypi/azure-search-ta/ - `Azure Search Analyze API `__ - `Language support (Azure Search Service REST API) `__ - `Custom analyzers in Azure Search `__ Contributing ------------ Bug reports and pull requests are welcome on GitHub at https://github.com/yokawasa/azure-search-ta. Copyright --------- .. raw:: html .. raw:: html :: .. raw:: html .. raw:: html :: .. raw:: html .. raw:: html
CopyrightCopyright (c) 2016- Yoichi Kawasaki
LicenseMIT
.. |image0| image:: https://github.com/yokawasa/azure-search-ta/raw/master/img/azure-search-ta-ui.gif PKèi£J¶÷zE660azure_search_ta-0.2.1.dist-info/entry_points.txt[console_scripts] azure-search-ta=azuresearchta:main PKèi£JHäa-azure_search_ta-0.2.1.dist-info/metadata.json{"classifiers": ["Environment :: Console", "Intended Audience :: Developers", "Intended Audience :: System Administrators", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Topic :: Utilities"], "download_url": "https://pypi.python.org/pypi/azure-seatch-ta", "extensions": {"python.commands": {"wrap_console": {"azure-search-ta": "azuresearchta:main"}}, "python.details": {"contacts": [{"email": "yoichi.kawasaki@outlook.com", "name": "Yoichi Kawasaki", "role": "author"}], "document_names": {"description": "DESCRIPTION.rst"}, "project_urls": {"Home": "https://github.com/yokawasa/azure-search-ta"}}, "python.exports": {"console_scripts": {"azure-search-ta": "azuresearchta:main"}}}, "extras": [], "generator": "bdist_wheel (0.26.0)", "keywords": ["azure", "search", "azuresearch", "text", "analysis", "api"], "license": "MIT", "metadata_version": "2.0", "name": "azure-search-ta", "platform": "any", "run_requires": [{"requires": ["argparse", "beautifulsoup4", "simplejson"]}], "summary": "Azure Search Test Analyzer API Client Tool", "version": "0.2.1"}PKèi£J‰Än¸-azure_search_ta-0.2.1.dist-info/top_level.txtazuresearchta PKèi£JŒ''\\%azure_search_ta-0.2.1.dist-info/WHEELWheel-Version: 1.0 Generator: bdist_wheel (0.26.0) Root-Is-Purelib: true Tag: py2-none-any PKèi£J)D™Ææ0æ0(azure_search_ta-0.2.1.dist-info/METADATAMetadata-Version: 2.0 Name: azure-search-ta Version: 0.2.1 Summary: Azure Search Test Analyzer API Client Tool Home-page: https://github.com/yokawasa/azure-search-ta Author: Yoichi Kawasaki Author-email: yoichi.kawasaki@outlook.com License: MIT Download-URL: https://pypi.python.org/pypi/azure-seatch-ta Keywords: azure search azuresearch text analysis api Platform: any Classifier: Environment :: Console Classifier: Intended Audience :: Developers Classifier: Intended Audience :: System Administrators Classifier: License :: OSI Approved :: MIT License Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 Classifier: Programming Language :: Python :: 3.6 Classifier: Topic :: Utilities Requires-Dist: argparse Requires-Dist: beautifulsoup4 Requires-Dist: simplejson azure-search-ta =============== Azure Search Test Analyzer API client tool that shows how an analyzer breaks text into tokens utlizing Azure Search `Analyze API `__. Web UI for Test Analyzer API ============================ Web UI Tool that allows you to see how an analyzer breaks text into tokens via Web UI. |image0| Installation is very simple - (1) just copying files under `azure-search-ta/ui `__ onto your web server, (2) Open analyze-api.php with your editor and configure your Azure Search serivce name and Azure Search API Admin key, that's it! Make sure if all related files are accessible from the web server, and also if .php file is executable in the web server. :: vi analyze-api.php $azureSearchAccount=""; $azureSearchApiKey = "" Command-Line Tool ================= 1. Installation --------------- Install `azure-search-ta `__ python package by uinsg `pip `__. Pip is a package management system used to install and manage software packages, such as those found in the `Python Package Index `__. :: pip install azure-search-ta 2. Preparation -------------- 2-1. Create Azure Search Account and configure search.conf ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ To enjoy text analysis using this command, you must create an Azure Search service in the Azure Portal. Please follow the instrucftion below: \* `Create a service `__ Once the Azure search account is created, add Azure Search service name and API Key to the following search.conf file. Regarding API Key, an admin key must be added instead of a query key as the Analyze API request requires an admin key. :: # Azure Search Service Name ( never put space before and after = ) SEARCH_SERVICE_NAME= # Azure Search API Admin Key ( never put space before and after = ) SEARCH_API_KEY= 2-2. Create Index Schema to Analyze Text ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You need an index name to construct Azure Search Analyze API request internally in the tool. For creating an index, please follow the instruction below - `Create an Azure Search index `__ - `Azure Search Service REST API:Version 2015-02-28-Preview `__ Regardless of your index definitions you can test with any Azure Search's **predefined analyzers**. Therefore the following index schema (index name:'ta') is enough for the testing with predefined analyzers: :: { "name": "ta", "fields": [ { "name":"id", "type":"Edm.String", "key": true, "searchable": false }, { "name":"content", "type":"Edm.String" } ] } In the meanwhile, in order for you to test with your **custom analyzer**, you need to define the custom analyzer in your index definition. Here is a sample index schema (index name: 'tacustom') that has custom analyzer definition: :: { "name":"tacustom", "fields":[ { "name":"id", "type":"Edm.String", "key":true, "searchable":false }, { "name":"content","type":"Edm.String", "analyzer":"my_ngram" } ], "analyzers":[ { "name":"my_ngram", "@odata.type":"#Microsoft.Azure.Search.CustomAnalyzer", "charFilters": ["html_strip"], "tokenizer":"my_tokenizer", "tokenFilters":[ "cjk_width","lowercase" ] } ], "tokenizers":[ { "name":"my_tokenizer", "@odata.type":"#Microsoft.Azure.Search.NGramTokenizer", "minGram":2, "maxGram":5 } ] } [NOTE] For **predefined analyzers**, please refer to `Language support (Azure Search Service REST API) `__ and `this document `__'s `Analyzers' section `__. For **custom analyzers**, please refer to `Custom analyzers in Azure Search `__. 3. Executing command -------------------- azure-search-ta usage ~~~~~~~~~~~~~~~~~~~~~ :: usage: azure-search-ta [-h] [-v] [-c CONF] [-i INDEX] [-a ANALYZER] [-t TEXT] [-o OUTPUT] This program do text analysis and generate formatted output by using Azure Search Analyze API optional arguments: -h, --help show this help message and exit -v, --version show program's version number and exit -c CONF, --conf CONF Azure Search Configuration file. Default:search.conf -i INDEX, --index INDEX Azure Search index name -a ANALYZER, --analyzer ANALYZER Azure Search analyzer name -t TEXT, --text TEXT A file path or HTTP(s) URL from which the command line reads the text to analyze -o OUTPUT, --output OUTPUT Output format ("simple" or "normal"). Default:normal Example1: Analyzing text from a file with ja.microsoft analyzer and 'normal' output format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Suppose you want to read text from simple1.txt and make analysis for the text with ja.microsoft analyzer :: $ cat sample1.txt å¾è¼©ã¯çŒ«ã§ã‚ã‚‹ $ azure-search-ta -c ./search.conf -i ta -a ja.microsoft --t sample1.txt INPUT: å¾è¼©ã¯çŒ«ã§ã‚ã‚‹ TOKENS: [å¾è¼©] [猫] [ã‚ã‚‹] Example2: Analyzing text from a file with ja.microsoft analyzer and 'simple' output format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Suppose you want to read text from simple1.txt and make analysis for the text with ja.microsoft analyzer :: $ cat sample1.txt å¾è¼©ã¯çŒ«ã§ã‚ã‚‹ $ azure-search-ta -c ./search.conf -i ta -a ja.microsoft --t sample1.txt -o simple 'å¾è¼©' '猫' 'ã‚ã‚‹' Example3: Analyzing text from a file with custome analyzer and 'simple' output format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Suppose you want to read text from simple1.txt and make analysis for the text with custom analyzer ('my\_ngram') defined in tacustom index :: $ cat sample1.txt å¾è¼©ã¯çŒ«ã§ã‚ã‚‹ $ azure-search-ta -c ./search.conf -i tacustom -a my_ngram --t sample1.txt -o simple 'å¾è¼©' 'å¾è¼©ã¯' 'å¾è¼©ã¯çŒ«ã§' 'å¾è¼©ã¯çŒ«' '輩ã¯çŒ«ã§ã‚' '輩ã¯' '輩ã¯çŒ«' '輩ã¯çŒ«ã§' 'ã¯çŒ«ã§ã‚' 'ã¯çŒ«ã§' 'ã¯çŒ«' 'ã¯çŒ«ã§ã‚ã‚‹' '猫ã§ã‚' '猫ã§' '猫㧠ã‚ã‚‹' 'ã§ã‚ã‚‹' 'ã§ã‚' 'ã‚ã‚‹' Example4: Analyzing text from URL with ja.lucene analyzer and 'simple' output format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Suppose you want to read text from URL(http://www.yahoo.co.jp) and make analysis for the text with ja.lucene analyzer :: $ azure-search-ta -i ta -a ja.lucene --t http://www.yahoo.co.jp -o simple 'yahoo' 'japan' 'ヘルプ' 'yahoo' 'japan' 'トップページ' '機能' 'æ­£ã—ã' 'ã”' '利用' 'ã„ãŸã ã' '下記' '環境' 'å¿…è¦' 'windows' 'internet' 'explorer' '9' '0' '以上' 'chrome' '最新' '版' 'firefox' '最新' '版' 'microsoft' 'edge' 'macintosh' 'safari' '5' '0' '以上' 'internet' 'explorer' '9' '0' '以上' 'ã”' '利用' 'å ´åˆ' 'internet' 'explorer' '互æ›' '表示' 'å‚考' '互æ›' '表示' '無効' '化' '試ã—' 'ãã ã•ã‚‹' 'キャンペーン' 'å‚加' 'å®¶é›»' 'ブランド' 'å“' 'ãƒã‚¤ãƒ³ãƒˆ' '11' 'å€' 'ユニãƒãƒ¼ã‚µãƒ«' 'スタジオ' 'ジャパン' 'ã”' '招待' 'é›»å­' '書ç±' '5' '冊' '購入' '555' 'ãƒã‚¤ãƒ³ãƒˆ' ' 進呈' 'ニュース' '6' '時' '34' '分' 'æ›´æ–°' '韓国' 'å‰' '首席' '秘書官' '逮æ•' 'ç”·å…' '䏿˜Ž' '父' '供述' 'æµ®ã‹ã¶' '謎' '事故' '車外' '出る' 'ã¯ã­ã‚‹' '死亡' '麻薬' 'å–引' '疑惑' '市長' '射殺' '比' 'パナ' 'led' 'é›»çƒ' '5' 'å¹´' 'ä¿è¨¼' 'éŽåŽ»' 'ジョコビッãƒ' '世界' '1' 'ä½' '陥è½' 'ガイア' '夜明ã‘' '心' '刺ã•ã‚‹' '訳' 'ã•ã‚“ã¾' 'åˆ' '紅白' '出演' '濃厚' 'ã‚‚ã£ã¨' '見る' '記事' '一覧' '夜' 'ワラ' 'ゴジラ' '11' '月' '5' 'æ—¥' '19' '時' '40' '分' 'é…ä¿¡' '時事' '時事通信' '通信' 'ショッピング' 'ヤフオク' '旅行' 'ホテル' '予約' 'ニュース' '天気' 'スãƒãƒ¼ãƒ„ナビ' 'ファイナンス' 'テレビ' 'gyao' 'y' 'モãƒã‚²' '地域' '地図' '路線' '食ã¹ã‚‹' 'ログ' '求人' 'アルãƒã‚¤ãƒˆ' 'ä¸å‹•産' '自動車' '掲示æ¿' 'ブログ' 'ビューティ' '出会ã„' 'é›»å­' '書ç±' '映画' 'ゲーム' 'å ã„' 'サービス' '一覧' 'ログイン' 'id' 'ã‚‚ã£ã¨' '便利' 'æ–°è¦' 'å–å¾—' 'メール' 'メールアドレス' 'å–å¾—' 'カレンダ' 'カレンダ' '活用' 'ãƒã‚¤ãƒ³ãƒˆ' '確èª' 'ログイン' '履歴' '確èª' '会社' '概è¦' '投資' 'å®¶' '情報' '社会' 'çš„' '責任' '伿¥­' '行動' '憲章' '広 告' '掲載' '採用' '情報' '利用' 'è¦ç´„' 'å…責' '事項' 'メディア' 'ステートメント' 'セキュリティ' 'è€ƒãˆæ–¹' 'プライãƒã‚·' 'ãƒãƒªã‚·' 'copyright' 'c' '2016' 'yahoo' 'japan' 'corporation' 'all' 'rights' 'reserved' Suppose you want to read text from URL(http://news.microsoft.com/ja-jp/) and get the 10 most popular keywords that are contained in the results of test analysis with ja.lucene analyzer :: azure-search-ta -i ta -a ja.lucene --t http://news.microsoft.com/ja-jp/ -o simple | tr " " "\n" | sort |uniq -c | sort -nr |head -10 97 'ストア' 74 'デãƒã‚¤ã‚¹' 71 'マイクロソフト' 39 '日本' 32 'æ ªå¼ä¼šç¤¾' 32 'æ ªå¼' 32 '会社' 30 'ソフトウェア' 29 'microsoft' 27 '2016' Todo ---- - Support HTML output format option Change log ---------- - `Changelog `__ Links ----- - https://pypi.python.org/pypi/azure-search-ta/ - `Azure Search Analyze API `__ - `Language support (Azure Search Service REST API) `__ - `Custom analyzers in Azure Search `__ Contributing ------------ Bug reports and pull requests are welcome on GitHub at https://github.com/yokawasa/azure-search-ta. Copyright --------- .. raw:: html .. raw:: html :: .. raw:: html .. raw:: html :: .. raw:: html .. raw:: html
CopyrightCopyright (c) 2016- Yoichi Kawasaki
LicenseMIT
.. |image0| image:: https://github.com/yokawasa/azure-search-ta/raw/master/img/azure-search-ta-ui.gif PKèi£Jm RÒÒ&azure_search_ta-0.2.1.dist-info/RECORDazuresearchta.py,sha256=sWfzpgX5jpL6yIbyZlL-91yP4C_kUjt0ekmm6F6ddRk,6887 azure_search_ta-0.2.1.dist-info/DESCRIPTION.rst,sha256=cV_qekkWnijwE8nrXazetj2eHmwXDZd0kuI0x5z98x8,11552 azure_search_ta-0.2.1.dist-info/METADATA,sha256=IsxBuxmxSMYP2nCntPzBUFqoQ1RDB28MEnTEIr0diyM,12518 azure_search_ta-0.2.1.dist-info/RECORD,, azure_search_ta-0.2.1.dist-info/WHEEL,sha256=JTb7YztR8fkPg6aSjc571Q4eiVHCwmUDlX8PhuuqIIE,92 azure_search_ta-0.2.1.dist-info/entry_points.txt,sha256=UU_EO1Q_BXzrrxVIR8Ov1Q9dVjwXw2RUxDNK3pp_msA,54 azure_search_ta-0.2.1.dist-info/metadata.json,sha256=Ks0DHqE_DfXQEv1YVIlu6a7UpepOhfhXRiiPOOBzxOg,1294 azure_search_ta-0.2.1.dist-info/top_level.txt,sha256=9zR187JE4HHY3--y4Y6dU07mXiuOYgQzU9uS6ZraK-g,14 PK f£J šççazuresearchta.pyPKèi£Jƒ;ï€ - -/azure_search_ta-0.2.1.dist-info/DESCRIPTION.rstPKèi£J¶÷zE660‚Hazure_search_ta-0.2.1.dist-info/entry_points.txtPKèi£JHäa-Iazure_search_ta-0.2.1.dist-info/metadata.jsonPKèi£J‰Än¸-_Nazure_search_ta-0.2.1.dist-info/top_level.txtPKèi£JŒ''\\%¸Nazure_search_ta-0.2.1.dist-info/WHEELPKèi£J)D™Ææ0æ0(WOazure_search_ta-0.2.1.dist-info/METADATAPKèi£Jm RÒÒ&ƒ€azure_search_ta-0.2.1.dist-info/RECORDPK¬™ƒ