PKDEwI8o0o0pycogserv/SearchWeb.pyfrom socket import gethostname, gethostbyname from time import sleep import requests from pycogserv.dict_mod import OrderedDictWithPrepend from pycogserv.constants import user_constants, static_constants from pycogserv.validations import QueryChecker, ResponseChecker # import pdb """ Massive swaths of this v5 API interface were graciously stolen from py-bing-search you can find it here: https://github.com/tristantao/py-bing-search Modify query params in class 'constants.' - You can create your own own query-param-dict as a replacement, but use OrderedDict. 'q' must be the first key at runtime. - Dict entries of format "YourDict[key] == None" will be ignored and can therefore be safely included. TODO: - Add image/news/video classes w/ support for API-specific querying --Base Endpoint URLs for these are partially built in class "constants" - implement paging with self.current_offset. """ class BingSearch(object): """ Base-Class to elimnate redundancy for the common functionalities that cut across APIs """ def __init__(self, api_key, query, safe=False, header_dict=user_constants.HEADERS): self.api_key = api_key self.safe = safe self.query = query # Paging-support self.current_offset = 0 self.total_estimated_matches = None self.last_url_sent = None # Cache last response self.last_response = None self.last_response_packaged = None if header_dict is user_constants.HEADERS: self.header = header_dict self.header.prepend('Ocp-Apim-Subscription-Key', api_key) for key, val in list(self.header.items()): if val == None: del self.header[key] else: self.header = self.manual_header_entry() def search(self, limit=50, **kwargs): """ :param limit: number of results to return. max is 50. :return json_results: a mess of json right now...in a dictionary. """ return self._search(limit=50, **kwargs) def manual_header_entry(self): """ Specify your own headers like a BOSS! (Note: spoof At your own peril. Complications abound.) :return: Nothing. This sets input. No type checking for max customizations. """ while True: headr = OrderedDictWithPrepend() if not headr: api_key = eval(input('enter your api key')) ua_str = eval(input('enter a valid User-Agent string')) ipaddr = eval(input('enter your ip address (or leave blank to autodetect)')) if not ipaddr: ipaddr = gethostbyname(gethostname()) headr['Ocp-Apim-Subscription-Key'] = api_key headr['User-Agent'] = ua_str headr['X-Search-ClientIP'] = ipaddr print(('\nYour auth-key is {}\nYour User-Agent string is {}\nYour ip address will appear as {}\n\n\n\n'.format( headr['Ocp-Apim-Subscription-Key'], headr['User-Agent'], headr['X-Search-ClientIP']))) response1 = eval(input( 'To change your auth-key enter (a)\nTo change your User-Agent string enter (u)\nTo change your ip address enter (i)\n\nIf you are satisfied, press (y) to confirm, or (n) to start over.\n> :')) if response1.lower() == 'y': return headr elif response1.lower() == 'n': del headr continue elif response1.lower() == 'a': headr['Ocp-Apim-Subscription-Key'] = eval(input('enter your api key')) continue elif response1.lower() == 'u': headr['User-Agent'] = eval(input('enter a valid User-Agent string')) continue elif response1.lower() == 'i': headr['X-Search-ClientIP'] = eval(input('enter "your" ip address')) continue else: print(('{} is not a valid option. Try again.'.format(response1))) continue class BingWebSearch(BingSearch): """ Web Search Object. Allows for default or manual header entry. Mandatory fields are 'api_key' and 'query' Other defaults will specify you as a firefox user, add no addtnl query params, and will give you the max of 50 results returned for your query Currently no support for paging, but functionality is in the works. """ def __init__(self, api_key, query, safe=False, header_dict=user_constants.HEADERS, addtnl_params=user_constants.INCLUDED_PARAMS): self.BASE_URL = static_constants.WEBSEARCH_ENDPOINT self.param_dict = OrderedDictWithPrepend() if addtnl_params and type(addtnl_params) == OrderedDictWithPrepend: for key, value in list(addtnl_params.items()): if key in static_constants.BASE_QUERY_PARAMS[2:]: self.param_dict[key] = addtnl_params[key] else: raise ValueError('One or more keys in param-dict are not valid params.') elif not addtnl_params: pass else: raise TypeError('Additional params must be in dictionary-format: {param_name : param_val}') ## Build header inside inherited BingSearch class. BingSearch.__init__(self, api_key=api_key, query=query, safe=safe, header_dict=header_dict) ## Run query validations is_ok = QueryChecker.check_web_params(self.param_dict, self.header) if is_ok: print('Query params PASSED validation.') else: raise AttributeError('query checker has a bug') print(('run .search() to run query and print json returned\ncurrent URL format is {}'.format( self.BASE_URL))) def _search(self, limit, override=False, newquery=None): """ Meat-&Potatoes of the search. Inserts search query and makes API call. :param limit: Number of return results. Max is 50 :param override: Set to True if you intend to use 'newquery' to modify the query on the fly :param newquery: enter new query value if you so choose. Will not change query params. :return json_results: list of packaged JSON results returned from Microsoft. see WebResult class below. """ # Allow _search to initialize new query. if override and newquery: self.query = newquery self.current_offset = 0 self.total_estimated_matches = None elif override and not newquery: raise AssertionError('query override has been activated but you have not specified a new query.') # Modify some variable/nonvariable params to enable paging and restrict query to webpages. if 'q' in list(self.param_dict.keys()): if override: del self.param_dict['q'] self.param_dict.prepend('q', self._insert_web_search_query(override=override, newquery=newquery)) else: print(('keeping {} as search-query value'.format(self.query))) pass else: self.param_dict.prepend('q', self._insert_web_search_query(override=override, newquery=newquery)) self.param_dict['offset'] = self.current_offset self.param_dict['responseFilter'] = 'Webpages' if limit > 50 or limit < 1: raise ValueError('limit must be positive integer b/w 1 and 50') else: self.param_dict['count'] = str(limit) # Query the API. Receive response object. try: response_object = requests.get(self.BASE_URL, params=self.param_dict, headers=self.header) except requests.Timeout: print('requests module timed out. Returning NoneType') return None # Handle error-codes and Preempt garbage results if URL is too long. if len(response_object.url) > 1500: raise ValueError('URL too long. Limit URLs to < 1,200 chars.') response_validated = ResponseChecker.validate_request_response(response_object) if response_validated == '429': response_object = self.handle_429_error(url=response_object.url) else: pass # Return packaged JSON or HTML. Update 'current_offset' and 'last_response...' caches. self.last_response = response_object self.last_url_sent = response_object.url if 'textFormat' in list(self.param_dict.keys()) and self.param_dict['textFormat']: if self.param_dict['textFormat'].upper() == 'HTML': self.current_offset += min(50, limit) print('returning HTML w/o packaging. .last_response_packaged will remain set to None.') return response_object.text() else: packaged_json = self.parse_json(response_object.json()) self.last_response_packaged = packaged_json self.current_offset += min(50, limit, len(packaged_json)) return packaged_json def parse_json(self, json_response): """ Takes raw JSON response and packages them as instances of class WebResult. :param json_response: EX -- .json() :return list of WebResult objects: parsed and prettied JSON results with easy data-access. Returned as a LIST of WebResult objects with len == the # of links returned by Bing. """ if not self.total_estimated_matches: print(('Bing says there are an estimated {} results matching your query'.format(json_response['webPages']['totalEstimatedMatches']))) self.total_estimated_matches = int(json_response['webPages']['totalEstimatedMatches']) packaged_json = [WebResult(single_json_entry) for single_json_entry in json_response['webPages']['value']] return packaged_json # return response_data # packaged_results = [WebResult(single_result_json) for single_result_json in json_results['d']['results']] # self.current_offset += min(50, limit, len(packaged_results)) # return packaged_results def handle_429_error(self, url): timeout_cnt = 0 while True: if timeout_cnt < 5: sleep(2) r2 = requests.get(url, self.header) if ResponseChecker.validate_request_response(r2) == '429': timeout_cnt += 1 pass elif r2.status_code == 200: break else: raise AssertionError('response not successful') else: raise IOError(static_constants.ERROR_CODES['429']) return r2 def _insert_web_search_query(self, override=False, newquery=None): if override: return newquery else: return self.query class WebResult(object): ''' Attributes which can be called from WebResult instance(WRi) -- WRi.json: full JSON entry. WRi.url: The URL sent back by Bing. WRi.display_url: Display URL. Not always accurate. WRi.name: The title of the page linked to by WRi.url. WRi.snippet: A snippet of text from the page linked to by WRi.url. WRi.id: the index value for this JSON entry. Used primarily for compound queries. ''' def __init__(self, result): self.json = result self.url = result.get('url') self.display_url = result.get('displayUrl') self.name = result.get('name') self.snippet = result.get('snippet') self.id = result.get('id') try: self.date_crawled = result.get('dateLastCrawled') self.about = result.get('about') except Exception: self.date_crawled = None self.about = None # maintain compatibility self.title = result.get('name') self.description = result.get('snippet') def __repr__(self): return 'WebResponse Obj: {}'.format(self.display_url)PK$IwI73rrpycogserv/__init__.py"""Docstring goes here""" __version__="0.0.2" from pycogserv import constants from pycogserv import SearchWebPK IwÎpycogserv/__main__.py"""Command line parsing""" from pycogserv import SearchWeb import sys if len(sys.argv)==1: print("--help for more options") elif sys.argv[1] == '--help': ''' TODO: ENTER OTHER OPTIONS HERE AND WITH ELSE STATEMENT''' print ('enter api_key, query to start search') else: search = SearchWeb.BingWebSearch(sys.argv[1],sys.argv[2]).search() print(search) PK GwI),b%b%pycogserv/constants.pyfrom _socket import gethostbyname, gethostname # from fake_useragent import UserAgent as UA from pycogserv.dict_mod import OrderedDictWithPrepend class user_constants(): """ This class gives access to the default headers and query-modifiers used when a BingWebSearch object is instantiated. Change the 'None' values to fit your use-case. Any value maked 'None' will be ignored. ANY & ALL other values will not be ignored. Also, DON'T CHANGE THE DICTIONARY KEYS!!!!!! They're defined by Microsoft. These values can also all be accessed and changed on the fly from the REPL. """ ############################################### ## DON'T TOUCH ## ############################################### HEADERS = OrderedDictWithPrepend() INCLUDED_PARAMS = OrderedDictWithPrepend() ############################################### ## Enter default-header customizations here. ## ############################################### HEADERS['User-Agent'] = "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1" HEADERS['X-Search-ClientIP'] = gethostbyname(gethostname()) HEADERS['X-MSEdge-ClientID'] = None HEADERS['Accept'] = None HEADERS['Accept-Language'] = None HEADERS['X-Search-Location'] = None ############################################### ## Enter query customizations here. ## ############################################### ## Web Params: INCLUDED_PARAMS['cc'] = None # <--(See constants.COUNTRY_CODES below for available options) INCLUDED_PARAMS['count'] = None # <--(Enter a number from 0-50. Must by type==str. EX: count of 5 should be "5") INCLUDED_PARAMS['freshness'] = None # <--(Poss values are 'Day', 'Week', or 'Month') INCLUDED_PARAMS['mkt'] = None # <--(See constants.MARKET_CODES below for available options) INCLUDED_PARAMS['offset'] = None # <--(Use this in conjunction with totalEstimatedMatches and count to page. Same format as 'count') INCLUDED_PARAMS['responseFilter'] = None # <--(Poss values are 'Computation', 'Images', 'News', 'RelatedSearches', SpellSuggestions', 'TimeZone', 'Videos', or 'Webpages') INCLUDED_PARAMS['safeSearch'] = None # <--(Poss values are 'Off', 'Moderate', and 'Strict.') INCLUDED_PARAMS['setLang'] = None # <--(See ISO 639-1, 2-letter language codes here: https://www.loc.gov/standards/iso639-2/php/code_list.php) INCLUDED_PARAMS['textDecorations'] = None # <--(Case-insensitive boolean. '(t|T)rue', or '(f|F)alse') INCLUDED_PARAMS['textFormat'] = None # <--(Poss values are 'Raw', and 'HTML.' Default is 'Raw' if left blank.) class static_constants(): """ These are both for reference and are used extensively in _methods() So don't modify them!!!!! Just lookie-lookie. """ #################################################### ## ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !## ## DO NOT modify ANY of the constants below ## ## ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !## #################################################### #################################################### ## BASE_QUERY_PARAMS[0] & [1] are special! ## #################################################### BASE_QUERY_PARAMS = ( 'q', # <-- if you change this to 'images/search?q' or 'news/search?q' it will change ze behavior much like repsonsefileter 'category', # <--news only 'cc', 'count', 'freshness', 'mkt', 'offset', 'responseFilter', 'safeSearch', 'setLang', 'textDecorations', # <-- bool 'textFormat', ) #################################################### ## Use these in conjunction w/ the cc param ## #################################################### COUNTRY_CODES = { 'Australia': 'AR', 'Austria': 'AU', 'Belgium': 'AT', 'Brazil': 'BE', 'Canada': 'BR', 'Chile': 'CA', 'Denmark': 'CL', 'Finland': 'DK', 'France': 'FI', 'Germany': 'FR', 'Hong Kong SAR': 'DE', 'India': 'HK', 'Indonesia': 'IN', 'Ireland': 'ID', 'Italy': 'IE', 'Japan': 'IT', 'Korea': 'JP', 'Malaysia': 'KR', 'Mexico': 'MY', 'NO': 'CN', 'Netherlands': 'MX', 'New Zealand': 'NL', 'Norway': 'NZ', 'Poland': 'PL', 'Portugal': 'PT', 'Republic of the Philippines': 'PH', 'Russia': 'RU', 'Saudi Arabia': 'SA', 'South Africa': 'ZA', 'Spain': 'ES', 'Sweden': 'SE', 'Switzerland': 'CH', 'Taiwan': 'TW', 'Turkey': 'TR', 'United Kingdom': 'GB', 'United States': 'US' } #################################################### ## Use these in conjunction w/ the mkt param ## #################################################### MARKET_CODES = { 'Argentina-Spanish': 'es-AR', 'Australia-English': 'en-AU', 'Austria-German': 'de-AT', 'Belgium-Dutch': 'nl-BE', 'Belgium-French': 'fr-BE', 'Brazil-Portuguese': 'pt-BR', 'Canada-English': 'en-CA', 'Canada-French': 'fr-CA', 'Chile-Spanish': 'es-CL', 'Denmark-Danish': 'da-DK', 'Finland-Finnish': 'fi-FI', 'France-French': 'fr-FR', 'Germany-German': 'de-DE', 'Hong Kong SAR-Traditional Chinese': 'zh-HK', 'India-English': 'en-IN', 'Indonesia-English': 'en-ID', 'Ireland-English': 'en-IE', 'Italy-Italian': 'it-IT', 'Japan-Japanese': 'ja-JP', 'Korea-Korean': 'ko-KR', 'Malaysia-English': 'en-MY', 'Mexico-Spanish': 'es-MX', 'Netherlands-Dutch': 'nl-NL', 'New Zealand-English': 'en-NZ', 'Norway-Norwegian': 'no-NO', "People's republic of China-Chinese": 'zh-CN', 'Poland-Polish': 'pl-PL', 'Portugal-Portuguese': 'pt-PT', 'Republic of the Philippines-English': 'en-PH', 'Russia-Russian': 'ru-RU', 'Saudi Arabia-Arabic': 'ar-SA', 'South Africa-English': 'en-ZA', 'Spain-Spanish': 'es-ES', 'Sweden-Swedish': 'sv-SE', 'Switzerland-French': 'fr-CH', 'Switzerland-German': 'de-CH', 'Taiwan-Traditional Chinese': 'zh-TW', 'Turkey-Turkish': 'tr-TR', 'United Kingdom-English': 'en-GB', 'United States-English': 'en-US', 'United States-Spanish': 'es-US' } #################################################### ## Various error codes and their descriptions ## #################################################### ERROR_CODES = { '200' : 'The call succeeded', '400' : 'One of the query parameters is missing or not valid', '401' : 'The subscription key is missing or not valid', '403' : "The user is authenticated but doesn't have permission to the requested resource. Bing may also return this status if the caller exceeded their queries per month quota", '404' : 'Page not found: Bing should not be throwing this error. There is likely a fundamental problem with the structure of your query URL.', '410' : 'The request was made using HTTP. Only HTTPS is supported.(BASE_ENDPOINT USES HTTPS. EITHER YOU CHANGED THAT OR YOU ARE NOT AT FAULT)', '429' : 'The user exceeded their queries per second quota', } #################################################### ## These are for adding to the base url ## #################################################### SPECIALTY_APIS = { 'images': 'images/', 'images_trending': 'images/trending/', # <-- works only for mkt= en-US, en-CA, and en-AU 'videos': 'videos/', 'videos_trending': 'videos/trending/', 'videos_details': 'videos/details/', 'news': 'news/', 'news_trending': 'news/trendingtopics/' # <-- works only for en-US and zh-CN } ##################################################### ## This is the basis for all endpoint permutations ## ## available for Bing Search APIs through Azure ## ## Cognitive Services ## ##################################################### BASE_ENDPOINT = 'https://api.cognitive.microsoft.com/bing/v5.0/' ## Commented out Endpoint URLs have special format which is not defined by .../search?q=... ## These are not yet supported. WEBSEARCH_ENDPOINT = BASE_ENDPOINT + 'search?' IMAGESEARCH_ENDPOINT = BASE_ENDPOINT + SPECIALTY_APIS['images'] + 'search?' # IMAGESEARCH_TRENDING_ENDPOINT = BASE_ENDPOINT + SPECIALTY_APIS['images_trending'] + ????? VIDEOSEARCH_ENDPOINT = BASE_ENDPOINT + SPECIALTY_APIS['videos'] + 'search?' # VIDEOSEARCH_TRENDING_ENDPOINT = BASE_ENDPOINT + SPECIALTY_APIS['videos_trending'] + ????? # VIDEOSEARCH_DETAILS_ENDPOINT = BASE_ENDPOINT + SPECIALTY_APIS['images_details'] + ????? NEWSSEARCH_ENDPOINT = BASE_ENDPOINT + SPECIALTY_APIS['news'] + 'search?' # NEWSSEARCH_TRENDING_ENDPOINT = BASE_ENDPOINT + SPECIALTY_APIS['news_trending'] + ?????PKDEwIxvpycogserv/dict_mod.pyfrom collections import OrderedDict class OrderedDictWithPrepend(OrderedDict): """ Extends OrderedDict functionality by adding 'prepend' method. Format is: .prepend(key, value) """ def prepend(self, key, value, dict_setitem=dict.__setitem__): # Python3 try: self.update({key:value}) self.move_to_end(key,last=False) # Python2 except: root = self._OrderedDict__root first = root[1] if key in self: link = self._OrderedDict__map[key] link_prev, link_next, _ = link link_prev[1] = link_next link_next[0] = link_prev link[0] = root link[1] = first root[1] = first[0] = link else: root[1] = first[0] = self._OrderedDict__map[key] = [root, first, key] dict_setitem(self, key, value)PKDEwIpycogserv/validations.pyfrom pycogserv.constants import static_constants class QueryChecker(): """ Isolated human-error-checker class. All methods are static and do not modify state. if/else mess below forgoes optimization in favor of clarity. """ @staticmethod def check_web_params(query_dict, header_dict): responseFilters = ('Computation', 'Images', 'News', 'RelatedSearches', 'SpellSuggestions', 'TimeZone', 'Videos', 'Webpages') if 'cc' in list(query_dict.keys()): if query_dict['cc'] and not header_dict['Accept-Language']: raise AssertionError('Attempt to use cc_country-cc_code without specifying language.') if query_dict['mkt']: raise ReferenceError('cc and mkt cannot be specified simultaneously') if 'count' in list(query_dict.keys()) and query_dict['count']: if int(query_dict['count']) >= 51 or int(query_dict['count']) < 0: raise ValueError('Count specified out of range. 50 max objects returned.') if 'freshness' in list(query_dict.keys()) and query_dict['freshness']: if query_dict['freshness'] not in ('Day', 'Week', 'Month'): raise ValueError('Freshness must be == Day, Week, or Month. Assume Case-Sensitive.') if 'offset' in list(query_dict.keys()) and query_dict['offset']: if int(query_dict['offset']) < 0: raise ValueError('Offset cannot be negative.') if 'responseFilter' in list(query_dict.keys()) and query_dict['responseFilter']: if query_dict['responseFilter'] not in responseFilters: raise ValueError('Improper response filter.') if 'safeSearch' in list(query_dict.keys()) and query_dict['safeSearch']: if query_dict['safeSearch'] not in ('Off', 'Moderate', 'Strict'): raise ValueError('safeSearch setting must be Off, Moderate, or Strict. Assume Case-Sensitive.') if 'X-Search-ClientIP' in list(header_dict.keys()) and header_dict['X-Search-ClientIP']: eval(input('You have specified both an X-Search-ClientIP header and safesearch setting\nplease note: header takes precedence')) if 'setLang' in list(query_dict.keys()): if 'Accept-Language' in list(header_dict.keys()) and header_dict['Accept-Language']: raise AssertionError('Attempt to use both language header and query param.') if 'textDecorations' in list(query_dict.keys()) and query_dict['textDecorations']: if query_dict['textDecorations'].lower() not in ('true', 'false'): raise TypeError('textDecorations is type bool') if 'textFormat' in list(query_dict.keys()) and query_dict['textFormat']: if query_dict['textFormat'] not in ('Raw', 'HTML'): raise ValueError('textFormat must be == Raw or HTML. Assume Case-Sensitive.') return True class ResponseChecker(): """ Meant to examine returned objects and check/handle errors. """ @staticmethod def validate_request_response(response): """ Return nothing if valid response object returned. Otherwise handle or throw exceptions :param response: requests.response object. :return: func will pass or raise exception. That's all. """ if not response.status_code == 200: if response.status_code == 429: print('queries/second quota exceeded. this func will make 5 attempts to resend.') return '429' elif str(response.status_code) in list(static_constants.ERROR_CODES.keys()): raise AssertionError(static_constants.ERROR_CODES[str(response.status_code)]) else: raise ReferenceError('unknown status code returned: {}\nurl string is: {}'.format(response.status_code, response.url)) else: return True PK!H|&Ubpycogserv-0.0.2.dist-info/WHEEL HM K-*ϳR03rOK-J,/RH,Q034 /, (-JLR()*M ILR(4KM̫#DPK!H"pycogserv-0.0.2.dist-info/METADATA=MN0O1pRXz ~Swj[3)8U`ި Ox2pܫ#HeI5Zs(GH8 G ]|))uXplӢ+ok4_WxG>m4E Mٻ5z1p T6 ]JW:Kh=BS PK!HnE pycogserv-0.0.2.dist-info/RECORD}r@@}iҋ,Z@G7RSE! 4'LM~Խݔe_ܟ"gUXU :;6kլy>qg[*N 8ߺ^`i85&5 ^h,-DJ6Rư,L1{ 5I9AFq)WW2$ΰ%=M g:zhxh'7{@aq9h3h@엗pNW%~e}p%H+pl { Sgl#ŪsO¨ySXS[ʳS/C.o|vZ`.0BMUowHxu')h9~RGVz*֓e@di09_TK< '9#/'|ZŖwf(<>d8 _dG0oPKDEwI8o0o0pycogserv/SearchWeb.pyPK$IwI73rr0pycogserv/__init__.pyPK IwÎH1pycogserv/__main__.pyPK GwI),b%b%2pycogserv/constants.pyPKDEwIxvXpycogserv/dict_mod.pyPKDEwI\pycogserv/validations.pyPK!H|&Ubslpycogserv-0.0.2.dist-info/WHEELPK!H"mpycogserv-0.0.2.dist-info/METADATAPK!HnE npycogserv-0.0.2.dist-info/RECORDPK o