PK<šGJogëë"retriever-2.0.0.data/data/CITATIONMorris, B.D. and E.P. White. 2013. The EcoData Retriever: improving access to existing ecological data. PLOS ONE 8:e65848. http://doi.org/doi:10.1371/journal.pone.0065848 @article{morris2013ecodata, title={The EcoData Retriever: Improving Access to Existing Ecological Data}, author={Morris, Benjamin D and White, Ethan P}, journal={PLOS One}, volume={8}, number={6}, pages={e65848}, year={2013}, publisher={Public Library of Science} doi={10.1371/journal.pone.0065848} } PK½nXJ¿¡“KQQretriever/__main__.py"""Data Retriever Wizard Running this module directly will launch the download wizard, allowing the user to choose from all scripts. The main() function can be used for bootstrapping. """ from __future__ import print_function from __future__ import absolute_import from builtins import str from builtins import input from imp import reload import os import platform import sys # sys removes the setdefaultencoding method at startup; reload to get it back reload(sys) if hasattr(sys, 'setdefaultencoding'): # set default encoding to latin-1 to decode source text sys.setdefaultencoding('latin-1') from retriever import VERSION, SCRIPT_LIST, HOME_DIR, sample_script, CITATION from retriever.engines import engine_list from retriever.lib.repository import check_for_updates from retriever.lib.tools import choose_engine, name_matches, reset_retriever from retriever.lib.get_opts import parser from retriever.lib.datapackage import create_json, edit_json def main(): """This function launches the Data Retriever.""" if len(sys.argv) == 1: # if no command line args are passed, show the help options parser.parse_args(['-h']) else: # otherwise, parse them script_list = SCRIPT_LIST() args = parser.parse_args() if args.command == "install" and not args.engine: parser.parse_args(['install','-h']) if args.quiet: sys.stdout = open(os.devnull, 'w') if args.command == 'help': parser.parse_args(['-h']) if hasattr(args, 'compile') and args.compile: script_list = SCRIPT_LIST(force_compile=True) if args.command == 'defaults': for engine_item in engine_list: print("Default options for engine ", engine_item.name) for default_opts in engine_item.required_opts: print(default_opts[0], " ", default_opts[2]) print() return if args.command == 'update': check_for_updates() script_list = SCRIPT_LIST() return elif args.command == 'citation': if args.dataset is None: print("\nCitation for retriever:\n") print(CITATION) else: scripts = name_matches(script_list, args.dataset) for dataset in scripts: print("\nDataset: {}".format(dataset.name)) print("Citation: {}".format(dataset.citation)) print("Description: {}\n".format(dataset.description)) return elif args.command == 'new': f = open(args.filename, 'w') f.write(sample_script) f.close() return elif args.command == 'reset': reset_retriever(args.scope) return elif args.command == 'new_json': # create new JSON script create_json() return elif args.command == 'edit_json': # edit existing JSON script for json_file in [filename for filename in os.listdir(os.path.join(HOME_DIR, 'scripts')) if filename[-5:] == '.json']: if json_file.lower().find(args.filename.lower()) != -1: edit_json(json_file) return raise Exception("File not found") elif args.command == 'delete_json': # delete existing JSON script for json_file in [filename for filename in os.listdir(os.path.join(HOME_DIR, 'scripts')) if filename[-5:] == '.json']: if json_file.lower().find(args.dataset.lower()) != -1: confirm = input("Really remove " + json_file + " and all its contents? (y/N): ") if confirm.lower().strip() in ['y', 'yes']: # raise Exception(json_file) os.remove(os.path.join(HOME_DIR, 'scripts', json_file)) try: os.remove(os.path.join( HOME_DIR, 'scripts', json_file[:-4] + 'py')) except: # Not compiled yet pass return raise Exception("File not found") if args.command == 'ls': # If scripts have never been downloaded there is nothing to list if not script_list: print("No scripts are currently available. Updating scripts now...") check_for_updates() print("\n\nScripts downloaded.\n") script_list = SCRIPT_LIST() all_scripts = [] for script in script_list: if script.shortname: if args.l is not None: script_name = script.name + "\nShortname: " + script.shortname + "\n" if script.tags: script_name += "Tags: " + \ str([tag for tag in script.tags]) + "\n" not_found = 0 for term in args.l: if script_name.lower().find(term.lower()) == -1: not_found = 1 break if not_found == 0: all_scripts.append(script_name) else: script_name = script.shortname all_scripts.append(script_name) all_scripts = sorted(all_scripts, key=lambda s: s.lower()) print("Available datasets : {}\n".format(len(all_scripts))) if args.l is None: from retriever import lscolumns lscolumns.printls(sorted(all_scripts, key=lambda s: s.lower())) else: count = 1 for script in all_scripts: print("%d. %s" % (count, script)) count += 1 return engine = choose_engine(args.__dict__) if hasattr(args, 'debug') and args.debug: debug = True else: debug = False sys.tracebacklimit = 0 if args.dataset is not None: scripts = name_matches(script_list, args.dataset) else: raise Exception("no dataset specified.") if scripts: for dataset in scripts: print("=> Installing", dataset.name) try: dataset.download(engine, debug=debug) dataset.engine.final_cleanup() except KeyboardInterrupt: pass except Exception as e: print(e) if debug: raise print("Done!") else: print("The dataset {} isn't currently available in the Retriever".format( args.dataset)) print("Run 'retriever ls to see a list of currently available datasets") if __name__ == "__main__": main() PKBˆXJ^þÑÚretriever/_version.py__version__ = 'v2.0.0' PK<šGJ¯Éã³¾¾retriever/lscolumns.pyfrom __future__ import print_function from __future__ import division from __future__ import absolute_import from builtins import zip from builtins import range import sys from retriever.term_size import get_terminal_size def get_columns(values, cols): columns = [] col_size = len(values) // cols extra = len(values) % cols n = 0 for i in range(cols): s = col_size if i + 1 <= extra: s += 1 this_column = values[n:n + s] columns.append(this_column) n += s return columns def printls(values, max_width=None, spacing=2): if sys.stdout.isatty() and max_width is None: cols, lines = get_terminal_size() max_width = cols if max_width: # if output to terminal or max_width is specified, use column output for cols in [int((len(values) // float(i)) + 0.5) for i in range(1, len(values) + 1)]: columns = get_columns(values, cols) widths = [max([len(c) for c in column]) + spacing for column in columns] if sum(widths) < max_width: break for pos in range(len(columns[0])): for column, width in zip(columns, widths): if len(column) > pos: print(column[pos].ljust(width - 1), end=' ') print() else: # otherwise, just output each value, one per line for value in values: print(value) PK½nXJ‹êýW W retriever/__init__.py"""Data Retriever This package contains a framework for creating and running scripts designed to download published ecological data, and store the data in a database. """ from __future__ import print_function from __future__ import absolute_import from builtins import str import io import os import sys import csv from os.path import join, isfile, getmtime, exists from pkg_resources import parse_version import imp import platform from retriever.lib.compile import compile_json from retriever._version import __version__ current_platform = platform.system().lower() if current_platform != 'windows': import pwd VERSION = __version__ COPYRIGHT = "Copyright (C) 2011-2016 Weecology University of Florida" REPO_URL = "https://raw.github.com/weecology/retriever/" MASTER_BRANCH = REPO_URL + "master/" REPOSITORY = MASTER_BRANCH # create the necessary directory structure for storing scripts/raw_data # in the ~/.retriever directory HOME_DIR = os.path.expanduser('~/.retriever/') for dir in (HOME_DIR, os.path.join(HOME_DIR, 'raw_data'), os.path.join(HOME_DIR, 'scripts')): if not os.path.exists(dir): try: os.makedirs(dir) if (current_platform != 'windows') and os.getenv("SUDO_USER"): # owner of .retriever should be user even when installing # w/sudo pw = pwd.getpwnam(os.getenv("SUDO_USER")) os.chown(dir, pw.pw_uid, pw.pw_gid) except OSError: print("The Retriever lacks permission to access the ~/.retriever/ directory.") raise SCRIPT_SEARCH_PATHS = [ "./", 'scripts', os.path.join(HOME_DIR, 'scripts/') ] SCRIPT_WRITE_PATH = SCRIPT_SEARCH_PATHS[-1] DATA_SEARCH_PATHS = [ "./", "{dataset}", "raw_data/{dataset}", os.path.join(HOME_DIR, 'raw_data/{dataset}'), ] DATA_WRITE_PATH = DATA_SEARCH_PATHS[-1] # Create default data directory DATA_DIR = '.' def open_fr(file_name, encoding='ISO-8859-1', encode=True): """Open file for reading respecting Python version and OS differences Sets newline to Linux line endings on Windows and Python 3 When encode=False does not set encoding on nix and Python 3 to keep as bytes """ if sys.version_info >= (3, 0, 0): if os.name == 'nt': file_obj = io.open(file_name, 'r', newline='', encoding=encoding) else: if encode: file_obj = io.open(file_name, "r", encoding=encoding) else: file_obj = io.open(file_name, "r") else: file_obj = io.open(file_name, "r", encoding=encoding) return file_obj def open_fw(file_name, encoding='ISO-8859-1', encode=True): """Open file for writing respecting Python version and OS differences Sets newline to Linux line endings on Python 3 When encode=False does not set encoding on nix and Python 3 to keep as bytes """ if sys.version_info >= (3, 0, 0): if encode: file_obj = io.open(file_name, 'w', newline='', encoding=encoding) else: file_obj = io.open(file_name, 'w', newline='') else: file_obj = io.open(file_name, 'wb',) return file_obj def open_csvw(csv_file, encode=True): """Open a csv writer forcing the use of Linux line endings on Windows Also sets dialect to 'excel' and escape characters to '\\' """ if os.name == 'nt': csv_writer = csv.writer(csv_file, dialect='excel', escapechar='\\', lineterminator='\n') else: csv_writer = csv.writer(csv_file, dialect='excel', escapechar='\\') return csv_writer def to_str(object, object_encoding=sys.stdout): if sys.version_info >= (3, 0, 0): enc = object_encoding.encoding return str(object).encode(enc, errors='backslashreplace').decode("latin-1") else: return object def MODULE_LIST(force_compile=False): """Load scripts from scripts directory and return list of modules.""" modules = [] loaded_scripts = [] for search_path in [search_path for search_path in SCRIPT_SEARCH_PATHS if exists(search_path)]: to_compile = [ file for file in os.listdir(search_path) if file[-5:] == ".json" and file[0] != "_" and ( (not isfile(join(search_path, file[:-5] + '.py'))) or ( isfile(join(search_path, file[:-5] + '.py')) and ( getmtime(join(search_path, file[:-5] + '.py')) < getmtime( join(search_path, file)))) or force_compile)] for script in to_compile: script_name = '.'.join(script.split('.')[:-1]) compile_json(join(search_path, script_name)) files = [file for file in os.listdir(search_path) if file[-3:] == ".py" and file[0] != "_" and '#retriever' in ' '.join(open(join(search_path, file), 'r').readlines()[:2]).lower()] for script in files: script_name = '.'.join(script.split('.')[:-1]) if script_name not in loaded_scripts: loaded_scripts.append(script_name) file, pathname, desc = imp.find_module(script_name, [search_path]) try: new_module = imp.load_module(script_name, file, pathname, desc) if hasattr(new_module.SCRIPT, "retriever_minimum_version"): # a script with retriever_minimum_version should be loaded # only if its compliant with the version of the retriever if not parse_version(VERSION) >= parse_version("{}".format( new_module.SCRIPT.retriever_minimum_version)): print("{} is supported by Retriever version {}".format(script_name, new_module.SCRIPT.retriever_minimum_version)) print("Current version is {}".format(VERSION)) continue # if the script wasn't found in an early search path # make sure it works and then add it new_module.SCRIPT.download modules.append(new_module) except Exception as e: sys.stderr.write("Failed to load script: %s (%s)\nException: %s \n" % ( script_name, search_path, str(e))) return modules def SCRIPT_LIST(force_compile=False): return [module.SCRIPT for module in MODULE_LIST(force_compile)] def ENGINE_LIST(): from retriever.engines import engine_list return engine_list def set_proxy(): """Check for proxies and makes them available to urllib""" proxies = ["https_proxy", "http_proxy", "ftp_proxy", "HTTP_PROXY", "HTTPS_PROXY", "FTP_PROXY"] for proxy in proxies: if os.getenv(proxy): if len(os.environ[proxy]) != 0: for i in proxies: os.environ[i] = os.environ[proxy] break set_proxy() sample_script = """ { "description": "S. K. Morgan Ernest. 2003. Life history characteristics of placental non-volant mammals. Ecology 84:3402.", "homepage": "http://esapubs.org/archive/ecol/E084/093/default.htm", "name": "MammalLH", "resources": [ { "dialect": {}, "mediatype": "text/csv", "name": "species", "schema": {}, "url": "http://esapubs.org/archive/ecol/E084/093/Mammal_lifehistories_v2.txt" } ], "title": "Mammal Life History Database - Ernest, et al., 2003", "urls": { "species": "http://esapubs.org/archive/ecol/E084/093/Mammal_lifehistories_v2.txt" } } """ CITATION = """Morris, B.D. and E.P. White. 2013. The EcoData Retriever: improving access to existing ecological data. PLOS ONE 8:e65848. http://doi.org/doi:10.1371/journal.pone.0065848 @article{morris2013ecodata, title={The EcoData Retriever: Improving Access to Existing Ecological Data}, author={Morris, Benjamin D and White, Ethan P}, journal={PLOS One}, volume={8}, number={6}, pages={e65848}, year={2013}, publisher={Public Library of Science} doi={10.1371/journal.pone.0065848} } """ PK<šGJƒP  retriever/compile.pyfrom __future__ import print_function from __future__ import absolute_import from retriever import MODULE_LIST def compile(): print("Compiling retriever scripts...") MODULE_LIST(force_compile=True) print("done.") if __name__ == "__main__": compile() PK<šGJ|žqŒööretriever/version.py"""Generates a configuration file containing the version number.""" from __future__ import absolute_import import os from retriever import VERSION, MODULE_LIST def get_module_version(): """This function gets the version number of the scripts and returns them in array form.""" modules = MODULE_LIST() scripts = [] for module in modules: if module.SCRIPT.public: if os.path.isfile('.'.join(module.__file__.split('.')[:-1]) + '.json') and module.SCRIPT.version: module_name = module.__name__ + '.json' scripts.append(','.join([module_name, str(module.SCRIPT.version)])) elif os.path.isfile('.'.join(module.__file__.split('.')[:-1]) + '.py') and \ not os.path.isfile('.'.join(module.__file__.split('.')[:-1]) + '.json'): module_name = module.__name__ + '.py' scripts.append(','.join([module_name, str(module.SCRIPT.version)])) scripts = sorted(scripts, key = str.lower) return scripts scripts = get_module_version() if os.path.isfile("version.txt"): os.remove("version.txt") with open("version.txt", "w") as version_file: version_file.write(VERSION) for script in scripts: version_file.write('\n' + script) PK<šGJb¶„ªÁ Á retriever/try_install_all.py"""Attempt to install all datasets into all database management systems This module, when run, attempts to install datasets from all Retriever scripts in the /scripts folder (except for those listed in IGNORE), for each engine in ENGINE_LIST() from __init__.py. In other words, it runs trys to install using all possible combinations of database platform and script and checks to see if there are any errors. It does not check the values in the database. """ from __future__ import print_function from __future__ import absolute_import import os import sys from imp import reload from retriever.lib.tools import choose_engine from retriever import MODULE_LIST, ENGINE_LIST, SCRIPT_LIST reload(sys) if hasattr(sys, 'setdefaultencoding'): sys.setdefaultencoding('latin-1') if os.name == "nt": os_password = "Password12!" else: os_password = "" MODULE_LIST = MODULE_LIST() ENGINE_LIST = ENGINE_LIST() if len(sys.argv) > 1: ENGINE_LIST = [ e for e in ENGINE_LIST if e.name in sys.argv[1:] or e.abbreviation in sys.argv[1:] ] if os.path.exists("test_all"): os.system("rm -r test_all") os.makedirs("test_all") os.chdir("test_all") dbfile = os.path.normpath(os.path.join(os.getcwd(), 'testdb.sqlite')) engine_test = { "postgres": {'engine': 'postgres', 'user': 'postgres', 'password': os_password, 'host': 'localhost', 'port': 5432, 'database': 'postgres', 'database_name': 'testschema', 'table_name': '{db}.{table}'}, "mysql": {'engine': 'mysql', 'user': 'travis', 'password': '', 'host': 'localhost', 'port': 3306, 'database_name': 'testdb', 'table_name': '{db}.{table}'}, "xml": {'engine': 'xml', 'table_name': 'output_file_{table}.xml'}, "json": {'engine': 'json', 'table_name': 'output_file_{table}.json'}, "csv": {'engine': 'csv', 'table_name': 'output_file_{table}.csv'}, "sqlite": {'engine': 'sqlite', 'file': dbfile, 'table_name': '{db}_{table}'} } SCRIPT_LIST = SCRIPT_LIST() TEST_ENGINES = {} IGNORE = ["forest-inventory-analysis", "bioclim", "prism-climate", "vertnet", "NPN", "mammal-super-tree"] IGNORE = [dataset.lower() for dataset in IGNORE] for engine in ENGINE_LIST: if engine.abbreviation in engine_test: try: opts = engine_test[engine.abbreviation] TEST_ENGINES[engine.abbreviation] = choose_engine(opts) except: TEST_ENGINES[engine.abbreviation] = None pass errors = [] for module in MODULE_LIST: for (key, value) in list(TEST_ENGINES.items()): if module.SCRIPT.shortname.lower() not in IGNORE: if value != None: print("==>", module.__name__, value.name, "..........", module.SCRIPT.shortname) try: module.SCRIPT.download(value) except KeyboardInterrupt: pass except Exception as e: print("ERROR.") errors.append((key, module.__name__, e)) else: errors.append((key, "No connection detected......" + module.SCRIPT.shortname)) print('') if errors: print("Engine, Dataset, Error") for error in errors: print(error) else: print("All tests passed") PK<šGJC6¯××retriever/term_size.pyimport os def get_terminal_size(): env = os.environ def ioctl_GWINSZ(fd): try: import fcntl import termios import struct import os cr = struct.unpack('hh', fcntl.ioctl(fd, termios.TIOCGWINSZ, '1234')) except: return return cr cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2) if not cr: try: fd = os.open(os.ctermid(), os.O_RDONLY) cr = ioctl_GWINSZ(fd) os.close(fd) except: pass if not cr: cr = (env.get('LINES', 25), env.get('COLUMNS', 80)) return int(cr[1]), int(cr[0]) PK<šGJ—úRs??retriever/lib/__init__.py """retriever.lib contains the core Data Retriever modules.""" PK<šGJPý—¯¯retriever/lib/compile.pyfrom builtins import str import json import sys if sys.version_info[0] < 3: from codecs import open script_templates = { "default": """#retriever from retriever.lib.templates import BasicTextTemplate from retriever.lib.models import Table, Cleanup, correct_invalid_value SCRIPT = BasicTextTemplate(%s)""", "html_table": """#retriever from retriever.lib.templates import HtmlTableTemplate from retriever.lib.models import Table, Cleanup, correct_invalid_value SCRIPT = HtmlTableTemplate(%s)""", } def add_dialect(table_dict, table): """ Reads dialect key of JSON script and extracts key-value pairs to store them in python script Contains properties such 'nulls', delimiter', etc """ for (key, val) in table['dialect'].items(): # dialect related key-value pairs # copied as is if key == "missingValues": table_dict[ 'cleanup'] = "Cleanup(correct_invalid_value, nulls=" + str(val) + ")" elif key == "delimiter": table_dict[key] = "'" + str(val) + "'" else: table_dict[key] = val def add_schema(table_dict, table): """ Reads schema key of JSON script and extracts values to store them in python script Contains properties related to table schema, such as 'fields' and cross-tab column name ('ct_column'). """ for (key, val) in table['schema'].items(): # schema related key-value pairs if key == "fields": # fields = columns of the table # list of column tuples column_list = [] for obj in val: # fields is a collection of JSON objects # (similar to a list of dicts in python) if "size" in obj: column_list.append((obj["name"], (obj["type"], obj["size"]))) else: column_list.append((obj["name"], (obj["type"],))) table_dict["columns"] = column_list elif key == "ct_column": table_dict[key] = "'" + val + "'" else: table_dict[key] = val def compile_json(json_file): """ Function to compile JSON script files to python scripts The scripts are created with `retriever create_json 0.6] return [match[0] for match in matches] def final_cleanup(engine): """Perform final cleanup operations after all scripts have run.""" pass config_path = os.path.join(HOME_DIR, 'connections.config') def get_saved_connection(engine_name): """Given the name of an engine, returns the stored connection for that engine from connections.config.""" parameters = {} if os.path.isfile(config_path): config = open(config_path, "r") for line in config: values = line.rstrip('\n').split(',') if values[0] == engine_name: try: parameters = eval(','.join(values[1:])) except: pass return parameters def save_connection(engine_name, values_dict): """Saves connection information for an engine in connections.config.""" lines = [] if os.path.isfile(config_path): config = open(config_path, "r") for line in config: if line.split(',')[0] != engine_name: lines.append('\n' + line.rstrip('\n')) config.close() os.remove(config_path) config = open(config_path, "w") else: config = open(config_path, "w") if "file" in values_dict: values_dict["file"] = os.path.abspath(values_dict["file"]) config.write(engine_name + "," + str(values_dict)) for line in lines: config.write(line) config.close() def get_default_connection(): """Gets the first (most recently used) stored connection from connections.config.""" if os.path.isfile(config_path): config = open(config_path, "r") default_connection = config.readline().split(",")[0] config.close() return default_connection else: return None def choose_engine(opts, choice=True): """Prompts the user to select a database engine""" from retriever.engines import engine_list if "engine" in list(opts.keys()): enginename = opts["engine"] elif opts["command"] == "download": enginename = "download" else: if not choice: return None print("Choose a database engine:") for engine in engine_list: if engine.abbreviation: abbreviation = "(" + engine.abbreviation + ") " else: abbreviation = "" print(" " + abbreviation + engine.name) enginename = input(": ") enginename = enginename.lower() engine = Engine() if not enginename: engine = engine_list[0] else: for thisengine in engine_list: if (enginename == thisengine.name.lower() or thisengine.abbreviation and enginename == thisengine.abbreviation): engine = thisengine engine.opts = opts return engine def reset_retriever(scope): """Remove stored information on scripts, data, and connections""" warning_messages = { 'all': "\nThis will remove existing scripts, cached data, and information on database connections. \nSpecifically it will remove the scripts and raw_data folders and the connections.config file in {}. \nDo you want to proceed? (y/N)\n", 'scripts': "\nThis will remove existing scripts. \nSpecifically it will remove the scripts folder in {}.\nDo you want to proceed? (y/N)\n", 'data': "\nThis will remove raw data cached by the Retriever. \nSpecifically it will remove the raw_data folder in {}. \nDo you want to proceed? (y/N)\n", 'connections': "\nThis will remove stored information on database connections. \nSpecifically it will remove the connections.config file in {}. \nDo you want to proceed? (y/N)\n" } path = os.path.normpath(HOME_DIR) warn_msg = warning_messages[scope].format(path) confirm = input(warn_msg) while not (confirm.lower() in ['y', 'n', '']): print("Please enter either y or n.") confirm = input() if confirm.lower() == 'y': if scope in ['data', 'all']: shutil.rmtree(os.path.join(path, 'raw_data')) if scope in ['scripts', 'all']: shutil.rmtree(os.path.join(path, 'scripts')) if scope in ['connections', 'all']: try: os.remove(os.path.join(path, 'connections.config')) except: pass def json2csv(input_file, output_file=None, header_values=None): """Convert Json file to CSV function is used for only testing and can handle the file of the size """ file_out = open_fr(input_file, encode = False) # set output file name and write header if output_file is None: output_file = os.path.splitext(os.path.basename(input_file))[0] + ".csv" csv_out = open_fw(output_file, encode=False) if os.name == 'nt': outfile = csv.DictWriter(csv_out, dialect='excel', escapechar="\\", lineterminator='\n', fieldnames=header_values) else: outfile = csv.DictWriter(csv_out, dialect='excel', escapechar="\\", fieldnames=header_values) raw_data = json.loads(file_out.read()) outfile.writeheader() for item in raw_data: outfile.writerow(item) file_out.close() os.system("rm -r {}".format(input_file)) return output_file def xml2csv(input_file, outputfile=None, header_values=None, row_tag="row"): """Convert xml to csv function is used for only testing and can handle the file of the size """ file_output = open_fr(input_file, encode=False) # set output file name and write header if outputfile is None: outputfile = os.path.splitext(os.path.basename(input_file))[0] + ".csv" csv_out = open_fw(outputfile) if os.name == 'nt': csv_writer = csv.writer(csv_out, dialect='excel', escapechar='\\', lineterminator='\n') else: csv_writer = csv.writer(csv_out, dialect='excel', escapechar='\\') v = file_output.read() csv_writer.writerow(header_values) tree = ET.parse(newfile(v)) root = tree.getroot() for rows in root.findall(row_tag): x = [name.text for name in header_values for name in rows.findall(name)] csv_writer.writerow(x) file_output.close() os.system("rm -r {}".format(input_file)) return outputfile def getmd5(data, data_type='lines'): """Get MD5 of a data source""" checksum = md5() if data_type == 'lines': for line in data: if type(line) == bytes: checksum.update(line) else: checksum.update(str(line).encode()) return checksum.hexdigest() files = [] if data_type == 'file': files = [os.path.normpath(data)] if data_type == 'dir': for root, directories, filenames in os.walk(os.path.normpath(data)): for filename in sorted(filenames): files.append(os.path.normpath(os.path.join(root, filename))) for file_path in files: # don't use open_fr to keep line endings consistent across OSs if sys.version_info >= (3, 0, 0): if os.name == 'nt': input_file = io.open(file_path, 'r', encoding='ISO-8859-1') else: input_file = open(file_path, 'r', encoding='ISO-8859-1') else: input_file = io.open(file_path, encoding='ISO-8859-1') for line in input_file: if type(line) == bytes: checksum.update(line) else: checksum.update(str(line).encode()) return checksum.hexdigest() def sort_file(file_path): """Sort file by line and return the file function is used for only testing and can handle the file of the size """ file_path = os.path.normpath(file_path) input_file = open_fr(file_path) lines = [line.strip().replace('\x00', '') for line in input_file] input_file.close() outfile = open_fw(file_path) lines.sort() for line in lines: outfile.write(line + "\n") outfile.close() return file_path def sort_csv(filename): """Sort CSV rows minus the header and return the file function is used for only testing and can handle the file of the size """ filename = os.path.normpath(filename) input_file = open_fr(filename) csv_reader_infile = csv.reader(input_file, escapechar="\\") # write the data to a temporary file and sort it temp_path = os.path.normpath("tempfile") temp_file = open_fw(temp_path) csv_writer = open_csvw(temp_file) i = 0 for row in csv_reader_infile: if i == 0: # The first entry is the header line infields = row i += 1 else: csv_writer.writerow(row) input_file.close() temp_file.close() # sort the temp file sorted_txt = sort_file(temp_path) tmp = open_fr(sorted_txt) in_txt = csv.reader(tmp, delimiter=',', escapechar="\\") csv_file = open_fw(filename) csv_writer = open_csvw(csv_file) csv_writer.writerow(infields) csv_writer.writerows(in_txt) tmp.close() csv_file.close() os.remove(os.path.normpath(temp_path)) return filename def create_file(data, output='output_file'): """Writes a string to a file for use by tests""" output_file = os.path.normpath(output) with open(output_file, 'w') as testfile: testfile.write(data) testfile.close() return output_file def file_2string(input_file): """return file contents as a string""" input_file = os.path.normpath(input_file) if sys.version_info >= (3, 0, 0): input = io.open(input_file, 'rU') else: input = io.open(input_file, encoding='ISO-8859-1') obs_out = input.read() return obs_out PK<šGJµ=•‘::retriever/lib/table.pyfrom future import standard_library standard_library.install_aliases() from builtins import next from builtins import object import csv import io import sys from functools import reduce from retriever.lib.cleanup import * class Table(object): """Information about a database table.""" def __init__(self, name, **kwargs): self.name = name self.pk = True self.contains_pk = False self.delimiter = None self.header_rows = 1 self.column_names_row = 1 self.fixed_width = False self.cleanup = Cleanup() self.record_id = 0 self.columns = [] self.replace_columns = [] self.escape_single_quotes = True self.escape_double_quotes = True self.cleaned_columns = False for key, item in list(kwargs.items()): setattr(self, key, item[0] if isinstance(item, tuple) else item) def auto_get_columns(self, header): """Gets the column names from the header row Identifies the column names from the header row. Replaces database keywords with alternatives. Replaces special characters and spaces. """ columns = [self.clean_column_name(x) for x in header] column_values = {x: [] for x in columns if x} self.cleaned_columns = True return [[x, None] for x in columns if x], column_values def clean_column_name(self, column_name): """Clean column names using the expected sql guidelines remove leading whitespaces, replace sql key words, etc.. """ column_name = column_name.lower().strip().replace("\n", "") replace_columns = {old.lower(): new.lower() for old, new in self.replace_columns} column_name = str(replace_columns.get(column_name, column_name).strip()) replace = [ ("%", "percent"), ("&", "and"), ("\xb0", "degrees"), ("^", "_power_"), ("<", "_lthn_"), (">", "_gthn_"), ] replace += [(x, '') for x in (")", "?", "#", ";" "\n", "\r", '"', "'")] replace += [(x, '_') for x in (" ", "(", "/", ".", "-", "*", ":", "[", "]")] column_name = reduce(lambda x, y: x.replace(*y), replace, column_name) while "__" in column_name: column_name = column_name.replace("__", "_") column_name = column_name.lstrip("0123456789_").rstrip("_") replace_dict = { "group": "grp", "order": "ordered", "check": "checked", "references": "refs", "long": "lon", "column": "columns", "cursor": "cursors", "delete": "deleted", "insert": "inserted", "join": "joins", "select": "selects", "table": "tables", "update": "updates", "date": "record_date" } for x in (")", "\n", "\r", '"', "'"): replace_dict[x] = '' for x in (" ", "(", "/", ".", "-"): replace_dict[x] = '_' if column_name in replace_dict: column_name = replace_dict[column_name] return column_name def combine_on_delimiter(self, line_as_list): """Combine a list of values into a line of csv data""" dialect = csv.excel dialect.escapechar = "\\" if sys.version_info >= (3, 0): writer_file = io.StringIO() else: writer_file = io.BytesIO() writer = csv.writer(writer_file, dialect=dialect, delimiter=self.delimiter) writer.writerow(line_as_list) return writer_file.getvalue() def values_from_line(self, line): linevalues = [] if self.columns[0][1][0] == 'pk-auto': column = 1 else: column = 0 for value in line: try: this_column = self.columns[column][1][0] # If data type is "skip" ignore the value if this_column == "skip": pass elif this_column == "combine": # If "combine" append value to end of previous column linevalues[-1] += " " + value else: # Otherwise, add new value linevalues.append(value) except: # too many values for columns; ignore pass column += 1 # make sure we have enough values by padding with None keys = self.get_insert_columns(join=False, create=False) if len(linevalues) < len(keys): linevalues.extend([None for _ in range(len(keys) - len(linevalues))]) return linevalues def get_insert_columns(self, join=True, create=False): """Gets column names for insert statements `create` should be set to `True` if the returned values are going to be used for creating a new table. It includes the `pk_auto` column if present. This column is not included by default because it is not used when generating insert statements for database management systems. """ columns = [] if not self.cleaned_columns: column_names = list(self.columns) self.columns[:] = [] self.columns = [(self.clean_column_name(name[0]), name[1]) for name in column_names] self.cleaned_columns = True for item in self.columns: if not create and item[1][0] == 'pk-auto': # don't include this columns if create=False continue thistype = item[1][0] if (thistype != "skip") and (thistype != "combine"): columns.append(item[0]) if join: return ", ".join(columns) else: return columns def get_column_datatypes(self): """Gets a set of column names for insert statements.""" columns = [] for item in self.get_insert_columns(False): for column in self.columns: if item == column[0]: columns.append(column[1][0]) return columns PK<šGJô僽¬¬retriever/lib/cleanup.pyfrom builtins import object def floatable(value): """Check if a value can be converted to a float""" try: float(value) return True except ValueError: return False def correct_invalid_value(value, args): """This cleanup function replaces null indicators with None.""" try: if value in [item for item in args["nulls"]]: return None if float(value) in [float(item) for item in args["nulls"] if floatable(item)]: return None return value except: return value def no_cleanup(value, args): """Default cleanup function, returns the unchanged value.""" return value class Cleanup(object): """This class represents a custom cleanup function and a dictionary of arguments to be passed to that function.""" def __init__(self, function=no_cleanup, **kwargs): self.function = function self.args = kwargs PK<šGJŸMÞ!¨¨retriever/lib/templates.py"""Class models for dataset scripts from various locations. Scripts should inherit from the most specific class available.""" from __future__ import print_function from builtins import object import os import shutil from retriever import DATA_DIR from retriever.lib.models import * from retriever.lib.tools import choose_engine class Script(object): """This class represents a database toolkit script. Scripts should inherit from this class and execute their code in the download method.""" def __init__(self, name="", description="", shortname="", urls=dict(), tables=dict(), ref="", public=True, addendum=None, citation="Not currently available", retriever_minimum_version="", version="", **kwargs): self.name = name self.shortname = shortname self.filename = __name__ self.description = description self.urls = urls self.tables = tables self.ref = ref self.public = public self.addendum = addendum self.citation = citation self.tags = [] self.retriever_minimum_version = retriever_minimum_version self.version = version for key, item in list(kwargs.items()): setattr(self, key, item[0] if isinstance(item, tuple) else item) def __str__(self): desc = self.name if self.reference_url(): desc += "\n" + self.reference_url() return desc def download(self, engine=None, debug=False): self.engine = self.checkengine(engine) self.engine.debug = debug self.engine.db_name = self.shortname self.engine.create_db() def reference_url(self): if self.ref: return self.ref else: if len(self.urls) == 1: return self.urls[list(self.urls.keys())[0]] else: return None def checkengine(self, engine=None): if engine is None: opts = {} engine = choose_engine(opts) engine.get_input() engine.script = self return engine def exists(self, engine=None): if engine: return engine.exists(self) else: return False def matches_terms(self, terms): try: search_string = ' '.join([ self.name, self.description, self.shortname ] + self.tags).upper() for term in terms: if not term.upper() in search_string: return False return True except: return False class BasicTextTemplate(Script): """Script template based on data files from Ecological Archives.""" def __init__(self, **kwargs): Script.__init__(self, **kwargs) def download(self, engine=None, debug=False): Script.download(self, engine, debug) for key in list(self.urls.keys()): if key not in list(self.tables.keys()): self.tables[key] = Table(key, cleanup=Cleanup(correct_invalid_value, nulls=[-999])) for key, value in list(self.urls.items()): self.engine.auto_create_table(self.tables[key], url=value) self.engine.insert_data_from_url(value) self.tables[key].record_id = 0 return self.engine def reference_url(self): if self.ref: return self.ref else: if len(self.urls) == 1: return '/'.join(self.urls[list(self.urls.keys())[0]].split('/')[0:-1]) + '/' class DownloadOnlyTemplate(Script): """Script template for non-tabular data that are only for download""" def __init__(self, **kwargs): Script.__init__(self, **kwargs) def download(self, engine=None, debug=False): if engine.name != "Download Only": raise Exception("This dataset contains only non-tabular data files, and can only be used with the 'download only' engine.\nTry 'retriever download datasetname instead.") Script.download(self, engine, debug) for filename, url in self.urls.items(): self.engine.download_file(url, filename) if os.path.exists(self.engine.format_filename(filename)): shutil.copy(self.engine.format_filename(filename), DATA_DIR) else: print("{} was not downloaded".format(filename)) print("A file with the same name may be in your working directory") class HtmlTableTemplate(Script): """Script template for parsing data in HTML tables""" pass TEMPLATES = [ ("Basic Text", BasicTextTemplate), ("HTML Table", HtmlTableTemplate), ] PK½nXJ–ƒ!vÍÍretriever/lib/get_opts.pyimport argparse from retriever import VERSION from retriever.engines import engine_list parser = argparse.ArgumentParser(prog="retriever") parser.add_argument('-v', '--version', action='version', version=VERSION) parser.add_argument('-q', '--quiet', help='suppress command-line output', action='store_true') # .............................................................. # subparsers # .............................................................. # retriever HELP subparsers = parser.add_subparsers(help='sub-command help', dest='command') # retriever download/install/update/new help download_parser = subparsers.add_parser('download', help='download raw data files for a dataset') install_parser = subparsers.add_parser('install', help='download and install dataset') default_parser = subparsers.add_parser('defaults', help='displays default options') update_parser = subparsers.add_parser('update', help='download updated versions of scripts') new_parser = subparsers.add_parser('new', help='create a new sample retriever script') new_json_parser = subparsers.add_parser('new_json', help='CLI to create retriever datapackage.json script') edit_json_parser = subparsers.add_parser('edit_json', help='CLI to edit retriever datapackage.json script') delete_json_parser = subparsers.add_parser('delete_json', help='CLI to remove retriever datapackage.json script') ls_parser = subparsers.add_parser('ls', help='display a list all available dataset scripts') citation_parser = subparsers.add_parser('citation', help='view citation') reset_parser = subparsers.add_parser('reset', help='reset retriever: removes configation settings, scripts, and cached data') help_parser = subparsers.add_parser('help', help='') # .............................................................. # subparsers with Arguments # .............................................................. citation_parser.add_argument('dataset', help='dataset name', nargs='?', default=None) new_parser.add_argument('filename', help='new script filename') edit_json_parser.add_argument('filename', help='script filename') reset_parser.add_argument('scope', help='things to reset: all, scripts, data, or connections', choices=['all', 'scripts', 'data', 'connections']) install_parser.add_argument('--compile', help='force re-compile of script before downloading', action='store_true') install_parser.add_argument('--debug', help='run in debug mode', action='store_true') download_parser.add_argument('dataset', help='dataset name') ls_parser.add_argument('-l', help='verbose list of datasets containing following keywords (lists all when no keywords are specified)', nargs='*') delete_json_parser.add_argument('dataset', help='dataset name') # retriever Install {Engine} .. # retriever download [options] install_subparsers = install_parser.add_subparsers(help='engine-specific help', dest='engine') for engine in engine_list: if engine.name == "Download Only": # skip the Download engine and just add attributes pass else: engine_parser = install_subparsers.add_parser(engine.abbreviation, help=engine.name) engine_parser.add_argument('dataset', help='dataset name') abbreviations = set('h') for arg in engine.required_opts: arg_name, help_msg, default = arg[:3] potential_abbreviations = [char for char in arg_name if not char in abbreviations] if potential_abbreviations: abbreviation = potential_abbreviations[0] abbreviations.add(abbreviation) else: abbreviation = '-%s' % arg_name if engine.name == "Download Only" or abbreviation == "download": # add attributes to Download:: (download [-h] [--path [PATH]] # [--subdir [SUBDIR]] # subdir doesn't take any arguments, if included takes True if excluded takes False if arg_name.lower()== "subdir": download_parser.add_argument('--%s' % arg_name, '-%s' % abbreviation, help=help_msg, default=default, action='store_true') # parser.add_argument('--foo', action='store_const', const = False) else: # path must take arguments else it takes default "./" download_parser.add_argument('--%s' % arg_name, '-%s' % abbreviation, help=help_msg, nargs='?', default=default) else: engine_parser.add_argument('--%s' % arg_name, '-%s' % abbreviation, help=help_msg, nargs='?', default=default) PK<šGJ,§%ŽÖÖretriever/lib/models.py"""Data Retriever Data Model This module contains basic class definitions for the Retriever platform. """ from retriever.lib.cleanup import * from retriever.lib.engine import * from retriever.lib.table import * PK<šGJܶHzzretriever/lib/engine.pyfrom __future__ import print_function from __future__ import division from future import standard_library standard_library.install_aliases() from builtins import object from builtins import range from builtins import input from builtins import zip from builtins import next from builtins import str import sys import os import getpass import zipfile import gzip import tarfile import csv import re import io import time from urllib.request import urlretrieve from retriever import DATA_SEARCH_PATHS, DATA_WRITE_PATH, open_fr, open_fw, open_csvw from retriever.lib.cleanup import no_cleanup from retriever.lib.warning import Warning class Engine(object): """A generic database system. Specific database platforms will inherit from this class.""" name = "" instructions = "Enter your database connection information:" db = None table = None _connection = None _cursor = None datatypes = [] required_opts = [] pkformat = "%s PRIMARY KEY %s " script = None debug = False warnings = [] def connect(self, force_reconnect=False): if force_reconnect: self.disconnect() if self._connection is None: self._connection = self.get_connection() return self._connection connection = property(connect) def disconnect(self): if self._connection: self.connection.close() self._connection = None self._cursor = None def get_connection(self): '''This method should be overloaded by specific implementations of Engine.''' pass def add_to_table(self, data_source): """This function adds data to a table from one or more lines specified in engine.table.source.""" if self.table.columns[-1][1][0][:3] == "ct-": lines = gen_from_source(data_source) # cross-tab data real_line_length, real_lines = self.get_ct_data(lines) else: real_lines = gen_from_source(data_source) len_source = gen_from_source(data_source) real_line_length = sum(1 for _ in len_source) total = self.table.record_id + real_line_length count_iter = 1 insert_limit = 400 types = self.table.get_column_datatypes() multiple_values = [] for line in real_lines: if line: # Only process non empty lines self.table.record_id += 1 linevalues = self.table.values_from_line(line) # Build insert statement with the correct number of values try: cleanvalues = [self.format_insert_value(self.table.cleanup.function (linevalues[n], self.table.cleanup.args), types[n]) for n in range(len(linevalues))] except Exception as e: self.warning('Exception in line %s: %s' % (self.table.record_id, e)) continue if line or count_iter == real_line_length: if count_iter % insert_limit == 0 or count_iter == real_line_length: # Add values to the list multiple_values # if multiple_values list is full or we reached the last value in real_line_length # execute the values in multiple_values multiple_values.append(cleanvalues) try: insert_stmt = self.insert_statement(multiple_values) except: if self.debug: print(types) if self.debug: print(linevalues) if self.debug: print(cleanvalues) raise multiple_values = [] try: self.execute(insert_stmt, commit=False) prompt = "Progress: " + str(count_iter) + " / " + str(real_line_length) + " rows inserted into " + self.table_name() + " totaling " + str(total) + ":" sys.stdout.write(prompt + "\b" * len(prompt)) sys.stdout.flush() except: print(insert_stmt) raise else: multiple_values.append(cleanvalues) count_iter += 1 self.connection.commit() print("\n") def get_ct_data(self, lines): """Creates cross tab data""" real_lines = [] for values in lines: initial_cols = len(self.table.columns) - \ (3 if hasattr(self.table, "ct_names") else 2) # add one if auto increment is not set to get the right initial columns if not self.table.columns[0][1][0] == "pk-auto": initial_cols += 1 begin = values[:initial_cols] rest = values[initial_cols:] n = 0 for item in rest: if hasattr(self.table, "ct_names"): name = [self.table.ct_names[n]] n += 1 else: name = [] real_lines.append(begin + name + [item]) real_line_length = len(real_lines) return real_line_length, real_lines def auto_create_table(self, table, url=None, filename=None, pk=None): """Creates a table automatically by analyzing a data source and predicting column names, data types, delimiter, etc.""" if url and not filename: filename = filename_from_url(url) self.table = table if url and not self.find_file(filename): # If the file doesn't exist, download it self.download_file(url, filename) file_path = self.find_file(filename) if not self.table.delimiter: self.set_table_delimiter(file_path) if self.table.header_rows > 0 and not self.table.columns: source = (skip_rows, (self.table.header_rows-1, self.load_data(file_path))) lines = gen_from_source(source) header = next(lines) lines.close() source = (skip_rows, (self.table.header_rows, self.load_data(file_path))) lines = gen_from_source(source) columns, column_values = self.table.auto_get_columns(header) self.auto_get_datatypes(pk, lines, columns, column_values) if self.table.columns[-1][1][0][:3] == "ct-" and hasattr(self.table, "ct_names") and not self.table.ct_column in [c[0] for c in self.table.columns]: self.table.columns = self.table.columns[:-1] + [(self.table.ct_column, ("char", 50))] + [self.table.columns[-1]] self.create_table() def auto_get_datatypes(self, pk, source, columns, column_values): """Determines data types for each column. For string columns adds an additional 100 characters to the maximum observed value to provide extra space for cases where special characters are counted differently by different engines. """ # Get all values for each column lines_to_scan = source # set default column data types as int column_types = [('int',)] * len(columns) max_lengths = [0] * len(columns) # Check the values for each column to determine data type for values in lines_to_scan: if values: for i in range(len(columns)): try: val = u"{}".format(values[i]) if self.table.cleanup.function != no_cleanup: val = self.table.cleanup.function( val, self.table.cleanup.args) if val is not None and val.strip() is not '': if len(str(val)) + 100 > max_lengths[i]: max_lengths[i] = len(str(val)) + 100 if column_types[i][0] in ('int', 'bigint'): try: val = int(val) if column_types[i][0] == 'int' and hasattr(self, 'max_int') and val > self.max_int: column_types[i] = ['bigint', ] except: column_types[i] = ['double', ] if column_types[i][0] == 'double': try: val = float(val) if "e" in str(val) or ("." in str(val) and len(str(val).split(".")[1]) > 10): column_types[i] = ["decimal", "50,30"] except: column_types[i] = ['char', max_lengths[i]] if column_types[i][0] == 'char': if len(str(val)) + 100 > column_types[i][1]: column_types[i][1] = max_lengths[i] except IndexError: pass for i in range(len(columns)): column = columns[i] column[1] = column_types[i] if pk == column[0]: column[1][0] = "pk-" + column[1][0] if pk is None and columns[0][1][0] == 'pk-auto': self.table.columns = [("record_id", ("pk-auto",))] self.table.contains_pk = True else: self.table.columns = [] for column in columns: self.table.columns.append((column[0], tuple(column[1]))) def auto_get_delimiter(self, header): """Determine the delimiter Find out which of a set of common delimiters occurs most in the header line and use this as the delimiter. """ self.table.delimiter = "\t" for other_delimiter in [",", ";"]: if header.count(other_delimiter) > header.count(self.table.delimiter): self.table.delimiter = other_delimiter def convert_data_type(self, datatype): """Converts Retriever generic data types to database platform specific data types """ # get the type from the dataset variables key = datatype[0] thispk = False if key[0:3] == "pk-": key = key[3:] thispk = True elif key[0:3] == "ct-": key = key[3:] # format the dataset type to match engine specific type thistype = "" if key in list(self.datatypes.keys()): thistype = self.datatypes[key] if isinstance(thistype, tuple): if datatype[0] == 'pk-auto': pass elif len(datatype) > 1: thistype = thistype[1] + "(" + str(datatype[1]) + ")" else: thistype = thistype[0] else: if len(datatype) > 1: thistype += "(" + str(datatype[1]) + ")" # set the PRIMARY KEY if thispk: if isinstance(thistype, tuple): thistype = self.pkformat % thistype else: thistype = self.pkformat % (thistype, "") return thistype def create_db(self): """Creates a new database based on settings supplied in Database object engine.db""" db_name = self.database_name() if db_name: print("Creating database " + db_name + "...\n") # Create the database create_stmt = self.create_db_statement() if self.debug: print(create_stmt) try: self.execute(create_stmt) except Exception as e: try: self.connection.rollback() except: pass print("Couldn't create database (%s). Trying to continue anyway." % e) def create_db_statement(self): """Returns a SQL statement to create a database.""" create_stmt = "CREATE DATABASE " + self.database_name() return create_stmt def create_raw_data_dir(self): """Checks to see if the archive directory exists and creates it if necessary.""" path = self.format_data_dir() if not os.path.exists(path): os.makedirs(path) def create_table(self): """Creates a new database table based on settings supplied in Table object engine.table.""" print("Creating table " + self.table_name() + "...") # Try to drop the table if it exists; this may cause an exception if it # doesn't exist, so ignore exceptions try: self.execute(self.drop_statement("TABLE", self.table_name())) except: pass create_stmt = self.create_table_statement() if self.debug: print(create_stmt) try: self.execute(create_stmt) except Exception as e: try: self.connection.rollback() except: pass print("Couldn't create table (%s). Trying to continue anyway." % e) def create_table_statement(self): """Returns a SQL statement to create a table""" create_stmt = "CREATE TABLE " + self.table_name() + " (" columns = self.table.get_insert_columns(join=False, create=True) types = [] for column in self.table.columns: for column_name in columns: if column[0] == column_name: types.append(self.convert_data_type(column[1])) if self.debug: print(columns) column_strings = [] for c, t in zip(columns, types): column_strings.append(c + ' ' + t) create_stmt += ', '.join(column_strings) create_stmt += " );" return create_stmt def database_name(self, name=None): """Returns the name of the database""" if not name: try: name = self.script.shortname except AttributeError: name = "{db}" try: db_name = self.opts["database_name"].format(db=name) except KeyError: db_name = name return db_name.replace('-', '_') def download_file(self, url, filename): """Downloads a file to the raw data directory.""" if not self.find_file(filename): path = self.format_filename(filename) self.create_raw_data_dir() print("\nDownloading " + filename + "...") try: urlretrieve(url, path, reporthook=reporthook) except: # For some urls lacking filenames urlretrieve from the future # package seems to fail. This issue occurred in the PlantTaxonomy # script. If this happens, fall back to the standard Python 2 version. from urllib import urlretrieve as py2urlretrieve py2urlretrieve(url, path, reporthook=reporthook) def download_files_from_archive(self, url, filenames, filetype="zip", keep_in_dir=False, archivename=None): """Downloads files from an archive into the raw data directory. """ print() downloaded = False if archivename: archivename = self.format_filename(archivename) else: archivename = self.format_filename(filename_from_url(url)) if keep_in_dir: archivebase = os.path.splitext(os.path.basename(archivename))[0] archivedir = os.path.join(DATA_WRITE_PATH, archivebase) archivedir = archivedir.format(dataset=self.script.shortname) if not os.path.exists(archivedir): os.makedirs(archivedir) else: archivebase = '' for filename in filenames: if self.find_file(os.path.join(archivebase, filename)): # Use local copy pass else: self.create_raw_data_dir() if not downloaded: self.download_file(url, archivename) downloaded = True if filetype == 'zip': try: archive = zipfile.ZipFile(archivename) if archive.testzip(): # This fixes an issue with the zip files that was causing errors on # Python 3. testzip() returns the names of any files with issues so if # it exists there is a problem. For details of the issue and the fix see: # see """https://stackoverflow.com/questions/41492984/ # zipfile-testzip-returning-different-results-on-python-2-and-python-3""" archive.getinfo(filename).file_size += (2 ** 64) - 1 open_archive_file = archive.open(filename, 'r') except zipfile.BadZipFile as e: print("\n{0} can't be extracted, may be corrupt \n{1}".format(filename, e)) elif filetype == 'gz': # gzip archives can only contain a single file open_archive_file = gzip.open(archivename, 'r') elif filetype == 'tar': archive = tarfile.open(filename, 'r') open_archive_file = archive.extractfile(filename) fileloc = self.format_filename(os.path.join(archivebase, os.path.basename(filename))) unzipped_file = open(fileloc, 'wb') for line in open_archive_file: unzipped_file.write(line) open_archive_file.close() unzipped_file.close() if 'archive' in locals(): archive.close() def drop_statement(self, objecttype, objectname): """Returns a drop table or database SQL statement.""" dropstatement = "DROP %s IF EXISTS %s" % (objecttype, objectname) return dropstatement def escape_single_quotes(self, value): """Escapes single quotes in the value""" return value.replace("'", "\\'") def escape_double_quotes(self, value): """Escapes double quotes in the value""" return value.replace('"', '\\"') def execute(self, statement, commit=True): """Executes the given statement""" self.cursor.execute(statement) if commit: self.connection.commit() def exists(self, script): """Checks to see if the given table exists""" return all([self.table_exists( script.shortname, key ) for key in list(script.urls.keys()) if key]) def final_cleanup(self): """Close the database connection.""" if self.warnings: print('\n'.join(str(w) for w in self.warnings)) self.disconnect() def find_file(self, filename): """Checks for an existing datafile""" for search_path in DATA_SEARCH_PATHS: search_path = search_path.format(dataset=self.script.shortname) file_path = os.path.normpath(os.path.join(search_path, filename)) if file_exists(file_path): return file_path return False def format_data_dir(self): """Returns the correctly formatted raw data directory location.""" return DATA_WRITE_PATH.format(dataset=self.script.shortname) def format_filename(self, filename): """Returns the full path of a file in the archive directory.""" return os.path.join(self.format_data_dir(), filename) def format_insert_value(self, value, datatype, escape=True, processed=False): """Format a value for an insert statement based on data type Different data types need to be formated differently to be properly stored in database management systems. The correct formats are obtained by: 1. Removing extra enclosing quotes 2. Harmonizing null indicators 3. Cleaning up badly formatted integers 4. Obtaining consistent float representations of decimals The optional `escape` argument controls whether additional quotes in strings are escaped, as needed for SQL database management systems (escape=True), or not escaped, as needed for flat file based engines (escape=False). The optional processed argument indicates that the engine has it's own escaping mechanism. i.e the csv engine which uses its own dialect""" datatype = datatype.split('-')[-1] strvalue = str(value).strip() # Remove any quotes already surrounding the string quotes = ["'", '"'] if len(strvalue) > 1 and strvalue[0] == strvalue[-1] and strvalue[0] in quotes: strvalue = strvalue[1:-1] nulls = ("null", "none") if strvalue.lower() in nulls: return "null" elif datatype in ("int", "bigint", "bool"): if strvalue: intvalue = strvalue.split('.')[0] if intvalue: return int(intvalue) else: return "null" else: return "null" elif datatype in ("double", "decimal"): if strvalue.strip(): try: decimals = float(str(strvalue)) return str(decimals) except: return "null" else: return "null" elif datatype == "char": if strvalue.lower() in nulls: return "null" if escape: # automatically escape quotes in string fields if hasattr(self.table, "escape_double_quotes") and self.table.escape_double_quotes: strvalue = self.escape_double_quotes(strvalue) if hasattr(self.table, "escape_single_quotes") and self.table.escape_single_quotes: strvalue = self.escape_single_quotes(strvalue) return "'" + strvalue + "'" if processed: return strvalue else: return "'" + strvalue + "'" else: return "null" def get_cursor(self): """Gets the db cursor.""" if self._cursor is None: self._cursor = self.connection.cursor() return self._cursor cursor = property(get_cursor) def get_input(self): """Manually get user input for connection information when script is run from terminal.""" for opt in self.required_opts: if not (opt[0] in list(self.opts.keys())): if opt[0] == "password": print(opt[1]) self.opts[opt[0]] = getpass.getpass(" ") else: prompt = opt[1] if opt[2]: prompt += " or press Enter for the default, %s" % opt[2] prompt += ': ' self.opts[opt[0]] = input(prompt) if self.opts[opt[0]] in ["", "default"]: self.opts[opt[0]] = opt[2] def insert_data_from_archive(self, url, filenames): """Insert data from files located in an online archive. This function extracts the file, inserts the data, and deletes the file if raw data archiving is not set.""" self.download_files_from_archive(url, filenames) for filename in filenames: file_path = self.find_file(filename) if file_path: self.insert_data_from_file(file_path) else: raise Exception("File not found: %s" % filename) def insert_data_from_file(self, filename): """The default function to insert data from a file. This function simply inserts the data row by row. Database platforms with support for inserting bulk data from files can override this function.""" data_source = (skip_rows, (self.table.header_rows, (self.load_data, (filename, )))) self.add_to_table(data_source) def insert_data_from_url(self, url): """Insert data from a web resource, such as a text file.""" filename = filename_from_url(url) find = self.find_file(filename) if find: # Use local copy self.insert_data_from_file(find) else: # Save a copy of the file locally, then load from that file self.create_raw_data_dir() print("\nSaving a copy of " + filename + "...") self.download_file(url, filename) self.insert_data_from_file(self.find_file(filename)) def insert_statement(self, values): """Returns a SQL statement to insert a set of values.""" columns = self.table.get_insert_columns() types = self.table.get_column_datatypes() columncount = len(self.table.get_insert_columns(join=False, create=False)) insert_stmt = "INSERT INTO {} ({}) VALUES ".format(self.table_name(), columns) for row in values: row_length = len(row) # Add None with appropriate value type for empty cells for i in range(columncount - row_length): row.append(self.format_insert_value(None, types[row_length + i])) insert_stmt += " (" + ", ".join([str(val) for val in row]) + "), " insert_stmt = insert_stmt.rstrip(", ") + ";" if self.debug: print(insert_stmt) return insert_stmt def set_engine_encoding(self): pass def set_table_delimiter(self, file_path): dataset_file = open_fr(file_path) self.auto_get_delimiter(dataset_file.readline()) dataset_file.close() def table_exists(self, dbname, tablename): """This can be overridden to return True if a table exists. It returns False by default.""" return False def table_name(self, name=None, dbname=None): """Returns the full tablename.""" if not name: name = self.table.name if not dbname: dbname = self.database_name() if not dbname: dbname = '' return self.opts["table_name"].format(db=dbname, table=name) def to_csv(self): # due to Cyclic imports we can not move this import to the top from retriever.lib.tools import sort_csv for item in list(self.script.urls.keys()): table_name = self.table_name() csv_file_output = os.path.normpath(table_name + '.csv') csv_file = open_fw(csv_file_output) csv_writer = open_csvw(csv_file) self.get_cursor() self.set_engine_encoding() self.cursor.execute("SELECT * FROM {};".format(table_name)) row = self.cursor.fetchone() colnames = [u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description] csv_writer.writerow(colnames) while row is not None: csv_writer.writerow(row) row = self.cursor.fetchone() csv_file.close() sort_csv(csv_file_output) self.disconnect() def warning(self, warning): new_warning = Warning('%s:%s' % (self.script.shortname, self.table.name), warning) self.warnings.append(new_warning) def load_data(self, filename): """Generator returning lists of values from lines in a data file 1. Works on both delimited (csv module) and fixed width data (extract_fixed_width) 2. Identifies the delimiter if not known 3. Removes extra line endings """ if not self.table.delimiter: self.set_table_delimiter(filename) dataset_file = open_fr(filename) if self.table.fixed_width: for row in dataset_file: yield self.extract_fixed_width(row) else: reg = re.compile("\\r\\n|\n|\r") for row in csv.reader(dataset_file, delimiter=self.table.delimiter): yield [reg.sub(" ", values) for values in row] def extract_fixed_width(self, line): """Splits a line based on the fixed width and returns a list of the values""" pos = 0 values = [] for width in self.table.fixed_width: values.append(line[pos:pos + width].strip()) pos += width return values def skip_rows(rows, source): """Skip over the header lines by reading them before processing.""" lines = gen_from_source(source) for i in range(rows): next(lines) return lines def file_exists(path): """Returns true if a file exists and its size is greater than 0.""" return os.path.isfile(path) and os.path.getsize(path) > 0 def filename_from_url(url): """Extracts and returns the filename from the url""" return url.split('/')[-1].split('?')[0] def gen_from_source(source): """Returns a generator from a source tuple. Source tuples are of the form (callable, args) where callable(\*args) returns either a generator or another source tuple. This allows indefinite regeneration of data sources. """ while isinstance(source, tuple): gen, args = source source = gen(*args) return source def reporthook(count, block_size, total_size): """Generated the progress bar Uses file size to calculate the percentage of file size downloaded. If the total_size of the file being downloaded is not in the header, provide progress as size of bytes downloaded in either KB, MB and GB. """ progress_size = int(count * block_size) if total_size != -1: global start_time if count == 0: start_time = time.time() return duration = time.time() - start_time if duration !=0: speed = int(progress_size / (1024 * duration)) percent = min(int(count*block_size*100/total_size),100) sys.stdout.write("\r%2d%% %d seconds " % (percent, duration)) sys.stdout.flush() else: if 1000 >= progress_size / 1000: sys.stdout.write("\r%d KB" % (progress_size / 1000)) sys.stdout.flush() elif 1000000 >= progress_size / 1000000: sys.stdout.write("\r%d MB" % (progress_size / 1000000)) sys.stdout.flush() elif 1000000000 >= progress_size / 1000000000: sys.stdout.write("\r%d GB" % (progress_size / 1000000000)) sys.stdout.flush() PK<šGJÌ¥K”ùùretriever/lib/warning.pyfrom builtins import object class Warning(object): def __init__(self, location, warning): self.location = location self.warning = warning def __str__(self): return 'WARNING (%s): %s' % (self.location, self.warning) PK<šGJÝZ/ÄÄ%retriever/lib/parse_script_to_json.pyimport json import os from copy import copy JSON_DIR = "../scripts/" SCRIPT_DIR = "../scripts/" def parse_script_to_json(script_file, location=SCRIPT_DIR): definition = open(os.path.join(location, script_file) + ".script", 'r') values = {} tables = [] last_table = {} replace = [] keys_to_ignore = ["template"] urls = {} values["retriever"] = "True" values["version"] = "1.0.0" values["retriever_minimum_version"] = "2.0.dev" for line in [str(line).strip() for line in definition]: if line and ':' in line and not line[0] == '#': split_line = [a.strip() for a in line.split(":")] key = split_line[0].lower() value = ':'.join(split_line[1:]) if key == "name": values["title"] = value elif key == "shortname": values["name"] = value elif key == "description": values["description"] = value elif key == "tags": values["keywords"] = [v.strip() for v in value.split(",")] elif key == "url" or key == "ref": values["homepage"] = value elif key == "citation": values["citation"] = value elif key == "replace": # could be made a dict replace = [(v.split(',')[0].strip(), v.split(',')[1].strip()) for v in [val for val in value.split(';')]] elif key == "table": last_table = {} last_table["name"] = value.split(',')[0].strip() last_table["url"] = ','.join(value.split(',')[1:]).strip() last_table["schema"] = {} last_table["dialect"] = {} tables.append(last_table) urls[last_table["name"]] = last_table["url"] if replace: last_table["dialect"]["replace_columns"] = replace elif key == "*column": if last_table: vs = [v.strip() for v in value.split(',')] if "fields" not in last_table["schema"]: last_table["schema"]["fields"] = [] column = {} column['name'] = vs[0] column['type'] = vs[1] if len(vs) > 2: column['size'] = vs[2] last_table["schema"]["fields"].append(copy(column)) elif key == "*nulls": if last_table: nulls = [eval(v) for v in [val.strip() for val in value.split(',')]] last_table["dialect"]["nulls"] = nulls elif key == "*ct_column": if last_table: last_table["schema"]["ct_column"] = value elif key == "*ct_names": if last_table: last_table["schema"]["ct_names"] = [v.strip() for v in value.split(',')] elif key[0] == "*": # attribute that should be applied to the most recently # declared table key = key[1:] if last_table: try: e = eval(value) except: e = str(value) last_table["dialect"][key] = str(e) else: values[key] = str(value) values["resources"] = tables values["urls"] = urls if 'name' not in values: try: values['name'] = values['title'] except: pass for key, value in values.items(): if key in keys_to_ignore: values.pop(key, None) with open(os.path.join(location, values['name']) + '.json', 'w') as json_file: json_str = json.dumps(values, json_file, sort_keys=True, indent=4, separators=(',', ': ')) json_file.write(json_str + '\n') json_file.close() definition.close() if __name__ == "__main__": for file in os.listdir(SCRIPT_DIR): if file[-6:] == "script": parse_script_to_json(file[:-7]) PK<šGJP2òÑÑretriever/lib/excel.py"""Data Retriever Excel Functions This module contains optional functions for importing data from Excel. """ from builtins import str from builtins import object class Excel(object): @staticmethod def empty_cell(cell): """Tests whether an excel cell is empty or contains only whitespace""" if cell.ctype == 0: return True if str(cell.value).strip() == "": return True return False @staticmethod def cell_value(cell): """Returns the string value of an excel spreadsheet cell""" if (cell.value).__class__.__name__ == 'unicode': return (str(cell.value).encode()).strip() return str(cell.value).strip() PK<šGJ"K¾retriever/lib/repository.py"""Checks the repository for updates.""" from __future__ import print_function from future import standard_library standard_library.install_aliases() import os import sys import urllib.request import urllib.parse import urllib.error import imp from pkg_resources import parse_version from retriever import REPOSITORY, SCRIPT_WRITE_PATH, HOME_DIR from retriever.lib.models import file_exists global abort, executable_name abort = False executable_name = "retriever" def download_from_repository(filepath, newpath, repo=REPOSITORY): """Downloads the latest version of a file from the repository.""" try: filename = filepath.split('/')[-1] urllib.request.urlretrieve(repo + filepath, newpath) except: raise pass def check_for_updates(): """Check for updates to scripts. This updates the HOME_DIR scripts directory with the latest script versions """ try: # open version.txt for current release branch and get script versions version_file = urllib.request.urlopen(REPOSITORY + "version.txt") version_file.readline() # read scripts from the repository and the checksums from the version.txt scripts = [] print("Downloading scripts...") for line in version_file: scripts.append(line.decode().strip('\n').split(',')) total_script_count = len(scripts) # create script directory if not available if not os.path.isdir(SCRIPT_WRITE_PATH): os.makedirs(SCRIPT_WRITE_PATH) update_progressbar(0.0 / float(total_script_count)) for index, script in enumerate(scripts): script_name = script[0] if len(script) > 1: script_version = script[1] else: script_version = None path_script_name = os.path.normpath(os.path.join(HOME_DIR, "scripts", script_name)) if not file_exists(path_script_name): download_from_repository("scripts/" + script_name, os.path.normpath(os.path.join(SCRIPT_WRITE_PATH, script_name))) need_to_download = False try: file_object, pathname, desc = imp.find_module(''.join(script_name.split('.')[:-1]), [SCRIPT_WRITE_PATH]) new_module = imp.load_module(script_name, file_object, pathname, desc) m = str(new_module.SCRIPT.version) need_to_download = parse_version(str(script_version)) > parse_version(m) except: pass if need_to_download: try: os.remove(os.path.normpath(os.path.join(HOME_DIR, "scripts", script_name))) download_from_repository("scripts/" + script_name, os.path.normpath(os.path.join(SCRIPT_WRITE_PATH, script_name))) except Exception as e: print(e) pass update_progressbar(float(index + 1) / float(total_script_count)) except: raise return print("\nThe retriever is up-to-date") def update_progressbar(progress): """Show progressbar Takes a number between 0 and 1 to indicate progress from 0 to 100%. And set the bar_length according to the console size """ try: rows, columns = os.popen('stty size', 'r').read().split() bar_length = int(columns) - 35 if not bar_length > 1: bar_length = 20 except: # Default value if determination of console size fails bar_length = 20 block = int(round(bar_length * progress)) text = "\rDownload Progress: [{0}] {1:.2f}%".format( "#" * block + "-" * (bar_length - block), progress * 100) sys.stdout.write(text) sys.stdout.flush() PK<šGJàߣPy@y@retriever/lib/datapackage.pyfrom __future__ import print_function from builtins import input import os import json from time import sleep from retriever import SCRIPT_LIST, HOME_DIR short_names = [script.shortname.lower() for script in SCRIPT_LIST()] def is_empty(val): """Check if a variable is an empty string or an empty list""" return val == "" or val == [] def clean_input(prompt="", split_char='', ignore_empty=False, dtype=None): """Clean the user-input from the CLI before adding it""" while True: val = input(prompt).strip() # split to list type if split_char specified if split_char != "": val = [v.strip() for v in val.split(split_char) if v.strip() != ""] # do not ignore empty input if not allowed if not ignore_empty and is_empty(val): print("\tError: empty input. Need one or more values.\n") continue # ensure correct input datatype if specified if not is_empty(val) and dtype is not None: try: if not type(eval(val)) == dtype: print("\tError: input doesn't match required type ", dtype, "\n") continue except: print("\tError: illegal argument. Input type should be ", dtype, "\n") continue break return val def get_replace_columns(dialect): """Get list of tuples with old and new names for the columns in the table""" val = clean_input("replace_columns (separated by ';', with comma-separated values) (press return to skip): ", split_char=';', ignore_empty=True) if val == "" or val == []: # return and dont add key to dialect dict if empty val return dialect['replace_columns'] = [] for v in val: try: pair = v.split(',') dialect['replace_columns'].append((pair[0].strip(), pair[1].strip())) except IndexError: continue def get_nulls(dialect): """Get list of strings that denote null in the dataset""" val = clean_input("nulls (separated by ';') (press return to skip): ", split_char=';', ignore_empty=True) if val == "" or val == []: # return and dont add key to dialect dict if empty val return dialect['nulls'] = val # change list to single value if size == 1 if len(dialect['nulls']) == 1: dialect['nulls'] = dialect['nulls'][0] def get_delimiter(dialect): """Get the string delimiter for the dataset file(s)""" val = clean_input("delimiter (press return to skip): ", ignore_empty=True) if val == "" or val == []: # return and dont add key to dialect dict if empty val return dialect['delimiter'] = val def get_do_not_bulk_insert(dialect): """Set do_not_bulk_insert property""" val = clean_input("do_not_bulk_insert (bool = True/False) (press return to skip): ", ignore_empty=True, dtype=bool) if val == "" or val == []: # return and dont add key to dialect dict if empty val return dialect['do_not_bulk_insert'] = val def get_contains_pk(dialect): """Set contains_pk property""" val = clean_input("contains_pk (bool = True/False) (press return to skip): ", ignore_empty=True, dtype=bool) if val == "" or val == []: # return and dont add key to dialect dict if empty val return dialect['contains_pk'] = val def get_escape_single_quotes(dialect): """Set escape_single_quotes property""" val = clean_input("escape_single_quotes (bool = True/False) (press return to skip): ", ignore_empty=True, dtype=bool) if val == "" or val == []: # return and dont add key to dialect dict if empty val return dialect['escape_single_quotes'] = val def get_escape_double_quotes(dialect): """Set escape_double_quotes property""" val = clean_input("escape_double_quotes (bool = True/False) (press return to skip): ", ignore_empty=True, dtype=bool) if val == "" or val == []: # return and dont add key to dialect dict if empty val return dialect['escape_double_quotes'] = val def get_fixed_width(dialect): """Set fixed_width property""" val = clean_input("fixed_width (bool = True/False) (press return to skip): ", ignore_empty=True, dtype=bool) if val == "" or val == []: # return and dont add key to dialect dict if empty val return dialect['fixed_width'] = val def get_header_rows(dialect): """Get number of rows considered as the header""" val = clean_input("header_rows (int) (press return to skip): ", ignore_empty=True, dtype=int) if val == "" or val == []: # return and dont add key to dialect dict if empty val return dialect['header_rows'] = val def create_json(): ''' Creates datapackage.JSON script. http://specs.frictionlessdata.io/data-packages/#descriptor-datapackagejson Takes input from user via command line. Usage: retriever create_json ''' contents = {} tableUrls = {} script_exists = True while script_exists: contents['name'] = clean_input("name (a short unique identifier; only lowercase letters and - allowed): ") script_exists = contents['name'].lower() in short_names if script_exists: print("Dataset already available. Check the list or try a different shortname") contents['title'] = clean_input("title: ", ignore_empty=True) contents['description'] = clean_input("description: ", ignore_empty=True) contents['citation'] = clean_input("citation: ", ignore_empty=True) contents['homepage'] = clean_input("homepage (for the entire dataset): ", ignore_empty=True) contents['keywords'] = clean_input("keywords (separated by ';'): ", split_char=';', ignore_empty=True) contents['resources'] = [] contents['retriever'] = "True" contents['retriever_minimum_version'] = "2.0.dev" contents['version'] = "1.0.0"; # Add tables - while True: addTable = clean_input("\nAdd Table? (y/N): ") if addTable.lower() in ["n", "no"]: break elif addTable.lower() not in ["y", "yes"]: print("Not a valid option\n") continue else: table = {} table['name'] = clean_input("table-name: ") table['url'] = clean_input("table-url: ") table['dialect'] = {} tableUrls[table['name']] = table['url'] # get table properties (dialect) # refer retriever.lib.table.Table get_replace_columns(table['dialect']) get_nulls(table['dialect']) get_delimiter(table['dialect']) get_do_not_bulk_insert(table['dialect']) get_contains_pk(table['dialect']) get_escape_single_quotes(table['dialect']) get_escape_double_quotes(table['dialect']) get_fixed_width(table['dialect']) get_header_rows(table['dialect']) # set table schema table['schema'] = {} table['schema']["fields"] = [] print("Enter columns [format = name, type, (optional) size] (press return to skip):\n\n") while True: # get column list (optional) try: col_list = clean_input("", split_char = ',', ignore_empty = True) if col_list == []: break elif type(col_list) != list: raise Exception col_list = [c.strip() for c in col_list] col_obj = {} # dict to store column data col_obj["name"] = col_list[0] col_obj["type"] = col_list[1] if len(col_list) > 2: if type(eval(col_list[2])) != int: raise Exception col_obj["size"] = col_list[2] table["schema"]["fields"].append(col_obj) except: print("Exception occured. Check the input format again.\n") pass isCT = clean_input( "Add crosstab columns? (y,N): ", ignore_empty=True) if isCT.lower() in ["y", "yes"]: ct_column = clean_input("Crosstab column name: ") ct_names = [] print("Enter names of crosstab column values (Press return after each name):\n") name = clean_input() while name != "": ct_names.append(name) name = clean_input() table['schema']['ct_column'] = ct_column table['schema']['ct_names'] = ct_names contents['resources'].append(table) contents['urls'] = tableUrls file_name = contents['name'] + ".json" with open(os.path.join(HOME_DIR, 'scripts', file_name), 'w') as output_file: json_str = json.dumps(contents, output_file, sort_keys=True, indent=4, separators=(',', ': ')) output_file.write(json_str + '\n') print("\nScript written to " + file_name) output_file.close() def edit_dict(obj, tabwidth=0): ''' Recursive helper function for edit_json() to edit a datapackage.JSON script file. ''' for (key, val) in obj.items(): print('\n' + " " * tabwidth + "->" + key + " (", type(val), ") :\n") if type(val) == list: for v in val: print(" " * tabwidth + str(v) + '\n\n') elif type(val) == dict: for item in val.items(): print(" " * tabwidth + str(item) + '\n\n') else: print(" " * tabwidth + str(val) + '\n\n') while True: try: if isinstance(val, dict): if val != {}: print(" '" + key + "' has the following keys:\n" + str(obj[key].keys()) + "\n") do_edit = clean_input( "Edit the values for these sub-keys of " + key + "? (y/N): ") if do_edit.lower() in ['y', 'yes']: edit_dict(obj[key], tabwidth + 1) print("Select one of the following for the key '" + key + "': \n") print("1. Add an item") print("2. Modify an item") print("3. Delete an item") print("4. Remove from script") print("5. Continue (no changes)\n") selection = clean_input("\nYour choice: ") if selection == '1': add_key = clean_input('Enter new key: ') add_val = clean_input('Enter new value: ') obj[key][add_key] = add_val elif selection == '2': mod_key = clean_input('Enter the key: ') if mod_key not in val: print("Invalid input! Key not found.") continue mod_val = clean_input('Enter new value: ') obj[key][mod_key] = mod_val elif selection == '3': del_key = clean_input('Enter key to be deleted: ') if del_key not in val: print("Invalid key: Not found") continue print("Removed " + str(del_key) + " : " + str(obj[key].pop(del_key))) elif selection == '4': do_remove = clean_input( "Are you sure (completely remove this entry)? (y/n): ") if do_remove.lower() in ['y', 'yes']: obj.pop(key) print("Removed " + key + " from script.\n") else: print("Aborted.") sleep(1) elif selection == '5' or selection == "": pass else: raise RuntimeError("Invalid input!") elif isinstance(val, list): for i in range(len(val)): print(i + 1, '. ', str(val[i])) if isinstance(val[i], dict): do_edit = clean_input( "\nEdit this dict in '" + key + "'? (y/N): ") if do_edit.lower() in ['y', 'yes']: edit_dict(obj[key][i], tabwidth + 2) print("Select one of the following for the key '" + key + "': \n") print("1. Add an item") print("2. Delete an item") print("3. Remove from script") print("4. Continue (no changes)\n") selection = clean_input("\nYour choice: ") if selection == '1': add_val = clean_input('Enter new value: ') obj[key].append(add_val) elif selection == '2': del_val = clean_input('Enter value to be deleted: ') if del_val not in obj[key]: print("Invalid value: Not found.") continue print("Removed " + str(obj[key].pop(del_key))) elif selection == '3': do_remove = clean_input( "Are you sure (completely remove this entry)? (y/n): ") if do_remove.lower() in ['y', 'yes']: obj.pop(key) print("Removed " + key + " from script.\n") else: print("Aborted.") sleep(1) elif selection == '4' or selection == "": pass else: raise RuntimeError("Invalid input!") else: print("Select one of the following for the key '" + key + "': \n") print("1. Modify value") print("2. Remove from script") print("3. Continue (no changes)\n") selection = clean_input("\nYour choice: ") if selection == '1': new_val = clean_input('Enter new value: ') obj[key] = new_val elif selection == '2': do_remove = clean_input( "Are you sure (completely remove this entry)? (y/n): ") if do_remove.lower() in ['y', 'yes']: obj.pop(key) print("Removed " + key + " from script.\n") else: print("Aborted.") sleep(1) elif selection == '3' or selection == "": pass else: raise RuntimeError("Invalid input!") break except RuntimeError: continue def edit_json(json_file): ''' Edits existing datapackage.JSON script. Usage: retriever edit_json Note: Name of script is the dataset shortname. ''' try: contents = json.load( open(os.path.join(HOME_DIR, 'scripts', json_file), 'r')) except FileNotFoundError: print("Script not found.") return edit_dict(contents, 1) file_name = contents['name'] + ".json" with open(os.path.join(HOME_DIR, 'scripts', file_name), 'w') as output_file: json_str = json.dumps(contents, output_file, sort_keys=True, indent=4, separators=(',', ': ')) output_file.write(json_str + '\n') print("\nScript written to " + os.path.join(HOME_DIR, 'scripts', file_name)) output_file.close() PK<šGJÔ0ðVretriever/engines/xmlengine.pyimport os from builtins import str from builtins import object from builtins import range from retriever.lib.models import Engine from retriever import DATA_DIR, open_fr, open_fw from retriever.lib.tools import xml2csv, sort_csv class DummyConnection(object): def cursor(self): pass def commit(self): pass def rollback(self): pass def close(self): pass class DummyCursor(DummyConnection): pass class engine(Engine): """Engine instance for writing data to a XML file.""" name = "XML" abbreviation = "xml" datatypes = { "auto": "INTEGER", "int": "INTEGER", "bigint": "INTEGER", "double": "REAL", "decimal": "REAL", "char": "TEXT", "bool": "INTEGER", } required_opts = [ ("table_name", "Format of table name", os.path.join(DATA_DIR, "{db}_{table}.xml")), ] table_names = [] def create_db(self): """Override create_db since there is no database just an XML file""" return None def create_table(self): """Create the table by creating an empty XML file""" self.output_file = open_fw(self.table_name()) self.output_file.write(u'') self.output_file.write(u'\n') self.table_names.append((self.output_file, self.table_name())) self.auto_column_number = 1 def disconnect(self): """Close out the xml files Close all the file objects that have been created Re-write the files stripping off the last comma and then close with a closing tag) """ if self.table_names: for output_file_i, file_name in self.table_names: output_file_i.close() current_input_file = open_fr(file_name) file_contents = current_input_file.readlines() current_input_file.close() file_contents[-1] = file_contents[-1].strip(',') current_output_file = open_fw(file_name) current_output_file.writelines(file_contents) current_output_file.write(u'\n') current_output_file.close() self.table_names = [] def execute(self, statement, commit=True): """Write a line to the output file""" self.output_file.writelines(statement) def format_insert_value(self, value, datatype): """Formats a value for an insert statement""" v = Engine.format_insert_value(self, value, datatype, escape=False, processed=True) if v == 'null': return "" try: if len(v) > 1 and v[0] == v[-1] == "'": v = '"%s"' % v[1:-1] except: pass return v def insert_statement(self, values): if not hasattr(self, 'auto_column_number'): self.auto_column_number = 1 keys = self.table.get_insert_columns(join=False, create=True) if self.table.columns[0][1][0][3:] == 'auto': newrows = [] for rows in values: insert_stmt = [self.auto_column_number] + rows newrows.append(insert_stmt) self.auto_column_number += 1 else: newrows = values xml_lines = ['\n\n{}'.format(self._format_single_row(keys, line_data)) for line_data in newrows] return xml_lines def _format_single_row(self, keys, line_data): return ''.join(' <{key}>{value}\n'.format(key=key, value=value) for key, value in zip(keys, line_data)) def table_exists(self, dbname, tablename): """Check to see if the data file currently exists""" tablename = self.table_name(name=tablename, dbname=dbname) return os.path.exists(tablename) def to_csv(self): """Export table from xml engine to CSV file""" for keys in list(self.script.tables): table_name = self.opts['table_name'].format(db=self.db_name, table=keys) header = self.script.tables[keys].get_insert_columns(join=False, create=True) csv_outfile = xml2csv(table_name, header_values=header) sort_csv(csv_outfile) def get_connection(self): """Gets the db connection.""" self.get_input() return DummyConnection() PK<šGJš©CÌ**retriever/engines/mysql.pyfrom __future__ import print_function from builtins import str import os from retriever.lib.models import Engine, no_cleanup class engine(Engine): """Engine instance for MySQL.""" name = "MySQL" abbreviation = "mysql" datatypes = { "auto": "INT(5) NOT NULL AUTO_INCREMENT", "int": "INT", "bigint": "BIGINT", "double": "DOUBLE", "decimal": "DECIMAL", "char": ("TEXT", "VARCHAR"), "bool": "BOOL", } max_int = 4294967295 required_opts = [("user", "Enter your MySQL username", "root"), ("password", "Enter your password", ""), ("host", "Enter your MySQL host", "localhost"), ("port", "Enter your MySQL port", 3306), ("database_name", "Format of database name", "{db}"), ("table_name", "Format of table name", "{db}.{table}"), ] def create_db_statement(self): """Returns a SQL statement to create a database.""" createstatement = "CREATE DATABASE IF NOT EXISTS " + self.database_name() return createstatement def insert_data_from_file(self, filename): """Calls MySQL "LOAD DATA LOCAL INFILE" statement to perform a bulk insert.""" mysql_set_autocommit_off = """SET autocommit=0; SET UNIQUE_CHECKS=0; SET FOREIGN_KEY_CHECKS=0; SET sql_log_bin=0;""" mysql_set_autocommit_on = """SET GLOBAL innodb_flush_log_at_trx_commit=1; COMMIT; SET autocommit=1; SET unique_checks=1; SET foreign_key_checks=1;""" self.get_cursor() ct = len([True for c in self.table.columns if c[1][0][:3] == "ct-"]) != 0 if (self.table.cleanup.function == no_cleanup and not self.table.fixed_width and not ct and (not hasattr(self.table, "do_not_bulk_insert") or not self.table.do_not_bulk_insert)): print ("Inserting data from " + os.path.basename(filename) + "...") columns = self.table.get_insert_columns() statement = """ LOAD DATA LOCAL INFILE '""" + filename.replace("\\", "\\\\") + """' INTO TABLE """ + self.table_name() + """ FIELDS TERMINATED BY '""" + self.table.delimiter + """' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\\n' IGNORE """ + str(self.table.header_rows) + """ LINES (""" + columns + ")" try: self.cursor.execute(mysql_set_autocommit_off) self.cursor.execute(statement) self.cursor.execute(mysql_set_autocommit_on) except Exception as e: self.disconnect() # If the execute fails the database connection can get hung up self.cursor.execute(mysql_set_autocommit_on) return Engine.insert_data_from_file(self, filename) else: return Engine.insert_data_from_file(self, filename) def table_exists(self, dbname, tablename): """Checks to see if the given table exists""" if not hasattr(self, 'existing_table_names'): self.cursor.execute( "SELECT table_schema, table_name " "FROM information_schema.tables WHERE table_schema NOT IN " "('mysql', 'information_schema', 'performance_schema');") self.existing_table_names = set() for schema, table in self.cursor: self.existing_table_names.add((schema.lower(), table.lower())) return (dbname.lower(), tablename.lower()) in self.existing_table_names def set_engine_encoding(self, encoding='ISO-8859-1'): """Set MySQL database encoding to match data encoding Defaults to latin1 and falls back to it if an unknown encoding is provided """ encoding_lookup = {'ISO-8859-1': 'latin1'} db_encoding = encoding_lookup.get(encoding, 'latin1') self.execute("SET NAMES '{0}';".format(db_encoding)) def get_connection(self): """Gets the db connection.""" args = {'host': self.opts['host'], 'port': int(self.opts['port']), 'user': self.opts['user'], 'passwd': self.opts['password']} import pymysql as dbapi import pymysql.constants.CLIENT as client args['client_flag'] = client.LOCAL_FILES self.get_input() return dbapi.connect(**args) PK<šGJ€Åeretriever/engines/__init__.py"""Contains DBMS-specific Engine implementations.""" engines = [ "mysql", "postgres", "sqlite", "msaccess", "csvengine", "download_only", "jsonengine", "xmlengine" ] engine_module_list = [ __import__("retriever.engines." + module, fromlist="engines") for module in engines ] engine_list = [module.engine() for module in engine_module_list] PK<šGJÛ7Ñ5®®retriever/engines/jsonengine.py"""Engine for writing data to a JSON file""" from builtins import zip from builtins import object from builtins import range import os import json from retriever.lib.models import Engine from retriever import DATA_DIR, open_fw, open_fr from collections import OrderedDict from retriever.lib.tools import json2csv, sort_csv class DummyConnection(object): def cursor(self): pass def commit(self): pass def rollback(self): pass def close(self): pass class DummyCursor(DummyConnection): pass class engine(Engine): """Engine instance for writing data to a CSV file.""" name = "JSON" abbreviation = "json" datatypes = { "auto": "INTEGER", "int": "INTEGER", "bigint": "INTEGER", "double": "REAL", "decimal": "REAL", "char": "TEXT", "bool": "INTEGER", } required_opts = [ ("table_name", "Format of table name", os.path.join(DATA_DIR, "{db}_{table}.json")), ] table_names = [] def create_db(self): """Override create_db since there is no database just a JSON file""" return None def create_table(self): """Create the table by creating an empty json file""" self.output_file = open_fw(self.table_name()) self.output_file.write("[") self.table_names.append((self.output_file, self.table_name())) self.auto_column_number = 1 def disconnect(self): """Close out the JSON with a `\\n]}` and close the file. Close all the file objects that have been created Re-write the files stripping off the last comma and then close with a `\\n]}`. """ if self.table_names: for output_file_i, file_name in self.table_names: output_file_i.close() current_input_file = open_fr(file_name) file_contents = current_input_file.readlines() current_input_file.close() file_contents[-1] = file_contents[-1].strip(',\n') current_output_file = open_fw(file_name) current_output_file.writelines(file_contents) current_output_file.writelines(['\n]']) current_output_file.close() self.table_names = [] def execute(self, statement, commit=True): """Write a line to the output file""" self.output_file.writelines(statement) def format_insert_value(self, value, datatype): """Formats a value for an insert statement""" v = Engine.format_insert_value(self, value, datatype, escape=False, processed=True) if v == 'null': return "" try: if len(v) > 1 and v[0] == v[-1] == "'": v = '"%s"' % v[1:-1] except: pass return v def insert_statement(self, values): if not hasattr(self, 'auto_column_number'): self.auto_column_number = 1 keys = self.table.get_insert_columns(join=False, create=True) if self.table.columns[0][1][0][3:] == 'auto': newrows = [] for rows in values: insert_stmt = [self.auto_column_number] + rows newrows.append(insert_stmt) self.auto_column_number += 1 else: newrows = values json_dumps = [] for line_data in newrows: tuples = (zip(keys, line_data)) write_data = OrderedDict(tuples) json_dumps.append(json.dumps(write_data, ensure_ascii=False) + ",") return json_dumps tuples = (zip(keys, [value for value in values])) write_data = OrderedDict(tuples) return json.dumps(write_data, ensure_ascii=False) def table_exists(self, dbname, tablename): """Check to see if the data file currently exists""" tablename = self.table_name(name=tablename, dbname=dbname) return os.path.exists(tablename) def to_csv(self): """Export table from json engine to CSV file""" for keys in list(self.script.tables): table_name = self.opts['table_name'].format(db=self.db_name, table=keys) header = self.script.tables[keys].get_insert_columns(join=False, create=True) csv_outfile = json2csv(table_name, header_values=header) sort_csv(csv_outfile) def get_connection(self): """Gets the db connection.""" self.get_input() return DummyConnection() PK<šGJdÃð«´ ´ retriever/engines/sqlite.pyfrom builtins import range import os from retriever.lib.models import Engine, no_cleanup from retriever import DATA_DIR class engine(Engine): """Engine instance for SQLite.""" name = "SQLite" abbreviation = "sqlite" datatypes = { "auto": ("INTEGER", "AUTOINCREMENT"), "int": "INTEGER", "bigint": "INTEGER", "double": "REAL", "decimal": "REAL", "char": "TEXT", "bool": "INTEGER", } required_opts = [("file", "Enter the filename of your SQLite database", os.path.join(DATA_DIR, "sqlite.db"), ""), ("table_name", "Format of table name", "{db}_{table}"), ] def create_db(self): """SQLite doesn't create databases; each database is a file and needs a separate connection.""" return None def escape_single_quotes(self, line): """Escapes single quotes in the line""" return line.replace("'", "''") def get_bulk_insert_statement(self): """Get insert statement for bulk inserts This places ?'s instead of the actual values so that executemany() can operate as designed """ columns = self.table.get_insert_columns() types = self.table.get_column_datatypes() column_count = len(self.table.get_insert_columns(False)) insert_stmt = "INSERT INTO " + self.table_name() insert_stmt += " (" + columns + ")" insert_stmt += " VALUES (" for i in range(0, column_count): insert_stmt += "?, " insert_stmt = insert_stmt.rstrip(", ") + ")" return insert_stmt def table_exists(self, dbname, tablename): """Determine if the table already exists in the database""" if not hasattr(self, 'existing_table_names'): self.cursor.execute( "SELECT name FROM sqlite_master WHERE type='table';") self.existing_table_names = set() for line in self.cursor: self.existing_table_names.add(line[0].lower()) return self.table_name(name=tablename, dbname=dbname).lower() in self.existing_table_names def to_csv(self): Engine.to_csv(self) def get_connection(self): """Gets the db connection.""" import sqlite3 as dbapi self.get_input() return dbapi.connect(self.opts["file"]) PK<šGJ±Ó~j¦¦retriever/engines/msaccess.pyfrom __future__ import print_function from builtins import str import os from retriever.lib.models import Engine, no_cleanup from retriever import DATA_DIR, current_platform class engine(Engine): """Engine instance for Microsoft Access.""" name = "Microsoft Access" instructions = """Create a database in Microsoft Access, close Access, then \nselect your database file using this dialog.""" abbreviation = "msaccess" datatypes = { "auto": "AUTOINCREMENT", "int": "INTEGER", "bigint": "INTEGER", "double": "NUMERIC", "decimal": "NUMERIC", "char": "VARCHAR", "bool": "BIT", } required_opts = [("file", "Enter the filename of your Access database", os.path.join(DATA_DIR, "access.mdb"), "Access databases (*.mdb, *.accdb)|*.mdb;*.accdb"), ("table_name", "Format of table name", "[{db} {table}]"), ] def convert_data_type(self, datatype): """MS Access can't handle complex Decimal types""" converted = Engine.convert_data_type(self, datatype) if "NUMERIC" in converted: converted = "NUMERIC" elif "VARCHAR" in converted: try: length = int(converted.split('(')[1].split(')')[0].split(',')[0]) if length > 255: converted = "TEXT" except: pass return converted def create_db(self): """MS Access doesn't create databases.""" return None def drop_statement(self, objecttype, objectname): """Returns a drop table or database SQL statement.""" dropstatement = "DROP %s %s" % (objecttype, objectname) return dropstatement def escape_single_quotes(self, value): """Escapes the single quotes in the value""" return value.replace("'", "''") def insert_data_from_file(self, filename): """Perform a bulk insert.""" self.get_cursor() ct = len([True for c in self.table.columns if c[1][0][:3] == "ct-"]) != 0 if ((self.table.cleanup.function == no_cleanup and not self.table.fixed_width and self.table.header_rows < 2) and (self.table.delimiter in ["\t", ","]) and not ct and (not hasattr(self.table, "do_not_bulk_insert") or not self.table.do_not_bulk_insert) ): print ("Inserting data from " + os.path.basename(filename) + "...") if self.table.delimiter == "\t": fmt = "TabDelimited" elif self.table.delimiter == ",": fmt = "CSVDelimited" if self.table.header_rows == 1: hdr = "Yes" else: hdr = "No" columns = self.table.get_insert_columns() need_to_delete = False add_to_record_id = 0 if self.table.pk and not self.table.contains_pk: if '.' in os.path.basename(filename): proper_name = filename.split('.') newfilename = '.'.join((proper_name[0:-1]) if len(proper_name) > 0 else proper_name[0] ) + "_new." + filename.split(".")[-1] else: newfilename = filename + "_new" if not os.path.isfile(newfilename): print("Adding index to " + os.path.abspath(newfilename) + "...") read = open(filename, "rb") write = open(newfilename, "wb") to_write = "" for line in read: to_write += str(id) + self.table.delimiter + line.replace("\n", "\r\n") add_to_record_id += 1 self.table.record_id += add_to_record_id write.write(to_write) write.close() read.close() need_to_delete = True columns = "record_id, " + columns else: newfilename = filename newfilename = os.path.abspath(newfilename) filename_length = (len(os.path.basename(newfilename)) * -1) - 1 filepath = newfilename[:filename_length] statement = """ INSERT INTO """ + self.table_name() + " (" + columns + """) SELECT * FROM [""" + os.path.basename(newfilename) + '''] IN "''' + filepath + '''" "Text;FMT=''' + fmt + ''';HDR=''' + hdr + ''';"''' try: self.execute(statement) except: print("Couldn't bulk insert. Trying manual insert.") self.connection.rollback() self.table.record_id -= add_to_record_id return Engine.insert_data_from_file(self, filename) if need_to_delete: os.remove(newfilename) else: return Engine.insert_data_from_file(self, filename) def table_exists(self, dbname, tablename): """Determine if the table already exists in the database""" if not hasattr(self, 'existing_table_names'): self.existing_table_names = set() for row in self.cursor.tables(): tableinfo = row[2] if not tableinfo.startswith("MSys"): # ignore system tables database, table = tableinfo.split() self.existing_table_names.add((database, table)) return self.table_name(name=tablename, dbname=dbname).lower() in self.existing_table_names def get_connection(self): """Gets the db connection.""" if current_platform != "windows": raise Exception("MS Access can only be used in Windows.") import pypyodbc as dbapi self.get_input() if not os.path.exists(self.opts['file']) and self.opts['file'].endswith('.mdb'): dbapi.win_create_mdb(self.opts['file']) connection_string = ("DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=" + os.path.abspath(self.opts["file"]).replace("/", "//") + ";") return dbapi.connect(connection_string, autocommit=False) PK<šGJW¹MYýý"retriever/engines/download_only.pyfrom __future__ import print_function from builtins import object import os import platform import shutil import inspect from retriever.lib.engine import filename_from_url from retriever.lib.models import Engine, no_cleanup from retriever import DATA_DIR, HOME_DIR class DummyConnection(object): def cursor(self): pass def commit(self): pass def rollback(self): pass def close(self): pass class DummyCursor(DummyConnection): pass class engine(Engine): """Engine instance for writing data to a CSV file.""" name = "Download Only" abbreviation = "download" required_opts = [("path", "File path to copy data files", "./"), ("subdir", "Keep the subdirectories for archived files", False) ] def table_exists(self, dbname, tablename): """Checks if the file to be downloaded already exists""" try: tablename = self.table_name(name=tablename, dbname=dbname) return os.path.exists(tablename) except: return False def get_connection(self): """Gets the db connection.""" self.get_input() return DummyConnection() def final_cleanup(self): """Copies downloaded files to desired directory Copies the downloaded files into the chosen directory unless files with the same name already exist in the directory. """ if hasattr(self, "all_files"): for file_name in self.all_files: file_path, file_name_nopath = os.path.split(file_name) subdir = os.path.split(file_path)[1] if self.opts['subdir'] else '' dest_path = os.path.join(self.opts['path'], subdir) if os.path.isfile(os.path.join(dest_path, file_name_nopath)): print ("File already exists at specified location") elif os.path.abspath(file_path) == os.path.abspath(os.path.join(DATA_DIR, subdir)): print ("%s is already in the working directory" % file_name_nopath) print("Keeping existing copy.") else: print("Copying %s from %s" % (file_name_nopath, file_path)) if os.path.isdir(dest_path): try: shutil.copy(file_name, dest_path) except: print("Couldn't copy file to %s" % dest_path) else: try: print("Creating directory %s" % dest_path) os.makedirs(dest_path) shutil.copy(file_name, dest_path) except: print("Couldn't create directory %s" % dest_path) self.all_files = set() def auto_create_table(self, table, url=None, filename=None, pk=None): """Download the file if it doesn't exist""" if url and not filename: filename = filename_from_url(url) if url and not self.find_file(filename): # If the file doesn't exist, download it self.download_file(url, filename) def insert_data_from_url(self, url): """Insert data from a web resource""" filename = filename_from_url(url) find = self.find_file(filename) if not find: self.create_raw_data_dir() self.download_file(url, filename) def find_file(self, filename): """Checks for the given file and adds it to the list of all files""" result = Engine.find_file(self, filename) if not hasattr(self, "all_files"): self.all_files = set() if result: self.all_files.add(result) return result def register_files(self, filenames): """Identify a list of files to be moved by the download When downloading archives with multiple files the engine needs to be informed of all of the file names so that it can move them. """ full_filenames = {self.find_file(filename) for filename in filenames if self.find_file(filename)} self.all_files = self.all_files.union(full_filenames) # replace all other methods with a function that does nothing def dummy_method(self, *args, **kwargs): pass methods = inspect.getmembers(engine, predicate=inspect.ismethod) keep_methods = {'table_exists', 'get_connection', 'final_cleanup', 'auto_create_table', 'insert_data_from_url', } remove_methods = ['insert_data_from_file', 'create_db'] for name, method in methods: if (name not in keep_methods and 'download' not in name and 'file' not in name and 'dir' not in name): setattr(engine, name, dummy_method) for name in remove_methods: setattr(engine, name, dummy_method) PK<šGJ£ÃÚààretriever/engines/postgres.pyimport os from retriever.lib.models import Engine, no_cleanup class engine(Engine): """Engine instance for PostgreSQL.""" name = "PostgreSQL" abbreviation = "postgres" datatypes = { "auto": "serial", "int": "integer", "bigint": "bigint", "double": "double precision", "decimal": "decimal", "char": "varchar", "bool": "boolean", } max_int = 2147483647 required_opts = [("user", "Enter your PostgreSQL username", "postgres"), ("password", "Enter your password", ""), ("host", "Enter your PostgreSQL host", "localhost"), ("port", "Enter your PostgreSQL port", 5432), ("database", "Enter your PostgreSQL database name", "postgres"), ("database_name", "Format of schema name", "{db}"), ("table_name", "Format of table name", "{db}.{table}"), ] def create_db_statement(self): """In PostgreSQL, the equivalent of a SQL database is a schema.""" return Engine.create_db_statement(self).replace("DATABASE", "SCHEMA") def create_db(self): """Creates the database""" try: Engine.create_db(self) except: self.connection.rollback() pass def create_table(self): """PostgreSQL needs to commit operations individually.""" Engine.create_table(self) self.connection.commit() def drop_statement(self, objecttype, objectname): """In PostgreSQL, the equivalent of a SQL database is a schema.""" statement = Engine.drop_statement(self, objecttype, objectname) statement += " CASCADE;" return statement.replace(" DATABASE ", " SCHEMA ") def escape_single_quotes(self, value): """Escapes single quotes in the value""" return value.replace("'", "''") def insert_data_from_file(self, filename): """Use PostgreSQL's "COPY FROM" statement to perform a bulk insert.""" self.get_cursor() ct = len([True for c in self.table.columns if c[1][0][:3] == "ct-"]) != 0 if (([self.table.cleanup.function, self.table.delimiter, self.table.header_rows] == [no_cleanup, ",", 1]) and not self.table.fixed_width and not ct and (not hasattr(self.table, "do_not_bulk_insert") or not self.table.do_not_bulk_insert)): columns = self.table.get_insert_columns() filename = os.path.abspath(filename) statement = """ COPY """ + self.table_name() + " (" + columns + """) FROM '""" + filename.replace("\\", "\\\\") + """' WITH DELIMITER ',' CSV HEADER;""" try: self.execute("BEGIN") self.execute(statement) self.execute("COMMIT") except: self.connection.rollback() return Engine.insert_data_from_file(self, filename) else: return Engine.insert_data_from_file(self, filename) def insert_statement(self, values): """Returns a SQL statement to insert a set of values""" statement = Engine.insert_statement(self, values) if isinstance(statement, bytes): statement = statement.decode("utf-8", "ignore") return statement def table_exists(self, dbname, tablename): """Checks to see if the given table exists""" if not hasattr(self, 'existing_table_names'): self.cursor.execute( "SELECT schemaname, tablename FROM pg_tables WHERE schemaname NOT LIKE 'pg_%';") self.existing_table_names = set() for schema, table in self.cursor: self.existing_table_names.add((schema.lower(), table.lower())) return (dbname.lower(), tablename.lower()) in self.existing_table_names def format_insert_value(self, value, datatype): """Formats a value for an insert statement""" if datatype == "bool": try: if int(value) == 1: return "TRUE" elif int(value) == 0: return "FALSE" except: pass return Engine.format_insert_value(self, value, datatype) def get_connection(self): """Gets the db connection.""" import psycopg2 as dbapi self.get_input() conn = dbapi.connect(host=self.opts["host"], port=int(self.opts["port"]), user=self.opts["user"], password=self.opts["password"], database=self.opts["database"]) conn.set_client_encoding('Latin1') return conn PK<šGJå|è  retriever/engines/csvengine.pyfrom builtins import str from builtins import object import os import io import sys import csv from retriever.lib.models import Engine from retriever import DATA_DIR, open_fw, open_csvw from retriever.lib.tools import sort_csv class DummyConnection(object): def cursor(self): pass def commit(self): pass def rollback(self): pass def close(self): pass class DummyCursor(DummyConnection): pass class engine(Engine): """Engine instance for writing data to a CSV file.""" name = "CSV" abbreviation = "csv" datatypes = { "auto": "INTEGER", "int": "INTEGER", "bigint": "INTEGER", "double": "REAL", "decimal": "REAL", "char": "TEXT", "bool": "INTEGER", } required_opts = [ ("table_name", "Format of table name", os.path.join(DATA_DIR, "{db}_{table}.csv")), ] table_names = [] def create_db(self): """Override create_db since there is no database just a CSV file""" return None def create_table(self): """Create the table by creating an empty csv file""" self.auto_column_number = 1 self.file = open_fw(self.table_name()) self.output_file = open_csvw(self.file) self.output_file.writerow([u'{}'.format(val) for val in self.table.get_insert_columns(join=False,create=True)]) self.table_names.append((self.file, self.table_name())) def disconnect(self): """Close the last file in the dataset""" for output_tuple in self.table_names: output_tuple[0].close() def execute(self, statement, commit=True): """Write a line to the output file""" self.output_file.writerows(statement) def format_insert_value(self, value, datatype): """Formats a value for an insert statement""" v = Engine.format_insert_value(self, value, datatype, escape=False, processed=True) if v == 'null': return "" try: if len(v) > 1 and v[0] == v[-1] == "'": v = '"%s"' % v[1:-1] except: pass return v def insert_statement(self, values): """Returns a comma delimited row of values""" if not hasattr(self, 'auto_column_number'): self.auto_column_number = 1 if self.table.columns[0][1][0][3:] == 'auto': newrows = [] for rows in values: insert_stmt = [self.auto_column_number] + rows newrows.append(insert_stmt) self.auto_column_number += 1 return newrows else: return values def table_exists(self, db_name, table_name): """Check to see if the data file currently exists""" table_name = self.table_name(name=table_name, dbname=db_name) return os.path.exists(table_name) def to_csv(self): """Export sorted version of CSV file""" for keys in self.script.tables: table_name = self.opts['table_name'].format(db=self.db_name, table=keys) sort_csv(table_name) def get_connection(self): """Gets the db connection.""" self.get_input() return DummyConnection() PKBˆXJ^-Ò )retriever-2.0.0.dist-info/DESCRIPTION.rstUNKNOWN PKBˆXJ{­Ìc77*retriever-2.0.0.dist-info/entry_points.txt[console_scripts] retriever = retriever.__main__:main PKBˆXJÙ:ÿÂ[['retriever-2.0.0.dist-info/metadata.json{"classifiers": ["Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Programming Language :: Python", "Programming Language :: Python :: 2", "Programming Language :: Python :: 3"], "extensions": {"python.commands": {"wrap_console": {"retriever": "retriever.__main__:main"}}, "python.details": {"contacts": [{"email": "ethan@weecology.org", "name": "Ben Morris, Akash Goel, Henry Senyondo, Ethan White", "role": "author"}], "document_names": {"description": "DESCRIPTION.rst"}, "project_urls": {"Home": "https://github.com/weecology/retriever"}}, "python.exports": {"console_scripts": {"retriever": "retriever.__main__:main"}}}, "extras": [], "generator": "bdist_wheel (0.26.0)", "metadata_version": "2.0", "name": "retriever", "run_requires": [{"requires": ["future", "xlrd"]}], "summary": "Data Retriever", "version": "2.0.0"}PKBˆXJ’× 'retriever-2.0.0.dist-info/top_level.txtretriever PKBˆXJìndªnnretriever-2.0.0.dist-info/WHEELWheel-Version: 1.0 Generator: bdist_wheel (0.26.0) Root-Is-Purelib: true Tag: py2-none-any Tag: py3-none-any PKBˆXJ1ŸÐe%%"retriever-2.0.0.dist-info/METADATAMetadata-Version: 2.0 Name: retriever Version: 2.0.0 Summary: Data Retriever Home-page: https://github.com/weecology/retriever Author: Ben Morris, Akash Goel, Henry Senyondo, Ethan White Author-email: ethan@weecology.org License: UNKNOWN Platform: UNKNOWN Classifier: Intended Audience :: Science/Research Classifier: License :: OSI Approved :: MIT License Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2 Classifier: Programming Language :: Python :: 3 Requires-Dist: future Requires-Dist: xlrd UNKNOWN PKBˆXJ8òÀÊÊ Ê retriever-2.0.0.dist-info/RECORDretriever/__init__.py,sha256=7x2LTDEV-iD9tPWsdwrJ_TvdhUWhvI0oPUL-6A784E0,8279 retriever/__main__.py,sha256=T8M6EfV6rAjiQHUzoATmYJatKYl5WQganyLsuCbfeNM,7249 retriever/_version.py,sha256=7ujN2-4P4fezkmKSLS5qMCWRjIlrfYCpaPnEQ8PzvdI,23 retriever/compile.py,sha256=kaq9P_PqsP3AlrdbyeRF7-TEEmmwe3CMrlBEvorMTe0,269 retriever/lscolumns.py,sha256=EKRr2kE52_QjgOr7wS5m5uK0Cb1A551eZ4nfpO-AdCI,1470 retriever/term_size.py,sha256=Zxno4Rsfvy5i-n2ZOvQympBVfFVlZRhHSBn1deVucsg,727 retriever/try_install_all.py,sha256=NrK7nOlfpXhjvCPnJbtzwXwXxEayI409nECzuQ2P9FE,3521 retriever/version.py,sha256=axm4SEu_DDItzIQ26S-f_B96_zgWwVfap4_uDn0N0nk,1270 retriever/engines/__init__.py,sha256=ZhgoxCrUcJDl7Jm04amJKD5vK7bEJZ5-XKP1Z0QpvTc,383 retriever/engines/csvengine.py,sha256=vkRZUnsHoXQ4D-VgQQ8TeOKizMNFWNDheyenF-3tJUk,3266 retriever/engines/download_only.py,sha256=gpkBs03PeLxSg7teWS-F5SHDyb6Xog89lYElOOoIacU,5117 retriever/engines/jsonengine.py,sha256=lq_MXf1y_sqw3AA0XOV9pPUi0CG8xeMl75XOpzphUJo,4526 retriever/engines/msaccess.py,sha256=u6Y5JgGAnzJ4rB0ghe8a-lWEQrSlrHL4cFasKhSA3pI,6310 retriever/engines/mysql.py,sha256=NFfP70KaidkEq5QLhy0k9XQThGinVurkJnmARilJ0Jw,4650 retriever/engines/postgres.py,sha256=2iG9-4Db3AfsC2k0eFVwt22viJN56g1xW208tHM4Q9M,5088 retriever/engines/sqlite.py,sha256=fER9tagBt5znYiznaQjI8OXRfhLLjkfJ1b6Qk5MjYK8,2484 retriever/engines/xmlengine.py,sha256=XUmFQmpLQAJQ0Mvn0O-we4QHW1hx_zVv1v-793vpkRg,4383 retriever/lib/__init__.py,sha256=Njb9Jb2B7GSu6Zp5d9tnwHlqgdysz6EIE55zLfDO53Q,63 retriever/lib/cleanup.py,sha256=jEjUaRvtrC-v29DXKLhUp3Tf_yjh7aJacsdr-cjmn_I,940 retriever/lib/compile.py,sha256=olRTivWdq2pUU0trx4rViTHw2lHb3sfu_aoH3fat5tE,5551 retriever/lib/datapackage.py,sha256=3T0ycKXCHhbOa4zmaAU8Apkv4FYx7ed9vBpDX6z5w8M,16505 retriever/lib/engine.py,sha256=qpl-gFk3ZONcH9o5OcXy7SJJD22z19Pj3afjytNJ4K8,31254 retriever/lib/excel.py,sha256=WORssmNvhtQEFJ3zwhbkfoCKwktKG-MCDEh-4WyRIII,721 retriever/lib/get_opts.py,sha256=aTu0DCr6mp520EFPBIKg4ONr5xUCLFAFejT25gz1N0E,4557 retriever/lib/models.py,sha256=nAyUlZe3V8qTVJiDa7DbFHby1lfXlxxCBpPCI7LqdJI,214 retriever/lib/parse_script_to_json.py,sha256=3S0ygjSWh05cYIgQLBxY-DGbsQUQaBYqMmmwKg1U-fI,4292 retriever/lib/repository.py,sha256=qeJDb09siC0QObjsXZkVdlx-e0kG_qFCkRYIdPljbos,3844 retriever/lib/table.py,sha256=qLEw6_RyfhtUunMw7oB0pTA0sAG2WNnkz4Qtai_HoOM,6202 retriever/lib/templates.py,sha256=b1n9jmtUi-ZuWgofAVpsw4V9xmzWVEEbwNpJ7Jui_VI,4776 retriever/lib/tools.py,sha256=fbvKKXl3kCQk9vgt8P2z8Tr-iITF9eEICZbrqcEClW0,11128 retriever/lib/warning.py,sha256=prkyxn5tdgRc5r3qEmfCxSb05vtrPLZ3sGALOTyuIjA,249 retriever-2.0.0.data/data/CITATION,sha256=j4dGFLzR4DsE3Zvn80R9fWESwg3z4o2RDVPLTF4PruM,491 retriever-2.0.0.dist-info/DESCRIPTION.rst,sha256=OCTuuN6LcWulhHS3d5rfjdsQtW22n7HENFRh6jC6ego,10 retriever-2.0.0.dist-info/METADATA,sha256=wZ6XXTrqomJ-N8vmBgQpOv5Zl3tnTDdFIViD_BISFhg,549 retriever-2.0.0.dist-info/RECORD,, retriever-2.0.0.dist-info/WHEEL,sha256=GrqQvamwgBV4nLoJe0vhYRSWzWsx7xjlt74FT0SWYfE,110 retriever-2.0.0.dist-info/entry_points.txt,sha256=imLI3cuO8LYg3lPN23KSEGAqu8tDALNsVz3JTn83OlI,55 retriever-2.0.0.dist-info/metadata.json,sha256=sltuuneopDSBrTzwVruOVC83LlKQ3uXXRfmsMUT0RSs,859 retriever-2.0.0.dist-info/top_level.txt,sha256=dbHHtqGXUEUzkJvQccDHAT7gGBQjTdndc64V6l1Dap4,10 PK<šGJogëë"retriever-2.0.0.data/data/CITATIONPK½nXJ¿¡“KQQ+retriever/__main__.pyPKBˆXJ^þÑÚ¯retriever/_version.pyPK<šGJ¯Éã³¾¾ùretriever/lscolumns.pyPK½nXJ‹êýW W ë$retriever/__init__.pyPK<šGJƒP  uEretriever/compile.pyPK<šGJ|žqŒöö´Fretriever/version.pyPK<šGJb¶„ªÁ Á ÜKretriever/try_install_all.pyPK<šGJC6¯×××Yretriever/term_size.pyPK<šGJ—úRs??â\retriever/lib/__init__.pyPK<šGJPý—¯¯X]retriever/lib/compile.pyPK<šGJ²ÌÕ{x+x+=sretriever/lib/tools.pyPK<šGJµ=•‘::éžretriever/lib/table.pyPK<šGJô僽¬¬W·retriever/lib/cleanup.pyPK<šGJŸMÞ!¨¨9»retriever/lib/templates.pyPK½nXJ–ƒ!vÍÍÎretriever/lib/get_opts.pyPK<šGJ,§%ŽÖÖàretriever/lib/models.pyPK<šGJܶHzz(áretriever/lib/engine.pyPK<šGJÌ¥K”ùùs[retriever/lib/warning.pyPK<šGJÝZ/ÄÄ%¢\retriever/lib/parse_script_to_json.pyPK<šGJP2òÑÑ©mretriever/lib/excel.pyPK<šGJ"K¾®pretriever/lib/repository.pyPK<šGJàߣPy@y@ëretriever/lib/datapackage.pyPK<šGJÔ0ðVžÀretriever/engines/xmlengine.pyPK<šGJš©CÌ**ùÑretriever/engines/mysql.pyPK<šGJ€Åe[äretriever/engines/__init__.pyPK<šGJÛ7Ñ5®®æretriever/engines/jsonengine.pyPK<šGJdÃð«´ ´ øretriever/engines/sqlite.pyPK<šGJ±Ó~j¦¦íretriever/engines/msaccess.pyPK<šGJW¹MYýý"Îretriever/engines/download_only.pyPK<šGJ£ÃÚàà /retriever/engines/postgres.pyPK<šGJå|è  &Cretriever/engines/csvengine.pyPKBˆXJ^-Ò )$Pretriever-2.0.0.dist-info/DESCRIPTION.rstPKBˆXJ{­Ìc77*uPretriever-2.0.0.dist-info/entry_points.txtPKBˆXJÙ:ÿÂ[['ôPretriever-2.0.0.dist-info/metadata.jsonPKBˆXJ’× '”Tretriever-2.0.0.dist-info/top_level.txtPKBˆXJìndªnnãTretriever-2.0.0.dist-info/WHEELPKBˆXJ1ŸÐe%%"ŽUretriever-2.0.0.dist-info/METADATAPKBˆXJ8òÀÊÊ Ê óWretriever-2.0.0.dist-info/RECORDPK''> ûd