#!/usr/bin/env python

from __future__ import print_function
from __future__ import unicode_literals

# Export of CODEX events table to EventFlow [1] input file format [2]
# [1] http://hcil.umd.edu/eventflow/
# [2] http://www.cs.umd.edu/hcil/eventflow/manual/chapter_start.html#1.4

import csv
import gzip
import logging
import logging.config
import sys, os

import docopt
import psycopg2
import psycopg2.extras
import six

options = docopt.docopt("""
usage: codex-to-eventflow [options] <output_file>
                          [--group-events <rule>]...
                          [--include-events <class>]...
                          [--exclude-events <class>]...

options:
  -s <string>, --schema <string>    Database schema [default: public]
  -d <string>, --database <string>  Database name or URI [default: postgres]
  --events-table <string>           Events table name [default: codex_events]
  --group-events <rule>             Group events following a rule (see below)
  --include-events <class>          Exclude any event not of this class
  --exclude-events <class>          Exclude any event of this class
  --no-attributes                   Exclude the event attributes
  -v, --verbose                     Display additional debugging information
  -h, --help                        Display this help then exit

notes:
  - by default all events from --events-table are exported
  - the --group-events option can be used to group events under a given name
    based on them having a given value for a key in the 'event_data' field;
    the grouping rule is <name>,<class>[.<key>],<value>[,<value>...] with <name>
    the resulting group name, <class> the class of events to group, <key> one
    of the key in 'event_data', and <value> one or more values a given event
    must have for the <key> to be placed in this <name> group. For example,
    "--group-events my_drug,drug_exposure.drug_concept_id,1,2,3,4" will create
    a group 'my_drug' out of all 'drug_exposure' events for which the
    'drug_concept_id' field in 'event_data' has a value of 1 to 4. If no <key>
    is provided a default concept_id field is selected
  - the --include-events option behaves as a white list; i.e., only event
    classes in this list will be considered, excluding all others
  - the --exclude-events option behaves as a black list; i.e., only event
    classes not in this list will be considered
  - if both --include-events and --exclude-events are used only event classes
    in the first list and not in the second one will be considered
""")

logging.config.dictConfig({
    "version": 1,
    "disable_existing_loggers": True,
    "formatters": {"default": {
        "format": "[%(asctime)s] %(levelname)s: %(message)s"
        }},
    "handlers": {"default": {
        "class": "logging.StreamHandler",
        "formatter": "default",
        }},
    "loggers": {"": {
        "handlers": ["default"],
        "level": logging.DEBUG if (options["--verbose"]) else logging.INFO,
        "propagate": True
        }}
    })

logger = logging.getLogger(os.path.basename(__file__))

def error (msg, is_exception = False):
    if (is_exception) and (options["--verbose"]):
        logger.exception(msg)
    else:
        logger.error(msg)
    sys.exit(1)

# default concept to report for each event class
DEFAULT_CONCEPT = {
    "observation_period": "period_type_concept_id",
    "specimen_collection": "specimen_concept_id",
    "death": "death_type_concept_id",
    "visit": "visit_concept_id",
    "procedure": "procedure_concept_id",
    "drug_exposure": "drug_concept_id",
    "device_exposure": "device_concept_id",
    "condition": "condition_concept_id",
    "measurement": "measurement_concept_id",
    "observation": "observation_concept_id"}

group_rules = {}
for entry in options["--group-events"]:
    group_rule = entry.split(',')

    if (len(group_rule) < 3):
        error("invalid --events-group rule: %s" % entry)

    (group_name, class_and_key), values = group_rule[:2], group_rule[2:]

    if (class_and_key.count('.') == 0):
        event_class, key = class_and_key, None
    elif (class_and_key.count('.') == 1):
        event_class, key = class_and_key.split('.')
    else:
        error(("invalid --events-group rule: "
               "invalid class and key"))

    if (not event_class in DEFAULT_CONCEPT):
        error(("invalid --events-group rule: "
               "unknown event class: %s" % event_class))

    if (key is None):
        key = DEFAULT_CONCEPT[event_class]

    if (group_name in group_rules):
        error(("invalid --events-group rule: "
               "group name '%s' is already used" % group_name))

    group_rules[group_name] = (event_class, key, values)

event_class_white_list = None
if (len(options["--include-events"]) > 0):
    event_class_white_list = dict.fromkeys(options["--include-events"])

event_class_black_list = None
if (len(options["--exclude-events"]) > 0):
    event_class_black_list = dict.fromkeys(options["--exclude-events"])

include_attributes = (not options["--no-attributes"])

def force_print (text):
    print(text)
    sys.stdout.flush()

force_print("exporting events from '%s.%s'\n" % (
    options["--schema"], options["--events-table"]))

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

logger.info("connecting to database server")

logger.debug("psycopg2.__version__ = %s", psycopg2.__version__)

postgres_uri = options["--database"]
postgres_schema = options["--schema"]

try:
    # if the value provided is a URI, pass it as such
    if (postgres_uri.startswith("postgresql://")):
        connection = psycopg2.connect(postgres_uri)

    # if not, consider it to be a database name
    else:
        connection = psycopg2.connect(database = postgres_uri)

    psycopg2.extensions.set_wait_callback(
        psycopg2.extras.wait_select)

    with connection.cursor() as cursor:
        cursor.execute("SET search_path TO %s", (postgres_schema,))

except psycopg2.Error as e:
    error("error from database server:\n%s" % e, True)

logger.info("connecting to database server: done")

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

logger.info("exporting events")

def format_entry (entry):
    person_id, event_class, start_time, end_time, event_data = entry

    # exclude this event if not in an existing white list
    if (event_class_white_list is not None) and \
       (event_class not in event_class_white_list):
        return

    # exclude this event if in an existing black list
    if (event_class_black_list is not None) and \
       (event_class in event_class_black_list):
        return

    # place this event in a user-defined group, if one matches
    event_category = None
    for (group_name, (event_class_, key, values)) in six.iteritems(group_rules):
        if (event_class != event_class_):
            continue

        if (not key in event_data):
            continue

        if (str(event_data[key]) in values):
            event_category = group_name
            break

    # if not, create a group out of the event main concept identifier
    if (event_category is None):
        relevant_concept = DEFAULT_CONCEPT.get(event_class)
        if (relevant_concept is not None):
            relevant_concept = event_data.get(relevant_concept)

        event_category = event_class.upper()
        if (relevant_concept is not None):
            event_category += "_%d" % relevant_concept

    # since EventFlow cannot manage a mix of instants and
    # periods for the same event category, we add a prefix
    event_category += "_instant" if (end_time == '') else "_period"

    # we transform the JSON-formated event
    # data to EventFlow specifications
    if (include_attributes):
        event_data_ = []
        for (key, value) in six.iteritems(event_data):
            if (value is None) or (str(value) == ''):
                continue
            event_data_.append("%s=%s" % (key, value))

        event_data = "; ".join(event_data_)
    else:
        event_data = ''

    return person_id, event_category, start_time, end_time, event_data

o_fn = options["<output_file>"]
if (o_fn.lower().endswith(".gz")):
    o_fh = gzip.open(o_fn, "wb")
else:
    o_fh = open(o_fn, "w")

o = csv.writer(o_fh)
n_entries = 0

BLOCK_SIZE = 10000

with connection:
    try:
        with connection.cursor("codex_events_cursor") as cursor:
            cursor.execute("""\
                SELECT person_id,
                       event_class,
                       lower(event_period)::text AS start_time,
                       (CASE codex_is_period(event_period)
                            WHEN true THEN (upper(event_period)-1)::text
                            WHEN false THEN ''
                        END) AS end_time,
                       event_data
                  FROM %(events_table)s
                """ % {
                    "events_table": options["--events-table"]})

            for entry in cursor:
                entry = format_entry(entry)
                if (entry is None):
                    continue

                o.writerow(entry)
                n_entries += 1

                if (n_entries % (1 * BLOCK_SIZE) == 0):
                    sys.stdout.write('.')
                    sys.stdout.flush()

                if (n_entries % (5 * BLOCK_SIZE) == 0):
                    sys.stdout.write(' ')
                    sys.stdout.flush()

                if (n_entries % (50 * BLOCK_SIZE) == 0):
                    sys.stdout.write('\n')
                    logger.info("exported {:,} events".format(n_entries))

    except (psycopg2.extensions.QueryCanceledError, KeyboardInterrupt):
        connection.rollback()
        sys.stdout.write('\n')
        error("operation cancelled by user")

o_fh.close()
sys.stdout.write('\n')

logger.info("exporting events: done ({:,} entries processed)".format(n_entries))
logger.info("all done.")
