Source code for omniduct.databases.presto

from __future__ import absolute_import

import ast
import logging
import re
import sys

import pandas.io.sql
import six
from interface_meta import override
from future.utils import raise_with_traceback

from omniduct.utils.debug import logger

from .base import DatabaseClient
from ._schemas import SchemasMixin
from . import _pandas


[docs]class PrestoClient(DatabaseClient, SchemasMixin):
    """
    This Duct connects to a Facebook Presto server instance using the `pyhive`
    library.

    In addition to the standard `DatabaseClient` API, `PrestoClient` adds a
    `.schemas` descriptor attribute, which enables a tab completion driven
    exploration of a Presto database's schemas and tables.

    Attributes:
        catalog (str): The default catalog to use in database queries.
        schema (str): The default schema/database to use in database queries.
        connection_options (dict): Additional options to pass on to
            `pyhive.presto.connect(...)`.
    """

    PROTOCOLS = ['presto']
    DEFAULT_PORT = 3506
    SUPPORTS_SESSION_PROPERTIES = True
    NAMESPACE_NAMES = ['catalog', 'schema', 'table']
    NAMESPACE_QUOTECHAR = '"'
    NAMESPACE_SEPARATOR = '.'

    @property
    @override
    def NAMESPACE_DEFAULT(self):
        return {
            'catalog': self.catalog,
            'schema': self.schema
        }

    @property
    @override
    def NAMESPACE_DEFAULTS_WRITE(self):
        defaults = self.NAMESPACE_DEFAULTS_READ.copy()
        defaults['schema'] = self.username
        return defaults

    @override
    def _init(self, catalog='default', schema='default', server_protocol='http', source=None, requests_session=None):
        """
        catalog (str): The default catalog to use in database queries.
        schema (str): The default schema/database to use in database queries.
        server_protocol (str): The protocol over which to connect to the Presto REST
            service ('http' or 'https'). (default='http')
        source (str): The source of this query (by default "omniduct <version>").
            If manually specified, result will be: "<source> / omniduct <version>".
        requests_session (requests.Session): an optional requests.Session object for advanced usage.
            Passed through to the pyhive Cursor which supports custom requests sessions for advanced usage
            such as custom headers, cookie values, retry logic, etc.
        """
        self.catalog = catalog
        self.schema = schema
        self.server_protocol = server_protocol
        self.source = source
        self.__presto = None
        self.connection_fields += ('catalog', 'schema')
        self._requests_session = requests_session

    @property
    def source(self):
        return self._source

    @source.setter
    def source(self, source):
        self._source = source or 'omniduct'

    # Connection

    @override
    def _connect(self):
        from sqlalchemy import create_engine, MetaData
        logging.getLogger('pyhive').setLevel(1000)  # Silence pyhive logging.
        logger.info('Connecting to Presto coordinator...')
        self._sqlalchemy_engine = create_engine('presto://{}:{}/{}/{}'.format(self.host, self.port, self.catalog, self.schema))
        self._sqlalchemy_metadata = MetaData(self._sqlalchemy_engine)

    @override
    def _is_connected(self):
        try:
            return self.__presto is not None
        except:
            return False

    @override
    def _disconnect(self):
        logger.info('Disconnecting from Presto coordinator...')
        try:
            self.__presto.close()
        except:
            pass
        self._sqlalchemy_engine = None
        self._sqlalchemy_metadata = None
        self._schemas = None

    # Querying
    @override
    def _execute(self, statement, cursor, wait, session_properties):
        """
        If something goes wrong, `PrestoClient` will attempt to parse the error
        log and present the user with useful debugging information. If that fails,
        the full traceback will be raised instead.
        """
        from pyhive import presto  # Imported here due to slow import performance in Python 3
        from pyhive.exc import DatabaseError  # Imported here due to slow import performance in Python 3
        try:
            cursor = cursor or presto.Cursor(
                host=self.host, port=self.port, username=self.username, password=self.password,
                catalog=self.catalog, schema=self.schema, session_props=session_properties,
                poll_interval=1, source=self.source, protocol=self.server_protocol,
                requests_session=self._requests_session
            )
            cursor.execute(statement)
            status = cursor.poll()
            if wait:
                logger.progress(0)
                # status None means command executed successfully
                # See https://github.com/dropbox/PyHive/blob/master/pyhive/presto.py#L234
                while status is not None and status['stats']['state'] != "FINISHED":
                    if status['stats'].get('totalSplits', 0) > 0:
                        pct_complete = round(status['stats']['completedSplits'] / float(status['stats']['totalSplits']), 4)
                        logger.progress(pct_complete * 100)
                    status = cursor.poll()
                logger.progress(100, complete=True)
            return cursor
        except (DatabaseError, pandas.io.sql.DatabaseError) as e:
            # Attempt to parse database error, before ultimately reraising the same
            # exception, maintaining the full stacktrace.
            exception, exception_args, traceback = sys.exc_info()

            try:
                message = e.args[0]
                if isinstance(message, six.string_types):
                    message = ast.literal_eval(re.match("[^{]*({.*})[^}]*$", message).group(1))

                linenumber = message['errorLocation']['lineNumber'] - 1
                splt = statement.splitlines()
                splt[linenumber] += '   <--  {errorType} ({errorName}) occurred. {message} '.format(**message)
                context = '\n\n[Error Context]\n{}\n'.format('\n'.join([splt[ln] for ln in range(max(linenumber - 1, 0),
                                                                                                 min(linenumber + 2, len(splt)))]))

                class ErrContext(object):

                    def __repr__(self):
                        return context

                # logged twice so that both notebook and console users see the error context
                exception_args.args = [exception_args, ErrContext()]
                logger.error(context)
            except:
                logger.warn(("Omniduct was unable to parse the database error messages. Refer to the "
                             "traceback below for full error details."))

            if isinstance(exception, type):
                exception = exception(exception_args)

            raise_with_traceback(exception, traceback)

    @override
    def _query_to_table(self, statement, table, if_exists, **kwargs):
        statements = []

        if if_exists == 'fail' and self.table_exists(table):
            raise RuntimeError("Table {} already exists!".format(table))
        elif if_exists == 'replace':
            statements.append('DROP TABLE IF EXISTS {};\n'.format(table))
        elif if_exists == 'append':
            raise NotImplementedError("Append operations have not been implemented for {}.".format(self.__class__.__name__))

        statements.append("CREATE TABLE {table} AS ({statement})".format(
            table=table,
            statement=statement
        ))
        return self.execute('\n'.join(statements), **kwargs)

    @override
    def _dataframe_to_table(self, df, table, if_exists='fail', **kwargs):
        """
        If if the schema namespace is not specified, `table.schema` will be
        defaulted to your username. Catalog overrides will be ignored, and will
        default to `self.catalog`.
        """
        return _pandas.to_sql(
            df=df, name=table.table, schema=table.schema, con=self._sqlalchemy_engine,
            index=False, if_exists=if_exists, **kwargs
        )

    @override
    def _cursor_empty(self, cursor):
        return False

    @override
    def _table_list(self, namespace, like=None, **kwargs):
        cmd = "SHOW TABLES "
        if namespace:
            cmd = cmd + " FROM " + namespace.name
        if like is not None:
            cmd = cmd + " LIKE " + like + "'"
        return self.query(cmd, **kwargs)

    @override
    def _table_exists(self, table, **kwargs):
        from pyhive.exc import DatabaseError

        logger.disabled = True
        try:
            self.table_desc(table, **kwargs)
            return True
        except DatabaseError:
            return False
        finally:
            logger.disabled = False

    @override
    def _table_drop(self, table, **kwargs):
        return self.execute("DROP TABLE {table}".format(table=table))

    @override
    def _table_desc(self, table, **kwargs):
        return self.query("DESCRIBE {0}".format(table), **kwargs)

    @override
    def _table_partition_cols(self, table, **kwargs):
        desc = self._table_desc(table, **kwargs)
        if 'Extra' in desc:
            return list(desc[desc['Extra'].str.contains('partition key')]['Column'])
        return []

    @override
    def _table_head(self, table, n=10, **kwargs):
        return self.query("SELECT * FROM {} LIMIT {}".format(table, n), **kwargs)

    @override
    def _table_props(self, table, **kwargs):
        raise NotImplementedError