samplefiles.py

"""
Provide tools for writing and reading the sample HDF files produced by
the sample generation.
"""

# -----------------------------------------------------------------------------
# IMPORTS
# -----------------------------------------------------------------------------

import numpy as np
import pandas as pd
import h5py

from six import iteritems
from pprint import pformat
from warnings import warn


# -----------------------------------------------------------------------------
# CLASS DEFINITIONS
# -----------------------------------------------------------------------------

class SampleFile:
    """
    :class:`SampleFile` objects serve as an abstraction for the result
    files of the sample generation.

    Args:
        data (dict): A dictionary containing the following keys:

            .. code-block:: python

               {'command_line_arguments', 'static_arguments',
                'injection_samples', 'noise_samples',
                'injection_parameters', 'normalization_parameters'}

            The value for every key must again be a dictionary relating
            the names of sample parameters (e.g., 'h1_snr') to a numpy
            array containing the values for that parameter.
    """

    def __init__(self,
                 data=None):

        # Perform sanity checks on data
        self.__check_data(data)

        # If we have received data, store it; else initialize an empty dict
        if data is not None:
            self.data = data
        else:
            self.data = dict(command_line_arguments=dict(),
                             static_arguments=dict(),
                             injection_samples=dict(),
                             noise_samples=dict(),
                             injection_parameters=dict(),
                             normalization_parameters=dict())

    # -------------------------------------------------------------------------

    @staticmethod
    def __check_data(data):
        """
        Run some sanity checks on `data`. Raises an assertion error if
        the data fail any of these sanity checks.

        Args:
            data (dict): A dictionary as specified in the ``__init__``
                of this class, that is, a dictionary containing the
                following keys:

                .. code-block:: python

                    {'command_line_arguments', 'static_arguments',
                     'injection_samples', 'noise_samples',
                     'injection_parameters', 'normalization_parameters'}
        """

        assert isinstance(data, dict) or data is None, \
            'data must be either dict or None!'

        if data is not None:

            assert 'command_line_arguments' in data.keys(), \
                'data must provide key "command_line_arguments"!'
            assert 'static_arguments' in data.keys(), \
                'data must provide key "static_arguments"!'
            assert 'injection_samples' in data.keys(), \
                'data must provide key "injection_samples"!'
            assert 'noise_samples' in data.keys(), \
                'data must provide key "noise_samples"!'
            assert 'injection_parameters' in data.keys(), \
                'data must provide key "injection_parameters"!'
            assert 'normalization_parameters' in data.keys(), \
                'data must provide key "normalization_parameters"!'

    # -------------------------------------------------------------------------

    def __repr__(self):

        return pformat(self.data, indent=4)

    # -------------------------------------------------------------------------

    def __str__(self):

        return pformat(self.data, indent=4)

    # -------------------------------------------------------------------------

    def __getitem__(self, item):

        return self.data[item]

    # -------------------------------------------------------------------------

    def __setitem__(self, key, value):

        self.data[key] = value

    # -------------------------------------------------------------------------

    def read_hdf(self, file_path):
        """
        Read in an existing HDF sample file (e.g., to use an instance
        of :class:`SampleFile` as a convenience wrapper for accessing
        the contents of an HDF samples file).

        Args:
            file_path (str): The path to the HDF file to be read into
                the :class:`SampleFile` object.
        """

        # Clear the existing data
        self.data = {}

        with h5py.File(file_path, 'r') as hdf_file:

            # Read in dict with command_line_arguments
            self.data['command_line_arguments'] = \
                dict(hdf_file['command_line_arguments'].attrs)
            self.data['command_line_arguments'] = \
                {key: value.decode('ascii') for key, value in
                 iteritems(self.data['command_line_arguments'])}

            # Read in dict with static_arguments
            self.data['static_arguments'] = \
                dict(hdf_file['static_arguments'].attrs)
            self.data['static_arguments'] = \
                {key: value.decode('ascii') for key, value in
                 iteritems(self.data['static_arguments'])}

            # Read in group containing injection samples
            self.data['injection_samples'] = dict()
            for key in ('event_time', 'h1_strain', 'l1_strain'):
                try:
                    self.data['injection_samples'][key] = \
                        np.array(hdf_file['injection_samples'][key])
                except TypeError:
                    self.data['injection_samples'][key] = np.array(None)

            # Read in group containing noise samples
            self.data['noise_samples'] = dict()
            for key in ('event_time', 'h1_strain', 'l1_strain'):
                try:
                    self.data['noise_samples'][key] = \
                        np.array(hdf_file['noise_samples'][key])
                except TypeError:
                    self.data['noise_samples'][key] = np.array(None)

            # Read in injection parameters
            self.data['injection_parameters'] = dict()
            for key in hdf_file['/injection_parameters'].keys():
                try:
                    self.data['injection_parameters'][key] = \
                        np.array(hdf_file['injection_parameters'][key])
                except TypeError:
                    self.data['injection_parameters'][key] = np.array(None)

            # Read in dict with normalization parameters
            self.data['normalization_parameters'] = \
                dict(hdf_file['normalization_parameters'].attrs)
            self.data['normalization_parameters'] = \
                {key: float(value) for key, value in
                 iteritems(self.data['normalization_parameters'])}

    # -------------------------------------------------------------------------

    def to_hdf(self, file_path):

        with h5py.File(file_path, 'w') as hdf_file:

            # Create group for command_line_arguments and save the values of
            # the dict as attributes of the group
            group = hdf_file.create_group('command_line_arguments')
            for key, value in iteritems(self.data['command_line_arguments']):
                group.attrs[key] = str(value)

            # Create group for static_arguments and save the values of
            # the dict as attributes of the group
            group = hdf_file.create_group('static_arguments')
            for key, value in iteritems(self.data['static_arguments']):
                group.attrs[key] = str(value)

            # Create group for injection_samples and save every item of the
            # dict as a new dataset
            group = hdf_file.create_group('injection_samples')
            for key, value in iteritems(self.data['injection_samples']):
                dtype = 'float64' if key == 'event_time' else 'float32'
                if value is not None:
                    group.create_dataset(name=key,
                                         shape=value.shape,
                                         dtype=dtype,
                                         data=value)
                else:
                    group.create_dataset(name=key,
                                         shape=None,
                                         dtype=dtype)

            # Create group for noise_samples and save every item of the
            # dict as a new dataset
            group = hdf_file.create_group('noise_samples')
            for key, value in iteritems(self.data['noise_samples']):
                dtype = 'float64' if key == 'event_time' else 'float32'
                if value is not None:
                    group.create_dataset(name=key,
                                         shape=value.shape,
                                         dtype=dtype,
                                         data=value)
                else:
                    group.create_dataset(name=key,
                                         shape=None,
                                         dtype=dtype)

            # Create group for injection_parameters and save every item of the
            # dict as a new dataset
            group = hdf_file.create_group('injection_parameters')
            for key, value in iteritems(self.data['injection_parameters']):
                if value is not None:
                    group.create_dataset(name=key,
                                         shape=value.shape,
                                         dtype='float64',
                                         data=value)
                else:
                    group.create_dataset(name=key,
                                         shape=None,
                                         dtype='float64')

            # Create group for normalization_parameters and save every item
            # of the dict as a new attribute
            group = hdf_file.create_group('normalization_parameters')
            for key, value in iteritems(self.data['normalization_parameters']):
                group.attrs[key] = float(value)

    # -------------------------------------------------------------------------

    def as_dataframe(self,
                     injection_parameters=False,
                     static_arguments=False,
                     command_line_arguments=False,
                     split_injections_noise=False):
        """
        Return the contents of the :class:`SampleFile` as a ``pandas``
        data frame.

        Args:
            injection_parameters (bool): Whether or not to return
                the `injection parameters` for every sample.
            static_arguments (bool): Whether or not to return
                the `static_arguments` for every sample.
            command_line_arguments (bool): Whether or not to return
                the `command_line_arguments` for every sample.
            split_injections_noise (bool): If this is set to True, a
                separate data frame will be returned for both the
                samples with and without an injection.

        Returns:
            One (or two, if `split_injections_noise` is set to `True`)
            pandas data frame containing the sample stored in the
            :class:`SampleFile` object.
        """

        # Create a data frame for the samples containing an injection
        injection_samples = []
        if self.data['injection_samples']['event_time'].shape != ():
            for i in range(len(self.data['injection_samples']['event_time'])):
                _ = {k: v[i] for k, v in
                     iteritems(self.data['injection_samples'])}
                injection_samples.append(_)
            df_injection_samples = pd.DataFrame().append(injection_samples,
                                                         ignore_index=True,
                                                         sort=True)
        else:
            df_injection_samples = pd.DataFrame()

        # Create a data frame for the samples not containing an injection
        noise_samples = []
        if self.data['noise_samples']['event_time'].shape != ():
            for i in range(len(self.data['noise_samples']['event_time'])):
                _ = {k: v[i] for k, v in
                     iteritems(self.data['noise_samples'])}
                noise_samples.append(_)
            df_noise_samples = pd.DataFrame().append(noise_samples,
                                                     ignore_index=True,
                                                     sort=True)
        else:
            df_noise_samples = pd.DataFrame()

        # If requested, create a data frame for the injection parameters and
        # merge it with the data frame containing the injection samples
        if injection_parameters:
            injection_params = []

            # Check if we even have any injection parameters
            if self.data['injection_parameters']['mass1'].shape != ():
                for i in range(len(df_injection_samples)):
                    _ = {k: v[i] for k, v in
                         iteritems(self.data['injection_parameters'])}
                    injection_params.append(_)
                df_injection_params = pd.DataFrame().append(injection_params,
                                                            ignore_index=True,
                                                            sort=True)
            else:
                df_injection_params = pd.DataFrame()

            df = pd.concat([df_injection_samples, df_injection_params],
                           axis=1, sort=True)

        else:
            df = df_injection_samples

        # If requested, add the static_arguments to the data frame
        # containing the injections, and a smaller subset of the
        # static_arguments also to the data frame containing the noise
        # samples (only those arguments that make sense there)
        if static_arguments:
            for key, value in iteritems(self.data['static_arguments']):
                df[key] = value
                if key in ('random_seed', 'target_sampling_rate',
                           'bandpass_lower', 'bandpass_upper',
                           'seconds_before_event', 'seconds_after_event',
                           'sample_length'):
                    df_noise_samples[key] = value

        # Merge the data frames for the samples with and without injections
        df = df.append(df_noise_samples, ignore_index=True, sort=True)

        # If requested, add the command line arguments that were used in the
        # creation of the sample file to the combined data frame
        if command_line_arguments:
            for key, value in iteritems(self.data['command_line_arguments']):
                df[key] = value

        # Ensure the `event_time` variable is an integer
        try:
            df['event_time'] = df['event_time'].astype(int)
        except KeyError:
            warn('\nNo key "event_time": Data frame is probably empty!')

        # Either split into two data frames for injection and noise samples
        if split_injections_noise:
            df_injections = df[df.h1_signal.notnull()]
            df_noise = df[~df.h1_signal.notnull()]
            return df_injections, df_noise

        # Or just return a single data frame containing both types of samples
        else:
            return df