Skip to content
Snippets Groups Projects
Select Git revision
  • ea0c03d15e2b590cfd13487093bf7ab4d31029e6
  • master default protected
2 results

Is_it_inherent_wandt.py

Blame
  • Forked from Xisco Jimenez Forteza / RDStackingProject
    Source project has a limited visibility.
    samplefiles.py 15.20 KiB
    """
    Provide tools for writing and reading the sample HDF files produced by
    the sample generation.
    """
    
    # -----------------------------------------------------------------------------
    # IMPORTS
    # -----------------------------------------------------------------------------
    
    import numpy as np
    import pandas as pd
    import h5py
    
    from six import iteritems
    from pprint import pformat
    from warnings import warn
    
    
    # -----------------------------------------------------------------------------
    # CLASS DEFINITIONS
    # -----------------------------------------------------------------------------
    
    class SampleFile:
        """
        :class:`SampleFile` objects serve as an abstraction for the result
        files of the sample generation.
    
        Args:
            data (dict): A dictionary containing the following keys:
                
                .. code-block:: python
                
                   {'command_line_arguments', 'static_arguments',
                    'injection_samples', 'noise_samples',
                    'injection_parameters', 'normalization_parameters'}
                
                The value for every key must again be a dictionary relating
                the names of sample parameters (e.g., 'h1_snr') to a numpy
                array containing the values for that parameter.
        """
    
        def __init__(self,
                     data=None):
    
            # Perform sanity checks on data
            self.__check_data(data)
    
            # If we have received data, store it; else initialize an empty dict
            if data is not None:
                self.data = data
            else:
                self.data = dict(command_line_arguments=dict(),
                                 static_arguments=dict(),
                                 injection_samples=dict(),
                                 noise_samples=dict(),
                                 injection_parameters=dict(),
                                 normalization_parameters=dict())
    
        # -------------------------------------------------------------------------
    
        @staticmethod
        def __check_data(data):
            """
            Run some sanity checks on `data`. Raises an assertion error if
            the data fail any of these sanity checks.
    
            Args:
                data (dict): A dictionary as specified in the ``__init__``
                    of this class, that is, a dictionary containing the
                    following keys:
                    
                    .. code-block:: python
                    
                        {'command_line_arguments', 'static_arguments',
                         'injection_samples', 'noise_samples',
                         'injection_parameters', 'normalization_parameters'}
            """
    
            assert isinstance(data, dict) or data is None, \
                'data must be either dict or None!'
    
            if data is not None:
    
                assert 'command_line_arguments' in data.keys(), \
                    'data must provide key "command_line_arguments"!'
                assert 'static_arguments' in data.keys(), \
                    'data must provide key "static_arguments"!'
                assert 'injection_samples' in data.keys(), \
                    'data must provide key "injection_samples"!'
                assert 'noise_samples' in data.keys(), \
                    'data must provide key "noise_samples"!'
                assert 'injection_parameters' in data.keys(), \
                    'data must provide key "injection_parameters"!'
                assert 'normalization_parameters' in data.keys(), \
                    'data must provide key "normalization_parameters"!'
    
        # -------------------------------------------------------------------------
    
        def __repr__(self):
    
            return pformat(self.data, indent=4)
    
        # -------------------------------------------------------------------------
    
        def __str__(self):
    
            return pformat(self.data, indent=4)
    
        # -------------------------------------------------------------------------
    
        def __getitem__(self, item):
    
            return self.data[item]
    
        # -------------------------------------------------------------------------
    
        def __setitem__(self, key, value):
    
            self.data[key] = value
    
        # -------------------------------------------------------------------------
    
        def read_hdf(self, file_path):
            """
            Read in an existing HDF sample file (e.g., to use an instance
            of :class:`SampleFile` as a convenience wrapper for accessing
            the contents of an HDF samples file).
    
            Args:
                file_path (str): The path to the HDF file to be read into
                    the :class:`SampleFile` object.
            """
    
            # Clear the existing data
            self.data = {}
    
            with h5py.File(file_path, 'r') as hdf_file:
    
                # Read in dict with command_line_arguments
                self.data['command_line_arguments'] = \
                    dict(hdf_file['command_line_arguments'].attrs)
                self.data['command_line_arguments'] = \
                    {key: value.decode('ascii') for key, value in
                     iteritems(self.data['command_line_arguments'])}
    
                # Read in dict with static_arguments
                self.data['static_arguments'] = \
                    dict(hdf_file['static_arguments'].attrs)
                self.data['static_arguments'] = \
                    {key: value.decode('ascii') for key, value in
                     iteritems(self.data['static_arguments'])}
    
                # Read in group containing injection samples
                self.data['injection_samples'] = dict()
                for key in ('event_time', 'h1_strain', 'l1_strain'):
                    try:
                        self.data['injection_samples'][key] = \
                            np.array(hdf_file['injection_samples'][key])
                    except TypeError:
                        self.data['injection_samples'][key] = np.array(None)
    
                # Read in group containing noise samples
                self.data['noise_samples'] = dict()
                for key in ('event_time', 'h1_strain', 'l1_strain'):
                    try:
                        self.data['noise_samples'][key] = \
                            np.array(hdf_file['noise_samples'][key])
                    except TypeError:
                        self.data['noise_samples'][key] = np.array(None)
    
                # Read in injection parameters
                self.data['injection_parameters'] = dict()
                for key in hdf_file['/injection_parameters'].keys():
                    try:
                        self.data['injection_parameters'][key] = \
                            np.array(hdf_file['injection_parameters'][key])
                    except TypeError:
                        self.data['injection_parameters'][key] = np.array(None)
    
                # Read in dict with normalization parameters
                self.data['normalization_parameters'] = \
                    dict(hdf_file['normalization_parameters'].attrs)
                self.data['normalization_parameters'] = \
                    {key: float(value) for key, value in
                     iteritems(self.data['normalization_parameters'])}
    
        # -------------------------------------------------------------------------
    
        def to_hdf(self, file_path):
    
            with h5py.File(file_path, 'w') as hdf_file:
    
                # Create group for command_line_arguments and save the values of
                # the dict as attributes of the group
                group = hdf_file.create_group('command_line_arguments')
                for key, value in iteritems(self.data['command_line_arguments']):
                    group.attrs[key] = str(value)
    
                # Create group for static_arguments and save the values of
                # the dict as attributes of the group
                group = hdf_file.create_group('static_arguments')
                for key, value in iteritems(self.data['static_arguments']):
                    group.attrs[key] = str(value)
    
                # Create group for injection_samples and save every item of the
                # dict as a new dataset
                group = hdf_file.create_group('injection_samples')
                for key, value in iteritems(self.data['injection_samples']):
                    dtype = 'float64' if key == 'event_time' else 'float32'
                    if value is not None:
                        group.create_dataset(name=key,
                                             shape=value.shape,
                                             dtype=dtype,
                                             data=value)
                    else:
                        group.create_dataset(name=key,
                                             shape=None,
                                             dtype=dtype)
    
                # Create group for noise_samples and save every item of the
                # dict as a new dataset
                group = hdf_file.create_group('noise_samples')
                for key, value in iteritems(self.data['noise_samples']):
                    dtype = 'float64' if key == 'event_time' else 'float32'
                    if value is not None:
                        group.create_dataset(name=key,
                                             shape=value.shape,
                                             dtype=dtype,
                                             data=value)
                    else:
                        group.create_dataset(name=key,
                                             shape=None,
                                             dtype=dtype)
    
                # Create group for injection_parameters and save every item of the
                # dict as a new dataset
                group = hdf_file.create_group('injection_parameters')
                for key, value in iteritems(self.data['injection_parameters']):
                    if value is not None:
                        group.create_dataset(name=key,
                                             shape=value.shape,
                                             dtype='float64',
                                             data=value)
                    else:
                        group.create_dataset(name=key,
                                             shape=None,
                                             dtype='float64')
    
                # Create group for normalization_parameters and save every item
                # of the dict as a new attribute
                group = hdf_file.create_group('normalization_parameters')
                for key, value in iteritems(self.data['normalization_parameters']):
                    group.attrs[key] = float(value)
    
        # -------------------------------------------------------------------------
    
        def as_dataframe(self,
                         injection_parameters=False,
                         static_arguments=False,
                         command_line_arguments=False,
                         split_injections_noise=False):
            """
            Return the contents of the :class:`SampleFile` as a ``pandas``
            data frame.
    
            Args:
                injection_parameters (bool): Whether or not to return
                    the `injection parameters` for every sample.
                static_arguments (bool): Whether or not to return
                    the `static_arguments` for every sample.
                command_line_arguments (bool): Whether or not to return
                    the `command_line_arguments` for every sample.
                split_injections_noise (bool): If this is set to True, a
                    separate data frame will be returned for both the
                    samples with and without an injection.
    
            Returns:
                One (or two, if `split_injections_noise` is set to `True`)
                pandas data frame containing the sample stored in the
                :class:`SampleFile` object.
            """
    
            # Create a data frame for the samples containing an injection
            injection_samples = []
            if self.data['injection_samples']['event_time'].shape != ():
                for i in range(len(self.data['injection_samples']['event_time'])):
                    _ = {k: v[i] for k, v in
                         iteritems(self.data['injection_samples'])}
                    injection_samples.append(_)
                df_injection_samples = pd.DataFrame().append(injection_samples,
                                                             ignore_index=True,
                                                             sort=True)
            else:
                df_injection_samples = pd.DataFrame()
    
            # Create a data frame for the samples not containing an injection
            noise_samples = []
            if self.data['noise_samples']['event_time'].shape != ():
                for i in range(len(self.data['noise_samples']['event_time'])):
                    _ = {k: v[i] for k, v in
                         iteritems(self.data['noise_samples'])}
                    noise_samples.append(_)
                df_noise_samples = pd.DataFrame().append(noise_samples,
                                                         ignore_index=True,
                                                         sort=True)
            else:
                df_noise_samples = pd.DataFrame()
    
            # If requested, create a data frame for the injection parameters and
            # merge it with the data frame containing the injection samples
            if injection_parameters:
                injection_params = []
    
                # Check if we even have any injection parameters
                if self.data['injection_parameters']['mass1'].shape != ():
                    for i in range(len(df_injection_samples)):
                        _ = {k: v[i] for k, v in
                             iteritems(self.data['injection_parameters'])}
                        injection_params.append(_)
                    df_injection_params = pd.DataFrame().append(injection_params,
                                                                ignore_index=True,
                                                                sort=True)
                else:
                    df_injection_params = pd.DataFrame()
    
                df = pd.concat([df_injection_samples, df_injection_params],
                               axis=1, sort=True)
    
            else:
                df = df_injection_samples
    
            # If requested, add the static_arguments to the data frame
            # containing the injections, and a smaller subset of the
            # static_arguments also to the data frame containing the noise
            # samples (only those arguments that make sense there)
            if static_arguments:
                for key, value in iteritems(self.data['static_arguments']):
                    df[key] = value
                    if key in ('random_seed', 'target_sampling_rate',
                               'bandpass_lower', 'bandpass_upper',
                               'seconds_before_event', 'seconds_after_event',
                               'sample_length'):
                        df_noise_samples[key] = value
    
            # Merge the data frames for the samples with and without injections
            df = df.append(df_noise_samples, ignore_index=True, sort=True)
    
            # If requested, add the command line arguments that were used in the
            # creation of the sample file to the combined data frame
            if command_line_arguments:
                for key, value in iteritems(self.data['command_line_arguments']):
                    df[key] = value
    
            # Ensure the `event_time` variable is an integer
            try:
                df['event_time'] = df['event_time'].astype(int)
            except KeyError:
                warn('\nNo key "event_time": Data frame is probably empty!')
    
            # Either split into two data frames for injection and noise samples
            if split_injections_noise:
                df_injections = df[df.h1_signal.notnull()]
                df_noise = df[~df.h1_signal.notnull()]
                return df_injections, df_noise
    
            # Or just return a single data frame containing both types of samples
            else:
                return df