Select Git revision
fft_setup.cpp
Forked from
einsteinathome / libclfft
Source project has a limited visibility.
samplefiles.py 15.20 KiB
"""
Provide tools for writing and reading the sample HDF files produced by
the sample generation.
"""
# -----------------------------------------------------------------------------
# IMPORTS
# -----------------------------------------------------------------------------
import numpy as np
import pandas as pd
import h5py
from six import iteritems
from pprint import pformat
from warnings import warn
# -----------------------------------------------------------------------------
# CLASS DEFINITIONS
# -----------------------------------------------------------------------------
class SampleFile:
"""
:class:`SampleFile` objects serve as an abstraction for the result
files of the sample generation.
Args:
data (dict): A dictionary containing the following keys:
.. code-block:: python
{'command_line_arguments', 'static_arguments',
'injection_samples', 'noise_samples',
'injection_parameters', 'normalization_parameters'}
The value for every key must again be a dictionary relating
the names of sample parameters (e.g., 'h1_snr') to a numpy
array containing the values for that parameter.
"""
def __init__(self,
data=None):
# Perform sanity checks on data
self.__check_data(data)
# If we have received data, store it; else initialize an empty dict
if data is not None:
self.data = data
else:
self.data = dict(command_line_arguments=dict(),
static_arguments=dict(),
injection_samples=dict(),
noise_samples=dict(),
injection_parameters=dict(),
normalization_parameters=dict())
# -------------------------------------------------------------------------
@staticmethod
def __check_data(data):
"""
Run some sanity checks on `data`. Raises an assertion error if
the data fail any of these sanity checks.
Args:
data (dict): A dictionary as specified in the ``__init__``
of this class, that is, a dictionary containing the
following keys:
.. code-block:: python
{'command_line_arguments', 'static_arguments',
'injection_samples', 'noise_samples',
'injection_parameters', 'normalization_parameters'}
"""
assert isinstance(data, dict) or data is None, \
'data must be either dict or None!'
if data is not None:
assert 'command_line_arguments' in data.keys(), \
'data must provide key "command_line_arguments"!'
assert 'static_arguments' in data.keys(), \
'data must provide key "static_arguments"!'
assert 'injection_samples' in data.keys(), \
'data must provide key "injection_samples"!'
assert 'noise_samples' in data.keys(), \
'data must provide key "noise_samples"!'
assert 'injection_parameters' in data.keys(), \
'data must provide key "injection_parameters"!'
assert 'normalization_parameters' in data.keys(), \
'data must provide key "normalization_parameters"!'
# -------------------------------------------------------------------------
def __repr__(self):
return pformat(self.data, indent=4)
# -------------------------------------------------------------------------
def __str__(self):
return pformat(self.data, indent=4)
# -------------------------------------------------------------------------
def __getitem__(self, item):
return self.data[item]
# -------------------------------------------------------------------------
def __setitem__(self, key, value):
self.data[key] = value
# -------------------------------------------------------------------------
def read_hdf(self, file_path):
"""
Read in an existing HDF sample file (e.g., to use an instance
of :class:`SampleFile` as a convenience wrapper for accessing
the contents of an HDF samples file).
Args:
file_path (str): The path to the HDF file to be read into
the :class:`SampleFile` object.
"""
# Clear the existing data
self.data = {}
with h5py.File(file_path, 'r') as hdf_file:
# Read in dict with command_line_arguments
self.data['command_line_arguments'] = \
dict(hdf_file['command_line_arguments'].attrs)
self.data['command_line_arguments'] = \
{key: value.decode('ascii') for key, value in
iteritems(self.data['command_line_arguments'])}
# Read in dict with static_arguments
self.data['static_arguments'] = \
dict(hdf_file['static_arguments'].attrs)
self.data['static_arguments'] = \
{key: value.decode('ascii') for key, value in
iteritems(self.data['static_arguments'])}
# Read in group containing injection samples
self.data['injection_samples'] = dict()
for key in ('event_time', 'h1_strain', 'l1_strain'):
try:
self.data['injection_samples'][key] = \
np.array(hdf_file['injection_samples'][key])
except TypeError:
self.data['injection_samples'][key] = np.array(None)
# Read in group containing noise samples
self.data['noise_samples'] = dict()
for key in ('event_time', 'h1_strain', 'l1_strain'):
try:
self.data['noise_samples'][key] = \
np.array(hdf_file['noise_samples'][key])
except TypeError:
self.data['noise_samples'][key] = np.array(None)
# Read in injection parameters
self.data['injection_parameters'] = dict()
for key in hdf_file['/injection_parameters'].keys():
try:
self.data['injection_parameters'][key] = \
np.array(hdf_file['injection_parameters'][key])
except TypeError:
self.data['injection_parameters'][key] = np.array(None)
# Read in dict with normalization parameters
self.data['normalization_parameters'] = \
dict(hdf_file['normalization_parameters'].attrs)
self.data['normalization_parameters'] = \
{key: float(value) for key, value in
iteritems(self.data['normalization_parameters'])}
# -------------------------------------------------------------------------
def to_hdf(self, file_path):
with h5py.File(file_path, 'w') as hdf_file:
# Create group for command_line_arguments and save the values of
# the dict as attributes of the group
group = hdf_file.create_group('command_line_arguments')
for key, value in iteritems(self.data['command_line_arguments']):
group.attrs[key] = str(value)
# Create group for static_arguments and save the values of
# the dict as attributes of the group
group = hdf_file.create_group('static_arguments')
for key, value in iteritems(self.data['static_arguments']):
group.attrs[key] = str(value)
# Create group for injection_samples and save every item of the
# dict as a new dataset
group = hdf_file.create_group('injection_samples')
for key, value in iteritems(self.data['injection_samples']):
dtype = 'float64' if key == 'event_time' else 'float32'
if value is not None:
group.create_dataset(name=key,
shape=value.shape,
dtype=dtype,
data=value)
else:
group.create_dataset(name=key,
shape=None,
dtype=dtype)
# Create group for noise_samples and save every item of the
# dict as a new dataset
group = hdf_file.create_group('noise_samples')
for key, value in iteritems(self.data['noise_samples']):
dtype = 'float64' if key == 'event_time' else 'float32'
if value is not None:
group.create_dataset(name=key,
shape=value.shape,
dtype=dtype,
data=value)
else:
group.create_dataset(name=key,
shape=None,
dtype=dtype)
# Create group for injection_parameters and save every item of the
# dict as a new dataset
group = hdf_file.create_group('injection_parameters')
for key, value in iteritems(self.data['injection_parameters']):
if value is not None:
group.create_dataset(name=key,
shape=value.shape,
dtype='float64',
data=value)
else:
group.create_dataset(name=key,
shape=None,
dtype='float64')
# Create group for normalization_parameters and save every item
# of the dict as a new attribute
group = hdf_file.create_group('normalization_parameters')
for key, value in iteritems(self.data['normalization_parameters']):
group.attrs[key] = float(value)
# -------------------------------------------------------------------------
def as_dataframe(self,
injection_parameters=False,
static_arguments=False,
command_line_arguments=False,
split_injections_noise=False):
"""
Return the contents of the :class:`SampleFile` as a ``pandas``
data frame.
Args:
injection_parameters (bool): Whether or not to return
the `injection parameters` for every sample.
static_arguments (bool): Whether or not to return
the `static_arguments` for every sample.
command_line_arguments (bool): Whether or not to return
the `command_line_arguments` for every sample.
split_injections_noise (bool): If this is set to True, a
separate data frame will be returned for both the
samples with and without an injection.
Returns:
One (or two, if `split_injections_noise` is set to `True`)
pandas data frame containing the sample stored in the
:class:`SampleFile` object.
"""
# Create a data frame for the samples containing an injection
injection_samples = []
if self.data['injection_samples']['event_time'].shape != ():
for i in range(len(self.data['injection_samples']['event_time'])):
_ = {k: v[i] for k, v in
iteritems(self.data['injection_samples'])}
injection_samples.append(_)
df_injection_samples = pd.DataFrame().append(injection_samples,
ignore_index=True,
sort=True)
else:
df_injection_samples = pd.DataFrame()
# Create a data frame for the samples not containing an injection
noise_samples = []
if self.data['noise_samples']['event_time'].shape != ():
for i in range(len(self.data['noise_samples']['event_time'])):
_ = {k: v[i] for k, v in
iteritems(self.data['noise_samples'])}
noise_samples.append(_)
df_noise_samples = pd.DataFrame().append(noise_samples,
ignore_index=True,
sort=True)
else:
df_noise_samples = pd.DataFrame()
# If requested, create a data frame for the injection parameters and
# merge it with the data frame containing the injection samples
if injection_parameters:
injection_params = []
# Check if we even have any injection parameters
if self.data['injection_parameters']['mass1'].shape != ():
for i in range(len(df_injection_samples)):
_ = {k: v[i] for k, v in
iteritems(self.data['injection_parameters'])}
injection_params.append(_)
df_injection_params = pd.DataFrame().append(injection_params,
ignore_index=True,
sort=True)
else:
df_injection_params = pd.DataFrame()
df = pd.concat([df_injection_samples, df_injection_params],
axis=1, sort=True)
else:
df = df_injection_samples
# If requested, add the static_arguments to the data frame
# containing the injections, and a smaller subset of the
# static_arguments also to the data frame containing the noise
# samples (only those arguments that make sense there)
if static_arguments:
for key, value in iteritems(self.data['static_arguments']):
df[key] = value
if key in ('random_seed', 'target_sampling_rate',
'bandpass_lower', 'bandpass_upper',
'seconds_before_event', 'seconds_after_event',
'sample_length'):
df_noise_samples[key] = value
# Merge the data frames for the samples with and without injections
df = df.append(df_noise_samples, ignore_index=True, sort=True)
# If requested, add the command line arguments that were used in the
# creation of the sample file to the combined data frame
if command_line_arguments:
for key, value in iteritems(self.data['command_line_arguments']):
df[key] = value
# Ensure the `event_time` variable is an integer
try:
df['event_time'] = df['event_time'].astype(int)
except KeyError:
warn('\nNo key "event_time": Data frame is probably empty!')
# Either split into two data frames for injection and noise samples
if split_injections_noise:
df_injections = df[df.h1_signal.notnull()]
df_noise = df[~df.h1_signal.notnull()]
return df_injections, df_noise
# Or just return a single data frame containing both types of samples
else:
return df