Commit ea69739b authored by Yifan Wang's avatar Yifan Wang
Browse files

transplant samples generation to python3

parent 97346f8e
{
"random_seed": 42,
"background_data_directory": null,
"dq_bits": [0, 1, 2, 3],
"inj_bits": [0, 1, 2, 4],
"waveform_params_file_name": "waveform_params.ini",
"max_runtime": 60,
"n_injection_samples": 32,
"n_noise_samples": 16,
"n_processes": 4,
"output_file_name": "default.hdf"
}
; -----------------------------------------------------------------------------
; DECLARE ARGUMENTS
; -----------------------------------------------------------------------------
[variable_args]
; Waveform parameters that will vary in MCMC
mass1 =
mass2 =
spin1z =
spin2z =
ra =
dec =
coa_phase =
inclination =
polarization =
injection_snr =
[static_args]
; Waveform parameters that will not change in MCMC
approximant = SEOBNRv4
domain = time
f_lower = 18
distance = 100
waveform_length = 128
; Width of the background noise interval (in seconds) around the event_time,
; which is used to make the injection. Should be larger than (see below):
; sample_length = seconds_before_event + seconds_after_event
; because we need to crop off the edges that are corrupted by the whitening.
noise_interval_width = 16
; original_sampling_rate = Sampling rate of raw HDF files (usually 4096 Hz)
; target_sampling_rate = Desired sampling rate for sample generation output
original_sampling_rate = 4096
target_sampling_rate = 2048
; Define parameters for the whitening procedure. See documentation of the
; pycbc.types.TimeSeries.whiten() method for an explanation of what these
; values exactly mean.
whitening_segment_duration = 4
whitening_max_filter_duration = 4
; Define the lower and upper bound for the bandpass filter (in Hertz)
bandpass_lower = 20
bandpass_upper = 2048
; Define how to align the sample around the event time. By convention, the
; event time is the H1 time!
; The sum of these values will be the the sample_length!
seconds_before_event = 5.5
seconds_after_event = 2.5
; alpha for the Tukey window that is used to "fade on" the waveforms
; It represents the fraction of the window inside the cosine tapered region.
; To turn off the "fade on", simply choose tukey_alpha = 0.
tukey_alpha = 0.25
; -----------------------------------------------------------------------------
; DEFINE DISTRIBUTIONS FOR PARAMETERS
; -----------------------------------------------------------------------------
[prior-mass1]
; Prior for mass1
name = uniform
min-mass1 = 10.
max-mass1 = 80.
[prior-mass2]
; Prior for mass2
name = uniform
min-mass2 = 10.
max-mass2 = 80.
[prior-spin1z]
; Prior for spin1z
name = uniform
min-spin1z = 0
max-spin1z = 0.998
[prior-spin2z]
; Prior for spin2z
name = uniform
min-spin2z = 0
max-spin2z = 0.998
[prior-injection_snr]
; Prior for the injection SNR
name = uniform
min-injection_snr = 5
max-injection_snr = 20
[prior-coa_phase]
; Coalescence phase prior
name = uniform_angle
[prior-inclination]
; Inclination prior
name = sin_angle
[prior-ra+dec]
; Sky position prior
name = uniform_sky
[prior-polarization]
; Polarization prior
name = uniform_angle
"""
The "main script" of this repository: Read in a configuration file and
generate synthetic GW data according to the provided specifications.
"""
# -----------------------------------------------------------------------------
# IMPORTS
# -----------------------------------------------------------------------------
from __future__ import print_function
import argparse
import numpy as np
import os
import sys
import time
from itertools import count
from multiprocessing import Process, Queue
from tqdm import tqdm
from utils.configfiles import read_ini_config, read_json_config
from utils.hdffiles import NoiseTimeline
from utils.samplefiles import SampleFile
from utils.samplegeneration import generate_sample
from utils.waveforms import WaveformParameterGenerator
# -----------------------------------------------------------------------------
# MAIN CODE
# -----------------------------------------------------------------------------
if __name__ == '__main__':
# -------------------------------------------------------------------------
# Preliminaries
# -------------------------------------------------------------------------
# Disable output buffering ('flush' option is not available for Python 2)
#sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
# Start the stopwatch
script_start = time.time()
print('')
print('GENERATE A GW DATA SAMPLE FILE')
print('')
# -------------------------------------------------------------------------
# Parse the command line arguments
# -------------------------------------------------------------------------
# Set up the parser and add arguments
parser = argparse.ArgumentParser(description='Generate a GW data sample.')
parser.add_argument('--config-file',
help='Name of the JSON configuration file which '
'controls the sample generation process.',
default='default.json')
# Parse the arguments that were passed when calling this script
print('Parsing command line arguments...', end=' ')
command_line_arguments = vars(parser.parse_args())
print('Done!')
# -------------------------------------------------------------------------
# Read in JSON config file specifying the sample generation process
# -------------------------------------------------------------------------
# Build the full path to the config file
json_config_name = command_line_arguments['config_file']
json_config_path = os.path.join('.', 'config_files', json_config_name)
# Read the JSON configuration into a dict
print('Reading and validating in JSON configuration file...', end=' ')
config = read_json_config(json_config_path)
print('Done!')
# -------------------------------------------------------------------------
# Read in INI config file specifying the static_args and variable_args
# -------------------------------------------------------------------------
# Build the full path to the waveform params file
ini_config_name = config['waveform_params_file_name']
ini_config_path = os.path.join('.', 'config_files', ini_config_name)
# Read in the variable_arguments and static_arguments
print('Reading and validating in INI configuration file...', end=' ')
variable_arguments, static_arguments = read_ini_config(ini_config_path)
print('Done!\n')
# -------------------------------------------------------------------------
# Shortcuts and random seed
# -------------------------------------------------------------------------
# Set the random seed for this script
np.random.seed(config['random_seed'])
# Define some useful shortcuts
random_seed = config['random_seed']
max_runtime = config['max_runtime']
bkg_data_dir = config['background_data_directory']
# -------------------------------------------------------------------------
# Construct a generator for sampling waveform parameters
# -------------------------------------------------------------------------
# Initialize a waveform parameter generator that can sample injection
# parameters from the distributions specified in the config file
waveform_parameter_generator = \
WaveformParameterGenerator(config_file=ini_config_path,
random_seed=random_seed)
# Wrap it in a generator expression so that we can we can easily sample
# from it by calling next(waveform_parameters)
waveform_parameters = \
(waveform_parameter_generator.draw() for _ in iter(int, 1))
# -------------------------------------------------------------------------
# Construct a generator for sampling valid noise times
# -------------------------------------------------------------------------
# If the 'background_data_directory' is None, we will use synthetic noise
if config['background_data_directory'] is None:
print('Using synthetic noise! (background_data_directory = None)\n')
# Create a iterator that returns a fake "event time", which we will
# use as a seed for the RNG to ensure the reproducibility of the
# generated synthetic noise.
# For the HDF file path that contains that time, we always yield
# None, so that we know that we need to generate synthetic noise.
noise_times = ((1000000000 + _, None) for _ in count())
# Otherwise, we set up a timeline object for the background noise, that
# is, we read in all HDF files in the raw_data_directory and figure out
# which parts of it are useable (i.e., have the right data quality and
# injection bits set as specified in the config file).
else:
print('Using real noise from LIGO recordings! '
'(background_data_directory = {})'.format(bkg_data_dir))
print('Reading in raw data. This may take several minutes...', end=' ')
# Create a timeline object by running over all HDF files once
noise_timeline = NoiseTimeline(background_data_directory=bkg_data_dir,
random_seed=random_seed)
# Create a noise time generator so that can sample valid noise times
# simply by calling next(noise_time_generator)
delta_t = int(static_arguments['noise_interval_width'] / 2)
noise_times = (noise_timeline.sample(delta_t=delta_t,
dq_bits=config['dq_bits'],
inj_bits=config['inj_bits'],
return_paths=True)
for _ in iter(int, 1))
print('Done!\n')
# -------------------------------------------------------------------------
# Define a convenience function to generate arguments for the simulation
# -------------------------------------------------------------------------
def generate_arguments(injection=True):
# Only sample waveform parameters if we are making an injection
waveform_params = next(waveform_parameters) if injection else None
# Return all necessary arguments as a dictionary
return dict(static_arguments=static_arguments,
event_tuple=next(noise_times),
waveform_params=waveform_params)
# -------------------------------------------------------------------------
# Finally: Create our samples!
# -------------------------------------------------------------------------
# Keep track of all the samples (and parameters) we have generated
samples = dict(injection_samples=[], noise_samples=[])
injection_parameters = dict(injection_samples=[], noise_samples=[])
print('Generating samples containing an injection...')
n_samples = config['n_injection_samples']
arguments_generator = \
(generate_arguments(injection=True) for _ in iter(int, 1))
print('Number of samples:',n_samples)
sample_type = 'injection_samples'
for i in range(n_samples):
print(i)
results_list = []
arguments = next(arguments_generator)
print(arguments)
result = generate_sample(**arguments)
results_list.append(result)
# ---------------------------------------------------------------------
# Process results in the results_list
# ---------------------------------------------------------------------
# Separate the samples and the injection parameters
samples[sample_type], injection_parameters[sample_type] = \
zip(*results_list)
# Sort all results by the event_time
idx = np.argsort([_['event_time'] for _ in list(samples[sample_type])])
samples[sample_type] = \
list([samples[sample_type][i] for i in idx])
injection_parameters[sample_type] = \
list([injection_parameters[sample_type][i] for i in idx])
print('Sample generation completed!\n')
# -------------------------------------------------------------------------
# Compute the normalization parameters for this file
# -------------------------------------------------------------------------
print('Computing normalization parameters for sample...', end=' ')
# Gather all samples (with and without injection) in one list
all_samples = list(samples['injection_samples'] + samples['noise_samples'])
# Group all samples by detector
h1_samples = [_['h1_strain'] for _ in all_samples]
l1_samples = [_['l1_strain'] for _ in all_samples]
# Stack recordings along first axis
h1_samples = np.vstack(h1_samples)
l1_samples = np.vstack(l1_samples)
# Compute the mean and standard deviation for both detectors as the median
# of the means / standard deviations for each sample. This is more robust
# towards outliers than computing "global" parameters by concatenating all
# samples and treating them as a single, long time series.
normalization_parameters = \
dict(h1_mean=np.median(np.mean(h1_samples, axis=1), axis=0),
l1_mean=np.median(np.mean(l1_samples, axis=1), axis=0),
h1_std=np.median(np.std(h1_samples, axis=1), axis=0),
l1_std=np.median(np.std(l1_samples, axis=1), axis=0))
print('Done!\n')
# -------------------------------------------------------------------------
# Create a SampleFile dict from list of samples and save it as an HDF file
# -------------------------------------------------------------------------
print('Saving the results to HDF file ...', end=' ')
# Initialize the dictionary that we use to create a SampleFile object
sample_file_dict = dict(command_line_arguments=command_line_arguments,
injection_parameters=dict(),
injection_samples=dict(),
noise_samples=dict(),
normalization_parameters=normalization_parameters,
static_arguments=static_arguments)
# Collect and add samples (with and without injection)
for sample_type in ('injection_samples', 'noise_samples'):
for key in ('event_time', 'h1_strain', 'l1_strain'):
if samples[sample_type]:
value = np.array([_[key] for _ in list(samples[sample_type])])
else:
value = None
sample_file_dict[sample_type][key] = value
# Collect and add injection_parameters (ignore noise samples here, because
# for those, the injection_parameters are always None)
other_keys = ['h1_signal', 'h1_output_signal','h1_snr', 'l1_signal','l1_output_signal', 'l1_snr', 'scale_factor']
for key in list(variable_arguments + other_keys):
if injection_parameters['injection_samples']:
value = np.array([_[key] for _ in
injection_parameters['injection_samples']])
else:
value = None
sample_file_dict['injection_parameters'][key] = value
# Construct the path for the output HDF file
output_dir = os.path.join('.', 'output')
if not os.path.exists(output_dir):
os.mkdir(output_dir)
sample_file_path = os.path.join(output_dir, config['output_file_name'])
# Create the SampleFile object and save it to the specified output file
sample_file = SampleFile(data=sample_file_dict)
sample_file.to_hdf(file_path=sample_file_path)
print('Done!')
# Get file size in MB and print the result
sample_file_size = os.path.getsize(sample_file_path) / 1024**2
print('Size of resulting HDF file: {:.2f}MB'.format(sample_file_size))
print('')
# -------------------------------------------------------------------------
# Postliminaries
# -------------------------------------------------------------------------
# PyCBC always create a copy of the waveform parameters file, which we
# can delete at the end of the sample generation process
duplicate_path = os.path.join('.', config['waveform_params_file_name'])
if os.path.exists(duplicate_path):
os.remove(duplicate_path)
# Print the total run time
print('Total runtime: {:.1f} seconds!'.format(time.time() - script_start))
print('')
"""
Provide functions for reading and parsing configuration files.
"""
# -----------------------------------------------------------------------------
# IMPORTS
# -----------------------------------------------------------------------------
import json
import os
from pycbc.workflow import WorkflowConfigParser
from pycbc.distributions import read_params_from_config
from .staticargs import amend_static_args, typecast_static_args
# -----------------------------------------------------------------------------
# FUNCTION DEFINITIONS
# -----------------------------------------------------------------------------
def read_ini_config(file_path):
"""
Read in a `*.ini` config file, which is used mostly to specify the
waveform simulation (for example, the waveform model, the parameter
space for the binary black holes, etc.) and return its contents.
Args:
file_path (str): Path to the `*.ini` config file to be read in.
Returns:
A tuple `(variable_arguments, static_arguments)` where
* `variable_arguments` should simply be a list of all the
parameters which get randomly sampled from the specified
distributions, usually using an instance of
:class:`utils.waveforms.WaveformParameterGenerator`.
* `static_arguments` should be a dictionary containing the keys
and values of the parameters that are the same for each
example that is generated (i.e., the non-physical parameters
such as the waveform model and the sampling rate).
"""
# Make sure the config file actually exists
if not os.path.exists(file_path):
raise IOError('Specified configuration file does not exist: '
'{}'.format(file_path))
# Set up a parser for the PyCBC config file
workflow_config_parser = WorkflowConfigParser(configFiles=[file_path])
# Read the variable_arguments and static_arguments using the parser
variable_arguments, static_arguments = \
read_params_from_config(workflow_config_parser)
# Typecast and amend the static arguments
static_arguments = typecast_static_args(static_arguments)
static_arguments = amend_static_args(static_arguments)
return variable_arguments, static_arguments
def read_json_config(file_path):
"""
Read in a `*.json` config file, which is used to specify the
sample generation process itself (for example, the number of
samples to generate, the number of concurrent processes to use,
etc.) and return its contents.
Args:
file_path (str): Path to the `*.json` config file to be read in.
Returns:
A `dict` containing the contents of the given JSON file.
"""
# Make sure the config file actually exists
if not os.path.exists(file_path):
raise IOError('Specified configuration file does not exist: '
'{}'.format(file_path))
# Open the config while and load the JSON contents as a dict
with open(file_path, 'r') as json_file:
config = json.load(json_file)
# Define the required keys for the config file in a set
required_keys = {'background_data_directory', 'dq_bits', 'inj_bits',
'waveform_params_file_name', 'max_runtime',
'n_injection_samples', 'n_noise_samples', 'n_processes',
'random_seed', 'output_file_name'}
# Make sure no required keys are missing
missing_keys = required_keys.difference(set(config.keys()))
if missing_keys:
raise KeyError('Missing required key(s) in JSON configuration file: '
'{}'.format(', '.join(list(missing_keys))))
return config
"""
Provide classes and functions for reading and writing HDF files.
"""
# -----------------------------------------------------------------------------
# IMPORTS
# -----------------------------------------------------------------------------
from __future__ import print_function
import numpy as np
import h5py
import os
import sys
from pycbc.catalog import Catalog
from pycbc.types.timeseries import TimeSeries
from lal import LIGOTimeGPS
# -----------------------------------------------------------------------------
# FUNCTION DEFINITIONS
# -----------------------------------------------------------------------------
def get_file_paths(directory, extensions=None):
"""
Take a directory and return the paths to all files in this
directory and its subdirectories. Optionally filter out only
files specific extensions.
Args:
directory (str): Path to a directory.
extensions (list): List of allowed file extensions,
for example: `['hdf', 'h5']`.
Returns:
List of paths of all files matching the above descriptions.
"""
file_paths = []
# Walk over the directory and find all files
for path, dirs, files in os.walk(directory):
for f in files:
file_paths.append(os.path.join(path, f))
# If a list of extensions is provided, only keep the corresponding files
if extensions is not None:
file_paths = [_ for _ in file_paths if any([_.endswith(ext) for
ext in extensions])]
return file_paths